字符串比对KMP算法
KMP算法
一种字符串比对算法,寻找存在于长字符串中的子串,由Knuth、Morris、Pratt三个同时提出来。
算法的基本思路是:
假设有序列seq(长度为m)和子序列subseq(长度为n),
首先遍历序列seq,找到subseq[0]第一次出现在seq中的位置k;
然后顺次比较subseq[j](0<j<n)与seq[i](k<i<m), 即比较以subseq[0]开头长度为j的序列subseq[0,j]和以seq[i-1]结尾长度为j的序列seq[i-j,i](此处之所以选择用i而不是k来描述seq的片段,是想通过序列片段而不是单个字符进行移动,继而减少移动次数):
若在j==n-1时,每个元素都相等,则定位了一个子序列,重复上面的过程即可找到所有子序列的分布位置,若对某一特定的j(0<j<n)使得subseq[j] <> seq[i], 这时则宣告最初找到的k不合意(应该后移),无法继续延伸,这时需要调整j的值, 继而影响到k的值。为了调整后,依然满足subseq[0,j’] == subseq[i-j’,i],就要求j’应该满足subseq[0,j’]==subseq[j-j’,j],同时最佳j’应该是满足这个条件的最大值且j’ <> j, 使得能继续参与匹配的序列最长。当然也存在不能找到合适的j’的情况,这时另j’等于0, 而增加i的值直到再次碰到subseq[0]。同时,我们可以看到j’的取值只取决于subseq,因此我们可以提前构建出这样一个索引数组来指导错误匹配之后的移动。
索引数组index的构建:
索引数组的构建就是对字符串进行自我扫描的过程,每个值都可以递归的由其前一个位的值得出。
假设index[j-1] = k, 只要index[k] == index[j],index[j] = index[j-1]+1, 因为index[0,k] = index[j-k,j], 所以index[0,k+1] == index[j-k, j+1]。若index[k] != index[j],这时可以回溯到index[index[j-1],若依然不满足则递归index[index[index[j-1]],直到满足或取 值为0, 由此得到index数组。
假设seq = ‘abaebacddebacdbace’, subseq = ‘bace’.
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
a b a e b a c d d e b a c d b a c e
b a c e
0 1 2 3
当k=1时,subseq[0] == seq[k],
然后subseq[1] == seq[2], subseq[2] != seq[3], 这时需要调整j的值,使得subseq[0,j’]依然等同于
seq[i-j’+1,i], 这就使得j’要满足使得subseq[0,j]的前j’的字母和后j’个字母完全相同, 由于没有合适的,
j’取值0,此时继续增大i的值直到再次在seq里遇到subseq[0], 重复以上过程。这个例子比较特殊没有利用到算法的优势。
若子序列不存在重复,用KMP算法和用下面的一个bruteforce程序效果应该相差不大, 另外python的find和index在处理字符串时速度也很可观。
另一个例子是matrix67上的, 大家可过去参考下。
参考:http://www.matrix67.com/blog/archives/115
http://stackoverflow.com/questions/425604/best-way-to-determine-if-a-sequence-is-in-another-sequence-in-python
#!/usr/bin/env python #Last Change: 2010-11-03 13:38:36 def compute_prefix_function(p): ''' compute_prefix_function(p) --> a list [p] is a pattern sequence to be searched in a longer sequence. ''' m = len(p) pi = [] * m k = for q in range(1, m): while k > and p[k] != p[q]: k = pi[k - 1] if p[k] == p[q]: k = k + 1 pi[q] = k return pi def kmp_matcher(t, p): ''' kmp_matcher(t, p) --> a list [t, p] is two sequences, which 'p' maybe partof 't'. ''' n = len(t) m = len(p) pi = compute_prefix_function(p) q = index = [] for i in range(n): while q > and p[q] != t[i]: q = pi[q - 1] if p[q] == t[i]: q = q + 1 if q == m: q = index.append(i-m+2) #-----end of for--------------------------- return -1 if len(index) == else index if __name__ == '__main__': str = "1230012123211212213012300121232112122130123001212321121221312300121232112122130" substr = "1212" #for i in range(10000000): # kmp_matcher(str, substr) print kmp_matcher(str, substr) |
def index(subseq, seq): ''''' index(subseq, seq) -->a list of numbers or -1 Return an index of [subseq] in the [seq]. Or -1 if [subseq] is not a subsequence of [seq]. The time complexity of the algorithm is O(n*m), where n, m = len(seq), len(subseq). >>>index('12', '0112') [2] >>>index([1,2], [011212]) [2, 4] >>>index('13', '0112') -1 ''' i, n, m = -1, len(seq), len(subseq) index = [] try: while True: i = seq.index(subseq[], i+1, n - m + 1) if subseq == seq[i:i+m]: index.append(i) except ValueError: return index if len(index) > else -1 def subseqInSeq(subseq, seq): ''''' subseqInSeq(subseq, seq) ---> list or -1 The same as index. ''' indexList = [] m = len(subseq) subseqRepla = '*' * m while subseq[] in seq: index = seq.index(subseq[]) if subseq == seq[index:index+m]: indexList.append(index) seq = seq.replace(subseq, subseqRepla, 1) else: seq = seq.replace(subseq[], '*', 1) return (indexList if len(indexList) > else -1) def main(): print index('ab', 'abcdab') print subseqInSeq('ab', 'abcdab') |
#include #include #include int kmp(char *substr, char *str, int *index); int prefix(char *substr, int lensub, int *subIndex); /* this is a string compare function using KMP */ int kmp(char *substr, char *str, int *index) { int lensub, lenstr; int i = ; int q = ; int k = ; //a index of array-index lensub = strlen(substr); lenstr = strlen(str); int subIndex[lensub]; prefix(substr, lensub, subIndex); for( i = ; i < lenstr ; i++ ) { while( (q > ) && (*(substr+q) != *(str+i))) { q = subIndex[q-1]; } if( *(substr+q) == *(str+i) ) { q++; } if( q == lensub ) { q = subIndex[q-1]; *(index+k) = i-lensub+2;//1 or 2 k++; } } return ; } /* this does a self kmp of short string to get the prefix function */ int prefix(char *substr, int lensub, int *subIndex) { int k = ; int q = ; for( q = ; q < lensub ; q++ ) { *(subIndex+q) = ; } for( q = 1 ; q < lensub ; q++ ) { while( k > && *(substr+k) != *(substr+q) ) { k = subIndex[k-1]; } if( *(substr+k) == *(substr+q) ) { k++; } *(subIndex+q) = k; } return ; } int main(int argc, char *argv[]) { clock_t start, finish; start = clock(); double duration; char *str = "CGTAACGCGTGCCGCGGCTACGTAACGCGGCTA"; char *substr = "CGCG"; int lenindex; lenindex = strlen(str)/strlen(substr); int index[lenindex]; int i; for( i = ; i < lenindex ; i++ ) { index[i] = -1; } kmp(substr, str, &index[]); for( i = ; i < lenindex ; i++ ) { if( index[i] == -1 ) { break; } printf("%d\n", index[i]); } finish = clock(); duration = (double)(finish-start) / CLOCKS_PER_SEC; printf("Done, %f seconds\n", duration); return ; } |
#include #include #include #include typedef unsigned char byte; int kmp(byte *bitsub, byte *bit, unsigned int lensub, unsigned int lenbit, unsigned int *index); int prefix(byte *bitsub, unsigned int lensub, unsigned int *subIndex); /* this is a string compare function using KMP */ int kmp(byte *bitsub, byte *bit, unsigned int lensub, unsigned int lenbit, unsigned int *index) { unsigned int i = ; unsigned int q = ; unsigned int k = ; //a index of array-index unsigned int subIndex[lensub]; prefix(bitsub, lensub, subIndex); for( i = ; i < lenbit ; i++ ) { while( (q > ) && (*(bitsub+q) ^ *(bit+i)))//*(bitsub+q) != *(bit+i)) { q = subIndex[q-1]; } if( !(*(bitsub+q) ^ *(bit+i)) )//*(bitsub+q) == *(bit+i)) { q++; } if( !(q ^ lensub) ) //q == lensub { q = ; *(index+k) = i-lensub+2; k++; } } return ; } /* this does a self kmp of short string to get the prefix function */ int prefix(byte *bitsub, unsigned int lensub, unsigned int *subIndex) { unsigned int k = ; unsigned int q = ; for( q = ; q < lensub ; q++ ) { *(subIndex+q) = ; } for( q = 1 ; q < lensub ; q++ ) { while( k > && (*(bitsub+k) ^ *(bitsub+q)) )//*(bitsub+k) != *(bitsub+q) { k = subIndex[k-1]; } if(!(*(bitsub+k) ^ *(bitsub+q)))//*(bitsub+k) == *(bitsub+q) ) { k++; } *(subIndex+q) = k; } return ; } int main(int argc, char *argv[]) { unsigned int i; struct timeval start; struct timeval finish; gettimeofday(&start, NULL); double duration; byte bit[79] = {1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,, 1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,,1,2,3,,,1,2, 1,2,3,2,1,1,2,1,2,2,1,3,1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,}; byte bitsub[4] = {1,2,1,2}; unsigned int lenindex; unsigned int lenbit = sizeof(bit)/sizeof(byte); unsigned int lensub = sizeof(bitsub)/sizeof(byte); lenindex = lenbit/lensub; unsigned int index[lenindex]; for( i = ; i < lenindex ; i++ ) { index[i] = ; } for( i = ; i < 1000000 ; i++ ) { kmp(bitsub, bit, lensub, lenbit, &index[]); } kmp(bitsub, bit, lensub, lenbit, &index[]); for( i = ; i < lenindex ; i++ ) { if( index[i] == ) { break; } printf("%d\n", *(index+i)); } gettimeofday(&finish, NULL); duration = (finish.tv_sec-start.tv_sec) + (finish.tv_usec - start.tv_usec) / (1000000.1); printf("Done, %f seconds\n", duration); return ; } |