字符串比对KMP算法

KMP算法

一种字符串比对算法,寻找存在于长字符串中的子串,由Knuth、Morris、Pratt三个同时提出来。

算法的基本思路是:

假设有序列seq(长度为m)和子序列subseq(长度为n),
首先遍历序列seq,找到subseq[0]第一次出现在seq中的位置k;

然后顺次比较subseq[j](0<j<n)与seq[i](k<i<m), 即比较以subseq[0]开头长度为j的序列subseq[0,j]和以seq[i-1]结尾长度为j的序列seq[i-j,i](此处之所以选择用i而不是k来描述seq的片段,是想通过序列片段而不是单个字符进行移动,继而减少移动次数):

若在j==n-1时,每个元素都相等,则定位了一个子序列,重复上面的过程即可找到所有子序列的分布位置,若对某一特定的j(0<j<n)使得subseq[j] <> seq[i], 这时则宣告最初找到的k不合意(应该后移),无法继续延伸,这时需要调整j的值, 继而影响到k的值。为了调整后,依然满足subseq[0,j’] == subseq[i-j’,i],就要求j’应该满足subseq[0,j’]==subseq[j-j’,j],同时最佳j’应该是满足这个条件的最大值且j’ <> j, 使得能继续参与匹配的序列最长。当然也存在不能找到合适的j’的情况,这时另j’等于0, 而增加i的值直到再次碰到subseq[0]。同时,我们可以看到j’的取值只取决于subseq,因此我们可以提前构建出这样一个索引数组来指导错误匹配之后的移动。

索引数组index的构建:
索引数组的构建就是对字符串进行自我扫描的过程,每个值都可以递归的由其前一个位的值得出。
假设index[j-1] = k, 只要index[k] == index[j],index[j] = index[j-1]+1, 因为index[0,k] = index[j-k,j], 所以index[0,k+1] == index[j-k, j+1]。若index[k] != index[j],这时可以回溯到index[index[j-1],若依然不满足则递归index[index[index[j-1]],直到满足或取 值为0, 由此得到index数组。

假设seq = ‘abaebacddebacdbace’, subseq = ‘bace’.
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
a b a e b a c d d e b  a  c  d  b  a  c  e

b a c e
0 1 2 3

当k=1时,subseq[0] == seq[k],
然后subseq[1] == seq[2], subseq[2] != seq[3], 这时需要调整j的值,使得subseq[0,j’]依然等同于
seq[i-j’+1,i], 这就使得j’要满足使得subseq[0,j]的前j’的字母和后j’个字母完全相同, 由于没有合适的,
j’取值0,此时继续增大i的值直到再次在seq里遇到subseq[0], 重复以上过程。这个例子比较特殊没有利用到算法的优势。
若子序列不存在重复,用KMP算法和用下面的一个bruteforce程序效果应该相差不大, 另外python的find和index在处理字符串时速度也很可观。

另一个例子是matrix67上的, 大家可过去参考下。

参考:http://www.matrix67.com/blog/archives/115

http://stackoverflow.com/questions/425604/best-way-to-determine-if-a-sequence-is-in-another-sequence-in-python

#!/usr/bin/env python
 
#Last Change:  2010-11-03 13:38:36
def compute_prefix_function(p):
    '''
    compute_prefix_function(p) --> a list
 
    [p] is a pattern sequence to be searched in a longer sequence.
    '''
    m = len(p)
    pi = [] * m
    k = 
    for q in range(1, m):
        while k &gt;  and p[k] != p[q]:
            k = pi[k - 1]
        if p[k] == p[q]:
            k = k + 1
        pi[q] = k
    return pi
 
def kmp_matcher(t, p):
    '''
    kmp_matcher(t, p) --> a list
 
    [t, p] is two sequences, which 'p' maybe partof 't'.
    '''
    n = len(t)
    m = len(p)
    pi = compute_prefix_function(p)
    q = 
    index = []
    for i in range(n):
        while q &gt;  and p[q] != t[i]:
            q = pi[q - 1]
        if p[q] == t[i]:
            q = q + 1
        if q == m:
            q = 
            index.append(i-m+2)
    #-----end of for---------------------------
    return -1 if len(index) ==  else index
 
if __name__ == '__main__':
    str = "1230012123211212213012300121232112122130123001212321121221312300121232112122130"
    substr = "1212"
    #for i in range(10000000):
    #    kmp_matcher(str, substr)
    print kmp_matcher(str, substr)

def index(subseq, seq):
    '''''
    index(subseq, seq) -->a list of numbers or -1
    Return an index of [subseq] in the [seq].
    Or -1 if [subseq] is not a subsequence of [seq].
    The time complexity of the algorithm is O(n*m), where
    n, m = len(seq), len(subseq).
    >>>index('12', '0112')
    [2]
    >>>index([1,2], [011212])
    [2, 4]
    >>>index('13', '0112')
    -1
    '''
    i, n, m = -1, len(seq), len(subseq)
    index = []
    try:
        while True:
            i = seq.index(subseq[], i+1, n - m + 1)
            if subseq == seq[i:i+m]:
                index.append(i)
    except ValueError:
        return index if len(index) &gt;  else -1
def subseqInSeq(subseq, seq):
    '''''
    subseqInSeq(subseq, seq) ---> list or -1
    The same as index.
    '''
    indexList = []
    m = len(subseq)
    subseqRepla = '*' * m
    while subseq[] in seq:
        index = seq.index(subseq[])
        if subseq == seq[index:index+m]:
            indexList.append(index)
            seq = seq.replace(subseq, subseqRepla, 1)
        else:
            seq = seq.replace(subseq[], '*', 1)
    return (indexList if len(indexList) &gt;  else -1)
def main():
    print index('ab', 'abcdab')
    print subseqInSeq('ab', 'abcdab')

#include
#include
#include 
 
int kmp(char *substr, char *str, int *index);
int prefix(char *substr, int lensub, int *subIndex);
 
/* this is a string compare function using KMP */
int kmp(char *substr, char *str, int *index)
{
	int lensub, lenstr;
	int i = ;
	int q = ;
	int k = ; //a index of array-index
	lensub = strlen(substr);
	lenstr = strlen(str);
	int subIndex[lensub];
 
	prefix(substr, lensub, subIndex);
 
	for( i =  ; i &lt; lenstr ; i++ )
	{
		while( (q &gt; ) &amp;&amp; (*(substr+q) != *(str+i)))
		{
			q = subIndex[q-1];
		}
		if( *(substr+q) == *(str+i) )
		{
			q++;
		}
		if( q == lensub )
		{
			q = subIndex[q-1];
			*(index+k) = i-lensub+2;//1 or 2
			k++;
		}
	}
	return ;
}
 
/* this does a self kmp of short string to get the prefix function */
int prefix(char *substr, int lensub, int *subIndex)
{
	int k = ;
	int q = ;
 
	for( q =  ; q &lt; lensub ; q++ )
	{
		*(subIndex+q) = ;
	}
 
	for( q = 1 ; q &lt; lensub ; q++ )
	{
		while( k &gt;  &amp;&amp; *(substr+k) != *(substr+q) )
		{
			k = subIndex[k-1];
		}
		if( *(substr+k) == *(substr+q) )
		{
			k++;
		}
		*(subIndex+q) = k;
	}
	return ;
}
 
int main(int argc, char *argv[])
{
	clock_t start, finish;
	start = clock();
	double duration;
	char *str = "CGTAACGCGTGCCGCGGCTACGTAACGCGGCTA";
 
	char *substr = "CGCG";
 
	int lenindex;
	lenindex = strlen(str)/strlen(substr);
	int index[lenindex];
	int i;
	for( i =  ; i &lt; lenindex ; i++ )
	{
		index[i] = -1;
	}
 
	kmp(substr, str, &amp;index[]);
 
	for( i =  ; i &lt; lenindex ; i++ )
	{
		if( index[i] == -1 )
		{
			break;
		}
		printf("%d\n", index[i]);
	}
	finish = clock();
	duration = (double)(finish-start) / CLOCKS_PER_SEC;
	printf("Done, %f seconds\n", duration);
 
	return ;
}

#include
#include
#include
#include 
 
typedef unsigned char byte;
 
int kmp(byte *bitsub, byte *bit, unsigned int lensub,
		unsigned int lenbit, unsigned int *index);
int prefix(byte *bitsub, unsigned int lensub, unsigned int *subIndex);
 
/* this is a string compare function using KMP */
int kmp(byte *bitsub, byte *bit, unsigned int lensub,
		unsigned int lenbit, unsigned int *index)
{
	unsigned int i = ;
	unsigned int q = ;
	unsigned int k = ; //a index of array-index
	unsigned int subIndex[lensub];
 
	prefix(bitsub, lensub, subIndex);
 
	for( i =  ; i &lt; lenbit ; i++ )
	{
		while( (q &gt; ) &amp;&amp; (*(bitsub+q) ^ *(bit+i)))//*(bitsub+q) != *(bit+i))
		{
			q = subIndex[q-1];
		}
		if( !(*(bitsub+q) ^ *(bit+i)) )//*(bitsub+q) == *(bit+i))
		{
			q++;
		}
		if( !(q ^ lensub) ) //q == lensub
		{
			q = ;
			*(index+k) = i-lensub+2;
			k++;
		}
	}
 
	return ;
}
 
/* this does a self kmp of short string to get the prefix function */
int prefix(byte *bitsub, unsigned int lensub, unsigned int *subIndex)
{
	unsigned int k = ;
	unsigned int q = ;
 
	for( q =  ; q &lt; lensub ; q++ )
	{
		*(subIndex+q) = ;
	}
 
	for( q = 1 ; q &lt; lensub ; q++ )
	{
		while( k &gt;  &amp;&amp; (*(bitsub+k) ^ *(bitsub+q)) )//*(bitsub+k) != *(bitsub+q)
		{
			k = subIndex[k-1];
		}
		if(!(*(bitsub+k) ^ *(bitsub+q)))//*(bitsub+k) == *(bitsub+q) )
		{
			k++;
		}
		*(subIndex+q) = k;
	}
	return ;
}
 
int main(int argc, char *argv[])
{
 
	unsigned int i;
	struct timeval start;
	struct timeval finish;
	gettimeofday(&amp;start, NULL);
 
	double duration;
	byte bit[79] = {1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,,
		1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,,1,2,3,,,1,2,
		1,2,3,2,1,1,2,1,2,2,1,3,1,2,3,,,1,2,1,2,3,2,1,1,2,1,2,2,1,3,};
	byte bitsub[4] = {1,2,1,2};
 
	unsigned int lenindex;
	unsigned int lenbit = sizeof(bit)/sizeof(byte);
	unsigned int lensub = sizeof(bitsub)/sizeof(byte);
	lenindex = lenbit/lensub;
	unsigned int index[lenindex];
	for( i =  ; i &lt; lenindex ; i++ )
	{
		index[i] = ;
	}
	for( i =  ; i &lt; 1000000 ; i++ )
	{
		kmp(bitsub, bit, lensub, lenbit, &amp;index[]);
	}
 
	kmp(bitsub, bit, lensub, lenbit, &amp;index[]);
 
	for( i =  ; i &lt; lenindex ; i++ )
	{
		if( index[i] ==  )
		{
			break;
		}
		printf("%d\n", *(index+i));
	}
 
	gettimeofday(&amp;finish, NULL);
 
	duration = (finish.tv_sec-start.tv_sec)
		+ (finish.tv_usec - start.tv_usec) / (1000000.1);
	printf("Done, %f seconds\n", duration);
 
	return ;
}
CHENTONG
版权声明:本文为博主原创文章,转载请注明出处。
alipay.png

CHENTONG

CHENTONG
积微,月不胜日,时不胜月,岁不胜时。凡人好敖慢小事,大事至,然后兴之务之。如是,则常不胜夫敦比于小事者矣!何也?小事之至也数,其悬日也博,其为积也大。大事之至也希,其悬日也浅,其为积也小。故善日者王,善时者霸,补漏者危,大荒者亡!故,王者敬日,霸者敬时,仅存之国危而后戚之。亡国至亡而后知亡,至死而后知死,亡国之祸败,不可胜悔也。霸者之善著也,可以时托也。王者之功名,不可胜日志也。财物货宝以大为重,政教功名者反是,能积微者速成。诗曰:德如毛,民鲜能克举之。此之谓也。

R 学习

R语言是比较常用的统计分析和绘图语言,拥有强大的统计库、绘图库和生信分析的Bioconductor库,是学习生物信息分析的必备语言之一。Rstudio是编辑、运行R语言的最为理想的工具之一,支持纯R脚本、Rmarkdown (脚本文档混排)、Bookdown (脚本文档混排...… Continue reading

本地使用Rfam 12.0+

Published on June 16, 2017

Linux学习(一)- 文件和目录

Published on June 08, 2017