Knuth-Morris-Pratt 算法



用于模式匹配的 KMP 算法

KMP 算法用于解决模式匹配问题,模式匹配问题是在文本中查找给定模式的所有出现位置的任务。在查找多个模式时非常有用。例如,如果文本是“aabbaaccaabbaadde”,模式是“aabaa”,则模式在文本中出现了两次,分别位于索引 0 和 8 处。

此问题的朴素解决方案是从最左边的位置开始,向右移动,将模式与文本的每个可能的子字符串进行比较。这需要 O(n*m) 时间,其中 'n' 是文本的长度,'m' 是模式的长度。

当我们处理长文本文档时,蛮力法和朴素方法可能会导致冗余比较。为了避免这种冗余,Knuth、Morris 和 Pratt 开发了一种线性序列匹配算法,称为 KMP 模式匹配算法。它也称为 Knuth Morris Pratt 模式匹配算法。

KMP 算法如何工作?

KMP 算法从左到右开始搜索操作。它使用前缀函数来避免在搜索模式时进行不必要的比较。此函数存储迄今为止匹配的字符数,称为LPS 值。KMP 算法涉及以下步骤:

  • 定义前缀函数。

  • 滑动模式以进行比较。

  • 如果所有字符都匹配,则表示已找到匹配项。

  • 如果没有匹配,则使用前缀函数跳过不必要的比较。如果与不匹配字符之前的字符的 LPS 值为“0”,则从模式的索引 0 开始与文本中的下一个字符进行比较。但是,如果 LPS 值大于“0”,则从等于之前不匹配字符的 LPS 值的索引值开始比较。

KMP Solution

KMP 算法需要 O(n + m) 时间和 O(m) 空间。它比朴素解决方案更快,因为它跳过了冗余比较,并且每个文本字符最多只比较一次。

让我们通过一个示例了解模式匹配问题的输入输出场景:

Input:
main String: "AAAABCAAAABCBAAAABC" 
pattern: "AAABC"
Output:
Pattern found at position: 1
Pattern found at position: 7
Pattern found at position: 14

示例

以下示例从实践上说明了用于模式匹配的 KMP 算法。

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
// function to find prefix
void prefixSearch(char* pat, int m, int* pps) {
   int length = 0; 
   // array to store prefix
   pps[0] = 0;      
   int i = 1; 
   while(i < m) { 
      // to check if the current character matches the previous character
      if(pat[i] == pat[length]) { 
          // increment the length
         length++; 
         // store the length in the prefix array  
         pps[i] = length;  
      }else { 
         if(length != 0) { 
            // to update length of previous prefix length
            length = pps[length - 1]; 
            i--; 
         } else 
            // if the length is 0, store 0 in the prefix array
            pps[i] = 0; 
      }
      i++; // incrementing i 
   }
}
// function to search for pattern
void patrnSearch(char* orgnString, char* patt, int m, int *locArray, int *loc) {
   int n, i = 0, j = 0; 
   n = strlen(orgnString); 
   // array to store the prefix values  
   int* prefixArray = (int*)malloc(m * sizeof(int)); // allocate memory for the prefix array
   // calling prefix function to fill the prefix array
   prefixSearch(patt, m, prefixArray); 
   *loc = 0; // initialize the location index
   while(i < n) { 
       // checking if main string character matches pattern string character
      if(orgnString[i] == patt[j]) { 
         // increment both i and j
         i++; 
         j++; 
      }
      // if j and m are equal pattern is found
      if(j == m) { 
         // store the location of the pattern
         locArray[*loc] = i-j; 
         (*loc)++; // increment the location index
          // update j to the previous prefix value
         j = prefixArray[j-1];
      // checking if i is less than n and the current characters do not match
      }else if(i < n && patt[j] != orgnString[i]) { 
         if(j != 0) 
            // update j to the previous prefix value
            j = prefixArray[j-1]; 
         // if j is zero    
         else 
            i++; // increment i
      }
   }
   free(prefixArray); // free the memory of the prefix array
}
int main() {
    // declare the original text
   char* orgnStr = "AAAABCAEAAABCBDDAAAABC"; 
   // pattern to be found
   char* patrn = "AAABC"; 
   // get the size of the pattern
   int m = strlen(patrn);
   // array to store the locations of the pattern
   int locationArray[strlen(orgnStr)]; 
   // to store the number of locations
   int index; 
   // calling pattern search function
   patrnSearch(orgnStr, patrn, m, locationArray, &index); 
   // to loop through location array
   for(int i = 0; i<index; i++) { 
      // print the location of the pattern
      printf("Pattern found at location: %d\n", locationArray[i]); 
   }
}
#include<iostream>
using namespace std; 
// function to find prefix
void prefixSearch(string pattern, int m, int storePrefx[]) {
   int length = 0; 
   // array to store prefix
   storePrefx[0] = 0;      
   int i = 1; 
   while(i < m) { 
      // to check if the current character matches the previous character
      if(pattern[i] == pattern[length]) { 
         // increment the length
         length++; 
         // store the length in the prefix array  
         storePrefx[i] = length;  
      }else { 
         if(length != 0) { 
            // to update length of previous prefix length
            length = storePrefx[length - 1]; 
            i--; 
         } else 
            // if the length is 0, store 0 in the prefix array
            storePrefx[i] = 0; 
      }
      i++; // incrementing i 
   }
}
// function to search for pattern
void patrnSearch(string orgnString, string patt, int *locArray, int &loc) {
   int n, m, i = 0, j = 0; 
   n = orgnString.size(); 
   m = patt.size(); 
   // array to store the prefix values  
   int prefixArray[m];  
   // calling prefix function to fill the prefix array
   prefixSearch(patt, m, prefixArray); 
   loc = 0; // initialize the location index
   while(i < n) { 
      // checking if main string character matches pattern string character
      if(orgnString[i] == patt[j]) { 
         // increment both i and j
         i++; 
         j++; 
      }
      // if j and m are equal pattern is found
      if(j == m) { 
         // store the location of the pattern
         locArray[loc] = i-j; 
         loc++; // increment the location index
         // update j to the previous prefix value
         j = prefixArray[j-1];
      // checking if i is less than n and the current characters do not match
      }else if(i < n && patt[j] != orgnString[i]) { 
         if(j != 0) 
            // update j to the previous prefix value
            j = prefixArray[j-1]; 
         // if j is zero    
         else 
            i++; // increment i
      }
   }
}
int main() {
   // declare the original text
   string orgnStr = "AAAABCAEAAABCBDDAAAABC"; 
   // pattern to be found
   string patrn = "AAABC"; 
   // array to store the locations of the pattern
   int locationArray[orgnStr.size()]; 
   // to store the number of locations
   int index; 
   // calling pattern search function
   patrnSearch(orgnStr, patrn, locationArray, index); 
   // to loop through location array
   for(int i = 0; i<index; i++) { 
      // print the location of the pattern
      cout << "Pattern found at location: " <<locationArray[i] << endl; 
   }
}
import java.io.*; 
// class to implement the KMP algorithm
public class KMPalgo {
   // function to find prefix
   public static void prefixSearch(String pat, int m, int[] storePrefx) {
      int length = 0;
      // array to store prefix
      storePrefx[0] = 0;
      int i = 1;
      while (i < m) {
         // to check if the current character matches the previous character
         if (pat.charAt(i) == pat.charAt(length)) {
            // increment the length
            length++;
            // store the length in the prefix array
            storePrefx[i] = length;
         } else {
            if (length != 0) {
               // to update length of previous prefix length
               length = storePrefx[length - 1];
               i--;
            } else
               // if the length is 0, store 0 in the prefix array
               storePrefx[i] = 0;
            }
         i++; // incrementing i
      }
   }
   // function to search for pattern
   public static int patrnSearch(String orgnString, String patt, int[] locArray) {
      int n, m, i = 0, j = 0;
      n = orgnString.length();
      m = patt.length();
      // array to store the prefix values
      int[] prefixArray = new int[m]; // allocate memory for the prefix array
      // calling prefix function to fill the prefix array
      prefixSearch(patt, m, prefixArray);
      int loc = 0; // initialize the location index
      while (i < n) {
         // checking if main string character matches pattern string character
         if (orgnString.charAt(i) == patt.charAt(j)) {
            // increment both i and j
            i++;
            j++;
         }
         // if j and m are equal pattern is found
         if (j == m) {
            // store the location of the pattern
            locArray[loc] = i - j;
            loc++; // increment the location index
            // update j to the previous prefix value
            j = prefixArray[j - 1];
            // checking if i is less than n and the current characters do not match
         } else if (i < n && patt.charAt(j) != orgnString.charAt(i)) {
            if (j != 0)
               // update j to the previous prefix value
               j = prefixArray[j - 1];
               // if j is zero
            else
               i++; // increment i
         }
      }
      return loc;
   }
   public static void main(String[] args) throws IOException {
      // declare the original text
      String orgnStr = "AAAABCAEAAABCBDDAAAABC";
      // pattern to be found
      String patrn = "AAABC";
      // array to store the locations of the pattern
      int[] locationArray = new int[orgnStr.length()];
      // calling pattern search function
      int index = patrnSearch(orgnStr, patrn, locationArray);
      // to loop through location array
      for (int i = 0; i < index; i++) {
         // print the location of pattern
         System.out.println("Pattern found at location: " + locationArray[i]);
      }
   }
}
# function to find prefix
def prefix_search(pattern, m, store_prefx):
   length = 0
   # array to store prefix
   store_prefx[0] = 0
   i = 1
   while i < m:
      # to check if the current character matches the previous character
      if pattern[i] == pattern[length]:
         # increment the length
         length += 1
         # store the length in the prefix array
         store_prefx[i] = length
      else:
         if length != 0:
            # to update length of previous prefix length
            length = store_prefx[length - 1]
            i -= 1
         else:
            # if the length is 0, store 0 in the prefix array
            store_prefx[i] = 0
      i += 1  # incrementing i
# function to search for pattern
def pattern_search(orgn_string, patt, loc_array):
   n = len(orgn_string)
   m = len(patt)
   i = j = loc = 0
   # array to store the prefix values
   prefix_array = [0] * m
   # calling prefix function to fill the prefix array
   prefix_search(patt, m, prefix_array)
   while i < n:
      # checking if main string character matches pattern string character
      if orgn_string[i] == patt[j]:
         # increment both i and j
         i += 1
         j += 1
      # if j and m are equal pattern is found
      if j == m:
         # store the location of the pattern
         loc_array[loc] = i - j
         loc += 1  # increment the location index
         # update j to the previous prefix value
         j = prefix_array[j - 1]
      # checking if i is less than n and the current characters do not match
      elif i < n and patt[j] != orgn_string[i]:
         if j != 0:
            # update j to the previous prefix value
            j = prefix_array[j - 1]
         else:
            i += 1  # increment i
   return loc
# main function
def main():
   # declare the original text
   orgn_str = "AAAABCAEAAABCBDDAAAABC"
   # pattern to be found
   patrn = "AAABC"
   # array to store the locations of the pattern
   location_array = [0] * len(orgn_str)
   # calling pattern search function
   index = pattern_search(orgn_str, patrn, location_array)
   # to loop through location array
   for i in range(index):
      # print the location of the pattern
      print("Pattern found at location:", location_array[i])
# call the main function
if __name__ == "__main__":
   main()

输出

Pattern found at location: 1
Pattern found at location: 8
Pattern found at location: 17
广告