Rabin-Karp 算法



Rabin-Karp 算法是一种模式匹配算法,它使用哈希来比较模式和文本。这里,哈希指的是将较大的输入值映射到较小的输出值(称为哈希值)的过程。此过程有助于避免不必要的比较,从而优化该算法的复杂度。因此,Rabin-Karp 算法的时间复杂度为O(n + m),其中 n 是文本的长度,m 是模式的长度。

Rabin-Karp 算法是如何工作的?

Rabin-Karp 算法通过逐个移动窗口来检查文本中是否存在给定的模式,但它不会在所有情况下都检查所有字符,而是查找哈希值。然后,将此哈希值与文本中所有与模式长度相同的子串的哈希值进行比较。

如果哈希值匹配,则模式和子串可能相等,我们可以通过逐个字符比较来验证。如果哈希值不匹配,则可以跳过子串并继续下一个子串。在下一节中,我们将了解如何计算哈希值。

在 Rabin-Karp 算法中计算哈希值

计算哈希值的步骤如下:

步骤 1:分配模数和基值

假设我们有一个文本Txt = "DAACABCDBA"和一个模式Ptrn = "CAB"。我们将首先根据字符的排名为文本的字符分配数值。最左边的字符排名为 1,最右边的字符排名为 10。此外,对于我们的哈希函数,我们将使用基数b = 10(文本中字符的数量)和模数m = 11。需要注意的是,模数m需要是一个素数,因为它有助于避免溢出问题。

Ranking

步骤 2:计算模式的哈希值

计算模式哈希值的公式如下:

  hash value(Ptrn) = Σ(r * bl-i-1) mod 11 
     where, r: ranking of character
            l: length of Pattern
            i: index of character within the pattern

因此,Patrn的哈希值为:

     h(Ptrn) = ((4 * 102) + (5 * 101) + (6 * 100)) mod 11 
             = 456 mod 11 
             = 5

步骤 3:计算第一个文本窗口的哈希值

开始通过滑动文本中的字符来计算所有字符的哈希值。我们将从第一个子串开始,如下所示:

     h(DAA) = ((1 * 102) + (2 * 101) + (3 * 100)) mod 11 
            = 123 mod 11 
            = 6

现在,比较模式和子串的哈希值。如果它们匹配,则检查字符是否匹配。如果匹配,则找到匹配项,否则,移动到下一个字符。

在上面的示例中,哈希值不匹配。因此,我们移动到下一个字符。

步骤 4:更新哈希值

现在,我们需要删除前一个字符并移动到下一个字符。在此过程中,也应该更新哈希值,直到找到匹配项。

示例

以下示例实际上演示了 Rabin-Karp 算法的工作原理。

#include<stdio.h>
#include<string.h>
#define MAXCHAR 256 
// Function to perform Rabin-Karp algorithm
void rabinKSearch(char orgnlString[], char pattern[], int prime, int array[], int *index) {
   int patLen = strlen(pattern);
   int strLen = strlen(orgnlString);
   int charIndex, pattHash = 0, strHash = 0, h = 1; 
   // Calculate the value of helper variable
   for(int i = 0; i<patLen-1; i++) {
      h = (h*MAXCHAR) % prime;   
   }
   // Calculating initial hash values and first window 
   for(int i = 0; i<patLen; i++) {
      pattHash = (MAXCHAR*pattHash + pattern[i]) % prime;    
      strHash = (MAXCHAR*strHash + orgnlString[i]) % prime;   
   }
   // Slide the pattern over the text one by one
   for(int i = 0; i<=(strLen-patLen); i++) {
      // Check the hash values of current window of text and pattern
      if(pattHash == strHash) {      
         for(charIndex = 0; charIndex < patLen; charIndex++) {
            if(orgnlString[i+charIndex] != pattern[charIndex])
               break;
         }

         if(charIndex == patLen) {   
            (*index)++;
            array[(*index)] = i;
         }
      }
      // Calculating hash value for next window of text
      if(i < (strLen-patLen)) {    
         strHash = (MAXCHAR*(strHash - orgnlString[i]*h) + orgnlString[i+patLen])%prime;
         // If strHash is negative, convert it to positive
         if(strHash < 0) {
            strHash += prime;    
         }
      }
   }
}
int main() {
   char orgnlString[] = "AAAABCAEAAABCBDDAAAABC"; 
   char pattern[] = "AABC"; 
   int locArray[strlen(orgnlString)]; 
   int prime = 101; 
   int index = -1; 
   // Calling Rabin-Karp search function
   rabinKSearch(orgnlString, pattern, prime, locArray, &index); 
   for(int i = 0; i <= index; i++) {
      printf("Pattern found at position: %d\n", locArray[i]);
   }
   return 0;
}
#include<iostream> 
#define MAXCHAR 256 
using namespace std; 
// Function to perform Rabin-Karp algorithm
void rabinKSearch(string orgnlString, string pattern, int prime, int array[], int *index) {
   int patLen = pattern.size();
   int strLen = orgnlString.size();
   int charIndex, pattHash = 0, strHash = 0, h = 1; 
   // Calculate the value of helper variable
   for(int i = 0; i<patLen-1; i++) {
      h = (h*MAXCHAR) % prime;   
   }
   // Calculating initial hash values and first window 
   for(int i = 0; i<patLen; i++) {
      pattHash = (MAXCHAR*pattHash + pattern[i]) % prime;    
      strHash = (MAXCHAR*strHash + orgnlString[i]) % prime;   
   }
   // Slide the pattern over the text one by one
   for(int i = 0; i<=(strLen-patLen); i++) {
      // Check the hash values of current window of text and pattern
      if(pattHash == strHash) {      
         for(charIndex = 0; charIndex < patLen; charIndex++) {
            if(orgnlString[i+charIndex] != pattern[charIndex])
               break;
         }

         if(charIndex == patLen) {   
            (*index)++;
            array[(*index)] = i;
         }
      }
      // Calculating hash value for next window of text
      if(i < (strLen-patLen)) {    
         strHash = (MAXCHAR*(strHash - orgnlString[i]*h) + orgnlString[i+patLen])%prime;
         // If strHash is negative, convert it to positive
         if(strHash < 0) {
            strHash += prime;    
         }
      }
   }
}
int main() {
   string orgnlString = "AAAABCAEAAABCBDDAAAABC"; 
   // Pattern to be searched
   string pattern = "AABC"; 
   // Array to store the locations of the pattern
   int locArray[orgnlString.size()]; 
   int prime = 101; 
   int index = -1; 
   // Calling Rabin-Karp search function
   rabinKSearch(orgnlString, pattern, prime, locArray, &index); 
   // print the result
   for(int i = 0; i <= index; i++) {
      cout << "Pattern found at position: " << locArray[i]<<endl;
   }
}
import java.util.ArrayList;
public class Main {
   static final int MAXCHAR = 256;
   // method to perform Rabin-Karp algorithm
   static void rabinKSearch(String orgnlString, String pattern, int prime, ArrayList<Integer> locArray) {
      int patLen = pattern.length();
      int strLen = orgnlString.length();
      int charIndex, pattHash = 0, strHash = 0, h = 1;
      // Calculating value of helper variable
      for (int i = 0; i < patLen - 1; i++) {
         h = (h * MAXCHAR) % prime;
      }
      // Calculating initial hash values and first window 
      for (int i = 0; i < patLen; i++) {
         pattHash = (MAXCHAR * pattHash + pattern.charAt(i)) % prime;
         strHash = (MAXCHAR * strHash + orgnlString.charAt(i)) % prime;
      }
      // Slide the pattern over the text one by one 
      for (int i = 0; i <= (strLen - patLen); i++) {
         // Check the hash values of current window of text and pattern
         if (pattHash == strHash) {
            for (charIndex = 0; charIndex < patLen; charIndex++) {
               if (orgnlString.charAt(i + charIndex) != pattern.charAt(charIndex))
                  break;
            }

            if (charIndex == patLen) {
               locArray.add(i);
            }
         }
         // Calculating hash value for next window of text
         if (i < (strLen - patLen)) {
            strHash = (MAXCHAR * (strHash - orgnlString.charAt(i) * h) + orgnlString.charAt(i + patLen)) % prime;
            // If strHash is negative, convert it to positive
            if (strHash < 0) {
               strHash += prime;
            }
         }
      }
   }
   public static void main(String[] args) {
      String orgnlString = "AAAABCAEAAABCBDDAAAABC";
      // Pattern to be searched
      String pattern = "AABC";
      // Array to store the locations of the pattern
      ArrayList<Integer> locArray = new ArrayList<>();
      int prime = 101;
      // Calling Rabin-Karp method
      rabinKSearch(orgnlString, pattern, prime, locArray);
      // print the result
      for (int i = 0; i < locArray.size(); i++) {
         System.out.println("Pattern found at position: " + locArray.get(i));
      }
   }
}
MAXCHAR = 256 
# method to perform Rabin-Karp algorithm
def rabinKSearch(orgnlString, pattern, prime):
    patLen = len(pattern)
    strLen = len(orgnlString)
    pattHash = 0
    strHash = 0
    h = 1
    locArray = []
    # Calculating value of helper variable
    for i in range(patLen-1):
        h = (h*MAXCHAR) % prime
    # Calculating initial hash values and first window 
    for i in range(patLen):
        pattHash = (MAXCHAR*pattHash + ord(pattern[i])) % prime
        strHash = (MAXCHAR*strHash + ord(orgnlString[i])) % prime
    # Slide the pattern over the text one by one 
    for i in range(strLen-patLen+1):
        if pattHash == strHash:
            for charIndex in range(patLen):
                if orgnlString[i+charIndex] != pattern[charIndex]:
                    break
            else:
                locArray.append(i)
        # Calculating hash value for next window of text
        if i < strLen-patLen:
            strHash = (MAXCHAR*(strHash - ord(orgnlString[i])*h) + ord(orgnlString[i+patLen])) % prime
            if strHash < 0:
                strHash += prime

    return locArray

def main():
    orgnlString = "AAAABCAEAAABCBDDAAAABC"
    pattern = "AABC"
    prime = 101
    locArray = rabinKSearch(orgnlString, pattern, prime)
    for i in locArray:
        print(f"Pattern found at position: {i}")

if __name__ == "__main__":
    main()

输出

Pattern found at position: 2
Pattern found at position: 9
Pattern found at position: 18
广告