- 数据结构与算法
- DSA - 首页
- DSA - 概述
- DSA - 环境设置
- DSA - 算法基础
- DSA - 渐进分析
- 数据结构
- DSA - 数据结构基础
- DSA - 数据结构和类型
- DSA - 数组数据结构
- 链表
- DSA - 链表数据结构
- DSA - 双向链表数据结构
- DSA - 循环链表数据结构
- 栈与队列
- DSA - 栈数据结构
- DSA - 表达式解析
- DSA - 队列数据结构
- 搜索算法
- DSA - 搜索算法
- DSA - 线性搜索算法
- DSA - 二分搜索算法
- DSA - 插值搜索
- DSA - 跳跃搜索算法
- DSA - 指数搜索
- DSA - 斐波那契搜索
- DSA - 子列表搜索
- DSA - 哈希表
- 排序算法
- DSA - 排序算法
- DSA - 冒泡排序算法
- DSA - 插入排序算法
- DSA - 选择排序算法
- DSA - 归并排序算法
- DSA - 希尔排序算法
- DSA - 堆排序
- DSA - 桶排序算法
- DSA - 计数排序算法
- DSA - 基数排序算法
- DSA - 快速排序算法
- 图数据结构
- DSA - 图数据结构
- DSA - 深度优先遍历
- DSA - 广度优先遍历
- DSA - 生成树
- 树数据结构
- DSA - 树数据结构
- DSA - 树遍历
- DSA - 二叉搜索树
- DSA - AVL树
- DSA - 红黑树
- DSA - B树
- DSA - B+树
- DSA - 伸展树
- DSA - Trie树
- DSA - 堆数据结构
- 递归
- DSA - 递归算法
- DSA - 使用递归的汉诺塔
- DSA - 使用递归的斐波那契数列
- 分治法
- DSA - 分治法
- DSA - 最大-最小问题
- DSA - Strassen矩阵乘法
- DSA - Karatsuba算法
- 贪心算法
- DSA - 贪心算法
- DSA - 旅行商问题(贪心法)
- DSA - Prim最小生成树
- DSA - Kruskal最小生成树
- DSA - Dijkstra最短路径算法
- DSA - 地图着色算法
- DSA - 分数背包问题
- DSA - 带截止日期的作业排序
- DSA - 最优合并模式算法
- 动态规划
- DSA - 动态规划
- DSA - 矩阵链乘法
- DSA - Floyd-Warshall算法
- DSA - 0-1背包问题
- DSA - 最长公共子序列算法
- DSA - 旅行商问题(动态规划法)
- 近似算法
- DSA - 近似算法
- DSA - 顶点覆盖算法
- DSA - 集合覆盖问题
- DSA - 旅行商问题(近似算法)
- 随机化算法
- DSA - 随机化算法
- DSA - 随机化快速排序算法
- DSA - Karger最小割算法
- DSA - Fisher-Yates洗牌算法
- DSA有用资源
- DSA - 问答
- DSA - 快速指南
- DSA - 有用资源
- DSA - 讨论
Aho-Corasick算法
用于模式匹配的Aho-Corasick算法
Aho-Corasick算法是一种字典匹配算法,它可以在线性时间内找到给定文本中所有模式集的所有出现。它由Alfred V. Aho和Margaret J. Corasick于1975年开发,广泛应用于恶意软件检测、文本分析和自然语言处理等应用。
Aho-Corasick算法是如何工作的?
Aho-Corasick算法只需要遍历文本一次即可搜索所有模式,并且不会进行任何不必要的回溯。它可以轻松处理不同长度的多个关键字,也可以轻松处理重叠匹配。
该算法借助Trie数据结构跟踪已搜索的模式。Trie是一种主要用于存储字符串的树形数据结构。
让我们通过一个例子来理解:
Input:
set of patterns = {their, there, any, bye}
main string = "isthereanyanswerokgoodbye"
Output:
Word there location: 2
Word any location: 7
Word bye location: 22
Aho-Corasick算法包含以下步骤:
- 预处理
- 搜索/匹配
预处理阶段
在预处理步骤中,我们从关键字构建有限状态机(Trie)。Trie的每个节点代表关键字的一个前缀,每个可能的扩展前缀都有一条用字符标记的边。Trie的根节点代表空前缀,最后一个节点标记为最终节点。
预处理步骤进一步分为三个子步骤:
跳转阶段 - 此阶段根据模式的字符定义状态之间的转换。它用二维数组表示。
失败阶段 - 它定义了当发生不匹配时状态之间的转换。它用一维数组表示。
输出阶段 - 在此阶段,算法存储在给定状态下结束的所有模式的索引。它也由一维数组表示。
搜索/匹配阶段
搜索步骤是通过从左到右扫描文本,并根据文本中的字符在Trie中跟随边和失败链接来完成的。每当我们到达最终节点时,我们都会报告文本中相应关键字的匹配。
示例
以下示例演示了Aho-Corasick算法在不同编程语言中的工作原理。
#include <stdio.h>
#include <string.h>
#define MAXS 500 // sum of the length of all patterns
#define MAXC 26 // as 26 letters in alphabet
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(char* array[], int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
char* word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < strlen(word); ++j) {
int ch = word[j] - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
// adding next level node to the queue
int q[MAXS], front = 0, rear = 0;
q[rear++] = gotoMat[0][ch];
while (front != rear) {
// removing front node
int state = q[front++];
for (int ch = 0; ch <= MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q[rear++] = gotoMat[state][ch];
}
}
}
}
}
return state;
}
int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
void patternSearch(char* arr[], int size, char* text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < strlen(text); i++) {
presentState = getNextState(presentState, text[i]);
// matching found and print words
for (int j = 0; j < size; ++j) {
if (output[presentState] & (1 << j)) {
printf("Word %s location: %zu\n", arr[j], i - strlen(arr[j]) + 1);
}
}
}
}
int main() {
char* arr[] = {"their", "there", "answer", "any", "bye"};
char* text = "isthereanyanswerokgoodbye";
int k = sizeof(arr)/sizeof(arr[0]);
patternSearch(arr, k, text);
return 0;
}
#include <iostream>
#include <queue>
#define MAXS 500 // sum of the length of all patterns
#define MAXC 26 // as 26 letters in alphabet
using namespace std;
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(string array[], int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
string word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < word.size(); ++j) {
int ch = word[j] - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
queue<int> q;
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
q.push(gotoMat[0][ch]);
}
}
while (q.size()) {
// removing front node
int state = q.front();
q.pop();
for (int ch = 0; ch <= MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q.push(gotoMat[state][ch]);
}
}
}
return state;
}
int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
void patternSearch(string arr[], int size, string text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < text.size(); i++) {
presentState = getNextState(presentState, text[i]);
// matching found and print words
for (int j = 0; j < size; ++j) {
if (output[presentState] & (1 << j)) {
cout << "Word " << arr[j] << " location: " << i - arr[j].size() + 1 << endl;
}
}
}
}
int main() {
string arr[] = {"their", "there", "answer", "any", "bye"};
string text = "isthereanyanswerokgoodbye";
int k = sizeof(arr)/sizeof(arr[0]);
patternSearch(arr, k, text);
return 0;
}
import java.util.*;
public class Main {
static final int MAXS = 500; // sum of the length of all patterns
static final int MAXC = 26; // as 26 letters in alphabet
static int[] output = new int[MAXS];
static int[] fail = new int[MAXS];
static int[][] gotoMat = new int[MAXS][MAXC];
// method to construct trie
static int buildTree(String[] array, int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
String word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < word.length(); ++j) {
int ch = word.charAt(j) - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
Queue<Integer> q = new LinkedList<>();
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
q.add(gotoMat[0][ch]);
}
}
while (!q.isEmpty()) {
// removing front node
state = q.poll();
for (int ch = 0; ch < MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q.add(gotoMat[state][ch]);
}
}
}
return state;
}
static int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
static void patternSearch(String[] arr, int size, String text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < text.length(); i++) {
presentState = getNextState(presentState, text.charAt(i));
// matching found and print words
for (int j = 0; j < size; ++j) {
if ((output[presentState] & (1 << j)) != 0) {
System.out.println("Word " + arr[j] + " location: " + (i - arr[j].length() + 1));
}
}
}
}
public static void main(String[] args) {
String[] arr = {"their", "there", "answer", "any", "bye"};
String text = "isthereanyanswerokgoodbye";
int k = arr.length;
patternSearch(arr, k, text);
}
}
from collections import deque
MAXS = 500 # sum of the length of all patterns
MAXC = 26 # as 26 letters in alphabet
output = [0]*MAXS
fail = [-1]*MAXS
gotoMat = [[-1]*MAXC for _ in range(MAXS)]
# function to construct trie
def buildTree(array):
global output, fail, gotoMat
size = len(array)
# initial state
state = 1
# make trie for all pattern in array
for i in range(size):
word = array[i]
presentState = 0
# adding pattern
for j in range(len(word)):
ch = ord(word[j]) - ord('a')
if gotoMat[presentState][ch] == -1:
# increasing state
gotoMat[presentState][ch] = state
state += 1
presentState = gotoMat[presentState][ch]
# adding current word in the output
output[presentState] |= (1 << i)
# if ch is not directly connected to root node
for ch in range(MAXC):
if gotoMat[0][ch] == -1:
gotoMat[0][ch] = 0
q = deque()
# node goes to previous state when fails
for ch in range(MAXC):
if gotoMat[0][ch] != 0:
fail[gotoMat[0][ch]] = 0
q.append(gotoMat[0][ch])
while q:
# removing front node
state = q.popleft()
for ch in range(MAXC):
# if goto state is present
if gotoMat[state][ch] != -1:
failure = fail[state]
# find deepest node with proper suffix
while gotoMat[failure][ch] == -1:
failure = fail[failure]
failure = gotoMat[failure][ch]
fail[gotoMat[state][ch]] = failure
# Merging output values
output[gotoMat[state][ch]] |= output[failure]
# adding next level node to the queue
q.append(gotoMat[state][ch])
return state
def getNextState(presentState, nextChar):
answer = presentState
ch = ord(nextChar) - ord('a') #subtract ascii of 'a'
# if go to is not found, use failure function
while gotoMat[answer][ch] == -1:
answer = fail[answer]
return gotoMat[answer][ch]
def patternSearch(arr, text):
buildTree(arr) # make the trie structure
presentState = 0 # make current state as 0
size = len(arr)
# to find all occurances of pattern
for i in range(len(text)):
presentState = getNextState(presentState, text[i])
# matching found and print words
for j in range(size):
if (output[presentState] & (1 << j)) != 0:
print(f"Word {arr[j]} location: {i - len(arr[j]) + 1}")
def main():
arr = ["their", "there", "answer", "any", "bye"]
text = "isthereanyanswerokgoodbye"
patternSearch(arr, text)
if __name__ == "__main__":
main()
输出
Word there location: 2 Word any location: 7 Word answer location: 10 Word bye location: 22
广告