需求
一个应用系统包含了搜索框、评论输入、聊天对话等功能,用户可能会在这些地方输入一些敏感词/违禁词,系统需要对敏感词/违禁词进行检查和处理(比如把词替换成*)
电商应用中,搜索结果页是根据用户的输入内容来呈现的,有些特殊标签是在搜索内容命中了词库才会展示
运营给了1000个标签id,要求匹配到这些id时做一些逻辑处理
什么是DFA算法
我们已经有了一个词库,很容易就能写出下面的代码,针对每次输入,遍历整个词库进行查找匹配
const searchWords = A_CONFIG.SEARCH_WORDS || []
return searchWords.some(word => word.toLowerCase() === userWord.toLowerCase())
但是每次都要遍历整个词库,效率太低。
DFA(Deterministic Finite Automaton):确定有限状态自动机,假设词库里有王八蛋和王八羔子两个词,那么构造的数据结构如下图:
{
"isEnd": false,
"王": {
"isEnd": false,
"八": {
"isEnd": false,
"蛋": { "isEnd": true },
"羔": {
"isEnd": false,
"子": { "isEnd": true }
}
}
}
}
我们只需要从构造好的数据结构中查找,比如用户输入的词是王八不好,其查找流程如下图:
js代码实现
// 根据词库来构造map
function initDirtyWord(words = []) {
const result = new Map()
let currMap = new Map()
for (let i = 0; i < words.length; i++) {
const word = words[i]
currMap = result
for (let j = 0; j < word.length; j++) {
const char = word[j]
let wordMap = currMap.get(char)
if (wordMap) {
currMap = wordMap
} else {
const tempMap = new Map()
tempMap.set('isEnd', false)
currMap.set(char, tempMap)
currMap = tempMap
}
if (j === word.length - 1) {
currMap.set('isEnd', true)
}
}
}
return result
}
// 根据map来查找过滤内容
function checkDirtyWord(content, index, dirtyWordMap) {
let currMap = dirtyWordMap
for(let i = index; i < content.length; i++) {
const char = content[i]
currMap = currMap.get(char)
if (currMap) {
if (currMap.get('isEnd')) {
return true
}
} else {
return false
}
}
return false
}
// @example
const dirtyMap = initDirtyWord(['王八蛋', '王八羔子'])
checkDirtyWord('王八不好', 0, dirtyMap)
性能比较
const searchWords = Array.from({ length: 10000 }).fill('111').concat('000') // 词库
const result = initDirtyWord(searchWords)
const start = process.hrtime()
checkDirtyWord('000', 0, result)
console.log('基于DFA: ', process.hrtime(start))
function test2(searchWords, userWord) {
return searchWords.some(word => word.toLowerCase() === userWord.toLowerCase())
}
const start2 = process.hrtime()
test2(searchWords, '000')
console.log('对照: ', process.hrtime(start2))
golang代码实现
package main
type WordMap struct {
isEnd bool
child map[rune]*WordMap
}
func createWordMap() *WordMap {
return &WordMap{isEnd: false, child: make(map[rune]*WordMap)}
}
func InitWordMap(words []string) *WordMap {
result := createWordMap()
var current *WordMap
for _, word := range words {
current = result
for _, char := range word {
if _, ok := current.child[char]; !ok {
newWordMap := createWordMap()
current.child[char] = newWordMap
}
current = current.child[char]
}
current.isEnd = true
}
return result
}
func CheckWord(content string, startIndex int, wordMap *WordMap ) bool {
currMap := wordMap
for _, char := range content[startIndex:] {
if val, ok := currMap.child[char]; ok {
if val.isEnd {
return true
}
currMap = val
} else {
return false
}
}
return false
}
// @example
func main () {
dirtyMap := InitWordMap([]string{"王八蛋", "王八羔子"})
CheckWord("王八不好", 0, dirtyMap)
}
性能比较
func test2(a []string, userWord string) bool {
userWordLower := strings.ToLower(userWord)
for _, word := range a {
if strings.ToLower(word) == userWordLower {
return true
}
}
return false
}
func main () {
searchWords := make([]string, 10000)
for i := range searchWords {
searchWords[i] = "111"
}
searchWords = append(searchWords, "000")
b := InitWordMap(searchWords)
start := time.Now()
CheckWord("000", 0, b)
elapsed := time.Since(start)
fmt.Println("基于DFA: ", elapsed.Nanoseconds())
start2 := time.Now()
test2(searchWords, "000")
elapsed2 := time.Since(start2)
fmt.Println("对照组: ", elapsed2.Nanoseconds())
}
延伸
输入场景,有时候还要考虑以下的情况:
把王八蛋替换为***
处理空格、换行符等空白字符
have是一个包含av的正常单词,不应该被匹配替换为h##e
作者:前端爆冲
链接:https://juejin.cn