sae wordpress 圖片北京網(wǎng)站優(yōu)化站優(yōu)化
文章目錄
- 敏感詞過(guò)濾
- 方案一:正則表達(dá)式
- 方案二:基于DFA算法的敏感詞過(guò)濾工具框架-sensitive-word
- springboot集成sensitive-word
- 步驟一:引入pom
- 步驟二:自定義配置
- 步驟三:自定義敏感詞+白名單
- 步驟四:核心方法測(cè)試
敏感詞過(guò)濾
敏感詞過(guò)濾通常是指從文本中檢測(cè)并移除或替換掉被認(rèn)為是不適當(dāng)、冒犯性或違反特定社區(qū)準(zhǔn)則的詞匯。這個(gè)過(guò)程常用于在線(xiàn)平臺(tái)、論壇、社交媒體和聊天系統(tǒng)等,以確保交流環(huán)境的健康和積極.
方案一:正則表達(dá)式
實(shí)現(xiàn)敏感詞過(guò)濾.只適合于敏感詞較少、文本量較少的場(chǎng)合,并且無(wú)法處理同音字、錯(cuò)別字等,案例:
public static void main(String[] args) {String text = "這是一個(gè)包含敏感詞匯的文本,例如色情、賭博等。";String[] sensitiveWords = {"色情", "賭博"};for (String word : sensitiveWords) {text = filterSensitiveWords(text, word);}System.out.println("過(guò)濾后的文本: " + text);testSensitiveWordFrame();}/*** 方案一:正則表達(dá)式實(shí)現(xiàn)敏感詞過(guò)濾.只適合于敏感詞較少、文本量較少的場(chǎng)合,并且無(wú)法處理同音字、錯(cuò)別字等.** @param text* @param sensitiveWord* @return*/public static String filterSensitiveWords(String text, String sensitiveWord) {Pattern pattern = Pattern.compile(sensitiveWord);Matcher matcher = pattern.matcher(text);return matcher.replaceAll("***");}
方案二:基于DFA算法的敏感詞過(guò)濾工具框架-sensitive-word
* 6W+ 詞庫(kù),且不斷優(yōu)化更新* 基于 DFA 算法,性能較好* 基于 fluent-api 實(shí)現(xiàn),使用優(yōu)雅簡(jiǎn)潔* 支持敏感詞的判斷、返回、脫敏等常見(jiàn)操作* 支持全角半角互換* 支持英文大小寫(xiě)互換* 支持?jǐn)?shù)字常見(jiàn)形式的互換* 支持中文繁簡(jiǎn)體互換* 支持英文常見(jiàn)形式的互換* 支持用戶(hù)自定義敏感詞和白名單* 支持?jǐn)?shù)據(jù)的數(shù)據(jù)動(dòng)態(tài)更新,實(shí)時(shí)生效
springboot集成sensitive-word
步驟一:引入pom
<dependency><groupId>com.github.houbb</groupId><artifactId>sensitive-word</artifactId><version>0.2.0</version>
</dependency>
步驟二:自定義配置
@Configuration
public class MySensitiveWordBs {@Autowiredprivate MyWordAllow myWordAllow;@Autowiredprivate MyWordDeny myWordDeny;@Autowiredprivate MyWordReplace myWordReplace;/*** 初始化引導(dǎo)類(lèi)** @return 初始化引導(dǎo)類(lèi)* @since 1.0.0*/@Beanpublic SensitiveWordBs sensitiveWordBs() {SensitiveWordBs sensitiveWordBs = SensitiveWordBs.newInstance()
// .wordAllow(WordAllows.chains(WordAllows.defaults(), myWordAllow)) // 設(shè)置多個(gè)敏感詞,系統(tǒng)默認(rèn)和自定義
// .wordDeny(WordDenys.chains(WordDenys.defaults(), myWordDeny)) // 設(shè)置多個(gè)敏感詞,系統(tǒng)默認(rèn)和自定義.wordAllow(WordAllows.chains(myWordAllow)) // 自定義.wordDeny(WordDenys.chains(myWordDeny)) // 自定義.wordReplace(myWordReplace) // 自定義替換規(guī)則.ignoreCase(true) // 忽略大小寫(xiě).ignoreWidth(true) // 忽略半角圓角.ignoreNumStyle(true) // 忽略數(shù)字的寫(xiě)法.ignoreChineseStyle(true) // 忽略中文的書(shū)寫(xiě)格式.ignoreEnglishStyle(true) // 忽略英文的書(shū)寫(xiě)格式.ignoreRepeat(true) // 忽略重復(fù)詞.enableNumCheck(true) // 是否啟用數(shù)字檢測(cè)。默認(rèn)連續(xù) 8 位數(shù)字認(rèn)為是敏感詞.enableEmailCheck(true) // 是有啟用郵箱檢測(cè).enableUrlCheck(true) // 是否啟用鏈接檢測(cè).init();return sensitiveWordBs;}
}
步驟三:自定義敏感詞+白名單
/*** 自定義非敏感詞* 注意每一行為一個(gè)非敏感詞,單行不能只包括空格,否則,也會(huì)把空格識(shí)別為非敏感詞*/
@Component
@Slf4j
public class MyWordAllow implements IWordAllow {@Overridepublic List<String> allow() {List<String> allowWords = new ArrayList<>();try {ClassPathResource resource = new ClassPathResource("myAllowWords.txt");Path myAllowWordsPath = Paths.get(resource.getUrl().toURI());allowWords = Files.readAllLines(myAllowWordsPath, StandardCharsets.UTF_8);} catch (IOException ioException) {log.error("讀取非敏感詞文件錯(cuò)誤:{}", ioException);} catch (URISyntaxException e) {throw new RuntimeException(e);}return allowWords;}
}
@Component
@Slf4j
public class MyWordDeny implements IWordDeny {@Overridepublic List<String> deny() {List<String> denyWords = new ArrayList<>();try {ClassPathResource resource = new ClassPathResource("myDenyWords.txt");Path myAllowWordsPath = Paths.get(resource.getUrl().toURI());denyWords = Files.readAllLines(myAllowWordsPath, StandardCharsets.UTF_8);} catch (IOException ioException) {log.error("讀取敏感詞文件錯(cuò)誤:{}", ioException);} catch (URISyntaxException e) {throw new RuntimeException(e);}return denyWords;}
}
/*** 自定義敏感詞對(duì)應(yīng)的替換值.* 場(chǎng)景說(shuō)明:有時(shí)候我們希望不同的敏感詞有不同的替換結(jié)果。比如【游戲】替換為【電子競(jìng)技】,【失業(yè)】替換為【靈活就業(yè)】。*/
@Configuration
public class MyWordReplace implements IWordReplace {@Overridepublic void replace(StringBuilder stringBuilder, final char[] rawChars, IWordResult wordResult, IWordContext wordContext) {String sensitiveWord = InnerWordCharUtils.getString(rawChars, wordResult);if ("zhupeng".equals(sensitiveWord)) {stringBuilder.append("朱鵬");} else {// 其他默認(rèn)使用 * 代替int wordLength = wordResult.endIndex() - wordResult.startIndex();for (int i = 0; i < wordLength; i++) {stringBuilder.append('-');}}}
}
步驟四:核心方法測(cè)試
public class SensitiveWordController {@Autowiredprivate MyWordReplace myWordReplace;@Autowiredprivate SensitiveWordBs sensitiveWordBs;private static final String text = "五星紅旗迎風(fēng)飄揚(yáng),毛主席的畫(huà)像屹立在天安門(mén)前,zhuzhuhzu";@GetMapping("/pattern")public void testSensitiveWord2() {String text = "這是一個(gè)包含敏感詞匯的文本,例如色情、賭博等。";String[] sensitiveWords = {"色情", "賭博"};for (String word : sensitiveWords) {text = filterSensitiveWords(text, word);}System.out.println("過(guò)濾后的文本: " + text);}/*** 方案二:基于DFA算法的敏感詞過(guò)濾工具框架-sensitive-word:https://github.com/houbb/sensitive-word* 6W+ 詞庫(kù),且不斷優(yōu)化更新* 基于 DFA 算法,性能較好* 基于 fluent-api 實(shí)現(xiàn),使用優(yōu)雅簡(jiǎn)潔* 支持敏感詞的判斷、返回、脫敏等常見(jiàn)操作* 支持全角半角互換* 支持英文大小寫(xiě)互換* 支持?jǐn)?shù)字常見(jiàn)形式的互換* 支持中文繁簡(jiǎn)體互換* 支持英文常見(jiàn)形式的互換* 支持用戶(hù)自定義敏感詞和白名單* 支持?jǐn)?shù)據(jù)的數(shù)據(jù)動(dòng)態(tài)更新,實(shí)時(shí)生效*/@GetMapping("/filter")public void testSensitiveWord() {System.out.println("SensitiveWordHelper.contains(text) = " + SensitiveWordHelper.contains(text));System.out.println("SensitiveWordHelper.findAll(text) = " + SensitiveWordHelper.findAll(text));System.out.println("SensitiveWordHelper.replace(text,myWordReplace) = " + SensitiveWordHelper.replace(text, myWordReplace));// 如果自定義敏感詞,不要使用SensitiveWordHelper的方法,要使用SensitiveWordBsSystem.out.println("sensitiveWordBs.contains(text) = " + sensitiveWordBs.contains(text));System.out.println("sensitiveWordBs.findAll(text) = " + sensitiveWordBs.findAll(text));System.out.println("sensitiveWordBs.replace(text) = " + sensitiveWordBs.replace(text));}
}