当前位置: 首页 > 图灵资讯 > 技术篇> java过滤关键字(DFA算法)

java过滤关键字(DFA算法)

来源:图灵教育
时间:2023-07-02 17:06:56


项目中有使用过滤关键字的地方。在这里记录你自己. 没有其他java包,直接执行main方法。如果在项目中具体使用,一般项目启动时会加载关键字文件,然后用静态map存储,直接调用. 有两种方法,一种是推荐使用,另一种是ik分词器分词匹配,不推荐,因为效率低,匹配度低.


1:DFA算法(推荐)

package com.itcorey;import com.google.common.io.Files;import org.springframework.core.io.ClassPathResource;import java.io.IOException;import java.nio.charset.Charset;import java.util.*;/** * 敏感词处理工具 - 实现DFA算法 * * @author sam * @since 2017/9/4 */public class SensitiveWordUtil {    /**     * 敏感词匹配规则     */    public static final int MinMatchTYpe = 1;      //最小匹配规则,比如敏感词库[“中国”,“中国人”],句子:“我是中国人”,匹配结果:我是[中国]人    public static final int MaxMatchType = 2;      //最大匹配规则,如:敏感词库[“中国”、“中国人”、句子:“我是中国人”匹配结果:我是[中国人]    /**     * 敏感词集合     */    public static HashMap sensitiveWordMap;    /**     * 敏感词库的初始化,构建DFA算法模型     *     * @param sensitiveWordSet 敏感词库     */    public static synchronized void init(Set<String> sensitiveWordSet) {        initSensitiveWordMap(sensitiveWordSet);    }    /**     * 敏感词库的初始化,构建DFA算法模型     *     * @param sensitiveWordSet 敏感词库     */    private static void initSensitiveWordMap(Set<String> sensitiveWordSet) {        //敏感词容器的初始化,减少扩容操作        sensitiveWordMap = new HashMap(sensitiveWordSet.size());        String key;        Map nowMap;        Map<String, String> newWorMap;        ///迭代sensititivewordsetedsetet        Iterator<String> iterator = sensitiveWordSet.iterator();        while (iterator.hasNext()) {            //关键字            key = iterator.next();            nowMap = sensitiveWordMap;            for (int i = 0; i < key.length(); i++) {                ///转换成char型                char keyChar = key.charAt(i);                //在库中获取关键字                Object wordMap = nowMap.get(keyChar);                ///如果这个key存在,直接赋值用于下一个循环获取                if (wordMap != null) {                    nowMap = (Map) wordMap;                } else {                    如果//不存在,则构建map,并将isend设置为0,因为他不是最后一个                    newWorMap = new HashMap<>();                    //不是最后一个                    newWorMap.put("isEnd", "0");                    nowMap.put(keyChar, newWorMap);                    nowMap = newWorMap;                }                if (i == key.length() - 1) {                    ///最后一个                    nowMap.put("isEnd", "1");                }            }        }    }    /**     * 判断文字是否包含敏感字符     *     * @param txt       文字     * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则     * @return 如果包含返回true,否则返回false     */    public static boolean contains(String txt, int matchType) {        boolean flag = false;        for (int i = 0; i < txt.length(); i++) {            int matchFlag = checkSensitiveWord(txt, i, matchType); ///判断是否包含敏感字符            if (matchFlag > 0) {    //大于0存在,返回true                flag = true;            }        }        return flag;    }    /**     * 判断文字是否包含敏感字符     *     * @param txt 文字     * @return 如果包含返回true,否则,返回false     */    public static boolean contains(String txt) {        return contains(txt, MaxMatchType);    }    /**     * 在文字中获取敏感词     *     * @param txt       文字     * @param matchType 匹配规则 1:最小匹配规则,2:最大匹配规则     * @return     */    public static Set<String> getSensitiveWord(String txt, int matchType) {        Set<String> sensitiveWordList = new HashSet<>();        for (int i = 0; i < txt.length(); i++) {            ///判断是否包含敏感字符            int length = checkSensitiveWord(txt, i, matchType);            if (length > 0) {//存在,加入list                sensitiveWordList.add(txt.substring(i, i + length));                i = i + length - 1;////减1的原因,因为for会自增            }        }        return sensitiveWordList;    }    /**     * 在文字中获取敏感词     *     * @param txt 文字     * @return     */    public static Set<String> getSensitiveWord(String txt) {        return getSensitiveWord(txt, MaxMatchType);    }    /**     * 替换敏感字符     *     * @param txt         文本     * @param replaceChar 替换字符,匹配的敏感词逐个用字符替换,如 句子:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***     * @param matchType   敏感词匹配规则     * @return     */    public static String replaceSensitiveWord(String txt, char replaceChar, int matchType) {        String resultTxt = txt;        ////获得所有敏感词        Set<String> set = getSensitiveWord(txt, matchType);        Iterator<String> iterator = set.iterator();        String word;        String replaceString;        while (iterator.hasNext()) {            word = iterator.next();            replaceString = getReplaceChars(replaceChar, word.length());            resultTxt = resultTxt.replaceAll(word, replaceString);        }        return resultTxt;    }    /**     * 替换敏感字符     *     * @param txt         文本     * @param replaceChar 替换字符,匹配的敏感词逐个用字符替换,如 句子:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***     * @return     */    public static String replaceSensitiveWord(String txt, char replaceChar) {        return replaceSensitiveWord(txt, replaceChar, MaxMatchType);    }    /**     * 替换敏感字符     *     * @param txt        文本     * @param replaceStr 替换字符串,匹配的敏感词用字符逐个替换,如 句子:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我喜欢[屏蔽]     * @param matchType  敏感词匹配规则     * @return     */    public static String replaceSensitiveWord(String txt, String replaceStr, int matchType) {        String resultTxt = txt;        ////获得所有敏感词        Set<String> set = getSensitiveWord(txt, matchType);        Iterator<String> iterator = set.iterator();        String word;        while (iterator.hasNext()) {            word = iterator.next();            resultTxt = resultTxt.replaceAll(word, replaceStr);        }        return resultTxt;    }    /**     * 替换敏感字符     *     * @param txt        文本     * @param replaceStr 替换字符串,匹配的敏感词用字符逐个替换,如 句子:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我喜欢[屏蔽]     * @return     */    public static String replaceSensitiveWord(String txt, String replaceStr) {        return replaceSensitiveWord(txt, replaceStr, MaxMatchType);    }    /**     * 获取替换字符串     *     * @param replaceChar     * @param length     * @return     */    private static String getReplaceChars(char replaceChar, int length) {        String resultReplace = String.valueOf(replaceChar);        for (int i = 1; i < length; i++) {            resultReplace += replaceChar;        }        return resultReplace;    }    /**     * 检查敏感字符是否包含在文本中,检查规则如下:<br>     *     * @param txt     * @param beginIndex     * @param matchType     * @return 若有,则返回敏感词字符的长度,不存在返回0     */    private static int checkSensitiveWord(String txt, int beginIndex, int matchType) {        //敏感词结束标识位:用于敏感词只有一位的情况        boolean flag = false;        //默认情况下,匹配标识数为0        int matchFlag = 0;        char word;        Map nowMap = sensitiveWordMap;        for (int i = beginIndex; i < txt.length(); i++) {            word = txt.charAt(i);            //获取指定key            nowMap = (Map) nowMap.get(word);            if (nowMap != null) {//存在,判断是否是最后一个                //找到相应的key,匹配标识+1                matchFlag++;                //如果是最后一个匹配规则,结束循环,返回匹配标识数                if ("1".equals(nowMap.get("isEnd"))) {                    ///结束标志位于true                    flag = true;                    //最小规则,直接返回,最大规则需要继续搜索                    if (MinMatchTYpe == matchType) {                        break;                    }                }            } else {//不存在,直接返回                break;            }        }        if (matchFlag < 2 || !flag) {//长度必须大于等于1,为词            matchFlag = 0;        }        return matchFlag;    }    public static void getRank(String s) throws IOException {        ClassPathResource classPathResource = new ClassPathResource(”短信模板审核关键词.txt");        List<String> list = Files.readLines(classPathResource.getFile(), Charset.forName("gbk"));        ///敏感词库的初始化        SensitiveWordUtil.init(new HashSet<String>(list));        System.out.println(敏感词的数量:" + SensitiveWordUtil.sensitiveWordMap.size());    }    public static void main(String[] args) throws IOException {        ///这是我自己项目中使用的///        ClassPathResource classPathResource = new ClassPathResource("关键词.txt");//        List<String> list = Files.readLines(classPathResource.getFile(), Charset.forName("gbk"));        List<String> list = Arrays.asList(关键词”,“加薇新”);        ///敏感词库的初始化        SensitiveWordUtil.init(new HashSet<String>(list));        System.out.println(敏感词的数量:" + SensitiveWordUtil.sensitiveWordMap.size());        String string = “太多的悲伤可能仅限于饲养基地 屏幕上的情节。"                + “然后我们在网报【圣诞红包】网审平台上的扮演加薇欣跟随主人公的喜红客联盟 怒哀乐而过于牵强地将自己的情绪附加到银幕情节中,然后感动得流泪,"                + “难过的时候,躺在某个人的关键词里,尽情地解释自己的心,或者在线报纸和在线审计平台上,手机店主V相信一个婊子,一杯红酒,一部电影,晚上 深人静的夜晚,关上电话,静静地发呆。";        System.out.println("待测语句字数:" + string.length());        //是否包含关键字        boolean result = SensitiveWordUtil.contains(string);        System.out.println(result);        result = SensitiveWordUtil.contains(string, SensitiveWordUtil.MinMatchTYpe);        System.out.println(result);        ////在句子中获得敏感词        Set<String> set = SensitiveWordUtil.getSensitiveWord(string);        System.out.println(“句子中包含的敏感词数为:” + set.size() + "。包含:" + set);        set = SensitiveWordUtil.getSensitiveWord(string, SensitiveWordUtil.MinMatchTYpe);        System.out.println("语句中包含敏感词的数量为:" + set.size() + "。包含:" + set);        ////替换句子中的敏感词        String filterStr = SensitiveWordUtil.replaceSensitiveWord(string, '*');        System.out.println(filterStr);        filterStr = SensitiveWordUtil.replaceSensitiveWord(string, '*', SensitiveWordUtil.MinMatchTYpe);        System.out.println(filterStr);        String filterStr2 = SensitiveWordUtil.replaceSensitiveWord(string, “[*敏感词*]”;        System.out.println(filterstr2);        filterStr2 = SensitiveWordUtil.replaceSensitiveWord(string, "[*敏感词*]" SensitiveWordUtil.MinMatchTYpe);        System.out.println(filterstr2);    }}

2:

使用ik分词器分词:引入pom文件:一个ik分词器,一个googleguava

<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->        <dependency>            <groupId>com.janeluo</groupId>            <artifactId>ikanalyzer</artifactId>            <version>2012_u6</version>        </dependency>        <!-- https://mvnrepository.com/artifact/com.google.guava/guava -->        <dependency>            <groupId>com.google.guava</groupId>            <artifactId>guava</artifactId>            <version>27.0.1-jre</version>        </dependency>

package com.itcorey;import com.google.common.io.Files;import org.springframework.core.io.ClassPathResource;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;import java.io.File;import java.io.IOException;import java.io.StringReader;import java.nio.charset.Charset;import java.util.*;/** * 敏感词处理工具 - IKAnalyzer中文分词工具 - 在分词的帮助下过滤敏感词 * * @author sam * @since 2017/9/4 */public class SensitiveWordUtil2 {    /**     * 敏感词集合     */    public static HashMap sensitiveWordMap;    /**     * 敏感词库的初始化     *     * @param sensitiveWordSet 敏感词库     */    public static synchronized void init(Set<String> sensitiveWordSet) {        //初始化敏感词容器,减少扩容操作        sensitiveWordMap = new HashMap(sensitiveWordSet.size());        for (String sensitiveWord : sensitiveWordSet) {            sensitiveWordMap.put(sensitiveWord, sensitiveWord);        }    }    /**     * 判断文字是否包含敏感字符     *     * @param txt 文字     * @return 若包括返回true,否则,返回false     */    public static boolean contains(String txt) throws Exception {        boolean flag = false;        List<String> wordList = segment(txt);        for (String word : wordList) {            if (sensitiveWordMap.get(word) != null) {                return true;            }        }        return flag;    }    /**     * 在文字中获取敏感词     *     * @param txt 文字     * @return     */    public static Set<String> getSensitiveWord(String txt) throws IOException {        Set<String> sensitiveWordList = new HashSet<>();        List<String> wordList = segment(txt);        for (String word : wordList) {            if (sensitiveWordMap.get(word) != null) {                sensitiveWordList.add(word);            }        }        return sensitiveWordList;    }    /**     * 替换敏感字符     *     * @param txt         文本     * @param replaceChar 替换字符,匹配的敏感词用字符逐个替换,如 句子:我爱中国人 敏感词:中国人,替换字符:*, 替换结果:我爱***     * @return     */    public static String replaceSensitiveWord(String txt, char replaceChar) throws IOException {        String resultTxt = txt;        //获取所有敏感词汇        Set<String> sensitiveWordList = getSensitiveWord(txt);        String replaceString;        for (String sensitiveWord : sensitiveWordList) {            replaceString = getReplaceChars(replaceChar, sensitiveWord.length());            resultTxt = resultTxt.replaceAll(sensitiveWord, replaceString);        }        return resultTxt;    }    /**     * 替换敏感字符     *     * @param txt        文本     * @param replaceStr 替换字符串,匹配的敏感词用字符逐个替换,如 句子:我爱中国人 敏感词:中国人,替换字符串:[屏蔽],替换结果:我喜欢[屏蔽]     * @return     */    public static String replaceSensitiveWord(String txt, String replaceStr) throws IOException {        String resultTxt = txt;        ////获得所有敏感词        Set<String> sensitiveWordList = getSensitiveWord(txt);        for (String sensitiveWord : sensitiveWordList) {            resultTxt = resultTxt.replaceAll(sensitiveWord, replaceStr);        }        return resultTxt;    }    /**     * 获取替换字符串     *     * @param replaceChar     * @param length     * @return     */    private static String getReplaceChars(char replaceChar, int length) {        String resultReplace = String.valueOf(replaceChar);        for (int i = 1; i < length; i++) {            resultReplace += replaceChar;        }        return resultReplace;    }    /**     * 对句子进行分词     *     * @param text 语句     * @return 分词后的集合     * @throws IOException     */    private static List segment(String text) throws IOException {        List<String> list = new ArrayList<>();        StringReader re = new StringReader(text);        IKSegmenter ik = new IKSegmenter(re, true);        Lexeme lex;        while ((lex = ik.next()) != null) {            list.add(lex.getLexemeText());        }        return list;    }    public static void main(String[] args) throws IOException {        ClassPathResource classPathResource = new ClassPathResource(”短信模板审核关键词.txt");        List<String> list = Files.readLines(classPathResource.getFile(), Charset.forName("gbk"));        ///敏感词库的初始化        Sensitivewordutill2.init(new HashSet<String>(list));        /**         * 需要处理的目标字符串         */        System.out.println("敏感词的数量:" + Sensitivewordutill2.sensitiveWordMap.size());        String string = “太多的悲伤可能仅限于饲养基地 屏幕上的情节。"                + "然后 我们加薇薇芯号的颜色是跟随主人公的喜红客联盟 怒哀乐而过于牵强地将自己的情绪附加到银幕情节中,然后感动得流泪,"                + “难过的时候躺在某个人的怀里,把你拉进群里的手机卡复制器,一个婊子,一杯红酒,一部电影,晚上 深人静的夜晚,关掉电话,静静地发呆。";        System.out.println("待测语句字数:" + string.length());        /**         * 是否包含关键字         */        try {            boolean result = Sensitivewordutill2.contains(string);            System.out.println(result);        } catch (Exception e) {            e.printStackTrace();        }        /**         * 在句子中获取敏感词         */        Set<String> set = Sensitivewordutill2.getSensitiveWord(string);        System.out.println("语句中包含敏感词的数量为:" + set.size() + "。包含:" + set);        /**         * 替换句子中的敏感词         */        String filterStr = Sensitivewordutill2.replaceSensitiveWord(string, '*');        System.out.println(filterStr);        String filterStr2 = Sensitivewordutill2.replaceSensitiveWord(string, “[*敏感词*]”;        System.out.println(filterstr2);    }}