。"; // init stopwords StopWordsHandler sw = new StopWordsHandler(); // get StopWords List stopwords = sw.getStopWords(); // split String[] contents = paragraph.split(",|。||(|)"); List segment = new ArrayList(); for (String content : contents) { // ⽤空格分隔 String con = content " "; // 长度限制 String[] words = con.split("\s "); // 使用链表存储 LinkedList list = new LinkedList(Arrays.asList(words)); // list转array String[] arrays = list.toArray(new String[0]); // 对数组进⾏预处理 for (int i = 0; i < arrays.length; i ) { String string = arrays[i]; // 去除长度大于3的 if (string.length() > 2) { continue; } // 去除⾮中⽂ if (isContainsEnglish(string)) { arrays[i] = ""; } // 去除停顿词 if (stopwords.contains(string)) { arrays[i] = ""; } } // prefetch the remain int prefetch = 0; // 取出为""的 for (int i = 0; i < arrays.length; i ) { String string = arrays[i]; if (string == "") { prefetch ; } else { if (prefetch >= 2) { String[] temp = Arrays.copyOfRange(arrays, i - prefetch, i); String tp = getArrayString(temp); if (isContainsEnglish(tp)) { continue; } if (stopwords.contains(tp)) { break; } segment.add(tp); // prefetch = 0; i = i - prefetch 1; prefetch = 0; continue; } else { prefetch = 0; continue; } } } } // 输出 for (String string : segment) { System.out.println(string); } } public static boolean isContainsEnglish(String input) { String regex = ".*[a-zA-Z] .*"; Matcher m = Pattern.compile(regex).matcher(input); return m.matches(); } public static String getArrayString(String[] args) { char[] arr = { 0 }; StringBuffer sb = new StringBuffer(); for (String i : args) { i = i " "; sb.append(i); } String str = sb.toString(); str = str.substring(0, str.length() - 1); return str; }}
免責聲明:若本站收錄的資源侵犯了您的權益,我們會及時刪除侵權內容,謝謝合作!