| import nltk |
| import jieba |
| import sudachipy |
| import langid |
| nltk.download('punkt') |
| langid.set_languages(['en', 'zh', 'ja']) |
|
|
| def split_text_into_sentences(text): |
| if langid.classify(text)[0] == "en": |
| sentences = nltk.tokenize.sent_tokenize(text) |
|
|
| return sentences |
| elif langid.classify(text)[0] == "zh": |
| sentences = [] |
| segs = jieba.cut(text, cut_all=False) |
| segs = list(segs) |
| start = 0 |
| for i, seg in enumerate(segs): |
| if seg in ["。", "!", "?", "……"]: |
| sentences.append("".join(segs[start:i + 1])) |
| start = i + 1 |
| if start < len(segs): |
| sentences.append("".join(segs[start:])) |
|
|
| return sentences |
| elif langid.classify(text)[0] == "ja": |
| sentences = [] |
| tokenizer = sudachipy.Dictionary().create() |
| tokens = tokenizer.tokenize(text) |
| current_sentence = "" |
|
|
| for token in tokens: |
| current_sentence += token.surface() |
| if token.part_of_speech()[0] == "補助記号" and token.part_of_speech()[1] == "句点": |
| sentences.append(current_sentence) |
| current_sentence = "" |
|
|
| if current_sentence: |
| sentences.append(current_sentence) |
|
|
| return sentences |
|
|
| raise RuntimeError("It is impossible to reach here.") |