Upload 2 files
Browse files- ForkLangSegment/LangSegment.py +538 -0
- ForkLangSegment/__init__.py +8 -0
ForkLangSegment/LangSegment.py
ADDED
|
@@ -0,0 +1,538 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This file bundles language identification functions.
|
| 3 |
+
|
| 4 |
+
Modifications (fork): Copyright (c) 2021, Adrien Barbaresi.
|
| 5 |
+
|
| 6 |
+
Original code: Copyright (c) 2011 Marco Lui <saffsd@gmail.com>.
|
| 7 |
+
Based on research by Marco Lui and Tim Baldwin.
|
| 8 |
+
|
| 9 |
+
See LICENSE file for more info.
|
| 10 |
+
https://github.com/adbar/py3langid
|
| 11 |
+
|
| 12 |
+
Projects:
|
| 13 |
+
https://github.com/juntaosun/LangSegment
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import re
|
| 17 |
+
from collections import defaultdict
|
| 18 |
+
|
| 19 |
+
# import langid
|
| 20 |
+
import py3langid as langid
|
| 21 |
+
# pip install py3langid==0.2.2
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# -----------------------------------
|
| 25 |
+
# 更新日志:新版本分词更加精准。
|
| 26 |
+
# Changelog: The new version of the word segmentation is more accurate.
|
| 27 |
+
# -----------------------------------
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Word segmentation function:
|
| 31 |
+
# automatically identify and split the words (Chinese/English/Japanese/Korean) in the article or sentence according to different languages,
|
| 32 |
+
# making it more suitable for TTS processing.
|
| 33 |
+
# This code is designed for front-end text multi-lingual mixed annotation distinction, multi-language mixed training and inference of various TTS projects.
|
| 34 |
+
# This processing result is mainly for (Chinese = zh, Japanese = ja, English = en, Korean = ko), and can actually support up to 97 different language mixing processing.
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ===========================================================================================================
|
| 38 |
+
# 分词功能:将文章或句子里的例如(中/英/日/韩),按不同语言自动识别并拆分,让它更适合TTS处理。
|
| 39 |
+
# 本代码专为各种 TTS 项目的前端文本多语种混合标注区分,多语言混合训练和推理而编写。
|
| 40 |
+
# ===========================================================================================================
|
| 41 |
+
# (1)自动分词:“韩语中的오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型”
|
| 42 |
+
# (2)手动分词:“你的名字叫<ja>佐々木?<ja>吗?”
|
| 43 |
+
# 本处理结果主要针对(中文=zh , 日文=ja , 英文=en , 韩语=ko), 实际上可支持多达 97 种不同的语言混合处理。
|
| 44 |
+
# ===========================================================================================================
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# 手动分词标签规范:<语言标签>文本内容</语言标签>
|
| 48 |
+
# Manual word segmentation tag specification: <language tags> text content </language tags>
|
| 49 |
+
# ===========================================================================================================
|
| 50 |
+
# For manual word segmentation, labels need to appear in pairs, such as:
|
| 51 |
+
# 如需手动分词,标签需要成对出现,例如:“<ja>佐々木<ja>” 或者 “<ja>佐々木</ja>”
|
| 52 |
+
# 错误示范:“你的名字叫<ja>佐々木。” 此句子中出现的单个<ja>标签将被忽略,不会处理。
|
| 53 |
+
# Error demonstration: "Your name is <ja>佐々木。" Single <ja> tags that appear in this sentence will be ignored and will not be processed.
|
| 54 |
+
# ===========================================================================================================
|
| 55 |
+
|
| 56 |
+
class LangSegment():
|
| 57 |
+
|
| 58 |
+
_text_cache = None
|
| 59 |
+
_text_lasts = None
|
| 60 |
+
_text_langs = None
|
| 61 |
+
_lang_count = None
|
| 62 |
+
_lang_eos = None
|
| 63 |
+
|
| 64 |
+
# 可自定义语言匹配标签:
|
| 65 |
+
# Customizable language matching tags: These are supported
|
| 66 |
+
# <zh>你好<zh> , <ja>佐々木</ja> , <en>OK<en> , <ko>오빠</ko> 这些写法均支持
|
| 67 |
+
SYMBOLS_PATTERN = r'(<([a-zA-Z|-]*)>(.*?)<\/*[a-zA-Z|-]*>)'
|
| 68 |
+
|
| 69 |
+
# 语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
| 70 |
+
# The language filter group function allows you to specify reserved languages.
|
| 71 |
+
# Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.
|
| 72 |
+
Langfilters = ["zh", "en", "ja", "ko"]
|
| 73 |
+
# 除此以外,它支持简写过滤器,只需按不同语种任意组合即可。
|
| 74 |
+
# In addition to that, it supports abbreviation filters, allowing for any combination of different languages.
|
| 75 |
+
# 示例:您可以任意指定多种组合,进行过滤
|
| 76 |
+
# Example: You can specify any combination to filter
|
| 77 |
+
|
| 78 |
+
# Langfilters = ["zh"] # 按中文识别
|
| 79 |
+
# Langfilters = ["en"] # 按英文识别
|
| 80 |
+
# Langfilters = ["ja"] # 按日文识别
|
| 81 |
+
# Langfilters = ["ko"] # 按韩文识别
|
| 82 |
+
# Langfilters = ["zh_ja"] # 中日混合识别
|
| 83 |
+
# Langfilters = ["zh_en"] # 中英混合识别
|
| 84 |
+
# Langfilters = ["ja_en"] # 日英混合识别
|
| 85 |
+
# Langfilters = ["zh_ko"] # 中韩混合识别
|
| 86 |
+
# Langfilters = ["ja_ko"] # 日韩混合识别
|
| 87 |
+
# Langfilters = ["en_ko"] # 英韩混合识别
|
| 88 |
+
# Langfilters = ["zh_ja_en"] # 中日英混合识别
|
| 89 |
+
# Langfilters = ["zh_ja_en_ko"] # 中日英韩混合识别
|
| 90 |
+
|
| 91 |
+
# 更多过滤组合,请您随意。。。For more filter combinations, please feel free to......
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# DEFINITION
|
| 95 |
+
PARSE_TAG = re.compile(r'(⑥\$\d+[\d]{6,}⑥)')
|
| 96 |
+
|
| 97 |
+
@staticmethod
|
| 98 |
+
def _clears():
|
| 99 |
+
LangSegment._text_cache = None
|
| 100 |
+
LangSegment._text_lasts = None
|
| 101 |
+
LangSegment._text_langs = None
|
| 102 |
+
LangSegment._text_waits = None
|
| 103 |
+
LangSegment._lang_count = None
|
| 104 |
+
LangSegment._lang_eos = None
|
| 105 |
+
pass
|
| 106 |
+
|
| 107 |
+
@staticmethod
|
| 108 |
+
def _is_english_word(word):
|
| 109 |
+
return bool(re.match(r'^[a-zA-Z]+$', word))
|
| 110 |
+
|
| 111 |
+
@staticmethod
|
| 112 |
+
def _is_chinese(word):
|
| 113 |
+
for char in word:
|
| 114 |
+
if '\u4e00' <= char <= '\u9fff':
|
| 115 |
+
return True
|
| 116 |
+
return False
|
| 117 |
+
|
| 118 |
+
@staticmethod
|
| 119 |
+
def _is_japanese_kana(word):
|
| 120 |
+
pattern = re.compile(r'[\u3040-\u309F\u30A0-\u30FF]+')
|
| 121 |
+
matches = pattern.findall(word)
|
| 122 |
+
return len(matches) > 0
|
| 123 |
+
|
| 124 |
+
@staticmethod
|
| 125 |
+
def _insert_english_uppercase(word):
|
| 126 |
+
modified_text = re.sub(r'(?<!\b)([A-Z])', r' \1', word)
|
| 127 |
+
modified_text = modified_text.strip('-')
|
| 128 |
+
return modified_text + " "
|
| 129 |
+
|
| 130 |
+
@staticmethod
|
| 131 |
+
def _saveData(words,language:str,text:str):
|
| 132 |
+
# Language word statistics
|
| 133 |
+
lang_count = LangSegment._lang_count
|
| 134 |
+
if lang_count is None:lang_count = defaultdict(int)
|
| 135 |
+
if not "|" in language:lang_count[language] += int(len(text)//2) if language == "en" else len(text)
|
| 136 |
+
LangSegment._lang_count = lang_count
|
| 137 |
+
# Merge the same language and save the results
|
| 138 |
+
preData = words[-1] if len(words) > 0 else None
|
| 139 |
+
if preData and (preData["lang"] == language):
|
| 140 |
+
text = preData["text"] + text
|
| 141 |
+
preData["text"] = text
|
| 142 |
+
return preData
|
| 143 |
+
data = {"lang":language,"text": text}
|
| 144 |
+
filters = LangSegment.Langfilters
|
| 145 |
+
if filters is None or len(filters) == 0 or "?" in language or \
|
| 146 |
+
language in filters or language in filters[0] or \
|
| 147 |
+
filters[0] == "*" or filters[0] in "alls-mixs-autos":
|
| 148 |
+
words.append(data)
|
| 149 |
+
return data
|
| 150 |
+
|
| 151 |
+
@staticmethod
|
| 152 |
+
def _addwords(words,language,text):
|
| 153 |
+
if text is None or len(text.strip()) == 0:return True
|
| 154 |
+
if language is None:language = ""
|
| 155 |
+
language = language.lower()
|
| 156 |
+
if language == 'en':text = LangSegment._insert_english_uppercase(text)
|
| 157 |
+
# text = re.sub(r'[(())]', ',' , text) # Keep it.
|
| 158 |
+
text_waits = LangSegment._text_waits
|
| 159 |
+
ispre_waits = len(text_waits)>0
|
| 160 |
+
preResult = text_waits.pop() if ispre_waits else None
|
| 161 |
+
if preResult is None:preResult = words[-1] if len(words) > 0 else None
|
| 162 |
+
if preResult and ("|" in preResult["lang"]):
|
| 163 |
+
pre_lang = preResult["lang"]
|
| 164 |
+
if language in pre_lang:preResult["lang"] = language = language.split("|")[0]
|
| 165 |
+
else:preResult["lang"]=pre_lang.split("|")[0]
|
| 166 |
+
if ispre_waits:preResult = LangSegment._saveData(words,preResult["lang"],preResult["text"])
|
| 167 |
+
pre_lang = preResult["lang"] if preResult else None
|
| 168 |
+
if ("|" in language) and (pre_lang and not pre_lang in language and not "…" in language):language = language.split("|")[0]
|
| 169 |
+
filters = LangSegment.Langfilters
|
| 170 |
+
if "|" in language:LangSegment._text_waits.append({"lang":language,"text": text})
|
| 171 |
+
else:LangSegment._saveData(words,language,text)
|
| 172 |
+
return False
|
| 173 |
+
|
| 174 |
+
@staticmethod
|
| 175 |
+
def _get_prev_data(words):
|
| 176 |
+
data = words[-1] if words and len(words) > 0 else None
|
| 177 |
+
if data:return (data["lang"] , data["text"])
|
| 178 |
+
return (None,"")
|
| 179 |
+
|
| 180 |
+
@staticmethod
|
| 181 |
+
def _match_ending(input , index):
|
| 182 |
+
if input is None or len(input) == 0:return False,None
|
| 183 |
+
input = re.sub(r'\s+', '', input)
|
| 184 |
+
if len(input) == 0 or abs(index) > len(input):return False,None
|
| 185 |
+
ending_pattern = re.compile(r'([「」“”‘’"\'::。.!!?.?])')
|
| 186 |
+
return ending_pattern.match(input[index]),input[index]
|
| 187 |
+
|
| 188 |
+
@staticmethod
|
| 189 |
+
def _cleans_text(cleans_text):
|
| 190 |
+
cleans_text = re.sub(r'([^\w]+)', '', cleans_text)
|
| 191 |
+
return cleans_text
|
| 192 |
+
|
| 193 |
+
@staticmethod
|
| 194 |
+
def _lang_classify(cleans_text):
|
| 195 |
+
language, *_ = langid.classify(cleans_text)
|
| 196 |
+
return language
|
| 197 |
+
|
| 198 |
+
@staticmethod
|
| 199 |
+
def _parse_language(words , segment):
|
| 200 |
+
LANG_JA = "ja"
|
| 201 |
+
LANG_ZH = "zh"
|
| 202 |
+
language = LANG_ZH
|
| 203 |
+
regex_pattern = re.compile(r'([^\w\s]+)')
|
| 204 |
+
lines = regex_pattern.split(segment)
|
| 205 |
+
lines_max = len(lines)
|
| 206 |
+
LANG_EOS =LangSegment._lang_eos
|
| 207 |
+
for index, text in enumerate(lines):
|
| 208 |
+
if len(text) == 0:continue
|
| 209 |
+
EOS = index >= (lines_max - 1)
|
| 210 |
+
nextId = index + 1
|
| 211 |
+
nextText = lines[nextId] if not EOS else ""
|
| 212 |
+
nextPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',nextText)).strip()) == 0
|
| 213 |
+
textPunc = len(re.sub(regex_pattern,'',re.sub(r'\n+','',text)).strip()) == 0
|
| 214 |
+
if not EOS and (textPunc == True or ( len(nextText.strip()) >= 0 and nextPunc == True)):
|
| 215 |
+
lines[nextId] = f'{text}{nextText}'
|
| 216 |
+
continue
|
| 217 |
+
number_tags = re.compile(r'(⑥\d{6,}⑥)')
|
| 218 |
+
cleans_text = re.sub(number_tags, '' ,text)
|
| 219 |
+
cleans_text = LangSegment._cleans_text(cleans_text)
|
| 220 |
+
language = LangSegment._lang_classify(cleans_text)
|
| 221 |
+
prev_language , prev_text = LangSegment._get_prev_data(words)
|
| 222 |
+
if len(cleans_text) <= 3 and LangSegment._is_chinese(cleans_text):
|
| 223 |
+
if EOS and LANG_EOS: language = LANG_ZH if len(cleans_text) <= 1 else language
|
| 224 |
+
elif LangSegment._is_japanese_kana(cleans_text):language = LANG_JA
|
| 225 |
+
else:
|
| 226 |
+
LANG_UNKNOWN = f'{LANG_ZH}|{LANG_JA}'
|
| 227 |
+
match_end,match_char = LangSegment._match_ending(text, -1)
|
| 228 |
+
referen = prev_language in LANG_UNKNOWN or LANG_UNKNOWN in prev_language if prev_language else False
|
| 229 |
+
if match_char in "。.": language = prev_language if referen and len(words) > 0 else language
|
| 230 |
+
else:language = f"{LANG_UNKNOWN}|…"
|
| 231 |
+
text,*_ = re.subn(number_tags , LangSegment._restore_number , text )
|
| 232 |
+
LangSegment._addwords(words,language,text)
|
| 233 |
+
pass
|
| 234 |
+
pass
|
| 235 |
+
|
| 236 |
+
@staticmethod
|
| 237 |
+
def _restore_number(matche):
|
| 238 |
+
value = matche.group(0)
|
| 239 |
+
text_cache = LangSegment._text_cache
|
| 240 |
+
if value in text_cache:
|
| 241 |
+
process , data = text_cache[value]
|
| 242 |
+
tag , match = data
|
| 243 |
+
value = match
|
| 244 |
+
return value
|
| 245 |
+
|
| 246 |
+
@staticmethod
|
| 247 |
+
def _pattern_symbols(item , text):
|
| 248 |
+
if text is None:return text
|
| 249 |
+
tag , pattern , process = item
|
| 250 |
+
matches = pattern.findall(text)
|
| 251 |
+
if len(matches) == 1 and "".join(matches[0]) == text:
|
| 252 |
+
return text
|
| 253 |
+
for i , match in enumerate(matches):
|
| 254 |
+
key = f"⑥{tag}{i:06d}⑥"
|
| 255 |
+
text = re.sub(pattern , key , text , count=1)
|
| 256 |
+
LangSegment._text_cache[key] = (process , (tag , match))
|
| 257 |
+
return text
|
| 258 |
+
|
| 259 |
+
@staticmethod
|
| 260 |
+
def _process_symbol(words,data):
|
| 261 |
+
tag , match = data
|
| 262 |
+
language = match[1]
|
| 263 |
+
text = match[2]
|
| 264 |
+
LangSegment._addwords(words,language,text)
|
| 265 |
+
pass
|
| 266 |
+
|
| 267 |
+
@staticmethod
|
| 268 |
+
def _process_english(words,data):
|
| 269 |
+
tag , match = data
|
| 270 |
+
text = match[0]
|
| 271 |
+
language = "en"
|
| 272 |
+
LangSegment._addwords(words,language,text)
|
| 273 |
+
pass
|
| 274 |
+
|
| 275 |
+
@staticmethod
|
| 276 |
+
def _process_korean(words,data):
|
| 277 |
+
tag , match = data
|
| 278 |
+
text = match[0]
|
| 279 |
+
language = "ko"
|
| 280 |
+
LangSegment._addwords(words,language,text)
|
| 281 |
+
pass
|
| 282 |
+
|
| 283 |
+
@staticmethod
|
| 284 |
+
def _process_quotes(words,data):
|
| 285 |
+
tag , match = data
|
| 286 |
+
text = "".join(match)
|
| 287 |
+
childs = LangSegment.PARSE_TAG.findall(text)
|
| 288 |
+
if len(childs) > 0:
|
| 289 |
+
LangSegment._process_tags(words , text , False)
|
| 290 |
+
else:
|
| 291 |
+
cleans_text = LangSegment._cleans_text(match[1])
|
| 292 |
+
if len(cleans_text) <= 3:
|
| 293 |
+
LangSegment._parse_language(words,text)
|
| 294 |
+
else:
|
| 295 |
+
language = LangSegment._lang_classify(cleans_text)
|
| 296 |
+
LangSegment._addwords(words,language,text)
|
| 297 |
+
pass
|
| 298 |
+
|
| 299 |
+
@staticmethod
|
| 300 |
+
def _process_number(words,data): # "$0" process only
|
| 301 |
+
"""
|
| 302 |
+
Numbers alone cannot accurately identify language.
|
| 303 |
+
Because numbers are universal in all languages.
|
| 304 |
+
So it won't be executed here, just for testing.
|
| 305 |
+
"""
|
| 306 |
+
tag , match = data
|
| 307 |
+
language = words[0]["lang"] if len(words) > 0 else "zh"
|
| 308 |
+
text = match
|
| 309 |
+
LangSegment._addwords(words,language,text)
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
@staticmethod
|
| 313 |
+
def _process_tags(words , text , root_tag):
|
| 314 |
+
text_cache = LangSegment._text_cache
|
| 315 |
+
segments = re.split(LangSegment.PARSE_TAG, text)
|
| 316 |
+
segments_len = len(segments) - 1
|
| 317 |
+
for index , text in enumerate(segments):
|
| 318 |
+
if root_tag:LangSegment._lang_eos = index >= segments_len
|
| 319 |
+
if LangSegment.PARSE_TAG.match(text):
|
| 320 |
+
process , data = text_cache[text]
|
| 321 |
+
if process:process(words , data)
|
| 322 |
+
else:
|
| 323 |
+
LangSegment._parse_language(words , text)
|
| 324 |
+
pass
|
| 325 |
+
return words
|
| 326 |
+
|
| 327 |
+
@staticmethod
|
| 328 |
+
def _parse_symbols(text):
|
| 329 |
+
TAG_NUM = "00" # "00" => default channels , "$0" => testing channel
|
| 330 |
+
TAG_S1,TAG_P1,TAG_P2,TAG_EN,TAG_KO = "$1" ,"$2" ,"$3" ,"$4" ,"$5"
|
| 331 |
+
process_list = [
|
| 332 |
+
( TAG_S1 , re.compile(LangSegment.SYMBOLS_PATTERN) , LangSegment._process_symbol ), # Symbol Tag
|
| 333 |
+
( TAG_KO , re.compile('(([【《((“‘"\']*(\d+\W*\s*)*[\uac00-\ud7a3]+[\W\s]*)+)') , LangSegment._process_korean ), # Korean words
|
| 334 |
+
( TAG_NUM , re.compile(r'(\W*\d+\W+\d*\W*\d*)') , LangSegment._process_number ), # Number words, Universal in all languages, Ignore it.
|
| 335 |
+
( TAG_EN , re.compile(r'(([【《((“‘"\']*[a-zA-Z]+[\W\s]*)+)') , LangSegment._process_english ), # English words
|
| 336 |
+
( TAG_P1 , re.compile(r'(["\'])(.*?)(\1)') , LangSegment._process_quotes ), # Regular quotes
|
| 337 |
+
( TAG_P2 , re.compile(r'([\n]*[【《((“‘])([^【《((“‘’”))》】]{3,})([’”))》】][\W\s]*[\n]{,1})') , LangSegment._process_quotes ), # Special quotes, There are left and right.
|
| 338 |
+
]
|
| 339 |
+
LangSegment._lang_eos = False
|
| 340 |
+
text_cache = LangSegment._text_cache = {}
|
| 341 |
+
for item in process_list:
|
| 342 |
+
text = LangSegment._pattern_symbols(item , text)
|
| 343 |
+
words = LangSegment._process_tags([] , text , True)
|
| 344 |
+
lang_count = LangSegment._lang_count
|
| 345 |
+
if lang_count and len(lang_count) > 0:
|
| 346 |
+
lang_count = dict(sorted(lang_count.items(), key=lambda x: x[1], reverse=True))
|
| 347 |
+
lang_count = list(lang_count.items())
|
| 348 |
+
LangSegment._lang_count = lang_count
|
| 349 |
+
return words
|
| 350 |
+
|
| 351 |
+
@staticmethod
|
| 352 |
+
def setfilters(filters):
|
| 353 |
+
# 当过滤器更改时,清除缓存
|
| 354 |
+
# When the filter changes, clear the cache
|
| 355 |
+
if LangSegment.Langfilters != filters:
|
| 356 |
+
LangSegment._clears()
|
| 357 |
+
LangSegment.Langfilters = filters
|
| 358 |
+
pass
|
| 359 |
+
|
| 360 |
+
@staticmethod
|
| 361 |
+
def getfilters():
|
| 362 |
+
return LangSegment.Langfilters
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
@staticmethod
|
| 366 |
+
def getCounts():
|
| 367 |
+
lang_count = LangSegment._lang_count
|
| 368 |
+
if lang_count is not None:return lang_count
|
| 369 |
+
text_langs = LangSegment._text_langs
|
| 370 |
+
if text_langs is None or len(text_langs) == 0:return [("zh",0)]
|
| 371 |
+
lang_counts = defaultdict(int)
|
| 372 |
+
for d in text_langs:lang_counts[d['lang']] += int(len(d['text'])//2) if d['lang'] == "en" else len(d['text'])
|
| 373 |
+
lang_counts = dict(sorted(lang_counts.items(), key=lambda x: x[1], reverse=True))
|
| 374 |
+
lang_counts = list(lang_counts.items())
|
| 375 |
+
LangSegment._lang_count = lang_counts
|
| 376 |
+
return lang_counts
|
| 377 |
+
|
| 378 |
+
@staticmethod
|
| 379 |
+
def getTexts(text:str):
|
| 380 |
+
if text is None or len(text.strip()) == 0:
|
| 381 |
+
LangSegment._clears()
|
| 382 |
+
return []
|
| 383 |
+
# lasts
|
| 384 |
+
text_langs = LangSegment._text_langs
|
| 385 |
+
if LangSegment._text_lasts == text and text_langs is not None:return text_langs
|
| 386 |
+
# parse
|
| 387 |
+
LangSegment._text_waits = []
|
| 388 |
+
LangSegment._lang_count = None
|
| 389 |
+
LangSegment._text_lasts = text
|
| 390 |
+
text = LangSegment._parse_symbols(text)
|
| 391 |
+
LangSegment._text_langs = text
|
| 392 |
+
return text
|
| 393 |
+
|
| 394 |
+
@staticmethod
|
| 395 |
+
def classify(text:str):
|
| 396 |
+
return LangSegment.getTexts(text)
|
| 397 |
+
|
| 398 |
+
def setfilters(filters):
|
| 399 |
+
"""
|
| 400 |
+
功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
| 401 |
+
Function: Language filter group function, you can specify reserved languages. \n
|
| 402 |
+
Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
|
| 403 |
+
Args:
|
| 404 |
+
filters (list): ["zh", "en", "ja", "ko"]
|
| 405 |
+
"""
|
| 406 |
+
LangSegment.setfilters(filters)
|
| 407 |
+
pass
|
| 408 |
+
|
| 409 |
+
def getfilters():
|
| 410 |
+
"""
|
| 411 |
+
功能:语言过滤组功能, 可以指定保留语言。不在过滤组中的语言将被清除。您可随心搭配TTS语音合成所支持的语言。
|
| 412 |
+
Function: Language filter group function, you can specify reserved languages. \n
|
| 413 |
+
Languages not in the filter group will be cleared. You can match the languages supported by TTS Text To Speech as you like.\n
|
| 414 |
+
Args:
|
| 415 |
+
filters (list): ["zh", "en", "ja", "ko"]
|
| 416 |
+
"""
|
| 417 |
+
return LangSegment.getfilters()
|
| 418 |
+
|
| 419 |
+
# @Deprecated:Use shorter setfilters
|
| 420 |
+
def setLangfilters(filters):
|
| 421 |
+
"""
|
| 422 |
+
>0.1.9废除:使用更简短的setfilters
|
| 423 |
+
"""
|
| 424 |
+
setfilters(filters)
|
| 425 |
+
# @Deprecated:Use shorter getfilters
|
| 426 |
+
def getLangfilters():
|
| 427 |
+
"""
|
| 428 |
+
>0.1.9废除:使用更简短的getfilters
|
| 429 |
+
"""
|
| 430 |
+
return getfilters()
|
| 431 |
+
|
| 432 |
+
def getTexts(text:str):
|
| 433 |
+
"""
|
| 434 |
+
功能:对输入的文本进行多语种分词\n
|
| 435 |
+
Feature: Tokenizing multilingual text input.\n
|
| 436 |
+
参数-Args:
|
| 437 |
+
text (str): Text content,文本内容\n
|
| 438 |
+
返回-Returns:
|
| 439 |
+
list: 示例结果:[{'lang':'zh','text':'?'},...]\n
|
| 440 |
+
lang=语种 , text=内容\n
|
| 441 |
+
"""
|
| 442 |
+
return LangSegment.getTexts(text)
|
| 443 |
+
|
| 444 |
+
def getCounts():
|
| 445 |
+
"""
|
| 446 |
+
功能:分词结果统计,按语种字数降序,用于确定其主要语言\n
|
| 447 |
+
Function: Tokenizing multilingual text input.\n
|
| 448 |
+
返回-Returns:
|
| 449 |
+
list: 示例结果:[('zh', 5), ('ja', 2), ('en', 1)] = [(语种,字数含标点)]\n
|
| 450 |
+
"""
|
| 451 |
+
return LangSegment.getCounts()
|
| 452 |
+
|
| 453 |
+
def classify(text:str):
|
| 454 |
+
"""
|
| 455 |
+
功能:兼容接口实现
|
| 456 |
+
Function: Compatible interface implementation
|
| 457 |
+
"""
|
| 458 |
+
return LangSegment.classify(text)
|
| 459 |
+
|
| 460 |
+
def printList(langlist):
|
| 461 |
+
"""
|
| 462 |
+
功能:打印数组结果
|
| 463 |
+
Function: Print array results
|
| 464 |
+
"""
|
| 465 |
+
print("\n\n===================【打印结果】===================")
|
| 466 |
+
if langlist is None or len(langlist) == 0:
|
| 467 |
+
print("无内容结果,No content result")
|
| 468 |
+
return
|
| 469 |
+
for line in langlist:
|
| 470 |
+
print(line)
|
| 471 |
+
pass
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
if __name__ == "__main__":
|
| 476 |
+
|
| 477 |
+
# -----------------------------------
|
| 478 |
+
# 更新日志:新版本分词更加精准。
|
| 479 |
+
# Changelog: The new version of the word segmentation is more accurate.
|
| 480 |
+
# -----------------------------------
|
| 481 |
+
|
| 482 |
+
# 输入示例1:(包含日文,中文)
|
| 483 |
+
# text = "“昨日は雨が降った,音楽、映画。。。”你今天学习日语了吗?春は桜の季節です。语种分词是语音合成必不可少的环节。言語分詞は音声合成に欠かせない環節である!"
|
| 484 |
+
|
| 485 |
+
# 输入示例2:(包含日文,中文)
|
| 486 |
+
# text = "欢迎来玩。東京,は日本の首都です。欢迎来玩. 太好了!"
|
| 487 |
+
|
| 488 |
+
# 输入示例3:(包含日文,中文)
|
| 489 |
+
# text = "明日、私たちは海辺にバカンスに行きます。你会说日语吗:“中国語、話せますか” 你的日语真好啊!"
|
| 490 |
+
|
| 491 |
+
# 输入示例4:(包含日文,中文,韩语,英文)
|
| 492 |
+
text = "你的名字叫<ja>佐々木?<ja>吗?韩语中的안녕 오빠读什么呢?あなたの体育の先生は誰ですか? 此次发布会带来了四款iPhone 15系列机型和三款Apple Watch等一系列新品,这次的iPad Air采用了LCD屏幕"
|
| 493 |
+
|
| 494 |
+
# 进行分词:(接入TTS项目仅需一行代码调用)
|
| 495 |
+
langlist = LangSegment.getTexts(text)
|
| 496 |
+
printList(langlist)
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
# 语种统计:
|
| 500 |
+
print("\n===================【语种统计】===================")
|
| 501 |
+
# 获取所有语种数组结果,根据内容字数降序排列
|
| 502 |
+
langCounts = LangSegment.getCounts()
|
| 503 |
+
print(langCounts , "\n")
|
| 504 |
+
|
| 505 |
+
# 根据结果获取内容的主要语种 (语言,字数含标点)
|
| 506 |
+
lang , count = langCounts[0]
|
| 507 |
+
print(f"输入内容的主要语言为 = {lang} ,字数 = {count}")
|
| 508 |
+
print("==================================================\n")
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
# 分词输出:lang=语言,text=内容
|
| 512 |
+
# ===================【打印结果】===================
|
| 513 |
+
# {'lang': 'zh', 'text': '你的名字叫'}
|
| 514 |
+
# {'lang': 'ja', 'text': '佐々木?'}
|
| 515 |
+
# {'lang': 'zh', 'text': '吗?韩语中的'}
|
| 516 |
+
# {'lang': 'ko', 'text': '안녕 오빠'}
|
| 517 |
+
# {'lang': 'zh', 'text': '读什么呢?'}
|
| 518 |
+
# {'lang': 'ja', 'text': 'あなたの体育の先生は誰ですか?'}
|
| 519 |
+
# {'lang': 'zh', 'text': ' 此次发布会带来了四款'}
|
| 520 |
+
# {'lang': 'en', 'text': 'i Phone '}
|
| 521 |
+
# {'lang': 'zh', 'text': '15系列机型和三款'}
|
| 522 |
+
# {'lang': 'en', 'text': 'Apple Watch '}
|
| 523 |
+
# {'lang': 'zh', 'text': '等一系列新品,这次的'}
|
| 524 |
+
# {'lang': 'en', 'text': 'i Pad Air '}
|
| 525 |
+
# {'lang': 'zh', 'text': '采用了'}
|
| 526 |
+
# {'lang': 'en', 'text': 'L C D '}
|
| 527 |
+
# {'lang': 'zh', 'text': '屏幕'}
|
| 528 |
+
# ===================【语种统计】===================
|
| 529 |
+
|
| 530 |
+
# ===================【语种统计】===================
|
| 531 |
+
# [('zh', 51), ('ja', 19), ('en', 18), ('ko', 5)]
|
| 532 |
+
|
| 533 |
+
# 输入内容的主要语言为 = zh ,字数 = 51
|
| 534 |
+
# ==================================================
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
|
ForkLangSegment/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .LangSegment import LangSegment,getTexts,classify,getCounts,printList,setLangfilters,getLangfilters,setfilters,getfilters
|
| 2 |
+
|
| 3 |
+
# release
|
| 4 |
+
__version__ = '0.2.0'
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
# develop
|
| 8 |
+
__develop__ = 'dev-0.0.1'
|