Spaces:
Build error
Build error
Update tools/sentence.py
Browse files- tools/sentence.py +13 -24
tools/sentence.py
CHANGED
|
@@ -107,6 +107,8 @@ def remove_numeric_annotations(text):
|
|
| 107 |
pattern = r'“\d+”|【\d+】|〔\d+〕'
|
| 108 |
# 使用正则表达式替换掉这些注释
|
| 109 |
cleaned_text = re.sub(pattern, '', text)
|
|
|
|
|
|
|
| 110 |
return cleaned_text
|
| 111 |
|
| 112 |
def merge_adjacent_japanese(sentences):
|
|
@@ -128,11 +130,14 @@ def extrac(text):
|
|
| 128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
| 129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
| 130 |
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
| 131 |
-
preliminary_sentences = re.split(r'(?<=[\n
|
| 132 |
final_sentences = []
|
| 133 |
|
| 134 |
for piece in preliminary_sentences:
|
| 135 |
if is_single_language(piece):
|
|
|
|
|
|
|
|
|
|
| 136 |
final_sentences.append(piece)
|
| 137 |
else:
|
| 138 |
sub_sentences = split_mixed_language(piece)
|
|
@@ -199,7 +204,7 @@ def extract_text_from_file(inputFile):
|
|
| 199 |
def split_by_punctuation(sentence):
|
| 200 |
"""按照中文次级标点符号分割句子"""
|
| 201 |
# 常见的中文次级分隔符号:逗号、分号等
|
| 202 |
-
parts = re.split(r'([
|
| 203 |
# 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
|
| 204 |
merged_parts = []
|
| 205 |
for part in parts:
|
|
@@ -211,29 +216,13 @@ def split_by_punctuation(sentence):
|
|
| 211 |
|
| 212 |
def split_long_sentences(sentence, max_length=30):
|
| 213 |
"""如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
|
| 214 |
-
if len(sentence) > max_length and is_chinese(sentence):
|
| 215 |
-
# 首先尝试按照次级标点符号分割
|
| 216 |
-
preliminary_parts = split_by_punctuation(sentence)
|
| 217 |
-
new_sentences = []
|
| 218 |
-
|
| 219 |
-
for part in preliminary_parts:
|
| 220 |
-
# 如果部分仍然太长,使用jieba进行分词
|
| 221 |
-
if len(part) > max_length:
|
| 222 |
-
words = jieba.lcut(part)
|
| 223 |
-
current_sentence = ""
|
| 224 |
-
for word in words:
|
| 225 |
-
if len(current_sentence) + len(word) > max_length:
|
| 226 |
-
new_sentences.append(current_sentence)
|
| 227 |
-
current_sentence = word
|
| 228 |
-
else:
|
| 229 |
-
current_sentence += word
|
| 230 |
-
if current_sentence:
|
| 231 |
-
new_sentences.append(current_sentence)
|
| 232 |
-
else:
|
| 233 |
-
new_sentences.append(part)
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
def extract_and_convert(text):
|
| 239 |
|
|
|
|
| 107 |
pattern = r'“\d+”|【\d+】|〔\d+〕'
|
| 108 |
# 使用正则表达式替换掉这些注释
|
| 109 |
cleaned_text = re.sub(pattern, '', text)
|
| 110 |
+
cleaned_text = re.sub('「', '', cleaned_text)
|
| 111 |
+
cleaned_text = re.sub('」', '', cleaned_text)
|
| 112 |
return cleaned_text
|
| 113 |
|
| 114 |
def merge_adjacent_japanese(sentences):
|
|
|
|
| 130 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
| 131 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
| 132 |
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
| 133 |
+
preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!。])', text)
|
| 134 |
final_sentences = []
|
| 135 |
|
| 136 |
for piece in preliminary_sentences:
|
| 137 |
if is_single_language(piece):
|
| 138 |
+
if len(piece) > 15:
|
| 139 |
+
sub_sentences = split_long_sentences(piece)
|
| 140 |
+
final_sentences.extend(sub_sentences)
|
| 141 |
final_sentences.append(piece)
|
| 142 |
else:
|
| 143 |
sub_sentences = split_mixed_language(piece)
|
|
|
|
| 204 |
def split_by_punctuation(sentence):
|
| 205 |
"""按照中文次级标点符号分割句子"""
|
| 206 |
# 常见的中文次级分隔符号:逗号、分号等
|
| 207 |
+
parts = re.split(r'([,,;;…、『』「」])', sentence)
|
| 208 |
# 将标点符号与前面的词语合并,避免单独标点符号成为一个部分
|
| 209 |
merged_parts = []
|
| 210 |
for part in parts:
|
|
|
|
| 216 |
|
| 217 |
def split_long_sentences(sentence, max_length=30):
|
| 218 |
"""如果中文句子太长,先按标点分割,必要时使用jieba进行分词并分割"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
+
preliminary_parts = split_by_punctuation(sentence)
|
| 221 |
+
new_sentences = []
|
| 222 |
+
|
| 223 |
+
for part in preliminary_parts:
|
| 224 |
+
new_sentences.append(part)
|
| 225 |
+
return new_sentences
|
| 226 |
|
| 227 |
def extract_and_convert(text):
|
| 228 |
|