Spaces:
Build error
Build error
Update tools/sentence.py
Browse files- tools/sentence.py +4 -20
tools/sentence.py
CHANGED
|
@@ -127,36 +127,20 @@ def merge_adjacent_japanese(sentences):
|
|
| 127 |
def extrac(text):
|
| 128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
| 129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
| 130 |
-
# 使用换行符和标点符号进行初步分割
|
| 131 |
-
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
| 132 |
final_sentences = []
|
| 133 |
|
| 134 |
-
preliminary_sentences = re.split(r'([\n。;!?\.\?!])', text)
|
| 135 |
-
|
| 136 |
for piece in preliminary_sentences:
|
| 137 |
if is_single_language(piece):
|
| 138 |
final_sentences.append(piece)
|
| 139 |
else:
|
| 140 |
sub_sentences = split_mixed_language(piece)
|
| 141 |
final_sentences.extend(sub_sentences)
|
| 142 |
-
|
| 143 |
-
# 处理长句子,使用jieba进行分词
|
| 144 |
-
split_sentences = []
|
| 145 |
-
for sentence in final_sentences:
|
| 146 |
-
split_sentences.extend(split_long_sentences(sentence))
|
| 147 |
-
|
| 148 |
-
# 合并相邻的日语句子
|
| 149 |
-
merged_japanese_sentences = merge_adjacent_japanese(split_sentences)
|
| 150 |
-
|
| 151 |
-
# 剔除只包含标点符号的元素
|
| 152 |
-
clean_sentences = [s for s in merged_japanese_sentences if not is_only_punctuation(s)]
|
| 153 |
-
|
| 154 |
-
# 移除空字符串并去除多余引号
|
| 155 |
-
return [s.replace('"','').strip() for s in clean_sentences if s]
|
| 156 |
-
|
| 157 |
|
|
|
|
|
|
|
| 158 |
|
| 159 |
-
# 移除空字符串
|
| 160 |
|
| 161 |
def is_mixed_language(sentence):
|
| 162 |
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|
|
|
|
| 127 |
def extrac(text):
|
| 128 |
text = replace_quotes(remove_numeric_annotations(text)) # 替换引号
|
| 129 |
text = re.sub("<[^>]*>", "", text) # 移除 HTML 标签
|
| 130 |
+
# 使用换行符和标点符号进行初步分割,确保标点符号保留在句子末尾
|
| 131 |
+
preliminary_sentences = re.split(r'(?<=[\n。;!?\.\?!])', text)
|
| 132 |
final_sentences = []
|
| 133 |
|
|
|
|
|
|
|
| 134 |
for piece in preliminary_sentences:
|
| 135 |
if is_single_language(piece):
|
| 136 |
final_sentences.append(piece)
|
| 137 |
else:
|
| 138 |
sub_sentences = split_mixed_language(piece)
|
| 139 |
final_sentences.extend(sub_sentences)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
+
# 移除双引号和空白字符
|
| 142 |
+
return [s.replace('"', '').strip() for s in final_sentences if s]
|
| 143 |
|
|
|
|
| 144 |
|
| 145 |
def is_mixed_language(sentence):
|
| 146 |
contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
|