Spaces:
Running
Running
fix: clean and validate MFA dictionaries by whitespace tokens
Browse files- src/mfa_model_downloader.py +19 -9
- src/mfa_runner.py +15 -7
src/mfa_model_downloader.py
CHANGED
|
@@ -130,13 +130,14 @@ def _verify_file_integrity(
|
|
| 130 |
try:
|
| 131 |
valid_line_count = 0
|
| 132 |
invalid_line_count = 0
|
| 133 |
-
with open(file_path, 'r', encoding='utf-8') as f:
|
| 134 |
for line in f:
|
| 135 |
stripped = line.strip()
|
| 136 |
if not stripped:
|
| 137 |
continue
|
| 138 |
-
|
| 139 |
-
|
|
|
|
| 140 |
valid_line_count += 1
|
| 141 |
else:
|
| 142 |
invalid_line_count += 1
|
|
@@ -277,18 +278,27 @@ def _clean_dictionary_file(
|
|
| 277 |
返回: 清理的空行数量
|
| 278 |
"""
|
| 279 |
try:
|
| 280 |
-
with open(dict_path, 'r', encoding='utf-8') as f:
|
| 281 |
lines = f.readlines()
|
| 282 |
|
| 283 |
-
# 过滤空行
|
| 284 |
-
|
| 285 |
-
removed_count =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
if removed_count > 0:
|
| 288 |
with open(dict_path, 'w', encoding='utf-8') as f:
|
| 289 |
-
f.writelines(
|
| 290 |
if progress_callback:
|
| 291 |
-
progress_callback(f"已清理 {removed_count} 个空行")
|
| 292 |
|
| 293 |
return removed_count
|
| 294 |
except Exception as e:
|
|
|
|
| 130 |
try:
|
| 131 |
valid_line_count = 0
|
| 132 |
invalid_line_count = 0
|
| 133 |
+
with open(file_path, 'r', encoding='utf-8-sig') as f:
|
| 134 |
for line in f:
|
| 135 |
stripped = line.strip()
|
| 136 |
if not stripped:
|
| 137 |
continue
|
| 138 |
+
|
| 139 |
+
# MFA 字典支持任意空白分隔,至少需 2 列
|
| 140 |
+
if len(stripped.split()) >= 2:
|
| 141 |
valid_line_count += 1
|
| 142 |
else:
|
| 143 |
invalid_line_count += 1
|
|
|
|
| 278 |
返回: 清理的空行数量
|
| 279 |
"""
|
| 280 |
try:
|
| 281 |
+
with open(dict_path, 'r', encoding='utf-8-sig') as f:
|
| 282 |
lines = f.readlines()
|
| 283 |
|
| 284 |
+
# 过滤空行与格式异常行(至少应有 2 个 token)
|
| 285 |
+
cleaned_lines = []
|
| 286 |
+
removed_count = 0
|
| 287 |
+
for line in lines:
|
| 288 |
+
stripped = line.strip()
|
| 289 |
+
if not stripped:
|
| 290 |
+
removed_count += 1
|
| 291 |
+
continue
|
| 292 |
+
if len(stripped.split()) < 2:
|
| 293 |
+
removed_count += 1
|
| 294 |
+
continue
|
| 295 |
+
cleaned_lines.append(line)
|
| 296 |
|
| 297 |
if removed_count > 0:
|
| 298 |
with open(dict_path, 'w', encoding='utf-8') as f:
|
| 299 |
+
f.writelines(cleaned_lines)
|
| 300 |
if progress_callback:
|
| 301 |
+
progress_callback(f"已清理 {removed_count} 个空行/无效行")
|
| 302 |
|
| 303 |
return removed_count
|
| 304 |
except Exception as e:
|
src/mfa_runner.py
CHANGED
|
@@ -225,23 +225,28 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
|
|
| 225 |
返回: 清理的无效行数量
|
| 226 |
"""
|
| 227 |
try:
|
| 228 |
-
|
|
|
|
| 229 |
lines = f.readlines()
|
| 230 |
|
| 231 |
original_count = len(lines)
|
| 232 |
|
| 233 |
-
# 过滤空行和
|
| 234 |
-
#
|
|
|
|
| 235 |
valid_lines = []
|
|
|
|
| 236 |
for line in lines:
|
| 237 |
stripped = line.strip()
|
| 238 |
# 跳过空行
|
| 239 |
if not stripped:
|
| 240 |
continue
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
| 244 |
continue
|
|
|
|
| 245 |
valid_lines.append(line)
|
| 246 |
|
| 247 |
removed_count = original_count - len(valid_lines)
|
|
@@ -249,7 +254,10 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
|
|
| 249 |
if removed_count > 0:
|
| 250 |
with open(dict_path, 'w', encoding='utf-8') as f:
|
| 251 |
f.writelines(valid_lines)
|
| 252 |
-
logger.info(
|
|
|
|
|
|
|
|
|
|
| 253 |
else:
|
| 254 |
logger.info(f"字典文件无需清理: {original_count} 行")
|
| 255 |
|
|
|
|
| 225 |
返回: 清理的无效行数量
|
| 226 |
"""
|
| 227 |
try:
|
| 228 |
+
# utf-8-sig: 自动兼容可能存在的 BOM
|
| 229 |
+
with open(dict_path, 'r', encoding='utf-8-sig') as f:
|
| 230 |
lines = f.readlines()
|
| 231 |
|
| 232 |
original_count = len(lines)
|
| 233 |
|
| 234 |
+
# 过滤空行和无效行
|
| 235 |
+
# MFA 字典允许空白分隔,不应强制要求制表符
|
| 236 |
+
# 合法行至少应有 2 个 token(词条 + 至少一个音素)
|
| 237 |
valid_lines = []
|
| 238 |
+
malformed_count = 0
|
| 239 |
for line in lines:
|
| 240 |
stripped = line.strip()
|
| 241 |
# 跳过空行
|
| 242 |
if not stripped:
|
| 243 |
continue
|
| 244 |
+
|
| 245 |
+
tokens = stripped.split()
|
| 246 |
+
if len(tokens) < 2:
|
| 247 |
+
malformed_count += 1
|
| 248 |
continue
|
| 249 |
+
|
| 250 |
valid_lines.append(line)
|
| 251 |
|
| 252 |
removed_count = original_count - len(valid_lines)
|
|
|
|
| 254 |
if removed_count > 0:
|
| 255 |
with open(dict_path, 'w', encoding='utf-8') as f:
|
| 256 |
f.writelines(valid_lines)
|
| 257 |
+
logger.info(
|
| 258 |
+
f"字典文件清理完成: 原 {original_count} 行, 现 {len(valid_lines)} 行, "
|
| 259 |
+
f"移除 {removed_count} 行(其中格式异常 {malformed_count} 行)"
|
| 260 |
+
)
|
| 261 |
else:
|
| 262 |
logger.info(f"字典文件无需清理: {original_count} 行")
|
| 263 |
|