Spaces:
Running
Running
fix: drop probability-only dictionary rows before MFA parse
Browse files- src/mfa_model_downloader.py +30 -2
- src/mfa_runner.py +14 -1
src/mfa_model_downloader.py
CHANGED
|
@@ -11,10 +11,12 @@ import logging
|
|
| 11 |
import urllib.request
|
| 12 |
import urllib.error
|
| 13 |
import time
|
|
|
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import Optional, Callable
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
| 18 |
|
| 19 |
# 模型下载基础 URL
|
| 20 |
GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
|
|
@@ -136,8 +138,22 @@ def _verify_file_integrity(
|
|
| 136 |
if not stripped:
|
| 137 |
continue
|
| 138 |
|
|
|
|
| 139 |
# MFA 字典支持任意空白分隔,至少需 2 列
|
| 140 |
-
if len(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
valid_line_count += 1
|
| 142 |
else:
|
| 143 |
invalid_line_count += 1
|
|
@@ -285,6 +301,7 @@ def _clean_dictionary_file(
|
|
| 285 |
cleaned_lines = []
|
| 286 |
removed_count = 0
|
| 287 |
comment_count = 0
|
|
|
|
| 288 |
for line in lines:
|
| 289 |
stripped = line.replace('\ufeff', '').strip()
|
| 290 |
if not stripped:
|
|
@@ -299,6 +316,15 @@ def _clean_dictionary_file(
|
|
| 299 |
continue
|
| 300 |
|
| 301 |
tokens = stripped.split()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
cleaned_lines.append(f"{tokens[0]}\t{' '.join(tokens[1:])}\n")
|
| 303 |
|
| 304 |
# 无论是否移除行,都重写为标准 tab 分隔格式
|
|
@@ -307,7 +333,9 @@ def _clean_dictionary_file(
|
|
| 307 |
|
| 308 |
if progress_callback:
|
| 309 |
if removed_count > 0:
|
| 310 |
-
progress_callback(
|
|
|
|
|
|
|
| 311 |
else:
|
| 312 |
progress_callback(f"字典标准化完成,共 {len(cleaned_lines)} 行(已统一为 tab 分隔)")
|
| 313 |
|
|
|
|
| 11 |
import urllib.request
|
| 12 |
import urllib.error
|
| 13 |
import time
|
| 14 |
+
import re
|
| 15 |
from pathlib import Path
|
| 16 |
from typing import Optional, Callable
|
| 17 |
|
| 18 |
logger = logging.getLogger(__name__)
|
| 19 |
+
PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
|
| 20 |
|
| 21 |
# 模型下载基础 URL
|
| 22 |
GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
|
|
|
|
| 138 |
if not stripped:
|
| 139 |
continue
|
| 140 |
|
| 141 |
+
parts = stripped.split()
|
| 142 |
# MFA 字典支持任意空白分隔,至少需 2 列
|
| 143 |
+
if len(parts) < 2:
|
| 144 |
+
invalid_line_count += 1
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# 与 MFA parse_dictionary_file 对齐:概率字段后必须有音素
|
| 148 |
+
rest = parts[1:]
|
| 149 |
+
idx = 0
|
| 150 |
+
while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
|
| 151 |
+
idx += 1
|
| 152 |
+
if idx >= len(rest):
|
| 153 |
+
invalid_line_count += 1
|
| 154 |
+
continue
|
| 155 |
+
|
| 156 |
+
if len(parts) >= 2:
|
| 157 |
valid_line_count += 1
|
| 158 |
else:
|
| 159 |
invalid_line_count += 1
|
|
|
|
| 301 |
cleaned_lines = []
|
| 302 |
removed_count = 0
|
| 303 |
comment_count = 0
|
| 304 |
+
prob_only_count = 0
|
| 305 |
for line in lines:
|
| 306 |
stripped = line.replace('\ufeff', '').strip()
|
| 307 |
if not stripped:
|
|
|
|
| 316 |
continue
|
| 317 |
|
| 318 |
tokens = stripped.split()
|
| 319 |
+
rest = tokens[1:]
|
| 320 |
+
idx = 0
|
| 321 |
+
while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
|
| 322 |
+
idx += 1
|
| 323 |
+
if idx >= len(rest):
|
| 324 |
+
prob_only_count += 1
|
| 325 |
+
removed_count += 1
|
| 326 |
+
continue
|
| 327 |
+
|
| 328 |
cleaned_lines.append(f"{tokens[0]}\t{' '.join(tokens[1:])}\n")
|
| 329 |
|
| 330 |
# 无论是否移除行,都重写为标准 tab 分隔格式
|
|
|
|
| 333 |
|
| 334 |
if progress_callback:
|
| 335 |
if removed_count > 0:
|
| 336 |
+
progress_callback(
|
| 337 |
+
f"已清理 {removed_count} 个空行/无效行(含注释 {comment_count} 行, 概率无音素 {prob_only_count} 行)"
|
| 338 |
+
)
|
| 339 |
else:
|
| 340 |
progress_callback(f"字典标准化完成,共 {len(cleaned_lines)} 行(已统一为 tab 分隔)")
|
| 341 |
|
src/mfa_runner.py
CHANGED
|
@@ -10,10 +10,12 @@ import shutil
|
|
| 10 |
import subprocess
|
| 11 |
import logging
|
| 12 |
import time
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
from typing import Optional, Callable
|
| 15 |
|
| 16 |
logger = logging.getLogger(__name__)
|
|
|
|
| 17 |
|
| 18 |
# 定位路径
|
| 19 |
BASE_DIR = Path(__file__).parent.parent.absolute()
|
|
@@ -235,6 +237,7 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
|
|
| 235 |
# 这样可以最大化规避 MFA 解析器在边缘行上的 IndexError。
|
| 236 |
sanitized_lines = []
|
| 237 |
malformed_count = 0
|
|
|
|
| 238 |
comment_count = 0
|
| 239 |
for line in lines:
|
| 240 |
stripped = line.replace('\ufeff', '').strip()
|
|
@@ -252,6 +255,16 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
|
|
| 252 |
malformed_count += 1
|
| 253 |
continue
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
word = tokens[0]
|
| 256 |
pronunciation = " ".join(tokens[1:])
|
| 257 |
sanitized_lines.append(f"{word}\t{pronunciation}\n")
|
|
@@ -265,7 +278,7 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
|
|
| 265 |
if removed_count > 0:
|
| 266 |
logger.info(
|
| 267 |
f"字典文件清理完成: 原 {original_count} 行, 现 {len(sanitized_lines)} 行, "
|
| 268 |
-
f"移除 {removed_count} 行(注释 {comment_count} 行, 格式异常 {malformed_count} 行)"
|
| 269 |
)
|
| 270 |
else:
|
| 271 |
logger.info(f"字典文件标准化完成: 共 {len(sanitized_lines)} 行(已统一为 tab 分隔)")
|
|
|
|
| 10 |
import subprocess
|
| 11 |
import logging
|
| 12 |
import time
|
| 13 |
+
import re
|
| 14 |
from pathlib import Path
|
| 15 |
from typing import Optional, Callable
|
| 16 |
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
+
PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
|
| 19 |
|
| 20 |
# 定位路径
|
| 21 |
BASE_DIR = Path(__file__).parent.parent.absolute()
|
|
|
|
| 237 |
# 这样可以最大化规避 MFA 解析器在边缘行上的 IndexError。
|
| 238 |
sanitized_lines = []
|
| 239 |
malformed_count = 0
|
| 240 |
+
prob_only_count = 0
|
| 241 |
comment_count = 0
|
| 242 |
for line in lines:
|
| 243 |
stripped = line.replace('\ufeff', '').strip()
|
|
|
|
| 255 |
malformed_count += 1
|
| 256 |
continue
|
| 257 |
|
| 258 |
+
rest = tokens[1:]
|
| 259 |
+
# 与 MFA parse_dictionary_file 对齐:允许 1~4 个前置概率字段
|
| 260 |
+
# 但概率字段之后必须至少有一个音素,否则 MFA 内部会 IndexError。
|
| 261 |
+
idx = 0
|
| 262 |
+
while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
|
| 263 |
+
idx += 1
|
| 264 |
+
if idx >= len(rest):
|
| 265 |
+
prob_only_count += 1
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
word = tokens[0]
|
| 269 |
pronunciation = " ".join(tokens[1:])
|
| 270 |
sanitized_lines.append(f"{word}\t{pronunciation}\n")
|
|
|
|
| 278 |
if removed_count > 0:
|
| 279 |
logger.info(
|
| 280 |
f"字典文件清理完成: 原 {original_count} 行, 现 {len(sanitized_lines)} 行, "
|
| 281 |
+
f"移除 {removed_count} 行(注释 {comment_count} 行, 格式异常 {malformed_count} 行, 概率无音素 {prob_only_count} 行)"
|
| 282 |
)
|
| 283 |
else:
|
| 284 |
logger.info(f"字典文件标准化完成: 共 {len(sanitized_lines)} 行(已统一为 tab 分隔)")
|