TNOT commited on
Commit
3baa123
·
1 Parent(s): 20eb387

fix: drop probability-only dictionary rows before MFA parse

Browse files
Files changed (2) hide show
  1. src/mfa_model_downloader.py +30 -2
  2. src/mfa_runner.py +14 -1
src/mfa_model_downloader.py CHANGED
@@ -11,10 +11,12 @@ import logging
11
  import urllib.request
12
  import urllib.error
13
  import time
 
14
  from pathlib import Path
15
  from typing import Optional, Callable
16
 
17
  logger = logging.getLogger(__name__)
 
18
 
19
  # 模型下载基础 URL
20
  GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
@@ -136,8 +138,22 @@ def _verify_file_integrity(
136
  if not stripped:
137
  continue
138
 
 
139
  # MFA 字典支持任意空白分隔,至少需 2 列
140
- if len(stripped.split()) >= 2:
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  valid_line_count += 1
142
  else:
143
  invalid_line_count += 1
@@ -285,6 +301,7 @@ def _clean_dictionary_file(
285
  cleaned_lines = []
286
  removed_count = 0
287
  comment_count = 0
 
288
  for line in lines:
289
  stripped = line.replace('\ufeff', '').strip()
290
  if not stripped:
@@ -299,6 +316,15 @@ def _clean_dictionary_file(
299
  continue
300
 
301
  tokens = stripped.split()
 
 
 
 
 
 
 
 
 
302
  cleaned_lines.append(f"{tokens[0]}\t{' '.join(tokens[1:])}\n")
303
 
304
  # 无论是否移除行,都重写为标准 tab 分隔格式
@@ -307,7 +333,9 @@ def _clean_dictionary_file(
307
 
308
  if progress_callback:
309
  if removed_count > 0:
310
- progress_callback(f"已清理 {removed_count} 个空行/无效行(含注释 {comment_count} 行)")
 
 
311
  else:
312
  progress_callback(f"字典标准化完成,共 {len(cleaned_lines)} 行(已统一为 tab 分隔)")
313
 
 
11
  import urllib.request
12
  import urllib.error
13
  import time
14
+ import re
15
  from pathlib import Path
16
  from typing import Optional, Callable
17
 
18
  logger = logging.getLogger(__name__)
19
+ PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
20
 
21
  # 模型下载基础 URL
22
  GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
 
138
  if not stripped:
139
  continue
140
 
141
+ parts = stripped.split()
142
  # MFA 字典支持任意空白分隔,至少需 2 列
143
+ if len(parts) < 2:
144
+ invalid_line_count += 1
145
+ continue
146
+
147
+ # 与 MFA parse_dictionary_file 对齐:概率字段后必须有音素
148
+ rest = parts[1:]
149
+ idx = 0
150
+ while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
151
+ idx += 1
152
+ if idx >= len(rest):
153
+ invalid_line_count += 1
154
+ continue
155
+
156
+ if len(parts) >= 2:
157
  valid_line_count += 1
158
  else:
159
  invalid_line_count += 1
 
301
  cleaned_lines = []
302
  removed_count = 0
303
  comment_count = 0
304
+ prob_only_count = 0
305
  for line in lines:
306
  stripped = line.replace('\ufeff', '').strip()
307
  if not stripped:
 
316
  continue
317
 
318
  tokens = stripped.split()
319
+ rest = tokens[1:]
320
+ idx = 0
321
+ while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
322
+ idx += 1
323
+ if idx >= len(rest):
324
+ prob_only_count += 1
325
+ removed_count += 1
326
+ continue
327
+
328
  cleaned_lines.append(f"{tokens[0]}\t{' '.join(tokens[1:])}\n")
329
 
330
  # 无论是否移除行,都重写为标准 tab 分隔格式
 
333
 
334
  if progress_callback:
335
  if removed_count > 0:
336
+ progress_callback(
337
+ f"已清理 {removed_count} 个空行/无效行(含注释 {comment_count} 行, 概率无音素 {prob_only_count} 行)"
338
+ )
339
  else:
340
  progress_callback(f"字典标准化完成,共 {len(cleaned_lines)} 行(已统一为 tab 分隔)")
341
 
src/mfa_runner.py CHANGED
@@ -10,10 +10,12 @@ import shutil
10
  import subprocess
11
  import logging
12
  import time
 
13
  from pathlib import Path
14
  from typing import Optional, Callable
15
 
16
  logger = logging.getLogger(__name__)
 
17
 
18
  # 定位路径
19
  BASE_DIR = Path(__file__).parent.parent.absolute()
@@ -235,6 +237,7 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
235
  # 这样可以最大化规避 MFA 解析器在边缘行上的 IndexError。
236
  sanitized_lines = []
237
  malformed_count = 0
 
238
  comment_count = 0
239
  for line in lines:
240
  stripped = line.replace('\ufeff', '').strip()
@@ -252,6 +255,16 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
252
  malformed_count += 1
253
  continue
254
 
 
 
 
 
 
 
 
 
 
 
255
  word = tokens[0]
256
  pronunciation = " ".join(tokens[1:])
257
  sanitized_lines.append(f"{word}\t{pronunciation}\n")
@@ -265,7 +278,7 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
265
  if removed_count > 0:
266
  logger.info(
267
  f"字典文件清理完成: 原 {original_count} 行, 现 {len(sanitized_lines)} 行, "
268
- f"移除 {removed_count} 行(注释 {comment_count} 行, 格式异常 {malformed_count} 行)"
269
  )
270
  else:
271
  logger.info(f"字典文件标准化完成: 共 {len(sanitized_lines)} 行(已统一为 tab 分隔)")
 
10
  import subprocess
11
  import logging
12
  import time
13
+ import re
14
  from pathlib import Path
15
  from typing import Optional, Callable
16
 
17
  logger = logging.getLogger(__name__)
18
+ PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
19
 
20
  # 定位路径
21
  BASE_DIR = Path(__file__).parent.parent.absolute()
 
237
  # 这样可以最大化规避 MFA 解析器在边缘行上的 IndexError。
238
  sanitized_lines = []
239
  malformed_count = 0
240
+ prob_only_count = 0
241
  comment_count = 0
242
  for line in lines:
243
  stripped = line.replace('\ufeff', '').strip()
 
255
  malformed_count += 1
256
  continue
257
 
258
+ rest = tokens[1:]
259
+ # 与 MFA parse_dictionary_file 对齐:允许 1~4 个前置概率字段
260
+ # 但概率字段之后必须至少有一个音素,否则 MFA 内部会 IndexError。
261
+ idx = 0
262
+ while idx < len(rest) and idx < 4 and PROB_PATTERN.match(rest[idx]):
263
+ idx += 1
264
+ if idx >= len(rest):
265
+ prob_only_count += 1
266
+ continue
267
+
268
  word = tokens[0]
269
  pronunciation = " ".join(tokens[1:])
270
  sanitized_lines.append(f"{word}\t{pronunciation}\n")
 
278
  if removed_count > 0:
279
  logger.info(
280
  f"字典文件清理完成: 原 {original_count} 行, 现 {len(sanitized_lines)} 行, "
281
+ f"移除 {removed_count} 行(注释 {comment_count} 行, 格式异常 {malformed_count} 行, 概率无音素 {prob_only_count} 行)"
282
  )
283
  else:
284
  logger.info(f"字典文件标准化完成: 共 {len(sanitized_lines)} 行(已统一为 tab 分隔)")