TNOT commited on
Commit
c3a9a08
·
1 Parent(s): 56fcba9

fix: clean and validate MFA dictionaries by whitespace tokens

Browse files
Files changed (2) hide show
  1. src/mfa_model_downloader.py +19 -9
  2. src/mfa_runner.py +15 -7
src/mfa_model_downloader.py CHANGED
@@ -130,13 +130,14 @@ def _verify_file_integrity(
130
  try:
131
  valid_line_count = 0
132
  invalid_line_count = 0
133
- with open(file_path, 'r', encoding='utf-8') as f:
134
  for line in f:
135
  stripped = line.strip()
136
  if not stripped:
137
  continue
138
- # 检查是否有制表符分隔(字典格式要求)
139
- if '\t' in line:
 
140
  valid_line_count += 1
141
  else:
142
  invalid_line_count += 1
@@ -277,18 +278,27 @@ def _clean_dictionary_file(
277
  返回: 清理的空行数量
278
  """
279
  try:
280
- with open(dict_path, 'r', encoding='utf-8') as f:
281
  lines = f.readlines()
282
 
283
- # 过滤空行
284
- non_empty_lines = [line for line in lines if line.strip()]
285
- removed_count = len(lines) - len(non_empty_lines)
 
 
 
 
 
 
 
 
 
286
 
287
  if removed_count > 0:
288
  with open(dict_path, 'w', encoding='utf-8') as f:
289
- f.writelines(non_empty_lines)
290
  if progress_callback:
291
- progress_callback(f"已清理 {removed_count} 个空行")
292
 
293
  return removed_count
294
  except Exception as e:
 
130
  try:
131
  valid_line_count = 0
132
  invalid_line_count = 0
133
+ with open(file_path, 'r', encoding='utf-8-sig') as f:
134
  for line in f:
135
  stripped = line.strip()
136
  if not stripped:
137
  continue
138
+
139
+ # MFA 字典支持任意空白分隔,至少需 2 列
140
+ if len(stripped.split()) >= 2:
141
  valid_line_count += 1
142
  else:
143
  invalid_line_count += 1
 
278
  返回: 清理的空行数量
279
  """
280
  try:
281
+ with open(dict_path, 'r', encoding='utf-8-sig') as f:
282
  lines = f.readlines()
283
 
284
+ # 过滤空行与格式异常行(至少应有 2 个 token)
285
+ cleaned_lines = []
286
+ removed_count = 0
287
+ for line in lines:
288
+ stripped = line.strip()
289
+ if not stripped:
290
+ removed_count += 1
291
+ continue
292
+ if len(stripped.split()) < 2:
293
+ removed_count += 1
294
+ continue
295
+ cleaned_lines.append(line)
296
 
297
  if removed_count > 0:
298
  with open(dict_path, 'w', encoding='utf-8') as f:
299
+ f.writelines(cleaned_lines)
300
  if progress_callback:
301
+ progress_callback(f"已清理 {removed_count} 个空行/无效行")
302
 
303
  return removed_count
304
  except Exception as e:
src/mfa_runner.py CHANGED
@@ -225,23 +225,28 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
225
  返回: 清理的无效行数量
226
  """
227
  try:
228
- with open(dict_path, 'r', encoding='utf-8') as f:
 
229
  lines = f.readlines()
230
 
231
  original_count = len(lines)
232
 
233
- # 过滤空行和只有空白字符的
234
- # 同时过滤没有制表符分隔的无效行(字典格式: word\tprob\t...)
 
235
  valid_lines = []
 
236
  for line in lines:
237
  stripped = line.strip()
238
  # 跳过空行
239
  if not stripped:
240
  continue
241
- # 跳过没有制表符的行(无效格式)
242
- if '\t' not in line:
243
- logger.warning(f"跳过无效字典行: {stripped[:50]}...")
 
244
  continue
 
245
  valid_lines.append(line)
246
 
247
  removed_count = original_count - len(valid_lines)
@@ -249,7 +254,10 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
249
  if removed_count > 0:
250
  with open(dict_path, 'w', encoding='utf-8') as f:
251
  f.writelines(valid_lines)
252
- logger.info(f"字典文件清理完成: 原 {original_count} 行, 现 {len(valid_lines)} 行, 移除 {removed_count} 行")
 
 
 
253
  else:
254
  logger.info(f"字典文件无需清理: {original_count} 行")
255
 
 
225
  返回: 清理的无效行数量
226
  """
227
  try:
228
+ # utf-8-sig: 自动兼容可能存在的 BOM
229
+ with open(dict_path, 'r', encoding='utf-8-sig') as f:
230
  lines = f.readlines()
231
 
232
  original_count = len(lines)
233
 
234
+ # 过滤空行和无效
235
+ # MFA 字典允许空白分隔,不应强要求制表符
236
+ # 合法行至少应有 2 个 token(词条 + 至少一个音素)
237
  valid_lines = []
238
+ malformed_count = 0
239
  for line in lines:
240
  stripped = line.strip()
241
  # 跳过空行
242
  if not stripped:
243
  continue
244
+
245
+ tokens = stripped.split()
246
+ if len(tokens) < 2:
247
+ malformed_count += 1
248
  continue
249
+
250
  valid_lines.append(line)
251
 
252
  removed_count = original_count - len(valid_lines)
 
254
  if removed_count > 0:
255
  with open(dict_path, 'w', encoding='utf-8') as f:
256
  f.writelines(valid_lines)
257
+ logger.info(
258
+ f"字典文件清理完成: 原 {original_count} 行, 现 {len(valid_lines)} 行, "
259
+ f"移除 {removed_count} 行(其中格式异常 {malformed_count} 行)"
260
+ )
261
  else:
262
  logger.info(f"字典文件无需清理: {original_count} 行")
263