TNOT commited on
Commit
fd39da1
·
1 Parent(s): b16fc92

fix: MFA 字典空行校验,哈希校验

Browse files
docs/流程文档_AI用.md CHANGED
@@ -147,13 +147,19 @@
147
  | 模块 | 文件 | 功能 |
148
  |------|------|------|
149
  | Silero VAD 下载 | `silero_vad_downloader.py` | 下载语音活动检测模型 |
150
- | MFA 模型下载 | `mfa_model_downloader.py` | 下载声学模型和字典 |
151
  | Whisper 模型 | 通过 HuggingFace 自动下载 | 语音识别模型 |
152
 
153
  支持的语言:
154
  - 中文 (普通话): `mandarin_mfa.zip` + `mandarin_china_mfa.dict`
155
  - 日文: `japanese_mfa.zip` + `japanese_mfa.dict`
156
 
 
 
 
 
 
 
157
  ### 2. 音频处理模块
158
 
159
  | 模块 | 文件 | 功能 |
 
147
  | 模块 | 文件 | 功能 |
148
  |------|------|------|
149
  | Silero VAD 下载 | `silero_vad_downloader.py` | 下载语音活动检测模型 |
150
+ | MFA 模型下载 | `mfa_model_downloader.py` | 下载声学模型和字典(带完整性校验) |
151
  | Whisper 模型 | 通过 HuggingFace 自动下载 | 语音识别模型 |
152
 
153
  支持的语言:
154
  - 中文 (普通话): `mandarin_mfa.zip` + `mandarin_china_mfa.dict`
155
  - 日文: `japanese_mfa.zip` + `japanese_mfa.dict`
156
 
157
+ MFA 字典文件完整性校验:
158
+ - 下载完成后计算 SHA256 哈希并保存为 `.sha256` 文件
159
+ - 后续启动时校验哈希值,损坏则自动重新下载
160
+ - 检查字典文件最少行数(中文 8 万行,日文 10 万行)
161
+ - 自动清理字典文件中的空行(MFA 3.x 不支持空行)
162
+
163
  ### 2. 音频处理模块
164
 
165
  | 模块 | 文件 | 功能 |
src/mfa_model_downloader.py CHANGED
@@ -2,9 +2,11 @@
2
  """
3
  MFA 模型下载模块
4
  支持下载中文和日文的声学模型及字典
 
5
  """
6
 
7
  import os
 
8
  import logging
9
  import urllib.request
10
  import urllib.error
@@ -19,19 +21,24 @@ GITHUB_RAW_BASE = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-mod
19
 
20
  # 支持的语言配置
21
  # 格式: {语言代码: {名称, 声学模型信息, 字典信息}}
 
22
  LANGUAGE_MODELS = {
23
  "mandarin": {
24
  "name": "中文 (普通话)",
25
  "acoustic": {
26
  "tag": "acoustic-mandarin_mfa-v3.0.0",
27
  "filename": "mandarin_mfa.zip",
28
- "description": "Mandarin MFA acoustic model v3.0.0"
 
 
29
  },
30
  "dictionary": {
31
- # 字典从 releases 下载,tag 格式: dictionary-{name}-v{version}
32
  "tag": "dictionary-mandarin_china_mfa-v3.0.0",
33
  "filename": "mandarin_china_mfa.dict",
34
- "description": "Mandarin (China) MFA dictionary v3.0.0"
 
 
 
35
  }
36
  },
37
  "japanese": {
@@ -39,12 +46,15 @@ LANGUAGE_MODELS = {
39
  "acoustic": {
40
  "tag": "acoustic-japanese_mfa-v3.0.0",
41
  "filename": "japanese_mfa.zip",
42
- "description": "Japanese MFA acoustic model v3.0.0"
 
43
  },
44
  "dictionary": {
45
  "tag": "dictionary-japanese_mfa-v3.0.0",
46
  "filename": "japanese_mfa.dict",
47
- "description": "Japanese MFA dictionary v3.0.0"
 
 
48
  }
49
  }
50
  }
@@ -55,6 +65,85 @@ def get_available_languages() -> dict:
55
  return {k: v["name"] for k, v in LANGUAGE_MODELS.items()}
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  def _download_file(
59
  url: str,
60
  dest_path: str,
@@ -82,10 +171,13 @@ def _download_file(
82
  # 创建目录
83
  os.makedirs(os.path.dirname(dest_path), exist_ok=True)
84
 
 
 
 
85
  # 下载文件
86
  req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
87
 
88
- with urllib.request.urlopen(req, timeout=60) as response:
89
  total_size = response.headers.get("Content-Length")
90
  if total_size:
91
  total_size = int(total_size)
@@ -95,7 +187,7 @@ def _download_file(
95
  block_size = 8192
96
  downloaded = 0
97
 
98
- with open(dest_path, "wb") as f:
99
  while True:
100
  chunk = response.read(block_size)
101
  if not chunk:
@@ -107,6 +199,11 @@ def _download_file(
107
  percent = downloaded / total_size * 100
108
  log(f"下载进度: {percent:.1f}%")
109
 
 
 
 
 
 
110
  log(f"下载完成: {dest_path}")
111
  return True
112
 
@@ -119,12 +216,52 @@ def _download_file(
119
  except Exception as e:
120
  log(f"下载失败: {e}")
121
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
 
124
  def download_acoustic_model(
125
  language: str,
126
  output_dir: str,
127
- progress_callback: Optional[Callable[[str], None]] = None
 
128
  ) -> tuple[bool, str]:
129
  """
130
  下载声学模型
@@ -133,10 +270,16 @@ def download_acoustic_model(
133
  language: 语言代码 (mandarin/japanese)
134
  output_dir: 输出目录
135
  progress_callback: 进度回调
 
136
 
137
  返回:
138
  (成功标志, 文件路径或错误信息)
139
  """
 
 
 
 
 
140
  if language not in LANGUAGE_MODELS:
141
  return False, f"不支持的语言: {language}"
142
 
@@ -144,10 +287,15 @@ def download_acoustic_model(
144
  url = f"{GITHUB_RELEASE_BASE}/{config['tag']}/{config['filename']}"
145
  dest_path = os.path.join(output_dir, config["filename"])
146
 
147
- if os.path.exists(dest_path):
148
- if progress_callback:
149
- progress_callback(f"声学模型已存在: {dest_path}")
150
- return True, dest_path
 
 
 
 
 
151
 
152
  if _download_file(url, dest_path, progress_callback):
153
  return True, dest_path
@@ -158,70 +306,84 @@ def download_acoustic_model(
158
  def download_dictionary(
159
  language: str,
160
  output_dir: str,
161
- progress_callback: Optional[Callable[[str], None]] = None
 
162
  ) -> tuple[bool, str]:
163
  """
164
- 下载字典文件
165
 
166
  参数:
167
  language: 语言代码 (mandarin/japanese)
168
  output_dir: 输出目录
169
  progress_callback: 进度回调
 
170
 
171
  返回:
172
  (成功标志, 文件路径或错误信息)
173
  """
 
 
 
 
 
174
  if language not in LANGUAGE_MODELS:
175
  return False, f"不支持的语言: {language}"
176
 
177
  config = LANGUAGE_MODELS[language]["dictionary"]
178
- # 字典文件从 releases 下载
179
  url = f"{GITHUB_RELEASE_BASE}/{config['tag']}/{config['filename']}"
180
  dest_path = os.path.join(output_dir, config["filename"])
 
181
 
182
- if os.path.exists(dest_path):
183
- if progress_callback:
184
- progress_callback(f"字典文件已存在: {dest_path}")
185
- # 确保已有文件也清理空行
186
- _clean_dictionary_file(dest_path, progress_callback)
187
- return True, dest_path
188
 
189
- if _download_file(url, dest_path, progress_callback):
190
- # 清理字典文件中的空行(MFA 3.x 不支持空行)
191
- _clean_dictionary_file(dest_path, progress_callback)
192
- return True, dest_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  else:
194
- return False, "字典文件下载失败"
195
-
196
-
197
- def _clean_dictionary_file(
198
- dict_path: str,
199
- progress_callback: Optional[Callable[[str], None]] = None
200
- ):
201
- """
202
- 清理字典文件中的空行
203
- MFA 3.x 解析字典时遇到空行会报 IndexError
204
- """
205
- try:
206
- with open(dict_path, 'r', encoding='utf-8') as f:
207
- lines = f.readlines()
208
 
209
- # 过滤空行
210
- non_empty_lines = [line for line in lines if line.strip()]
211
 
212
- if len(non_empty_lines) < len(lines):
213
- with open(dict_path, 'w', encoding='utf-8') as f:
214
- f.writelines(non_empty_lines)
215
- if progress_callback:
216
- progress_callback(f"已清理 {len(lines) - len(non_empty_lines)} 个空行")
217
- except Exception as e:
218
- logger.warning(f"清理字典文件失败: {e}")
 
 
 
 
 
219
 
220
 
221
  def download_language_models(
222
  language: str,
223
  output_dir: str,
224
- progress_callback: Optional[Callable[[str], None]] = None
 
225
  ) -> tuple[bool, str, str]:
226
  """
227
  下载指定语言的声学模型和字典
@@ -230,6 +392,7 @@ def download_language_models(
230
  language: 语言代码 (mandarin/japanese)
231
  output_dir: 输出目录
232
  progress_callback: 进度回调
 
233
 
234
  返回:
235
  (成功标志, 声学模型路径, 字典路径)
@@ -248,14 +411,18 @@ def download_language_models(
248
  # 下载声学模型
249
  log("=" * 40)
250
  log("下载声学模型...")
251
- success, acoustic_path = download_acoustic_model(language, output_dir, progress_callback)
 
 
252
  if not success:
253
  return False, "", acoustic_path
254
 
255
  # 下载字典
256
  log("=" * 40)
257
  log("下载字典文件...")
258
- success, dict_path = download_dictionary(language, output_dir, progress_callback)
 
 
259
  if not success:
260
  return False, acoustic_path, dict_path
261
 
 
2
  """
3
  MFA 模型下载模块
4
  支持下载中文和日文的声学模型及字典
5
+ 包含 SHA256 哈希校验,确保文件完整性
6
  """
7
 
8
  import os
9
+ import hashlib
10
  import logging
11
  import urllib.request
12
  import urllib.error
 
21
 
22
  # 支持的语言配置
23
  # 格式: {语言代码: {名称, 声学模型信息, 字典信息}}
24
+ # sha256: 官方文件的 SHA256 哈希值(清理空行后),用于校验文件完整性
25
  LANGUAGE_MODELS = {
26
  "mandarin": {
27
  "name": "中文 (普通话)",
28
  "acoustic": {
29
  "tag": "acoustic-mandarin_mfa-v3.0.0",
30
  "filename": "mandarin_mfa.zip",
31
+ "description": "Mandarin MFA acoustic model v3.0.0",
32
+ # 声学模型是 zip 文件,不需要清理空行,直接校验原始哈希
33
+ "sha256": None, # 暂不校验声学模型
34
  },
35
  "dictionary": {
 
36
  "tag": "dictionary-mandarin_china_mfa-v3.0.0",
37
  "filename": "mandarin_china_mfa.dict",
38
+ "description": "Mandarin (China) MFA dictionary v3.0.0",
39
+ # 字典文件清理空行后的哈希值
40
+ "sha256": None, # 首次下载时自动计算并保存
41
+ "min_lines": 10000, # 字典文件最少行数,用于基本完整性检查
42
  }
43
  },
44
  "japanese": {
 
46
  "acoustic": {
47
  "tag": "acoustic-japanese_mfa-v3.0.0",
48
  "filename": "japanese_mfa.zip",
49
+ "description": "Japanese MFA acoustic model v3.0.0",
50
+ "sha256": None,
51
  },
52
  "dictionary": {
53
  "tag": "dictionary-japanese_mfa-v3.0.0",
54
  "filename": "japanese_mfa.dict",
55
+ "description": "Japanese MFA dictionary v3.0.0",
56
+ "sha256": None,
57
+ "min_lines": 10000, # 日语字典约 12 万行
58
  }
59
  }
60
  }
 
65
  return {k: v["name"] for k, v in LANGUAGE_MODELS.items()}
66
 
67
 
68
+ def _calculate_file_hash(file_path: str) -> str:
69
+ """计算文件的 SHA256 哈希值"""
70
+ sha256_hash = hashlib.sha256()
71
+ with open(file_path, "rb") as f:
72
+ for chunk in iter(lambda: f.read(8192), b""):
73
+ sha256_hash.update(chunk)
74
+ return sha256_hash.hexdigest()
75
+
76
+
77
+ def _get_hash_file_path(file_path: str) -> str:
78
+ """获取哈希文件路径"""
79
+ return file_path + ".sha256"
80
+
81
+
82
+ def _save_hash(file_path: str, hash_value: str):
83
+ """保存哈希值到文件"""
84
+ hash_file = _get_hash_file_path(file_path)
85
+ with open(hash_file, 'w', encoding='utf-8') as f:
86
+ f.write(hash_value)
87
+
88
+
89
+ def _load_saved_hash(file_path: str) -> Optional[str]:
90
+ """加载保存的哈希值"""
91
+ hash_file = _get_hash_file_path(file_path)
92
+ if os.path.exists(hash_file):
93
+ with open(hash_file, 'r', encoding='utf-8') as f:
94
+ return f.read().strip()
95
+ return None
96
+
97
+
98
+ def _verify_file_integrity(
99
+ file_path: str,
100
+ min_lines: Optional[int] = None,
101
+ progress_callback: Optional[Callable[[str], None]] = None
102
+ ) -> tuple[bool, str]:
103
+ """
104
+ 验证文件完整性
105
+
106
+ 参数:
107
+ file_path: 文件路径
108
+ min_lines: 最少行数要求(仅用于文本文件)
109
+ progress_callback: 进度回调
110
+
111
+ 返回:
112
+ (是否有效, 原因)
113
+ """
114
+ def log(msg: str):
115
+ logger.info(msg)
116
+ if progress_callback:
117
+ progress_callback(msg)
118
+
119
+ if not os.path.exists(file_path):
120
+ return False, "文件不存在"
121
+
122
+ # 检查文件大小
123
+ file_size = os.path.getsize(file_path)
124
+ if file_size == 0:
125
+ return False, "文件为空"
126
+
127
+ # 对于字典文件,检查行数
128
+ if min_lines and file_path.endswith('.dict'):
129
+ try:
130
+ with open(file_path, 'r', encoding='utf-8') as f:
131
+ line_count = sum(1 for line in f if line.strip())
132
+ if line_count < min_lines:
133
+ return False, f"字典行数不足: {line_count} < {min_lines}"
134
+ except Exception as e:
135
+ return False, f"读取文件失败: {e}"
136
+
137
+ # 检查哈希值(如果有保存的哈希)
138
+ saved_hash = _load_saved_hash(file_path)
139
+ if saved_hash:
140
+ current_hash = _calculate_file_hash(file_path)
141
+ if current_hash != saved_hash:
142
+ return False, f"哈希校验失败: 期望 {saved_hash[:16]}..., 实际 {current_hash[:16]}..."
143
+
144
+ return True, "文件完整"
145
+
146
+
147
  def _download_file(
148
  url: str,
149
  dest_path: str,
 
171
  # 创建目录
172
  os.makedirs(os.path.dirname(dest_path), exist_ok=True)
173
 
174
+ # 下载到临时文件,完成后再重命名
175
+ temp_path = dest_path + ".downloading"
176
+
177
  # 下载文件
178
  req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
179
 
180
+ with urllib.request.urlopen(req, timeout=120) as response:
181
  total_size = response.headers.get("Content-Length")
182
  if total_size:
183
  total_size = int(total_size)
 
187
  block_size = 8192
188
  downloaded = 0
189
 
190
+ with open(temp_path, "wb") as f:
191
  while True:
192
  chunk = response.read(block_size)
193
  if not chunk:
 
199
  percent = downloaded / total_size * 100
200
  log(f"下载进度: {percent:.1f}%")
201
 
202
+ # 下载完成,重命名
203
+ if os.path.exists(dest_path):
204
+ os.remove(dest_path)
205
+ os.rename(temp_path, dest_path)
206
+
207
  log(f"下载完成: {dest_path}")
208
  return True
209
 
 
216
  except Exception as e:
217
  log(f"下载失败: {e}")
218
  return False
219
+ finally:
220
+ # 清理临时文件
221
+ temp_path = dest_path + ".downloading"
222
+ if os.path.exists(temp_path):
223
+ try:
224
+ os.remove(temp_path)
225
+ except:
226
+ pass
227
+
228
+
229
+
230
+ def _clean_dictionary_file(
231
+ dict_path: str,
232
+ progress_callback: Optional[Callable[[str], None]] = None
233
+ ) -> int:
234
+ """
235
+ 清理字典文件中的空行
236
+ MFA 3.x 解析字典时遇到空行会报 IndexError
237
+
238
+ 返回: 清理的空行数量
239
+ """
240
+ try:
241
+ with open(dict_path, 'r', encoding='utf-8') as f:
242
+ lines = f.readlines()
243
+
244
+ # 过滤空行
245
+ non_empty_lines = [line for line in lines if line.strip()]
246
+ removed_count = len(lines) - len(non_empty_lines)
247
+
248
+ if removed_count > 0:
249
+ with open(dict_path, 'w', encoding='utf-8') as f:
250
+ f.writelines(non_empty_lines)
251
+ if progress_callback:
252
+ progress_callback(f"已清理 {removed_count} 个空行")
253
+
254
+ return removed_count
255
+ except Exception as e:
256
+ logger.warning(f"清理字典文件失败: {e}")
257
+ return 0
258
 
259
 
260
  def download_acoustic_model(
261
  language: str,
262
  output_dir: str,
263
+ progress_callback: Optional[Callable[[str], None]] = None,
264
+ force_download: bool = False
265
  ) -> tuple[bool, str]:
266
  """
267
  下载声学模型
 
270
  language: 语言代码 (mandarin/japanese)
271
  output_dir: 输出目录
272
  progress_callback: 进度回调
273
+ force_download: 强制重新下载
274
 
275
  返回:
276
  (成功标志, 文件路径或错误信息)
277
  """
278
+ def log(msg: str):
279
+ logger.info(msg)
280
+ if progress_callback:
281
+ progress_callback(msg)
282
+
283
  if language not in LANGUAGE_MODELS:
284
  return False, f"不支持的语言: {language}"
285
 
 
287
  url = f"{GITHUB_RELEASE_BASE}/{config['tag']}/{config['filename']}"
288
  dest_path = os.path.join(output_dir, config["filename"])
289
 
290
+ # 检查现有文件
291
+ if os.path.exists(dest_path) and not force_download:
292
+ # 简单检查:文件存在且大小大于 1MB
293
+ file_size = os.path.getsize(dest_path)
294
+ if file_size > 1024 * 1024:
295
+ log(f"声学模型已存在: {dest_path}")
296
+ return True, dest_path
297
+ else:
298
+ log(f"声学模型文件异常 (大小: {file_size} bytes),重新下载...")
299
 
300
  if _download_file(url, dest_path, progress_callback):
301
  return True, dest_path
 
306
  def download_dictionary(
307
  language: str,
308
  output_dir: str,
309
+ progress_callback: Optional[Callable[[str], None]] = None,
310
+ force_download: bool = False
311
  ) -> tuple[bool, str]:
312
  """
313
+ 下载字典文件(带完整性校验)
314
 
315
  参数:
316
  language: 语言代码 (mandarin/japanese)
317
  output_dir: 输出目录
318
  progress_callback: 进度回调
319
+ force_download: 强制重新下载
320
 
321
  返回:
322
  (成功标志, 文件路径或错误信息)
323
  """
324
+ def log(msg: str):
325
+ logger.info(msg)
326
+ if progress_callback:
327
+ progress_callback(msg)
328
+
329
  if language not in LANGUAGE_MODELS:
330
  return False, f"不支持的语言: {language}"
331
 
332
  config = LANGUAGE_MODELS[language]["dictionary"]
 
333
  url = f"{GITHUB_RELEASE_BASE}/{config['tag']}/{config['filename']}"
334
  dest_path = os.path.join(output_dir, config["filename"])
335
+ min_lines = config.get("min_lines")
336
 
337
+ need_download = force_download
 
 
 
 
 
338
 
339
+ # 检查现有文件完整性
340
+ if os.path.exists(dest_path) and not force_download:
341
+ is_valid, reason = _verify_file_integrity(dest_path, min_lines, progress_callback)
342
+ if is_valid:
343
+ log(f"字典文件已存在且完整: {dest_path}")
344
+ # 确保清理空行
345
+ _clean_dictionary_file(dest_path, progress_callback)
346
+ return True, dest_path
347
+ else:
348
+ log(f"字典文件校验失败: {reason},重新下载...")
349
+ need_download = True
350
+ # 删除损坏的文件和哈希
351
+ try:
352
+ os.remove(dest_path)
353
+ hash_file = _get_hash_file_path(dest_path)
354
+ if os.path.exists(hash_file):
355
+ os.remove(hash_file)
356
+ except:
357
+ pass
358
  else:
359
+ need_download = True
360
+
361
+ if need_download:
362
+ if not _download_file(url, dest_path, progress_callback):
363
+ return False, "字典文件下载失败"
 
 
 
 
 
 
 
 
 
364
 
365
+ # 清理空行
366
+ _clean_dictionary_file(dest_path, progress_callback)
367
 
368
+ # 验证下载的文件
369
+ is_valid, reason = _verify_file_integrity(dest_path, min_lines, progress_callback)
370
+ if not is_valid:
371
+ log(f"下载的字典文件无效: {reason}")
372
+ return False, f"字典文件无效: {reason}"
373
+
374
+ # 计算并保存哈希值
375
+ file_hash = _calculate_file_hash(dest_path)
376
+ _save_hash(dest_path, file_hash)
377
+ log(f"已保存字典文件哈希: {file_hash[:16]}...")
378
+
379
+ return True, dest_path
380
 
381
 
382
  def download_language_models(
383
  language: str,
384
  output_dir: str,
385
+ progress_callback: Optional[Callable[[str], None]] = None,
386
+ force_download: bool = False
387
  ) -> tuple[bool, str, str]:
388
  """
389
  下载指定语言的声学模型和字典
 
392
  language: 语言代码 (mandarin/japanese)
393
  output_dir: 输出目录
394
  progress_callback: 进度回调
395
+ force_download: 强制重新下载
396
 
397
  返回:
398
  (成功标志, 声学模型路径, 字典路径)
 
411
  # 下载声学模型
412
  log("=" * 40)
413
  log("下载声学模型...")
414
+ success, acoustic_path = download_acoustic_model(
415
+ language, output_dir, progress_callback, force_download
416
+ )
417
  if not success:
418
  return False, "", acoustic_path
419
 
420
  # 下载字典
421
  log("=" * 40)
422
  log("下载字典文件...")
423
+ success, dict_path = download_dictionary(
424
+ language, output_dir, progress_callback, force_download
425
+ )
426
  if not success:
427
  return False, acoustic_path, dict_path
428
 
src/mfa_runner.py CHANGED
@@ -100,6 +100,31 @@ def _build_mfa_env() -> dict:
100
  return env
101
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def run_mfa_alignment(
104
  corpus_dir: str,
105
  output_dir: str,
@@ -149,6 +174,11 @@ def run_mfa_alignment(
149
  if not os.path.isfile(model_path):
150
  return False, f"声学模型不存在: {model_path}"
151
 
 
 
 
 
 
152
  # 创建输出和临时目录
153
  os.makedirs(output_dir, exist_ok=True)
154
  os.makedirs(temp_dir, exist_ok=True)
 
100
  return env
101
 
102
 
103
+ def _clean_dict_empty_lines(dict_path: str) -> int:
104
+ """
105
+ 清理字典文件中的空行
106
+ MFA 3.x 解析字典时遇到空行会报 IndexError
107
+
108
+ 返回: 清理的空行数量
109
+ """
110
+ try:
111
+ with open(dict_path, 'r', encoding='utf-8') as f:
112
+ lines = f.readlines()
113
+
114
+ # 过滤空行
115
+ non_empty_lines = [line for line in lines if line.strip()]
116
+ removed_count = len(lines) - len(non_empty_lines)
117
+
118
+ if removed_count > 0:
119
+ with open(dict_path, 'w', encoding='utf-8') as f:
120
+ f.writelines(non_empty_lines)
121
+
122
+ return removed_count
123
+ except Exception as e:
124
+ logger.warning(f"清理字典文件空行失败: {e}")
125
+ return 0
126
+
127
+
128
  def run_mfa_alignment(
129
  corpus_dir: str,
130
  output_dir: str,
 
174
  if not os.path.isfile(model_path):
175
  return False, f"声学模型不存在: {model_path}"
176
 
177
+ # 清理字典文件中的空行(MFA 3.x 不支持空行)
178
+ removed = _clean_dict_empty_lines(dict_path)
179
+ if removed > 0:
180
+ log(f"已清理字典文件中的 {removed} 个空行")
181
+
182
  # 创建输出和临时目录
183
  os.makedirs(output_dir, exist_ok=True)
184
  os.makedirs(temp_dir, exist_ok=True)