TNOT commited on
Commit
a383b79
·
1 Parent(s): 3f6e731

fix: validate zip file integrity after MFA model download

Browse files

- Added _is_valid_zip() to check acoustic models are valid zip files
- Detect corrupted/incomplete zip files and re-download automatically
- Prevents 'not a zip file' errors from breaking alignment
- Validates after each download from any mirror source
- Removes corrupted files to force fresh download

Files changed (1) hide show
  1. src/mfa_model_downloader.py +52 -5
src/mfa_model_downloader.py CHANGED
@@ -12,12 +12,42 @@ import urllib.request
12
  import urllib.error
13
  import time
14
  import re
 
15
  from pathlib import Path
16
  from typing import Optional, Callable
17
 
18
  logger = logging.getLogger(__name__)
19
  PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  # 模型下载基础 URL
22
  GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
23
  GITHUB_RAW_BASE = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main"
@@ -377,17 +407,34 @@ def download_acoustic_model(
377
 
378
  # 检查现有文件
379
  if os.path.exists(dest_path) and not force_download:
380
- # 简单检查:文件存在且大小大于 1MB
381
  file_size = os.path.getsize(dest_path)
382
- if file_size > 1024 * 1024:
383
- log(f"声学模型已存在: {dest_path}")
 
 
 
 
 
384
  return True, dest_path
385
  else:
386
- log(f"声学模型文件异常 (大小: {file_size} bytes),重新下载...")
 
 
 
 
387
 
388
  for candidate_url in _build_mirror_urls(url):
389
  if _download_file(candidate_url, dest_path, progress_callback, retries=2, timeout=300):
390
- return True, dest_path
 
 
 
 
 
 
 
 
 
391
  return False, "声学模型下载失败"
392
 
393
 
 
12
  import urllib.error
13
  import time
14
  import re
15
+ import zipfile
16
  from pathlib import Path
17
  from typing import Optional, Callable
18
 
19
  logger = logging.getLogger(__name__)
20
  PROB_PATTERN = re.compile(r"\b(\d+\.\d+|1)\b")
21
 
22
+
23
+ def _is_valid_zip(file_path: str) -> bool:
24
+ """检查文件是否为有效的 zip 文件
25
+
26
+ 参数:
27
+ file_path: 文件路径
28
+
29
+ 返回:
30
+ 是否为有效的 zip 文件
31
+ """
32
+ if not os.path.exists(file_path):
33
+ return False
34
+
35
+ try:
36
+ with zipfile.ZipFile(file_path, 'r') as zf:
37
+ # 尝试读取 zip 文件的中央目录
38
+ # testzip() 返回 None if OK,返回第一个坏文件名 if 有错
39
+ bad_file = zf.testzip()
40
+ if bad_file:
41
+ logger.warning(f"zip 文件损坏,坏文件: {bad_file}")
42
+ return False
43
+ return True
44
+ except zipfile.BadZipFile:
45
+ logger.warning(f"不是有效的 zip 文件: {file_path}")
46
+ return False
47
+ except Exception as e:
48
+ logger.warning(f"检查 zip 文件异常: {e}")
49
+ return False
50
+
51
  # 模型下载基础 URL
52
  GITHUB_RELEASE_BASE = "https://github.com/MontrealCorpusTools/mfa-models/releases/download"
53
  GITHUB_RAW_BASE = "https://raw.githubusercontent.com/MontrealCorpusTools/mfa-models/main"
 
407
 
408
  # 检查现有文件
409
  if os.path.exists(dest_path) and not force_download:
 
410
  file_size = os.path.getsize(dest_path)
411
+
412
+ # 首先检查大小(快速验证)
413
+ if file_size < 1024 * 1024:
414
+ log(f"声学模型文件异常 (大小: {file_size} bytes),重新下载...")
415
+ # 然后检查 zip 的有效性
416
+ elif _is_valid_zip(dest_path):
417
+ log(f"声学模型已存在且有效: {dest_path}")
418
  return True, dest_path
419
  else:
420
+ log(f"声学模型文件损坏(非有效 zip)删除并重新下载...")
421
+ try:
422
+ os.remove(dest_path)
423
+ except Exception as e:
424
+ log(f"删除损坏文件失败: {e}")
425
 
426
  for candidate_url in _build_mirror_urls(url):
427
  if _download_file(candidate_url, dest_path, progress_callback, retries=2, timeout=300):
428
+ # 下载完成后验证 zip 有效性
429
+ if _is_valid_zip(dest_path):
430
+ log(f"声学模型下载并验证成功: {dest_path}")
431
+ return True, dest_path
432
+ else:
433
+ log(f"下载的文件不是有效的 zip,尝试下一个源...")
434
+ try:
435
+ os.remove(dest_path)
436
+ except Exception:
437
+ pass
438
  return False, "声学模型下载失败"
439
 
440