Spaces:

TNOT
/

JinrikiHelper

Running

TNOT commited on 27 days ago

Commit

c3a9a08

1 Parent(s): 56fcba9

fix: clean and validate MFA dictionaries by whitespace tokens

Files changed (2) hide show

src/mfa_model_downloader.py CHANGED Viewed

@@ -130,13 +130,14 @@ def _verify_file_integrity(
         try:
             valid_line_count = 0
             invalid_line_count = 0
-            with open(file_path, 'r', encoding='utf-8') as f:
                 for line in f:
                     stripped = line.strip()
                     if not stripped:
                         continue
-                    # 检查是否有制表符分隔（字典格式要求）
-                    if '\t' in line:
                         valid_line_count += 1
                     else:
                         invalid_line_count += 1
@@ -277,18 +278,27 @@ def _clean_dictionary_file(
     返回: 清理的空行数量
     """
     try:
-        with open(dict_path, 'r', encoding='utf-8') as f:
             lines = f.readlines()
-        # 过滤空行
-        non_empty_lines = [line for line in lines if line.strip()]
-        removed_count = len(lines) - len(non_empty_lines)
         if removed_count > 0:
             with open(dict_path, 'w', encoding='utf-8') as f:
-                f.writelines(non_empty_lines)
             if progress_callback:
-                progress_callback(f"已清理 {removed_count} 个空行")
         return removed_count
     except Exception as e:

         try:
             valid_line_count = 0
             invalid_line_count = 0
+            with open(file_path, 'r', encoding='utf-8-sig') as f:
                 for line in f:
                     stripped = line.strip()
                     if not stripped:
                         continue
+                    # MFA 字典支持任意空白分隔，至少需 2 列
+                    if len(stripped.split()) >= 2:
                         valid_line_count += 1
                     else:
                         invalid_line_count += 1
     返回: 清理的空行数量
     """
     try:
+        with open(dict_path, 'r', encoding='utf-8-sig') as f:
             lines = f.readlines()
+        # 过滤空行与格式异常行（至少应有 2 个 token）
+        cleaned_lines = []
+        removed_count = 0
+        for line in lines:
+            stripped = line.strip()
+            if not stripped:
+                removed_count += 1
+                continue
+            if len(stripped.split()) < 2:
+                removed_count += 1
+                continue
+            cleaned_lines.append(line)
         if removed_count > 0:
             with open(dict_path, 'w', encoding='utf-8') as f:
+                f.writelines(cleaned_lines)
             if progress_callback:
+                progress_callback(f"已清理 {removed_count} 个空行/无效行")
         return removed_count
     except Exception as e:

src/mfa_runner.py CHANGED Viewed

@@ -225,23 +225,28 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
     返回: 清理的无效行数量
     """
     try:
-        with open(dict_path, 'r', encoding='utf-8') as f:
             lines = f.readlines()
         original_count = len(lines)
-        # 过滤空行和只有空白字符的行
-        # 同时过滤没有制表符分隔的无效行（字典格式: word\tprob\t...）
         valid_lines = []
         for line in lines:
             stripped = line.strip()
             # 跳过空行
             if not stripped:
                 continue
-            # 跳过没有制表符的行（无效格式）
-            if '\t' not in line:
-                logger.warning(f"跳过无效字典行: {stripped[:50]}...")
                 continue
             valid_lines.append(line)
         removed_count = original_count - len(valid_lines)
@@ -249,7 +254,10 @@ def _clean_dict_empty_lines(dict_path: str) -> int:
         if removed_count > 0:
             with open(dict_path, 'w', encoding='utf-8') as f:
                 f.writelines(valid_lines)
-            logger.info(f"字典文件清理完成: 原 {original_count} 行, 现 {len(valid_lines)} 行, 移除 {removed_count} 行")
         else:
             logger.info(f"字典文件无需清理: {original_count} 行")

     返回: 清理的无效行数量
     """
     try:
+        # utf-8-sig: 自动兼容可能存在的 BOM
+        with open(dict_path, 'r', encoding='utf-8-sig') as f:
             lines = f.readlines()
         original_count = len(lines)
+        # 过滤空行和无效行
+        # MFA 字典允许空白分隔，不应强制要求制表符
+        # 合法行至少应有 2 个 token（词条 + 至少一个音素）
         valid_lines = []
+        malformed_count = 0
         for line in lines:
             stripped = line.strip()
             # 跳过空行
             if not stripped:
                 continue
+            tokens = stripped.split()
+            if len(tokens) < 2:
+                malformed_count += 1
                 continue
             valid_lines.append(line)
         removed_count = original_count - len(valid_lines)
         if removed_count > 0:
             with open(dict_path, 'w', encoding='utf-8') as f:
                 f.writelines(valid_lines)
+            logger.info(
+                f"字典文件清理完成: 原 {original_count} 行, 现 {len(valid_lines)} 行, "
+                f"移除 {removed_count} 行（其中格式异常 {malformed_count} 行）"
+            )
         else:
             logger.info(f"字典文件无需清理: {original_count} 行")