yujuanqin
/

TestTranslator

Model card Files Files and versions

yujuanqin commited on Oct 23, 2025

Commit

152a3e8

·

1 Parent(s): 79f9224

update script

Files changed (2) hide show

scripts/caculate_cer.py +1 -1
scripts/run_whisper_finetuned.py +39 -1

scripts/caculate_cer.py CHANGED Viewed

@@ -3,7 +3,7 @@ from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_
 # import Levenshtein
-def calculate_distance(reference: str, hypothesis: str) -> float:
     """
     使用 python-Levenshtein 库计算字符错误率 (CER)。

 # import Levenshtein
+def calculate_distance(reference: str, hypothesis: str):
     """
     使用 python-Levenshtein 库计算字符错误率 (CER)。

scripts/run_whisper_finetuned.py CHANGED Viewed

@@ -176,6 +176,44 @@ def run_recordings():
         except Exception as e:
             print(f"{audio.name} -> 失败: {e}")
     save_csv("csv/fine-tune_whisper.csv", rows)
 if __name__ == "__main__":
     # main()
-    run_recordings()

         except Exception as e:
             print(f"{audio.name} -> 失败: {e}")
     save_csv("csv/fine-tune_whisper.csv", rows)
+def run_test_dataset():
+    from scripts.asr_utils import read_dataset
+    model, processor = load_model()
+    test_data = Path("../tests/test_data/dataset.txt")
+    audio_parent = Path("../tests/test_data/")
+    rows = [["file_name", "time", "inference_result"]]
+    result_list = []
+    count = 0
+    try:
+        for audio_path, sentence, duration in read_dataset(test_data):
+            count += 1
+            print(f"processing {count}: {audio_path}")
+            t1 = time.time()
+            text = transcribe_file(
+                str(audio_parent/audio_path), model, processor
+            )
+            t = time.time() - t1
+            print("inference time:", t)
+            print(text)
+            result_list.append({
+                "index": count,
+                "audio_path": audio_path,
+                "reference": sentence,
+                "duration": duration,
+                "inference_time": round(t, 3),
+                "inference_result": text
+            })
+    except Exception as e:
+        print(e)
+    except KeyboardInterrupt as e:
+        print(e)
+    import json
+    with open("csv/whisper_finetuned_dataset_results.json", "w", encoding="utf-8") as f:
+        json.dump(result_list, f, ensure_ascii=False, indent=2)
 if __name__ == "__main__":
     # main()
+    run_test_dataset()