|
|
import json |
|
|
from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff |
|
|
import re |
|
|
|
|
|
|
|
|
def calculate_distance(reference: str, hypothesis: str): |
|
|
""" |
|
|
使用 python-Levenshtein 库计算字符错误率 (CER)。 |
|
|
|
|
|
CER = (Substitutions + Deletions + Insertions) / Total Characters in Reference |
|
|
= Levenshtein Distance / Total Characters in Reference |
|
|
|
|
|
Args: |
|
|
reference: 真实的文本转录 (Ground Truth)。 |
|
|
hypothesis: ASR 模型的预测结果。 |
|
|
|
|
|
Returns: |
|
|
字符错误率 (CER)。 |
|
|
""" |
|
|
text1_clean = clean_text_for_comparison_zh(reference) |
|
|
text2_clean = clean_text_for_comparison_zh(hypothesis) |
|
|
d, nd = run_textdistance(text1_clean, text2_clean) |
|
|
diff = "" |
|
|
if d > 0: |
|
|
diff = highlight_diff(text1_clean, text2_clean, spliter="") |
|
|
return d, diff |
|
|
|
|
|
if __name__ == '__main__': |
|
|
import cn2an |
|
|
results_list = json.load(open("csv/funasr_wenet_results.json", encoding="utf-8")) |
|
|
count = 0 |
|
|
distance_sum = 0 |
|
|
reference_sum = 0 |
|
|
for item in results_list: |
|
|
count += 1 |
|
|
reference = item["reference"] |
|
|
hypothesis = item["inference_result"] |
|
|
|
|
|
|
|
|
|
|
|
distance, diff = calculate_distance(reference, hypothesis) |
|
|
print(f"{count}. distance: {distance}") |
|
|
if distance > 0: |
|
|
print(f"Audio Path: {item['audio_path']}") |
|
|
print(f"Reference: {reference}") |
|
|
print(f"Hypothesis: {hypothesis}") |
|
|
print(f"Diff: {diff}") |
|
|
distance_sum += distance |
|
|
reference_sum += len(reference) |
|
|
cer = distance_sum / reference_sum if reference_sum > 0 else 0 |
|
|
print(f"Total Distance: {distance_sum}, Total Reference Length: {reference_sum}, CER: {cer:.4f}") |