import json
from lib.utils import run_textdistance, clean_text_for_comparison_zh, highlight_diff
import re
# import Levenshtein

def calculate_distance(reference: str, hypothesis: str):
    """
    使用 python-Levenshtein 库计算字符错误率 (CER)。

    CER = (Substitutions + Deletions + Insertions) / Total Characters in Reference
        = Levenshtein Distance / Total Characters in Reference

    Args:
        reference: 真实的文本转录 (Ground Truth)。
        hypothesis: ASR 模型的预测结果。

    Returns:
        字符错误率 (CER)。
    """
    text1_clean = clean_text_for_comparison_zh(reference)
    text2_clean = clean_text_for_comparison_zh(hypothesis)
    d, nd = run_textdistance(text1_clean, text2_clean)
    diff = ""
    if d > 0:
        diff = highlight_diff(text1_clean, text2_clean, spliter="")
    return d, diff

if __name__ == '__main__':
    import cn2an
    results_list = json.load(open("csv/funasr_wenet_results.json", encoding="utf-8"))
    count = 0
    distance_sum = 0
    reference_sum = 0
    for item in results_list:
        count += 1
        reference = item["reference"]
        hypothesis = item["inference_result"]
        # # 如果是 whisper，使用 cn2an替换数字为中文
        # if re.search(r"\d", hypothesis):
        #     hypothesis = cn2an.transform(hypothesis, "an2cn")
        distance, diff = calculate_distance(reference, hypothesis)
        print(f"{count}. distance: {distance}")
        if distance > 0:
            print(f"Audio Path: {item['audio_path']}")
            print(f"Reference: {reference}")
            print(f"Hypothesis: {hypothesis}")
            print(f"Diff: {diff}")
        distance_sum += distance
        reference_sum += len(reference)
    cer = distance_sum / reference_sum if reference_sum > 0 else 0
    print(f"Total Distance: {distance_sum}, Total Reference Length: {reference_sum}, CER: {cer:.4f}")