File size: 1,930 Bytes

ade4f6a

import csv
import re
import textdistance
import difflib


def replace_symbol(text):
    symbol_pattern = "[,.，。!?\n]"
    to = ""
    return re.sub(symbol_pattern, to, text)


def run_textdistance(text1, text2):
    # text1 = replace_symbol(text1)
    # text2 = replace_symbol(text2)
    d = textdistance.levenshtein.distance(text1, text2)
    nd = d / len(text1)
    # print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
    return d, nd

def highlight_diff(a, b):
    matcher = difflib.SequenceMatcher(None, a, b)
    output = []
    for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
        if tag == 'equal':
            output.append(a[a_start:a_end])
        elif tag == 'delete':
            output.append(f"[-{a[a_start:a_end]}-]")
        elif tag == 'insert':
            output.append(f"{{+{b[b_start:b_end]}+}}")
        elif tag == 'replace':
            output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
    return ''.join(output)

def read_csv(file_path):
    res ={}
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            res[row[0]] = row[-1]
    return res

def save_csv(file_path, rows):
    with open(file_path, "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerows(rows)
        print(f"write csv to {file_path}")

def main():
    funasr_text = read_csv("run_funasr.csv")
    quant_text = read_csv("run_quant.csv")
    print(funasr_text)
    print(quant_text)
    rows = [["file_name", "diff", "distance", "normalized_d"]]
    for key, v in funasr_text.items():
        d, normalized_d = run_textdistance(v, quant_text[key])
        opt = highlight_diff(v, quant_text[key])
        print(key,opt, d, normalized_d)
        rows.append([key,opt, d, normalized_d])
    save_csv("compare_asr.csv", rows)


if __name__ == '__main__':
    main()