import csv import re import textdistance import difflib def replace_symbol(text): symbol_pattern = "[,.,。!?\n]" to = "" return re.sub(symbol_pattern, to, text) def run_textdistance(text1, text2): # text1 = replace_symbol(text1) # text2 = replace_symbol(text2) d = textdistance.levenshtein.distance(text1, text2) nd = d / len(text1) # print("Levenshtein distance of texts:", d, "normalized distance is:", nd) return d, nd def highlight_diff(a, b): matcher = difflib.SequenceMatcher(None, a, b) output = [] for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes(): if tag == 'equal': output.append(a[a_start:a_end]) elif tag == 'delete': output.append(f"[-{a[a_start:a_end]}-]") elif tag == 'insert': output.append(f"{{+{b[b_start:b_end]}+}}") elif tag == 'replace': output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}") return ''.join(output) def read_csv(file_path): res ={} with open(file_path, 'r', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: res[row[0]] = row[-1] return res def save_csv(file_path, rows): with open(file_path, "w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerows(rows) print(f"write csv to {file_path}") def main(): funasr_text = read_csv("run_funasr.csv") quant_text = read_csv("run_quant.csv") print(funasr_text) print(quant_text) rows = [["file_name", "diff", "distance", "normalized_d"]] for key, v in funasr_text.items(): d, normalized_d = run_textdistance(v, quant_text[key]) opt = highlight_diff(v, quant_text[key]) print(key,opt, d, normalized_d) rows.append([key,opt, d, normalized_d]) save_csv("compare_asr.csv", rows) if __name__ == '__main__': main()