|
|
import csv |
|
|
import re |
|
|
import textdistance |
|
|
import difflib |
|
|
|
|
|
|
|
|
def replace_symbol(text): |
|
|
symbol_pattern = "[,.,。!?\n]" |
|
|
to = "" |
|
|
return re.sub(symbol_pattern, to, text) |
|
|
|
|
|
|
|
|
def run_textdistance(text1, text2): |
|
|
|
|
|
|
|
|
d = textdistance.levenshtein.distance(text1, text2) |
|
|
nd = d / len(text1) |
|
|
|
|
|
return d, nd |
|
|
|
|
|
def highlight_diff(a, b): |
|
|
matcher = difflib.SequenceMatcher(None, a, b) |
|
|
output = [] |
|
|
for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes(): |
|
|
if tag == 'equal': |
|
|
output.append(a[a_start:a_end]) |
|
|
elif tag == 'delete': |
|
|
output.append(f"[-{a[a_start:a_end]}-]") |
|
|
elif tag == 'insert': |
|
|
output.append(f"{{+{b[b_start:b_end]}+}}") |
|
|
elif tag == 'replace': |
|
|
output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}") |
|
|
return ''.join(output) |
|
|
|
|
|
def read_csv(file_path): |
|
|
res ={} |
|
|
with open(file_path, 'r', encoding='utf-8') as csvfile: |
|
|
reader = csv.reader(csvfile) |
|
|
for row in reader: |
|
|
res[row[0]] = row[-1] |
|
|
return res |
|
|
|
|
|
def save_csv(file_path, rows): |
|
|
with open(file_path, "w", encoding="utf-8") as f: |
|
|
writer = csv.writer(f) |
|
|
writer.writerows(rows) |
|
|
print(f"write csv to {file_path}") |
|
|
|
|
|
def main(): |
|
|
funasr_text = read_csv("run_funasr.csv") |
|
|
quant_text = read_csv("run_quant.csv") |
|
|
print(funasr_text) |
|
|
print(quant_text) |
|
|
rows = [["file_name", "diff", "distance", "normalized_d"]] |
|
|
for key, v in funasr_text.items(): |
|
|
d, normalized_d = run_textdistance(v, quant_text[key]) |
|
|
opt = highlight_diff(v, quant_text[key]) |
|
|
print(key,opt, d, normalized_d) |
|
|
rows.append([key,opt, d, normalized_d]) |
|
|
save_csv("compare_asr.csv", rows) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|