File size: 1,930 Bytes
ade4f6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import csv
import re
import textdistance
import difflib
def replace_symbol(text):
symbol_pattern = "[,.,。!?\n]"
to = ""
return re.sub(symbol_pattern, to, text)
def run_textdistance(text1, text2):
# text1 = replace_symbol(text1)
# text2 = replace_symbol(text2)
d = textdistance.levenshtein.distance(text1, text2)
nd = d / len(text1)
# print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
return d, nd
def highlight_diff(a, b):
matcher = difflib.SequenceMatcher(None, a, b)
output = []
for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
if tag == 'equal':
output.append(a[a_start:a_end])
elif tag == 'delete':
output.append(f"[-{a[a_start:a_end]}-]")
elif tag == 'insert':
output.append(f"{{+{b[b_start:b_end]}+}}")
elif tag == 'replace':
output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
return ''.join(output)
def read_csv(file_path):
res ={}
with open(file_path, 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
res[row[0]] = row[-1]
return res
def save_csv(file_path, rows):
with open(file_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(rows)
print(f"write csv to {file_path}")
def main():
funasr_text = read_csv("run_funasr.csv")
quant_text = read_csv("run_quant.csv")
print(funasr_text)
print(quant_text)
rows = [["file_name", "diff", "distance", "normalized_d"]]
for key, v in funasr_text.items():
d, normalized_d = run_textdistance(v, quant_text[key])
opt = highlight_diff(v, quant_text[key])
print(key,opt, d, normalized_d)
rows.append([key,opt, d, normalized_d])
save_csv("compare_asr.csv", rows)
if __name__ == '__main__':
main()
|