TestTranslator / scripts /compare_text.py
yujuanqin's picture
init project code
ade4f6a
import csv
import re
import textdistance
import difflib
def replace_symbol(text):
symbol_pattern = "[,.,。!?\n]"
to = ""
return re.sub(symbol_pattern, to, text)
def run_textdistance(text1, text2):
# text1 = replace_symbol(text1)
# text2 = replace_symbol(text2)
d = textdistance.levenshtein.distance(text1, text2)
nd = d / len(text1)
# print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
return d, nd
def highlight_diff(a, b):
matcher = difflib.SequenceMatcher(None, a, b)
output = []
for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
if tag == 'equal':
output.append(a[a_start:a_end])
elif tag == 'delete':
output.append(f"[-{a[a_start:a_end]}-]")
elif tag == 'insert':
output.append(f"{{+{b[b_start:b_end]}+}}")
elif tag == 'replace':
output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
return ''.join(output)
def read_csv(file_path):
res ={}
with open(file_path, 'r', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
res[row[0]] = row[-1]
return res
def save_csv(file_path, rows):
with open(file_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerows(rows)
print(f"write csv to {file_path}")
def main():
funasr_text = read_csv("run_funasr.csv")
quant_text = read_csv("run_quant.csv")
print(funasr_text)
print(quant_text)
rows = [["file_name", "diff", "distance", "normalized_d"]]
for key, v in funasr_text.items():
d, normalized_d = run_textdistance(v, quant_text[key])
opt = highlight_diff(v, quant_text[key])
print(key,opt, d, normalized_d)
rows.append([key,opt, d, normalized_d])
save_csv("compare_asr.csv", rows)
if __name__ == '__main__':
main()