yujuanqin commited on
Commit
5627195
·
1 Parent(s): 27d3144

update text comparison

Browse files
Files changed (3) hide show
  1. lib/report.py +13 -3
  2. lib/temp.py +41 -0
  3. lib/utils.py +32 -14
lib/report.py CHANGED
@@ -4,7 +4,7 @@ from dataclasses import dataclass, astuple
4
 
5
  from tabulate import tabulate
6
  from lib.log_parser import LogTag, LogItem, WebItem
7
- from lib.utils import save_csv, run_textdistance, highlight_diff, time_to_float
8
 
9
  class LogReport:
10
  """用于处理 log文件"""
@@ -240,8 +240,18 @@ class AccuracyItem:
240
  asr_accuracy: tuple= (0,1)
241
  text_compare: str = ""
242
  def __post_init__(self):
243
- self.asr_accuracy = run_textdistance(self.audio_text, self.src_text)
244
- self.text_compare = highlight_diff(self.audio_text, self.src_text)
 
 
 
 
 
 
 
 
 
 
245
  def to_list(self):
246
  return [self.audio, self.translation_type, self.audio_length,
247
  self.asr_accuracy[0], self.asr_accuracy[1],
 
4
 
5
  from tabulate import tabulate
6
  from lib.log_parser import LogTag, LogItem, WebItem
7
+ from lib.utils import *
8
 
9
  class LogReport:
10
  """用于处理 log文件"""
 
240
  asr_accuracy: tuple= (0,1)
241
  text_compare: str = ""
242
  def __post_init__(self):
243
+
244
+ if self.translation_type == "en2zh":
245
+ text1 = clean_text_for_comparison_en(self.audio_text)
246
+ text2 = clean_text_for_comparison_en(self.src_text)
247
+ spliter = " "
248
+ else:
249
+ text1 = clean_text_for_comparison_zh(self.audio_text)
250
+ text2 = clean_text_for_comparison_zh(self.src_text)
251
+ spliter = ""
252
+ self.asr_accuracy = run_textdistance(text1, text2)
253
+ self.text_compare = highlight_diff(text1, text2, spliter)
254
+
255
  def to_list(self):
256
  return [self.audio, self.translation_type, self.audio_length,
257
  self.asr_accuracy[0], self.asr_accuracy[1],
lib/temp.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import difflib
3
+
4
+
5
+ def replace_symbol(text):
6
+ symbol_pattern = "[,.,。、!?\n]"
7
+ to = ""
8
+ return re.sub(symbol_pattern, to, text).lower()
9
+ def tokenize(text):
10
+ # return re.findall(r'\w+|\s+|[^\w\s]', text)
11
+ return text.split(" ")
12
+ def highlight_diff(a, b):
13
+ a, b = replace_symbol(a), replace_symbol(b)
14
+ tokens_a = tokenize(a)
15
+ tokens_b = tokenize(b)
16
+ print(tokens_a)
17
+ print(tokens_b)
18
+ matcher = difflib.SequenceMatcher(None, tokens_a, tokens_b)
19
+ output = []
20
+
21
+ for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
22
+ if tag == 'equal':
23
+ output.append(' '.join(tokens_a[a_start:a_end]))
24
+ elif tag == 'delete':
25
+ deleted = ' '.join(tokens_a[a_start:a_end])
26
+ output.append(f"[-{deleted}-]")
27
+ elif tag == 'insert':
28
+ inserted = ' '.join(tokens_b[b_start:b_end])
29
+ output.append(f"{{+{inserted}+}}")
30
+ elif tag == 'replace':
31
+ deleted = ' '.join(tokens_a[a_start:a_end])
32
+ inserted = ' '.join(tokens_b[b_start:b_end])
33
+ output.append(f"[-{deleted}-]{{+{inserted}+}}")
34
+
35
+ return ' '.join(output)
36
+
37
+
38
+ if __name__ == '__main__':
39
+ text_1 = "We built computers to expand our brains. Originally scientists built computers to solve arithmetic, but they turned out to be incredibly useful for many other things as well. Running the entire internet, lifelike graphics, artificial brains, or simulating the universe. But amazingly all of it boils down to just flipping zeros and ones. Computers have become smaller and more powerful at an incredible rate. There was more computing power in your cell phone than there was in the entire world in the mid-1960s. And the entire Apollo moon landing could have been run on a couple of Nintendos. Computer science is a subject that studies what computers can do. It's a diverse and overlapping field, but I'm going to split it into three parts. The fundamental theory of computer science, computer engineering, and applications."
40
+ text_2 = "We built computers to expand our brains. Originally, scientists built computers to solve arithmetic, but they turned out to be incredibly useful for many other things as well. running the entire internet, life-like graphics, artificial brains, or simulating the universe. But amazingly, all of it boils down to just flipping zeros and ones. Computers have become smaller, more powerful and at an incredible rate. There's more computing power in your cell phone than there was in the entire world in the mid-1960s. And the entire Apollo moon landing could have been run on a couple of Nintendos. Computer science is a subject that studies what computers can do. It's a diverse and overlapping field, but I'm going to split it into three parts. The fundamental theory of computer science, computer engineering, and communications. "
41
+ print(highlight_diff(text_1, text_2))
lib/utils.py CHANGED
@@ -37,34 +37,45 @@ def cmd(command: str, check=True, capture_output=False) -> CompletedProcess:
37
  print(ret.stdout)
38
  return ret
39
 
40
- def replace_symbol(text):
41
- symbol_pattern = "[,.,。!?\n]"
 
 
 
 
 
42
  to = ""
43
  return re.sub(symbol_pattern, to, text).lower()
44
 
45
 
46
  def run_textdistance(text1, text2):
47
- text1 = replace_symbol(text1)
48
- text2 = replace_symbol(text2)
49
  d = textdistance.levenshtein.distance(text1, text2)
50
  nd = d / len(text1)
51
  # print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
52
  return d, nd
53
 
54
- def highlight_diff(a, b):
55
- a, b = replace_symbol(a), replace_symbol(b)
 
 
56
  matcher = difflib.SequenceMatcher(None, a, b)
57
  output = []
 
58
  for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
59
  if tag == 'equal':
60
- output.append(a[a_start:a_end])
61
  elif tag == 'delete':
62
- output.append(f"[-{a[a_start:a_end]}-]")
 
63
  elif tag == 'insert':
64
- output.append(f"{{+{b[b_start:b_end]}+}}")
 
65
  elif tag == 'replace':
66
- output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
67
- return ''.join(output)
 
 
 
68
 
69
  def time_to_float(s: str):
70
  if d := s.replace("s", ""):
@@ -73,7 +84,14 @@ def time_to_float(s: str):
73
 
74
 
75
  if __name__ == '__main__':
76
- text_1 = "This sequence of events is an example of what is known as the Butterfly Effect, a manifestation of Chaos Theory. For many centuries, the world was explained through the laws of Isaac Newton in classical physics. According to these laws, if the current state of an object is known, its future behavior can be predicted with relative ease. Chaos Theory questions this deterministic vision. Not everything is predictable anymore, nor does it work like a quirk. Since the 1800s, mathematicians have raised the idea that not all phenomena could be predicted by Newtonian laws. But a meteorologist named Edward Lawrence made Chaos Theory a visible phenomenon. It all started in 1961, when he was working on a mathematical model to forecast the weather. Lawrence entered data such as temperature, humidity, pressure, and wind direction into his computer. His computer would draw a graph modeling what the weather would be like. Not always accurate, but very close to reality. G."
77
- text_2 = "This sequence of events is an example of what is known as the butterfly effect, a manifestation of chaos theory. For many centuries, the world was explained through the laws of Isaac Newton in classical physics. According to these laws, if the current state of an object is known, Its future behavior can be predicted with relative ease. Chaos theory questions this deterministic vision. Not everything is predictable anymore. nor does it work like clockwork. Since the 1800s, mathematicians have raised the idea that not all phenomena could be predicted by Newtonian laws. But a meteorologist named Edward Lawrence made Kale's theory a visible phenomenon. It all started in 1961, when he was working on a mathematical model to forecast the weather. Lawrence entered data such as temperature, humidity, pressure, and wind direction into his computer. His computer would draw a graph, modeling what the weather would be like. Not always accurate, but very close to reality. "
 
 
 
 
 
78
  print(run_textdistance(text_1, text_2))
79
- print(highlight_diff(text_1, text_2))
 
 
 
37
  print(ret.stdout)
38
  return ret
39
 
40
+ def clean_text_for_comparison_zh(text):
41
+ symbol_pattern = "[ ,。、!?\n]"
42
+ to = ""
43
+ return re.sub(symbol_pattern, to, text).lower()
44
+
45
+ def clean_text_for_comparison_en(text):
46
+ symbol_pattern = "[,.\n]"
47
  to = ""
48
  return re.sub(symbol_pattern, to, text).lower()
49
 
50
 
51
  def run_textdistance(text1, text2):
 
 
52
  d = textdistance.levenshtein.distance(text1, text2)
53
  nd = d / len(text1)
54
  # print("Levenshtein distance of texts:", d, "normalized distance is:", nd)
55
  return d, nd
56
 
57
+ def highlight_diff(a, b, spliter=""):
58
+ if spliter:
59
+ a = a.split(spliter)
60
+ b = b.split(spliter)
61
  matcher = difflib.SequenceMatcher(None, a, b)
62
  output = []
63
+
64
  for tag, a_start, a_end, b_start, b_end in matcher.get_opcodes():
65
  if tag == 'equal':
66
+ output.append(spliter.join(a[a_start:a_end]))
67
  elif tag == 'delete':
68
+ deleted = spliter.join(a[a_start:a_end])
69
+ output.append(f"[-{deleted}-]")
70
  elif tag == 'insert':
71
+ inserted = spliter.join(b[b_start:b_end])
72
+ output.append(f"{{+{inserted}+}}")
73
  elif tag == 'replace':
74
+ deleted = spliter.join(a[a_start:a_end])
75
+ inserted = spliter.join(b[b_start:b_end])
76
+ output.append(f"[-{deleted}-]{{+{inserted}+}}")
77
+
78
+ return spliter.join(output)
79
 
80
  def time_to_float(s: str):
81
  if d := s.replace("s", ""):
 
84
 
85
 
86
  if __name__ == '__main__':
87
+ text_1 = "This sequence of events is an example of what is known as the Butterfly Effect, a manifestation of Chaos Theory. For many centuries, the world was explained through the laws of Isaac Newton in classical physics. According to these laws, if the current state of an object is known, its future behavior can be predicted with relative ease. Chaos Theory questions this deterministic vision. Not everything is predictable anymore, nor does it work like a clockwork. Since the 1800s, mathematicians have raised the idea that not all phenomena could be predicted by Newtonian laws. But a meteorologist named Edward Lawrence made Chaos Theory a visible phenomenon. It all started in 1961, when he was working on a mathematical model to forecast the weather. Lawrence entered data such as temperature, humidity, pressure, and wind direction into his computer. His computer would draw a graph modeling what the weather would be like. Not always accurate, but very close to reality."
88
+ text_2 = "of events is an example of what is known as the butterfly effect, a manifestation of chaos theory. For many centuries, the world was explained through the laws of Isaac Newton in classical physics. According to these laws, if the current state of an object is known, Its future behavior can be predicted with relative ease. Chaos theory questions this deterministic vision. Not everything is predictable anymore. nor does it work like clockwork. Since the 1800s, mathematicians have raised the idea that not all phenomena could be predicted by Newtonian laws. but a meteorologist named Edward Lawrence made chaos theory a visible phenomenon. It all started in 1961. when he was working on a mathematical model to forecast the weather. Lawrence entered data such as temperature, humidity, pressure and wind direction into his computer. His computer would draw a graph, modeling what the weather would be like. not always accurate, but very close to reality. "
89
+ text_3 = "后来我自己总结啊,微积分这么难,入门主要有几个原因。首先呢大部分的教材为了追求严谨,从一开始就使用了现代数学的这个所谓极限的概念。在它的基础之上向你介绍微积分。呃,问题是它是一个非常抽象的概念。对于大部分在接触微积分之前啊,主要的学习经验就是刷题啊,甚至是连题也不刷的。同学们来说呢,这种抽象语言会很陌生。而且如果你去了解微积分的历史的时候,你会发现极限这个概念啊是微积分创立之后大概一两百年才出现的这么个东西。你等于说我们现在公认的这些微积分的创始人,这些大佬们、牛顿啊、莱布尼茨、欧拉啊,连他们都不知道极限是什么。但是人家就是凭着直觉创建了微积分,当然作为教科书嘛,追求严谨无可厚非啊。虽然对于我来讲,过早的追求这种严谨,导致学习的人入门困难甚至入不了门。"
90
+ text_4 = "后来我自己总结啊,微积分这么难,入门主要有几个原因。首先呢大部分的教材为了追求严谨。从一开始就使用了现代数学的这个所谓极限的概念在它的基础之上。向你介绍微积分呃,问题是它是一个非常抽象的概念。对于大部分在接触微积分之前。主要的学习经验就是刷题,甚至是连题也不刷的。同学们来说呢,这种抽象语言会很陌生。而且如果你去了解微积分的历史的时候,你会发现极限这个概念啊是微积分创立之后大概一两百年。才出现的这么个东西。你等于说我们现在公认的这些微积分的创始人,这些大佬。牛顿啊、莱布尼茨啊、欧拉啊啊,连他们都不知道极限是谁,但是人家就是凭着直觉。创建了危机,当然作为教科书嘛追求严谨,无可厚非啊,虽然对于我来讲。过早的追求这种严谨,导致学习的人入门困难,甚至入不了门。"
91
+
92
+ text_1, text_2 = clean_text_for_comparison_en(text_1), clean_text_for_comparison_en(text_2)
93
+ text_3, text_4 = clean_text_for_comparison_zh(text_3), clean_text_for_comparison_zh(text_4)
94
  print(run_textdistance(text_1, text_2))
95
+ print(highlight_diff(text_1, text_2, " "))
96
+ print(run_textdistance(text_3, text_4))
97
+ print(highlight_diff(text_3, text_4))