yujuanqin commited on
Commit
e8abd42
·
1 Parent(s): 90cfc5c

update delay report

Browse files
Files changed (3) hide show
  1. lib/pages.py +1 -1
  2. lib/report.py +124 -65
  3. lib/utils.py +13 -1
lib/pages.py CHANGED
@@ -53,7 +53,7 @@ class TranslatorPage:
53
  print("click button to set translation off")
54
 
55
 
56
- def get_current_node_text(self, duration=0, interval=0.1)-> List[WebItem]:
57
  """在一定时间内持续读取页面最新的 node的内容"""
58
  print(f"capture page latest content for duration: {duration}s")
59
  translate_items = []
 
53
  print("click button to set translation off")
54
 
55
 
56
+ def get_current_node_text(self, duration=0, interval=0.05)-> List[WebItem]:
57
  """在一定时间内持续读取页面最新的 node的内容"""
58
  print(f"capture page latest content for duration: {duration}s")
59
  translate_items = []
lib/report.py CHANGED
@@ -1,27 +1,10 @@
 
1
  from typing import List
2
  from dataclasses import dataclass, astuple
3
 
4
  from tabulate import tabulate
5
  from lib.log_parser import LogTag, LogItem, WebItem
6
- from lib.utils import save_csv, run_textdistance, highlight_diff
7
-
8
- @dataclass
9
- class Row:
10
- audio_end_tsp:str = ""
11
- audio_length:str =""
12
- tsb_end_tsp:str =""
13
- tsb_opt:str =""
14
- tsb_cost:str =""
15
- tsl_ipt:str =""
16
- tsl_end_tsp:str =""
17
- tsl_opt:str =""
18
- tsl_cost:str =""
19
- web_tsp:str =""
20
- web_src:str =""
21
- web_dst:str =""
22
- def __repr__(self):
23
- return f"Row(audio_length={self.audio_length}, tsb_opt={self.tsb_opt})"
24
-
25
 
26
  class LogReport:
27
  """用于处理 log文件"""
@@ -83,6 +66,45 @@ class LogReport:
83
  rows = [[row[i] for i in header_mapping.values() if i < len(row)] for row in rows]
84
  save_csv(csv_path, header, rows)
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  @dataclass
87
  class DelayItem:
88
  """存储delay 报告中每一个 case的结果"""
@@ -93,29 +115,35 @@ class DelayItem:
93
  log_items: List[LogItem] = None
94
 
95
  def to_rows(self):
96
- """将 log和 web的结果合并成 csv行的形式
97
  返回 row_0包含音频信息和 load 时间
98
  rows 是每次推理的详细信息"""
99
  print(f"length of log_items: {len(self.log_items)}")
100
  web_items_dict = {i.src_text + i.dst_text: i for i in self.web_items}
101
 
102
- row_0 = [self.translation_type, self.audio, self.audio_length]
103
- rows = []
104
- current_row = Row()
 
105
  for i in self.log_items:
106
- if i.tag in [LogTag.load_start, LogTag.load_end]:
107
- row_0 += [i.tag.name, i.timestamp]
 
 
 
108
  elif i.tag == LogTag.audio_end:
 
 
109
  # 每次到 audio_end就是新的一行
110
- rows.append(current_row)
111
- current_row = Row()
112
  current_row.audio_end_tsp = i.timestamp
113
- current_row.audio_length = i.content.replace(" s", "")
114
  elif i.tag == LogTag.transcribe_end:
115
  current_row.tsb_end_tsp = i.timestamp
116
  current_row.tsb_opt = i.content
117
  elif i.tag == LogTag.transcribe_cost:
118
- current_row.tsb_cost = i.content.replace(" s", "")
 
119
  elif i.tag == LogTag.translate_start:
120
  current_row.tsl_ipt = i.content
121
  elif i.tag in [LogTag.translate_end, LogTag.translate_large_end]:
@@ -126,52 +154,79 @@ class DelayItem:
126
  current_row.web_tsp = web_item.timestamp
127
  current_row.web_src = web_item.src_text
128
  current_row.web_dst = web_item.dst_text
 
 
129
  # 删除 dict已匹配过的内容,避免多次匹配
130
  web_items_dict.pop(current_row.tsb_opt+current_row.tsl_opt)
 
 
 
131
  elif i.tag in [LogTag.translate_cost, LogTag.translate_large_cost]:
132
- current_row.tsl_cost = i.content.replace(" s", "")
133
- # print("rows value in DelayItem:",rows)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- return row_0, rows # [astuple(i) for i in rows]
136
 
137
  class DelayReport:
138
  """存储delay 报告中所有 case的结果"""
139
  start_line = 0
140
  items: List[DelayItem] = []
141
- # summary_items = {
142
- # "translation_type": "",
143
- # "audio length": "",
144
- # "load_model": "",
145
- # "total_transcribe": "",
146
- # "average_transcribe": "",
147
- # "total_translate": "",
148
- # "average_translate": "",
149
- # "asr accuracy": "",
150
- # "llm translation score": "",
151
- # "delay": "",
152
- # }
153
  def print_summary(self, data):
154
- header = ["audio", "load", "total audio len","total tsb","total tsl"]
155
- print(tabulate(data, header))
156
 
157
  def to_csv(self, csv_path):
158
- all_rows = []
159
- summaries = []
 
 
 
 
 
 
160
  for i in self.items:
161
- row_0, rows = i.to_rows()
162
- all_rows.append(row_0)
163
- all_rows += [astuple(i) for i in rows]
164
- all_rows += [] # 每个 case后加一个空行
165
-
166
- audios = [float(r.audio_length) for r in rows if r.audio_length]
167
- transcribes = [float(r.tsb_cost) for r in rows if r.tsb_cost]
168
- translates = [float(r.tsl_cost) for r in rows if r.tsl_cost]
169
- if len(row_0) >=7:
170
- summaries.append([row_0[1], row_0[6]-row_0[4], sum(audios), sum(transcribes), sum(translates)])
171
- else:
172
- summaries.append([row_0[1], 0, sum(audios), sum(transcribes), sum(translates)])
173
- save_csv(csv_path, [], all_rows)
174
  self.print_summary(summaries)
 
175
 
176
  @dataclass
177
  class AccuracyItem:
@@ -188,9 +243,9 @@ class AccuracyItem:
188
  self.asr_accuracy = run_textdistance(self.audio_text, self.src_text)
189
  self.text_compare = highlight_diff(self.audio_text, self.src_text)
190
  def to_list(self):
191
- return [self.translation_type, self.audio, self.audio_length, self.src_text,
192
- # self.dst_text,
193
- self.asr_accuracy, self.text_compare]
194
 
195
  class AccuracyReport:
196
  items:List[AccuracyItem] = []
@@ -201,7 +256,11 @@ class AccuracyReport:
201
  print(tabulate(rows, header))
202
 
203
  def to_csv(self, csv_path):
204
- save_csv(csv_path, [], [i.to_list() for i in self.items])
205
  self.print_summary()
 
 
 
 
206
 
207
 
 
1
+ from datetime import datetime, timedelta
2
  from typing import List
3
  from dataclasses import dataclass, astuple
4
 
5
  from tabulate import tabulate
6
  from lib.log_parser import LogTag, LogItem, WebItem
7
+ from lib.utils import save_csv, run_textdistance, highlight_diff, time_to_float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  class LogReport:
10
  """用于处理 log文件"""
 
66
  rows = [[row[i] for i in header_mapping.values() if i < len(row)] for row in rows]
67
  save_csv(csv_path, header, rows)
68
 
69
+ @dataclass
70
+ class DelaySummary:
71
+ audio_name:str = ""
72
+ trans_type: str = ""
73
+ audio_length:str = ""
74
+ load_start: datetime =None
75
+ load_end: datetime=None
76
+ load: float = 0
77
+ avg_audio_len: float = 0
78
+ total_tsb: float = 0
79
+ avg_tsb_per_second: float = 0
80
+ total_tsl: float = 0
81
+ avg_tsl_per_second: float = 0
82
+ total_web: float = 0
83
+ avg_web_per_second: float = 0
84
+ avg_web_freq: float = 0
85
+
86
+ @dataclass
87
+ class DelayDetailRow:
88
+ audio_end_tsp:datetime = ""
89
+ audio_length:float =0
90
+ tsb_end_tsp:datetime =""
91
+ tsb_opt:str =""
92
+ tsb_cost:float = 0
93
+ tsb_cost_per_second: float = 0
94
+ tsl_ipt:str =""
95
+ tsl_end_tsp:datetime =""
96
+ tsl_opt:str =""
97
+ tsl_cost:float =0
98
+ tsl_cost_per_second: float = 0
99
+ web_tsp:datetime =""
100
+ web_src:str =""
101
+ web_dst:str =""
102
+ web_delay: float = 0
103
+ web_delay_per_second: float = 0
104
+ web_freq: float = 0
105
+ def __repr__(self):
106
+ return f"Row(audio_length={self.audio_length}, tsb_opt={self.tsb_opt})"
107
+
108
  @dataclass
109
  class DelayItem:
110
  """存储delay 报告中每一个 case的结果"""
 
115
  log_items: List[LogItem] = None
116
 
117
  def to_rows(self):
118
+ """将 log和 web的结果合并, 返回 DelaySummary和 DelayDetail的列表
119
  返回 row_0包含音频信息和 load 时间
120
  rows 是每次推理的详细信息"""
121
  print(f"length of log_items: {len(self.log_items)}")
122
  web_items_dict = {i.src_text + i.dst_text: i for i in self.web_items}
123
 
124
+ summary = DelaySummary(audio_name=self.audio,trans_type=self.translation_type,
125
+ audio_length=self.audio_length)
126
+ detail_rows = []
127
+ current_row = DelayDetailRow()
128
  for i in self.log_items:
129
+ if i.tag == LogTag.load_start:
130
+ summary.load_start = i.timestamp
131
+ elif i.tag == LogTag.load_end:
132
+ summary.load_end = i.timestamp
133
+ summary.load = (summary.load_end-summary.load_start).total_seconds()
134
  elif i.tag == LogTag.audio_end:
135
+ if current_row.audio_length > 0:
136
+ detail_rows.append(current_row)
137
  # 每次到 audio_end就是新的一行
138
+ current_row = DelayDetailRow()
 
139
  current_row.audio_end_tsp = i.timestamp
140
+ current_row.audio_length = time_to_float(i.content)
141
  elif i.tag == LogTag.transcribe_end:
142
  current_row.tsb_end_tsp = i.timestamp
143
  current_row.tsb_opt = i.content
144
  elif i.tag == LogTag.transcribe_cost:
145
+ current_row.tsb_cost = time_to_float(i.content)
146
+ current_row.tsb_cost_per_second = current_row.tsb_cost/current_row.audio_length
147
  elif i.tag == LogTag.translate_start:
148
  current_row.tsl_ipt = i.content
149
  elif i.tag in [LogTag.translate_end, LogTag.translate_large_end]:
 
154
  current_row.web_tsp = web_item.timestamp
155
  current_row.web_src = web_item.src_text
156
  current_row.web_dst = web_item.dst_text
157
+ current_row.web_delay = (current_row.web_tsp - current_row.audio_end_tsp).total_seconds()
158
+ current_row.web_delay_per_second = current_row.web_delay / current_row.audio_length
159
  # 删除 dict已匹配过的内容,避免多次匹配
160
  web_items_dict.pop(current_row.tsb_opt+current_row.tsl_opt)
161
+ if len(detail_rows)>=1 and detail_rows[-1].web_tsp:
162
+ current_row.web_freq = (current_row.web_tsp - detail_rows[-1].web_tsp).total_seconds()
163
+
164
  elif i.tag in [LogTag.translate_cost, LogTag.translate_large_cost]:
165
+ current_row.tsl_cost = time_to_float(i.content)
166
+ current_row.tsl_cost_per_second = current_row.tsl_cost/current_row.audio_length
167
+ summary = self.get_summary(summary, detail_rows)
168
+ return summary, detail_rows # [astuple(i) for i in rows]
169
+ def get_summary(self,summary: DelaySummary, detail_rows):
170
+ audio_len = []
171
+ total_tsb = []
172
+ avg_tsb_per_second = []
173
+ total_tsl = []
174
+ avg_tsl_per_second = []
175
+ total_web = []
176
+ avg_web_per_second = []
177
+ web_freq = []
178
+ for row in detail_rows:
179
+ if row.audio_length:
180
+ audio_len.append(row.audio_length)
181
+ if row.tsb_cost:
182
+ total_tsb.append(row.tsb_cost)
183
+ if row.tsb_cost_per_second:
184
+ avg_tsb_per_second.append(row.tsb_cost_per_second)
185
+ if row.tsl_cost:
186
+ total_tsl.append(row.tsl_cost)
187
+ if row.tsl_cost_per_second:
188
+ avg_tsl_per_second.append(row.tsl_cost_per_second)
189
+ if row.web_delay:
190
+ total_web.append(row.web_delay)
191
+ if row.web_delay_per_second:
192
+ avg_web_per_second.append(row.web_delay_per_second)
193
+ if row.web_freq:
194
+ web_freq.append(row.web_freq)
195
+
196
+ summary.avg_audio_len = sum(audio_len) / len(audio_len)
197
+ summary.total_tsb = sum(total_tsb)
198
+ summary.avg_tsb_per_second = sum(avg_tsb_per_second) / len(avg_tsb_per_second)
199
+ summary.total_tsl = sum(total_tsl)
200
+ summary.avg_tsl_per_second = sum(avg_tsl_per_second) / len(avg_tsl_per_second)
201
+ summary.total_web = sum(total_web)
202
+ summary.avg_web_per_second = sum(avg_web_per_second) / len(avg_web_per_second)
203
+ summary.avg_web_freq = sum(web_freq) /len(web_freq)
204
+ return summary
205
 
 
206
 
207
  class DelayReport:
208
  """存储delay 报告中所有 case的结果"""
209
  start_line = 0
210
  items: List[DelayItem] = []
 
 
 
 
 
 
 
 
 
 
 
 
211
  def print_summary(self, data):
212
+ print(tabulate(data))
 
213
 
214
  def to_csv(self, csv_path):
215
+ summaries = [["audio_name", "translation", "audio_length",
216
+ "load_start", "load_end", "load", "avg_audio_len",
217
+ "total_tsb", "avg_tsb_per_sec", "total_tsl", "avg_tsl_per_sec",
218
+ "total_web", "avg_web_per_sec", "avg_web_freq"]]
219
+ details = [["audio_end_tsp", "audio_length",
220
+ "tsb_end_tsp", "tsp_opt", "tsb_cost", "tsb_cost_per_sec",
221
+ "tsl_ipt", "tsl_end_tsp", "tsl_opt", "tsl_cost", "tsl_cost_per_sec",
222
+ "web_tsp", "web_src", "web_dst", "web_delay", "web_delay_per_sec", "web_freq"]]
223
  for i in self.items:
224
+ summary, detail_rows = i.to_rows()
225
+ summaries.append(astuple(summary))
226
+ details += [astuple(i) for i in detail_rows]
227
+ details.append([])
 
 
 
 
 
 
 
 
 
228
  self.print_summary(summaries)
229
+ save_csv(csv_path, [], summaries+[[]]+details)
230
 
231
  @dataclass
232
  class AccuracyItem:
 
243
  self.asr_accuracy = run_textdistance(self.audio_text, self.src_text)
244
  self.text_compare = highlight_diff(self.audio_text, self.src_text)
245
  def to_list(self):
246
+ return [self.audio, self.translation_type, self.audio_length, self.src_text,
247
+ self.dst_text,
248
+ self.asr_accuracy[0], self.asr_accuracy[1], self.text_compare]
249
 
250
  class AccuracyReport:
251
  items:List[AccuracyItem] = []
 
256
  print(tabulate(rows, header))
257
 
258
  def to_csv(self, csv_path):
259
+ print("accuracy item length: ", len(self.items))
260
  self.print_summary()
261
+ header = ["audio_name", "translation", "audio_length", "src text", "dst text",
262
+ "distance", "normalized distance", "text compare"]
263
+ save_csv(csv_path, header, [i.to_list() for i in self.items])
264
+
265
 
266
 
lib/utils.py CHANGED
@@ -22,7 +22,8 @@ def get_time_str(level:Literal["d","s","ms"]="d"):
22
  def save_csv(file_path, header, rows):
23
  with open(file_path, "w", encoding="utf-8") as f:
24
  writer = csv.writer(f)
25
- writer.writerow(header)
 
26
  writer.writerows(rows)
27
  print(f"write csv to {file_path}")
28
 
@@ -63,3 +64,14 @@ def highlight_diff(a, b):
63
  elif tag == 'replace':
64
  output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
65
  return ''.join(output)
 
 
 
 
 
 
 
 
 
 
 
 
22
  def save_csv(file_path, header, rows):
23
  with open(file_path, "w", encoding="utf-8") as f:
24
  writer = csv.writer(f)
25
+ if header:
26
+ writer.writerow(header)
27
  writer.writerows(rows)
28
  print(f"write csv to {file_path}")
29
 
 
64
  elif tag == 'replace':
65
  output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
66
  return ''.join(output)
67
+
68
+ def time_to_float(s: str):
69
+ if d := s.replace("s", ""):
70
+ return float(d)
71
+ return 0.0
72
+
73
+
74
+ if __name__ == '__main__':
75
+ a = ["1", "1.0", "10000.0 s", "", "2s", "3 s", "4 s", "5m"]
76
+ for i in a:
77
+ print(time_to_float(i))