update delay report
Browse files- lib/pages.py +1 -1
- lib/report.py +124 -65
- lib/utils.py +13 -1
lib/pages.py
CHANGED
|
@@ -53,7 +53,7 @@ class TranslatorPage:
|
|
| 53 |
print("click button to set translation off")
|
| 54 |
|
| 55 |
|
| 56 |
-
def get_current_node_text(self, duration=0, interval=0.
|
| 57 |
"""在一定时间内持续读取页面最新的 node的内容"""
|
| 58 |
print(f"capture page latest content for duration: {duration}s")
|
| 59 |
translate_items = []
|
|
|
|
| 53 |
print("click button to set translation off")
|
| 54 |
|
| 55 |
|
| 56 |
+
def get_current_node_text(self, duration=0, interval=0.05)-> List[WebItem]:
|
| 57 |
"""在一定时间内持续读取页面最新的 node的内容"""
|
| 58 |
print(f"capture page latest content for duration: {duration}s")
|
| 59 |
translate_items = []
|
lib/report.py
CHANGED
|
@@ -1,27 +1,10 @@
|
|
|
|
|
| 1 |
from typing import List
|
| 2 |
from dataclasses import dataclass, astuple
|
| 3 |
|
| 4 |
from tabulate import tabulate
|
| 5 |
from lib.log_parser import LogTag, LogItem, WebItem
|
| 6 |
-
from lib.utils import save_csv, run_textdistance, highlight_diff
|
| 7 |
-
|
| 8 |
-
@dataclass
|
| 9 |
-
class Row:
|
| 10 |
-
audio_end_tsp:str = ""
|
| 11 |
-
audio_length:str =""
|
| 12 |
-
tsb_end_tsp:str =""
|
| 13 |
-
tsb_opt:str =""
|
| 14 |
-
tsb_cost:str =""
|
| 15 |
-
tsl_ipt:str =""
|
| 16 |
-
tsl_end_tsp:str =""
|
| 17 |
-
tsl_opt:str =""
|
| 18 |
-
tsl_cost:str =""
|
| 19 |
-
web_tsp:str =""
|
| 20 |
-
web_src:str =""
|
| 21 |
-
web_dst:str =""
|
| 22 |
-
def __repr__(self):
|
| 23 |
-
return f"Row(audio_length={self.audio_length}, tsb_opt={self.tsb_opt})"
|
| 24 |
-
|
| 25 |
|
| 26 |
class LogReport:
|
| 27 |
"""用于处理 log文件"""
|
|
@@ -83,6 +66,45 @@ class LogReport:
|
|
| 83 |
rows = [[row[i] for i in header_mapping.values() if i < len(row)] for row in rows]
|
| 84 |
save_csv(csv_path, header, rows)
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
@dataclass
|
| 87 |
class DelayItem:
|
| 88 |
"""存储delay 报告中每一个 case的结果"""
|
|
@@ -93,29 +115,35 @@ class DelayItem:
|
|
| 93 |
log_items: List[LogItem] = None
|
| 94 |
|
| 95 |
def to_rows(self):
|
| 96 |
-
"""将 log和 web
|
| 97 |
返回 row_0包含音频信息和 load 时间
|
| 98 |
rows 是每次推理的详细信息"""
|
| 99 |
print(f"length of log_items: {len(self.log_items)}")
|
| 100 |
web_items_dict = {i.src_text + i.dst_text: i for i in self.web_items}
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
| 105 |
for i in self.log_items:
|
| 106 |
-
if i.tag
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
| 108 |
elif i.tag == LogTag.audio_end:
|
|
|
|
|
|
|
| 109 |
# 每次到 audio_end就是新的一行
|
| 110 |
-
|
| 111 |
-
current_row = Row()
|
| 112 |
current_row.audio_end_tsp = i.timestamp
|
| 113 |
-
current_row.audio_length = i.content
|
| 114 |
elif i.tag == LogTag.transcribe_end:
|
| 115 |
current_row.tsb_end_tsp = i.timestamp
|
| 116 |
current_row.tsb_opt = i.content
|
| 117 |
elif i.tag == LogTag.transcribe_cost:
|
| 118 |
-
current_row.tsb_cost = i.content
|
|
|
|
| 119 |
elif i.tag == LogTag.translate_start:
|
| 120 |
current_row.tsl_ipt = i.content
|
| 121 |
elif i.tag in [LogTag.translate_end, LogTag.translate_large_end]:
|
|
@@ -126,52 +154,79 @@ class DelayItem:
|
|
| 126 |
current_row.web_tsp = web_item.timestamp
|
| 127 |
current_row.web_src = web_item.src_text
|
| 128 |
current_row.web_dst = web_item.dst_text
|
|
|
|
|
|
|
| 129 |
# 删除 dict已匹配过的内容,避免多次匹配
|
| 130 |
web_items_dict.pop(current_row.tsb_opt+current_row.tsl_opt)
|
|
|
|
|
|
|
|
|
|
| 131 |
elif i.tag in [LogTag.translate_cost, LogTag.translate_large_cost]:
|
| 132 |
-
current_row.tsl_cost = i.content
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
-
return row_0, rows # [astuple(i) for i in rows]
|
| 136 |
|
| 137 |
class DelayReport:
|
| 138 |
"""存储delay 报告中所有 case的结果"""
|
| 139 |
start_line = 0
|
| 140 |
items: List[DelayItem] = []
|
| 141 |
-
# summary_items = {
|
| 142 |
-
# "translation_type": "",
|
| 143 |
-
# "audio length": "",
|
| 144 |
-
# "load_model": "",
|
| 145 |
-
# "total_transcribe": "",
|
| 146 |
-
# "average_transcribe": "",
|
| 147 |
-
# "total_translate": "",
|
| 148 |
-
# "average_translate": "",
|
| 149 |
-
# "asr accuracy": "",
|
| 150 |
-
# "llm translation score": "",
|
| 151 |
-
# "delay": "",
|
| 152 |
-
# }
|
| 153 |
def print_summary(self, data):
|
| 154 |
-
|
| 155 |
-
print(tabulate(data, header))
|
| 156 |
|
| 157 |
def to_csv(self, csv_path):
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
for i in self.items:
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
audios = [float(r.audio_length) for r in rows if r.audio_length]
|
| 167 |
-
transcribes = [float(r.tsb_cost) for r in rows if r.tsb_cost]
|
| 168 |
-
translates = [float(r.tsl_cost) for r in rows if r.tsl_cost]
|
| 169 |
-
if len(row_0) >=7:
|
| 170 |
-
summaries.append([row_0[1], row_0[6]-row_0[4], sum(audios), sum(transcribes), sum(translates)])
|
| 171 |
-
else:
|
| 172 |
-
summaries.append([row_0[1], 0, sum(audios), sum(transcribes), sum(translates)])
|
| 173 |
-
save_csv(csv_path, [], all_rows)
|
| 174 |
self.print_summary(summaries)
|
|
|
|
| 175 |
|
| 176 |
@dataclass
|
| 177 |
class AccuracyItem:
|
|
@@ -188,9 +243,9 @@ class AccuracyItem:
|
|
| 188 |
self.asr_accuracy = run_textdistance(self.audio_text, self.src_text)
|
| 189 |
self.text_compare = highlight_diff(self.audio_text, self.src_text)
|
| 190 |
def to_list(self):
|
| 191 |
-
return [self.
|
| 192 |
-
|
| 193 |
-
self.asr_accuracy, self.text_compare]
|
| 194 |
|
| 195 |
class AccuracyReport:
|
| 196 |
items:List[AccuracyItem] = []
|
|
@@ -201,7 +256,11 @@ class AccuracyReport:
|
|
| 201 |
print(tabulate(rows, header))
|
| 202 |
|
| 203 |
def to_csv(self, csv_path):
|
| 204 |
-
|
| 205 |
self.print_summary()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
|
|
|
|
| 1 |
+
from datetime import datetime, timedelta
|
| 2 |
from typing import List
|
| 3 |
from dataclasses import dataclass, astuple
|
| 4 |
|
| 5 |
from tabulate import tabulate
|
| 6 |
from lib.log_parser import LogTag, LogItem, WebItem
|
| 7 |
+
from lib.utils import save_csv, run_textdistance, highlight_diff, time_to_float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class LogReport:
|
| 10 |
"""用于处理 log文件"""
|
|
|
|
| 66 |
rows = [[row[i] for i in header_mapping.values() if i < len(row)] for row in rows]
|
| 67 |
save_csv(csv_path, header, rows)
|
| 68 |
|
| 69 |
+
@dataclass
|
| 70 |
+
class DelaySummary:
|
| 71 |
+
audio_name:str = ""
|
| 72 |
+
trans_type: str = ""
|
| 73 |
+
audio_length:str = ""
|
| 74 |
+
load_start: datetime =None
|
| 75 |
+
load_end: datetime=None
|
| 76 |
+
load: float = 0
|
| 77 |
+
avg_audio_len: float = 0
|
| 78 |
+
total_tsb: float = 0
|
| 79 |
+
avg_tsb_per_second: float = 0
|
| 80 |
+
total_tsl: float = 0
|
| 81 |
+
avg_tsl_per_second: float = 0
|
| 82 |
+
total_web: float = 0
|
| 83 |
+
avg_web_per_second: float = 0
|
| 84 |
+
avg_web_freq: float = 0
|
| 85 |
+
|
| 86 |
+
@dataclass
|
| 87 |
+
class DelayDetailRow:
|
| 88 |
+
audio_end_tsp:datetime = ""
|
| 89 |
+
audio_length:float =0
|
| 90 |
+
tsb_end_tsp:datetime =""
|
| 91 |
+
tsb_opt:str =""
|
| 92 |
+
tsb_cost:float = 0
|
| 93 |
+
tsb_cost_per_second: float = 0
|
| 94 |
+
tsl_ipt:str =""
|
| 95 |
+
tsl_end_tsp:datetime =""
|
| 96 |
+
tsl_opt:str =""
|
| 97 |
+
tsl_cost:float =0
|
| 98 |
+
tsl_cost_per_second: float = 0
|
| 99 |
+
web_tsp:datetime =""
|
| 100 |
+
web_src:str =""
|
| 101 |
+
web_dst:str =""
|
| 102 |
+
web_delay: float = 0
|
| 103 |
+
web_delay_per_second: float = 0
|
| 104 |
+
web_freq: float = 0
|
| 105 |
+
def __repr__(self):
|
| 106 |
+
return f"Row(audio_length={self.audio_length}, tsb_opt={self.tsb_opt})"
|
| 107 |
+
|
| 108 |
@dataclass
|
| 109 |
class DelayItem:
|
| 110 |
"""存储delay 报告中每一个 case的结果"""
|
|
|
|
| 115 |
log_items: List[LogItem] = None
|
| 116 |
|
| 117 |
def to_rows(self):
|
| 118 |
+
"""将 log和 web的结果合并, 返回 DelaySummary和 DelayDetail的列表
|
| 119 |
返回 row_0包含音频信息和 load 时间
|
| 120 |
rows 是每次推理的详细信息"""
|
| 121 |
print(f"length of log_items: {len(self.log_items)}")
|
| 122 |
web_items_dict = {i.src_text + i.dst_text: i for i in self.web_items}
|
| 123 |
|
| 124 |
+
summary = DelaySummary(audio_name=self.audio,trans_type=self.translation_type,
|
| 125 |
+
audio_length=self.audio_length)
|
| 126 |
+
detail_rows = []
|
| 127 |
+
current_row = DelayDetailRow()
|
| 128 |
for i in self.log_items:
|
| 129 |
+
if i.tag == LogTag.load_start:
|
| 130 |
+
summary.load_start = i.timestamp
|
| 131 |
+
elif i.tag == LogTag.load_end:
|
| 132 |
+
summary.load_end = i.timestamp
|
| 133 |
+
summary.load = (summary.load_end-summary.load_start).total_seconds()
|
| 134 |
elif i.tag == LogTag.audio_end:
|
| 135 |
+
if current_row.audio_length > 0:
|
| 136 |
+
detail_rows.append(current_row)
|
| 137 |
# 每次到 audio_end就是新的一行
|
| 138 |
+
current_row = DelayDetailRow()
|
|
|
|
| 139 |
current_row.audio_end_tsp = i.timestamp
|
| 140 |
+
current_row.audio_length = time_to_float(i.content)
|
| 141 |
elif i.tag == LogTag.transcribe_end:
|
| 142 |
current_row.tsb_end_tsp = i.timestamp
|
| 143 |
current_row.tsb_opt = i.content
|
| 144 |
elif i.tag == LogTag.transcribe_cost:
|
| 145 |
+
current_row.tsb_cost = time_to_float(i.content)
|
| 146 |
+
current_row.tsb_cost_per_second = current_row.tsb_cost/current_row.audio_length
|
| 147 |
elif i.tag == LogTag.translate_start:
|
| 148 |
current_row.tsl_ipt = i.content
|
| 149 |
elif i.tag in [LogTag.translate_end, LogTag.translate_large_end]:
|
|
|
|
| 154 |
current_row.web_tsp = web_item.timestamp
|
| 155 |
current_row.web_src = web_item.src_text
|
| 156 |
current_row.web_dst = web_item.dst_text
|
| 157 |
+
current_row.web_delay = (current_row.web_tsp - current_row.audio_end_tsp).total_seconds()
|
| 158 |
+
current_row.web_delay_per_second = current_row.web_delay / current_row.audio_length
|
| 159 |
# 删除 dict已匹配过的内容,避免多次匹配
|
| 160 |
web_items_dict.pop(current_row.tsb_opt+current_row.tsl_opt)
|
| 161 |
+
if len(detail_rows)>=1 and detail_rows[-1].web_tsp:
|
| 162 |
+
current_row.web_freq = (current_row.web_tsp - detail_rows[-1].web_tsp).total_seconds()
|
| 163 |
+
|
| 164 |
elif i.tag in [LogTag.translate_cost, LogTag.translate_large_cost]:
|
| 165 |
+
current_row.tsl_cost = time_to_float(i.content)
|
| 166 |
+
current_row.tsl_cost_per_second = current_row.tsl_cost/current_row.audio_length
|
| 167 |
+
summary = self.get_summary(summary, detail_rows)
|
| 168 |
+
return summary, detail_rows # [astuple(i) for i in rows]
|
| 169 |
+
def get_summary(self,summary: DelaySummary, detail_rows):
|
| 170 |
+
audio_len = []
|
| 171 |
+
total_tsb = []
|
| 172 |
+
avg_tsb_per_second = []
|
| 173 |
+
total_tsl = []
|
| 174 |
+
avg_tsl_per_second = []
|
| 175 |
+
total_web = []
|
| 176 |
+
avg_web_per_second = []
|
| 177 |
+
web_freq = []
|
| 178 |
+
for row in detail_rows:
|
| 179 |
+
if row.audio_length:
|
| 180 |
+
audio_len.append(row.audio_length)
|
| 181 |
+
if row.tsb_cost:
|
| 182 |
+
total_tsb.append(row.tsb_cost)
|
| 183 |
+
if row.tsb_cost_per_second:
|
| 184 |
+
avg_tsb_per_second.append(row.tsb_cost_per_second)
|
| 185 |
+
if row.tsl_cost:
|
| 186 |
+
total_tsl.append(row.tsl_cost)
|
| 187 |
+
if row.tsl_cost_per_second:
|
| 188 |
+
avg_tsl_per_second.append(row.tsl_cost_per_second)
|
| 189 |
+
if row.web_delay:
|
| 190 |
+
total_web.append(row.web_delay)
|
| 191 |
+
if row.web_delay_per_second:
|
| 192 |
+
avg_web_per_second.append(row.web_delay_per_second)
|
| 193 |
+
if row.web_freq:
|
| 194 |
+
web_freq.append(row.web_freq)
|
| 195 |
+
|
| 196 |
+
summary.avg_audio_len = sum(audio_len) / len(audio_len)
|
| 197 |
+
summary.total_tsb = sum(total_tsb)
|
| 198 |
+
summary.avg_tsb_per_second = sum(avg_tsb_per_second) / len(avg_tsb_per_second)
|
| 199 |
+
summary.total_tsl = sum(total_tsl)
|
| 200 |
+
summary.avg_tsl_per_second = sum(avg_tsl_per_second) / len(avg_tsl_per_second)
|
| 201 |
+
summary.total_web = sum(total_web)
|
| 202 |
+
summary.avg_web_per_second = sum(avg_web_per_second) / len(avg_web_per_second)
|
| 203 |
+
summary.avg_web_freq = sum(web_freq) /len(web_freq)
|
| 204 |
+
return summary
|
| 205 |
|
|
|
|
| 206 |
|
| 207 |
class DelayReport:
|
| 208 |
"""存储delay 报告中所有 case的结果"""
|
| 209 |
start_line = 0
|
| 210 |
items: List[DelayItem] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
def print_summary(self, data):
|
| 212 |
+
print(tabulate(data))
|
|
|
|
| 213 |
|
| 214 |
def to_csv(self, csv_path):
|
| 215 |
+
summaries = [["audio_name", "translation", "audio_length",
|
| 216 |
+
"load_start", "load_end", "load", "avg_audio_len",
|
| 217 |
+
"total_tsb", "avg_tsb_per_sec", "total_tsl", "avg_tsl_per_sec",
|
| 218 |
+
"total_web", "avg_web_per_sec", "avg_web_freq"]]
|
| 219 |
+
details = [["audio_end_tsp", "audio_length",
|
| 220 |
+
"tsb_end_tsp", "tsp_opt", "tsb_cost", "tsb_cost_per_sec",
|
| 221 |
+
"tsl_ipt", "tsl_end_tsp", "tsl_opt", "tsl_cost", "tsl_cost_per_sec",
|
| 222 |
+
"web_tsp", "web_src", "web_dst", "web_delay", "web_delay_per_sec", "web_freq"]]
|
| 223 |
for i in self.items:
|
| 224 |
+
summary, detail_rows = i.to_rows()
|
| 225 |
+
summaries.append(astuple(summary))
|
| 226 |
+
details += [astuple(i) for i in detail_rows]
|
| 227 |
+
details.append([])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
self.print_summary(summaries)
|
| 229 |
+
save_csv(csv_path, [], summaries+[[]]+details)
|
| 230 |
|
| 231 |
@dataclass
|
| 232 |
class AccuracyItem:
|
|
|
|
| 243 |
self.asr_accuracy = run_textdistance(self.audio_text, self.src_text)
|
| 244 |
self.text_compare = highlight_diff(self.audio_text, self.src_text)
|
| 245 |
def to_list(self):
|
| 246 |
+
return [self.audio, self.translation_type, self.audio_length, self.src_text,
|
| 247 |
+
self.dst_text,
|
| 248 |
+
self.asr_accuracy[0], self.asr_accuracy[1], self.text_compare]
|
| 249 |
|
| 250 |
class AccuracyReport:
|
| 251 |
items:List[AccuracyItem] = []
|
|
|
|
| 256 |
print(tabulate(rows, header))
|
| 257 |
|
| 258 |
def to_csv(self, csv_path):
|
| 259 |
+
print("accuracy item length: ", len(self.items))
|
| 260 |
self.print_summary()
|
| 261 |
+
header = ["audio_name", "translation", "audio_length", "src text", "dst text",
|
| 262 |
+
"distance", "normalized distance", "text compare"]
|
| 263 |
+
save_csv(csv_path, header, [i.to_list() for i in self.items])
|
| 264 |
+
|
| 265 |
|
| 266 |
|
lib/utils.py
CHANGED
|
@@ -22,7 +22,8 @@ def get_time_str(level:Literal["d","s","ms"]="d"):
|
|
| 22 |
def save_csv(file_path, header, rows):
|
| 23 |
with open(file_path, "w", encoding="utf-8") as f:
|
| 24 |
writer = csv.writer(f)
|
| 25 |
-
|
|
|
|
| 26 |
writer.writerows(rows)
|
| 27 |
print(f"write csv to {file_path}")
|
| 28 |
|
|
@@ -63,3 +64,14 @@ def highlight_diff(a, b):
|
|
| 63 |
elif tag == 'replace':
|
| 64 |
output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
|
| 65 |
return ''.join(output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def save_csv(file_path, header, rows):
|
| 23 |
with open(file_path, "w", encoding="utf-8") as f:
|
| 24 |
writer = csv.writer(f)
|
| 25 |
+
if header:
|
| 26 |
+
writer.writerow(header)
|
| 27 |
writer.writerows(rows)
|
| 28 |
print(f"write csv to {file_path}")
|
| 29 |
|
|
|
|
| 64 |
elif tag == 'replace':
|
| 65 |
output.append(f"[-{a[a_start:a_end]}-]{{+{b[b_start:b_end]}+}}")
|
| 66 |
return ''.join(output)
|
| 67 |
+
|
| 68 |
+
def time_to_float(s: str):
|
| 69 |
+
if d := s.replace("s", ""):
|
| 70 |
+
return float(d)
|
| 71 |
+
return 0.0
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
if __name__ == '__main__':
|
| 75 |
+
a = ["1", "1.0", "10000.0 s", "", "2s", "3 s", "4 s", "5m"]
|
| 76 |
+
for i in a:
|
| 77 |
+
print(time_to_float(i))
|