unknown commited on
Commit
59afc96
·
1 Parent(s): 04000ce

Update UI

Browse files
README.md CHANGED
@@ -10,3 +10,36 @@ pinned: false
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+
15
+ ## ASR LLM Agent Upgrade
16
+
17
+ This version adds an LLM-based diagnosis layer on top of alignment/event statistics:
18
+
19
+ - `analysis/llm_analyzer.py`: sends representative ASR error cases + aggregate stats to an LLM
20
+ - `pipeline/run_analysis.py`: optionally runs LLM diagnosis when `OPENAI_API_KEY` is set
21
+ - `scripts/run_diagnostic.py`: regenerate `llm_diagnosis.json` and `diagnostic_report.md`
22
+ - `report.md`: now includes LLM semantic findings and priority actions
23
+
24
+ ### What the LLM adds
25
+
26
+ Compared with rule-only classification, the LLM layer can:
27
+
28
+ - separate surface-form differences from true semantic distortions
29
+ - identify meaning-preserving paraphrases vs business-critical errors
30
+ - infer likely causes from representative cases
31
+ - propose prioritized, actionable improvement suggestions
32
+
33
+ ### Quick start
34
+
35
+ ```bash
36
+ export OPENAI_API_KEY=your_key
37
+ python pipeline/run_all.py --manifest data/manifest.jsonl --model_name openai/whisper-small --llm_model gpt-4.1-mini
38
+ ```
39
+
40
+ Or rerun diagnosis only for an existing run:
41
+
42
+ ```bash
43
+ export OPENAI_API_KEY=your_key
44
+ python scripts/run_diagnostic.py --run_id <run_id> --model gpt-4.1-mini
45
+ ```
analysis/llm_analyzer.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import pandas as pd
8
+ from openai import OpenAI
9
+
10
+
11
+ SEMANTIC_SCHEMA_EXAMPLE = {
12
+ "utt_id": "string",
13
+ "semantic_judgement": "语义基本等价|轻微偏差|明显偏差|严重失真",
14
+ "severity": "high|medium|low",
15
+ "semantic_error_types": ["string"],
16
+ "business_impact": "high|medium|low",
17
+ "reason": "string",
18
+ "improvement_suggestions": ["string"],
19
+ "confidence": 0.0,
20
+ }
21
+
22
+
23
+ def _safe_float(v: Any) -> Optional[float]:
24
+ try:
25
+ if v is None:
26
+ return None
27
+ return float(v)
28
+ except Exception:
29
+ return None
30
+
31
+
32
+ def build_case_pack(df_align: pd.DataFrame, df_events: pd.DataFrame, max_cases: int = 24) -> List[Dict[str, Any]]:
33
+ if df_align is None or len(df_align) == 0:
34
+ return []
35
+
36
+ align = df_align.copy()
37
+ if "cer" not in align.columns:
38
+ align["cer"] = None
39
+ if "wer" not in align.columns:
40
+ align["wer"] = None
41
+
42
+ sort_cols = [c for c in ["cer", "wer"] if c in align.columns]
43
+ if sort_cols:
44
+ align = align.sort_values(sort_cols, ascending=False, na_position="last")
45
+
46
+ cases: List[Dict[str, Any]] = []
47
+ seen = set()
48
+
49
+ def _event_summary(utt_id: str) -> Dict[str, Any]:
50
+ if df_events is None or len(df_events) == 0 or "utt_id" not in df_events.columns:
51
+ return {"error_classes": {}, "ops": {}, "examples": []}
52
+ x = df_events[df_events["utt_id"] == utt_id].copy()
53
+ if len(x) == 0:
54
+ return {"error_classes": {}, "ops": {}, "examples": []}
55
+ examples = []
56
+ for _, row in x.head(10).iterrows():
57
+ examples.append({
58
+ "level": row.get("level"),
59
+ "op_type": row.get("op_type"),
60
+ "ref": row.get("ref"),
61
+ "hyp": row.get("hyp"),
62
+ "error_class": row.get("error_class"),
63
+ })
64
+ return {
65
+ "error_classes": {str(k): int(v) for k, v in x["error_class"].value_counts().head(10).to_dict().items()} if "error_class" in x.columns else {},
66
+ "ops": {str(k): int(v) for k, v in x["op_type"].value_counts().to_dict().items()} if "op_type" in x.columns else {},
67
+ "examples": examples,
68
+ }
69
+
70
+ for _, row in align.head(max_cases).iterrows():
71
+ utt_id = str(row.get("utt_id"))
72
+ if utt_id in seen:
73
+ continue
74
+ seen.add(utt_id)
75
+ meta = {k: row.get(k) for k in ["device", "domain", "accent", "speaker"] if k in row.index and pd.notna(row.get(k))}
76
+ cases.append({
77
+ "utt_id": utt_id,
78
+ "ref_text": row.get("ref_text"),
79
+ "hyp_text": row.get("hyp_text"),
80
+ "norm_ref": row.get("norm_ref"),
81
+ "norm_hyp": row.get("norm_hyp"),
82
+ "wer": _safe_float(row.get("wer")),
83
+ "cer": _safe_float(row.get("cer")),
84
+ "meta": meta,
85
+ "event_summary": _event_summary(utt_id),
86
+ })
87
+ if len(cases) >= max_cases:
88
+ break
89
+
90
+ return cases
91
+
92
+
93
+ def build_global_stats(df_align: pd.DataFrame, df_events: pd.DataFrame, summary: Dict[str, Any]) -> Dict[str, Any]:
94
+ stats: Dict[str, Any] = {
95
+ "summary": summary,
96
+ "num_utterances": int(len(df_align)) if df_align is not None else 0,
97
+ "num_events": int(len(df_events)) if df_events is not None else 0,
98
+ }
99
+ if df_events is not None and len(df_events) > 0:
100
+ if "op_type" in df_events.columns:
101
+ stats["op_counts"] = {str(k): int(v) for k, v in df_events["op_type"].value_counts().to_dict().items()}
102
+ if "error_class" in df_events.columns:
103
+ stats["error_class_counts"] = {str(k): int(v) for k, v in df_events["error_class"].value_counts().head(20).to_dict().items()}
104
+
105
+ slice_stats = {}
106
+ if df_align is not None and len(df_align) > 0 and "cer" in df_align.columns:
107
+ for key in ["device", "domain", "accent", "speaker"]:
108
+ if key in df_align.columns and df_align[key].notna().any():
109
+ g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False).head(10)
110
+ if len(g) > 0:
111
+ slice_stats[key] = [{"key": str(k), "cer": float(v)} for k, v in g.items()]
112
+ if slice_stats:
113
+ stats["slice_stats"] = slice_stats
114
+ return stats
115
+
116
+
117
+ SYSTEM_PROMPT = """你是资深 ASR 诊断专家,同时具备语音识别、语言学和业务语义分析能力。
118
+ 你的任务不是只做 S/I/D 统计,而是识别:
119
+ 1. 结构性错误(替换、删除、插入、数字、英文、专名等)
120
+ 2. 语义层错误(是否改变原意、是否造成业务理解偏差、是否只是表面字词不同但语义基本等价)
121
+ 3. 可能成因(口音、同音混淆、领域词缺失、分段/VAD、噪声、数字口语化、语言模型偏置等)
122
+ 4. 可执行的改进建议
123
+
124
+ 必须严格基于输入证据,不要编造音频层信息。若证据不足,明确写“不确定”���
125
+ 请输出严格 JSON。"""
126
+
127
+
128
+ def _extract_json(text: str) -> Dict[str, Any]:
129
+ text = text.strip()
130
+ try:
131
+ return json.loads(text)
132
+ except Exception:
133
+ pass
134
+ start = text.find("{")
135
+ end = text.rfind("}")
136
+ if start >= 0 and end > start:
137
+ return json.loads(text[start:end + 1])
138
+ raise ValueError("LLM output is not valid JSON")
139
+
140
+
141
+ def analyze_with_llm(
142
+ df_align: pd.DataFrame,
143
+ df_events: pd.DataFrame,
144
+ summary: Dict[str, Any],
145
+ model: str = "gpt-4.1-mini",
146
+ client: Optional[OpenAI] = None,
147
+ max_cases: int = 24,
148
+ ) -> Dict[str, Any]:
149
+ client = client or OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
150
+
151
+ case_pack = build_case_pack(df_align, df_events, max_cases=max_cases)
152
+ global_stats = build_global_stats(df_align, df_events, summary)
153
+
154
+ user_prompt = {
155
+ "task": "请对 ASR 结果做结构+语义联合诊断,并给出改进建议。",
156
+ "instructions": {
157
+ "output_schema": {
158
+ "executive_summary": "string",
159
+ "major_patterns": [
160
+ {
161
+ "title": "string",
162
+ "priority": 1,
163
+ "phenomenon": "string",
164
+ "evidence": ["string"],
165
+ "semantic_impact": "high|medium|low",
166
+ "likely_causes": ["string"],
167
+ "recommendations": ["string"],
168
+ "confidence": 0.0,
169
+ }
170
+ ],
171
+ "case_findings": [
172
+ {
173
+ "utt_id": "string",
174
+ "semantic_judgement": "语义基本等价|轻微偏差|明显偏差|严重失真",
175
+ "severity": "high|medium|low",
176
+ "reason": "string",
177
+ "semantic_error_types": ["string"],
178
+ "suggestions": ["string"],
179
+ }
180
+ ],
181
+ "priority_actions": ["string"],
182
+ "uncertainties": ["string"],
183
+ },
184
+ "requirements": [
185
+ "不要复述所有 case,只保留最有代表性的 findings。",
186
+ "要区分字符差异和真正改变语义的错误。",
187
+ "如果 ref/hyp 只是同义改写或口语差异,应该指出语义影响较低。",
188
+ "建议必须可执行,优先包含数据、解码、后处理、术语表、prompt/context、评测集扩展等方向。",
189
+ ],
190
+ },
191
+ "global_stats": global_stats,
192
+ "cases": case_pack,
193
+ }
194
+
195
+ resp = client.chat.completions.create(
196
+ model=model,
197
+ temperature=0.2,
198
+ messages=[
199
+ {"role": "system", "content": SYSTEM_PROMPT},
200
+ {"role": "user", "content": json.dumps(user_prompt, ensure_ascii=False)},
201
+ ],
202
+ response_format={"type": "json_object"},
203
+ )
204
+ raw = resp.choices[0].message.content or "{}"
205
+ parsed = _extract_json(raw)
206
+ parsed["model"] = model
207
+ parsed["num_cases_sent"] = len(case_pack)
208
+ parsed["global_stats"] = global_stats
209
+ return parsed
210
+
211
+
212
+ def _build_semantic_rows(df_align: pd.DataFrame, df_events: pd.DataFrame) -> List[Dict[str, Any]]:
213
+ rows: List[Dict[str, Any]] = []
214
+ event_map: Dict[str, List[Dict[str, Any]]] = {}
215
+ if df_events is not None and len(df_events) > 0 and "utt_id" in df_events.columns:
216
+ for utt_id, sub in df_events.groupby("utt_id"):
217
+ examples = []
218
+ for _, row in sub.head(8).iterrows():
219
+ examples.append({
220
+ "op_type": row.get("op_type"),
221
+ "ref": row.get("ref"),
222
+ "hyp": row.get("hyp"),
223
+ "error_class": row.get("error_class"),
224
+ "level": row.get("level"),
225
+ })
226
+ event_map[str(utt_id)] = examples
227
+
228
+ for _, row in df_align.iterrows():
229
+ utt_id = str(row.get("utt_id"))
230
+ meta = {k: row.get(k) for k in ["device", "domain", "accent", "speaker"] if k in row.index and pd.notna(row.get(k))}
231
+ rows.append({
232
+ "utt_id": utt_id,
233
+ "ref_text": row.get("ref_text"),
234
+ "hyp_text": row.get("hyp_text"),
235
+ "norm_ref": row.get("norm_ref"),
236
+ "norm_hyp": row.get("norm_hyp"),
237
+ "wer": _safe_float(row.get("wer")),
238
+ "cer": _safe_float(row.get("cer")),
239
+ "meta": meta,
240
+ "events": event_map.get(utt_id, []),
241
+ })
242
+ return rows
243
+
244
+
245
+ def _normalize_semantic_item(item: Dict[str, Any], fallback: Dict[str, Any]) -> Dict[str, Any]:
246
+ semantic_types = item.get("semantic_error_types") or []
247
+ suggestions = item.get("improvement_suggestions") or item.get("suggestions") or []
248
+ return {
249
+ "utt_id": str(item.get("utt_id") or fallback.get("utt_id")),
250
+ "semantic_judgement": str(item.get("semantic_judgement") or "不确定"),
251
+ "severity": str(item.get("severity") or "low"),
252
+ "semantic_error_types": semantic_types if isinstance(semantic_types, list) else [str(semantic_types)],
253
+ "business_impact": str(item.get("business_impact") or "low"),
254
+ "reason": str(item.get("reason") or ""),
255
+ "improvement_suggestions": suggestions if isinstance(suggestions, list) else [str(suggestions)],
256
+ "confidence": _safe_float(item.get("confidence")) if item.get("confidence") is not None else None,
257
+ "ref_text": fallback.get("ref_text"),
258
+ "hyp_text": fallback.get("hyp_text"),
259
+ "wer": fallback.get("wer"),
260
+ "cer": fallback.get("cer"),
261
+ "device": (fallback.get("meta") or {}).get("device"),
262
+ "domain": (fallback.get("meta") or {}).get("domain"),
263
+ "accent": (fallback.get("meta") or {}).get("accent"),
264
+ "speaker": (fallback.get("meta") or {}).get("speaker"),
265
+ }
266
+
267
+
268
+ def analyze_semantic_per_utterance(
269
+ df_align: pd.DataFrame,
270
+ df_events: pd.DataFrame,
271
+ model: str = "gpt-4.1-mini",
272
+ client: Optional[OpenAI] = None,
273
+ batch_size: int = 12,
274
+ ) -> pd.DataFrame:
275
+ client = client or OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
276
+ rows = _build_semantic_rows(df_align, df_events)
277
+ if not rows:
278
+ return pd.DataFrame()
279
+
280
+ results: List[Dict[str, Any]] = []
281
+ for start in range(0, len(rows), batch_size):
282
+ batch = rows[start:start + batch_size]
283
+ payload = {
284
+ "task": "逐条 utterance 做语义级错误判断。",
285
+ "requirements": [
286
+ "逐条判断 ref_text 与 hyp_text 的语义偏差程度。",
287
+ "不要因为表面字不同就判严重错误;如果基本不改变含义,应标注为语义基本等价或轻微偏差。",
288
+ "结合 events 判断数字、时间、专名、否定、实体、动作关系等关键语义是否出错。",
289
+ "输出必须覆盖 batch 中每个 utt_id,且仅输出 JSON 对象。",
290
+ ],
291
+ "output_schema": {"items": [SEMANTIC_SCHEMA_EXAMPLE]},
292
+ "items": batch,
293
+ }
294
+ resp = client.chat.completions.create(
295
+ model=model,
296
+ temperature=0,
297
+ messages=[
298
+ {"role": "system", "content": SYSTEM_PROMPT},
299
+ {"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
300
+ ],
301
+ response_format={"type": "json_object"},
302
+ )
303
+ raw = resp.choices[0].message.content or "{}"
304
+ parsed = _extract_json(raw)
305
+ items = parsed.get("items") or []
306
+ by_id = {str(x.get("utt_id")): x for x in items if isinstance(x, dict)}
307
+ for fallback in batch:
308
+ item = by_id.get(str(fallback.get("utt_id")), {"utt_id": fallback.get("utt_id"), "semantic_judgement": "不确定", "reason": "LLM 未返回该条结果。"})
309
+ results.append(_normalize_semantic_item(item, fallback))
310
+
311
+ df = pd.DataFrame(results)
312
+ if len(df) > 0 and "semantic_error_types" in df.columns:
313
+ df["semantic_error_types_str"] = df["semantic_error_types"].apply(lambda xs: " | ".join(xs) if isinstance(xs, list) else str(xs))
314
+ if len(df) > 0 and "improvement_suggestions" in df.columns:
315
+ df["improvement_suggestions_str"] = df["improvement_suggestions"].apply(lambda xs: " | ".join(xs) if isinstance(xs, list) else str(xs))
316
+ df["llm_model"] = model
317
+ return df
pipeline/__init__.py ADDED
File without changes
pipeline/run_all.py CHANGED
@@ -10,6 +10,8 @@ def main():
10
  ap.add_argument("--model_name", default="small")
11
  ap.add_argument("--device", default="cpu")
12
  ap.add_argument("--compute_type", default="int8")
 
 
13
  args = ap.parse_args()
14
 
15
  run_id = run_asr(
@@ -18,7 +20,7 @@ def main():
18
  device=args.device,
19
  compute_type=args.compute_type,
20
  )
21
- run_analysis(run_id)
22
  print(f"Done. Run: runs/{run_id}")
23
 
24
 
 
10
  ap.add_argument("--model_name", default="small")
11
  ap.add_argument("--device", default="cpu")
12
  ap.add_argument("--compute_type", default="int8")
13
+ ap.add_argument("--llm_model", default="gpt-4.1-mini")
14
+ ap.add_argument("--disable_llm", action="store_true")
15
  args = ap.parse_args()
16
 
17
  run_id = run_asr(
 
20
  device=args.device,
21
  compute_type=args.compute_type,
22
  )
23
+ run_analysis(run_id, llm_enabled=not args.disable_llm, llm_model=args.llm_model)
24
  print(f"Done. Run: runs/{run_id}")
25
 
26
 
pipeline/run_analysis.py CHANGED
@@ -1,29 +1,42 @@
1
  from __future__ import annotations
2
  import json
 
3
  from pathlib import Path
4
- from typing import List, Dict
5
 
6
  import pandas as pd
 
7
 
8
  from core.io import read_jsonl, write_jsonl
9
  from analysis.align import align_one
10
  from analysis.events import extract_events
11
  from analysis.aggregate import aggregate_summary
 
12
  from report.generate import write_report
 
13
 
14
 
15
- def run_analysis(run_id: str, out_root: str = "runs") -> None:
 
 
 
 
 
 
16
  run_dir = Path(out_root) / run_id
17
  run_meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
18
 
19
  asr_path = run_dir / "asr_outputs.jsonl"
20
  aligned_path = run_dir / "aligned.jsonl"
21
  events_path = run_dir / "events.parquet"
 
 
22
 
23
  aligned_records: List[Dict] = []
24
  events_records: List[Dict] = []
25
 
26
- for r in read_jsonl(asr_path):
 
27
  utt_id = r["utt_id"]
28
  ref = r.get("ref_text")
29
  hyp = r.get("hyp_text", "")
@@ -44,17 +57,46 @@ def run_analysis(run_id: str, out_root: str = "runs") -> None:
44
  df_events = pd.DataFrame(events_records)
45
  df_events.to_parquet(events_path, index=False)
46
  else:
47
- df_events = pd.DataFrame(columns=["op_type", "error_class"])
48
-
49
- # merge meta slice fields into align if they exist
50
- # df_align currently doesn't have meta; add a few common keys if present
51
- # We'll reconstruct from asr_outputs for slicing.
52
- # (Simple approach) reload and join on utt_id:
53
- meta_map = {}
54
- for r in read_jsonl(asr_path):
55
- meta_map[r["utt_id"]] = r.get("meta", {}) or {}
56
  for key in ["device", "domain", "accent", "speaker"]:
57
  df_align[key] = df_align["utt_id"].map(lambda u: meta_map.get(u, {}).get(key))
58
 
59
  summary = aggregate_summary(df_events, df_align)
60
- write_report(run_dir, run_meta, summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
  import json
3
+ import os
4
  from pathlib import Path
5
+ from typing import List, Dict, Optional
6
 
7
  import pandas as pd
8
+ from openai import OpenAI
9
 
10
  from core.io import read_jsonl, write_jsonl
11
  from analysis.align import align_one
12
  from analysis.events import extract_events
13
  from analysis.aggregate import aggregate_summary
14
+ from analysis.llm_analyzer import analyze_with_llm, analyze_semantic_per_utterance
15
  from report.generate import write_report
16
+ from report.diagnostic_report import generate_report_with_openai
17
 
18
 
19
+ def run_analysis(
20
+ run_id: str,
21
+ out_root: str = "runs",
22
+ llm_enabled: bool = True,
23
+ llm_model: str = "gpt-4.1-mini",
24
+ write_diagnostic_report: bool = True,
25
+ ) -> None:
26
  run_dir = Path(out_root) / run_id
27
  run_meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
28
 
29
  asr_path = run_dir / "asr_outputs.jsonl"
30
  aligned_path = run_dir / "aligned.jsonl"
31
  events_path = run_dir / "events.parquet"
32
+ semantic_path = run_dir / "semantic_findings.parquet"
33
+ semantic_jsonl_path = run_dir / "semantic_findings.jsonl"
34
 
35
  aligned_records: List[Dict] = []
36
  events_records: List[Dict] = []
37
 
38
+ asr_rows = list(read_jsonl(asr_path))
39
+ for r in asr_rows:
40
  utt_id = r["utt_id"]
41
  ref = r.get("ref_text")
42
  hyp = r.get("hyp_text", "")
 
57
  df_events = pd.DataFrame(events_records)
58
  df_events.to_parquet(events_path, index=False)
59
  else:
60
+ df_events = pd.DataFrame(columns=["utt_id", "op_type", "error_class", "ref", "hyp", "level"])
61
+
62
+ meta_map = {r["utt_id"]: r.get("meta", {}) or {} for r in asr_rows}
 
 
 
 
 
 
63
  for key in ["device", "domain", "accent", "speaker"]:
64
  df_align[key] = df_align["utt_id"].map(lambda u: meta_map.get(u, {}).get(key))
65
 
66
  summary = aggregate_summary(df_events, df_align)
67
+
68
+ llm_diagnosis: Optional[Dict] = None
69
+ semantic_df = pd.DataFrame()
70
+ if llm_enabled and os.getenv("OPENAI_API_KEY") and len(df_align) > 0:
71
+ client = OpenAI()
72
+ llm_diagnosis = analyze_with_llm(
73
+ df_align=df_align,
74
+ df_events=df_events,
75
+ summary=summary,
76
+ model=llm_model,
77
+ client=client,
78
+ )
79
+ semantic_df = analyze_semantic_per_utterance(
80
+ df_align=df_align,
81
+ df_events=df_events,
82
+ model=llm_model,
83
+ client=client,
84
+ )
85
+ if len(semantic_df) > 0:
86
+ semantic_df.to_parquet(semantic_path, index=False)
87
+ write_jsonl(semantic_jsonl_path, semantic_df.to_dict(orient="records"))
88
+ if write_diagnostic_report:
89
+ report = generate_report_with_openai(llm_diagnosis, summary, client, model=llm_model)
90
+ (run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
91
+ elif write_diagnostic_report and not (run_dir / "diagnostic_report.md").exists():
92
+ (run_dir / "diagnostic_report.md").write_text(
93
+ "LLM diagnostic report was skipped because OPENAI_API_KEY is not set.\n"
94
+ "You can still inspect summary.json and report.md, or rerun with an API key.",
95
+ encoding="utf-8",
96
+ )
97
+
98
+ write_report(run_dir, run_meta, summary, llm_diagnosis=llm_diagnosis)
99
+
100
+
101
+ if __name__ == "__main__":
102
+ raise SystemExit("Use pipeline/run_all.py or import run_analysis()")
report/diagnostic_report.py CHANGED
@@ -6,39 +6,40 @@ from typing import Dict, Any
6
 
7
  SYSTEM_PROMPT = """You are an ASR diagnostics expert.
8
  Write a concise but evidence-based ASR error analysis report in Chinese.
9
- Do not invent evidence. Only use the provided structured statistics.
10
  Focus on:
11
  1. major error patterns
12
- 2. likely root causes
13
- 3. confidence and uncertainty
14
- 4. actionable next steps
 
15
  """
16
 
17
 
18
- def build_prompt(root_cause: Dict[str, Any], summary: Dict[str, Any]) -> str:
19
  return f"""
20
  请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
21
 
22
  要求:
23
  - 先写总体结论
24
  - 再写主要错误原因(按优先级排序)
25
- - 每个原因要包含:现象、证据、可能原因、改进建议
26
  - 最后给出一个优先级排序的行动清单
27
  - 如果证据不足,要明确说“不确定”
28
 
29
  【summary.json】
30
  {json.dumps(summary, ensure_ascii=False, indent=2)}
31
 
32
- root_cause.json】
33
- {json.dumps(root_cause, ensure_ascii=False, indent=2)}
34
  """
35
 
36
 
37
- def generate_report_with_openai(root_cause: Dict[str, Any], summary: Dict[str, Any], client) -> str:
38
- prompt = build_prompt(root_cause, summary)
39
 
40
  resp = client.chat.completions.create(
41
- model="gpt-4.1-mini",
42
  messages=[
43
  {"role": "system", "content": SYSTEM_PROMPT},
44
  {"role": "user", "content": prompt},
 
6
 
7
  SYSTEM_PROMPT = """You are an ASR diagnostics expert.
8
  Write a concise but evidence-based ASR error analysis report in Chinese.
9
+ Do not invent evidence. Use both structured metrics and LLM semantic diagnosis.
10
  Focus on:
11
  1. major error patterns
12
+ 2. semantic impact of errors
13
+ 3. likely root causes
14
+ 4. confidence and uncertainty
15
+ 5. actionable next steps
16
  """
17
 
18
 
19
+ def build_prompt(llm_diagnosis: Dict[str, Any], summary: Dict[str, Any]) -> str:
20
  return f"""
21
  请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
22
 
23
  要求:
24
  - 先写总体结论
25
  - 再写主要错误原因(按优先级排序)
26
+ - 每个原因要包含:现象、证据、语义影响、可能原因、改进建议
27
  - 最后给出一个优先级排序的行动清单
28
  - 如果证据不足,要明确说“不确定”
29
 
30
  【summary.json】
31
  {json.dumps(summary, ensure_ascii=False, indent=2)}
32
 
33
+ llm_diagnosis.json】
34
+ {json.dumps(llm_diagnosis, ensure_ascii=False, indent=2)}
35
  """
36
 
37
 
38
+ def generate_report_with_openai(llm_diagnosis: Dict[str, Any], summary: Dict[str, Any], client, model: str = "gpt-4.1-mini") -> str:
39
+ prompt = build_prompt(llm_diagnosis, summary)
40
 
41
  resp = client.chat.completions.create(
42
+ model=model,
43
  messages=[
44
  {"role": "system", "content": SYSTEM_PROMPT},
45
  {"role": "user", "content": prompt},
report/generate.py CHANGED
@@ -1,13 +1,15 @@
1
  from __future__ import annotations
2
  import json
3
  from pathlib import Path
4
- from typing import Dict
5
  from .templates import render_markdown
6
 
7
 
8
- def write_report(run_dir: Path, run_meta: Dict, summary: Dict) -> None:
9
  run_dir.mkdir(parents=True, exist_ok=True)
10
 
11
  (run_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
12
- md = render_markdown(run_meta, summary)
 
 
13
  (run_dir / "report.md").write_text(md, encoding="utf-8")
 
1
  from __future__ import annotations
2
  import json
3
  from pathlib import Path
4
+ from typing import Dict, Optional
5
  from .templates import render_markdown
6
 
7
 
8
+ def write_report(run_dir: Path, run_meta: Dict, summary: Dict, llm_diagnosis: Optional[Dict] = None) -> None:
9
  run_dir.mkdir(parents=True, exist_ok=True)
10
 
11
  (run_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
12
+ if llm_diagnosis is not None:
13
+ (run_dir / "llm_diagnosis.json").write_text(json.dumps(llm_diagnosis, ensure_ascii=False, indent=2), encoding="utf-8")
14
+ md = render_markdown(run_meta, summary, llm_diagnosis=llm_diagnosis)
15
  (run_dir / "report.md").write_text(md, encoding="utf-8")
report/templates.py CHANGED
@@ -1,10 +1,10 @@
1
  from __future__ import annotations
2
- from typing import Dict
3
 
4
 
5
- def render_markdown(run_meta: Dict, summary: Dict) -> str:
6
  lines = []
7
- lines.append(f"# ASR Error Analysis Report\n")
8
  lines.append(f"**Run ID:** {run_meta.get('run_id')}")
9
  lines.append(f"**Model:** {run_meta.get('model_info')}\n")
10
 
@@ -25,7 +25,6 @@ def render_markdown(run_meta: Dict, summary: Dict) -> str:
25
  for k, v in (summary.get("top_confusions", {}) or {}).items():
26
  lines.append(f"- {k}: {v}")
27
 
28
- # Slice
29
  for key in ["device", "domain", "accent", "speaker"]:
30
  k2 = f"worst_{key}_by_cer"
31
  if k2 in summary:
@@ -33,9 +32,48 @@ def render_markdown(run_meta: Dict, summary: Dict) -> str:
33
  for item in summary[k2]:
34
  lines.append(f"- {item['key']}: {item['cer']:.4f}")
35
 
36
- lines.append("\n## Recommendations (auto-generated starter)")
37
- lines.append("- 优先检查 CER/WER 在特定子集(device/domain/accent)是否显著升高,针对性补数据或做增强。")
38
- lines.append("- 如果 top_confusions 集中在数字/时间类,可加入数字规范化与专门的后处理规则。")
39
- lines.append("- 如果 mixed_language 占比高,考虑加入英文热词/专名词表或做 LM/解码侧增强。")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  return "\n".join(lines)
 
1
  from __future__ import annotations
2
+ from typing import Dict, Optional
3
 
4
 
5
+ def render_markdown(run_meta: Dict, summary: Dict, llm_diagnosis: Optional[Dict] = None) -> str:
6
  lines = []
7
+ lines.append("# ASR Error Analysis Report\n")
8
  lines.append(f"**Run ID:** {run_meta.get('run_id')}")
9
  lines.append(f"**Model:** {run_meta.get('model_info')}\n")
10
 
 
25
  for k, v in (summary.get("top_confusions", {}) or {}).items():
26
  lines.append(f"- {k}: {v}")
27
 
 
28
  for key in ["device", "domain", "accent", "speaker"]:
29
  k2 = f"worst_{key}_by_cer"
30
  if k2 in summary:
 
32
  for item in summary[k2]:
33
  lines.append(f"- {item['key']}: {item['cer']:.4f}")
34
 
35
+ if llm_diagnosis:
36
+ lines.append("\n## LLM Executive Summary")
37
+ if llm_diagnosis.get("executive_summary"):
38
+ lines.append(llm_diagnosis["executive_summary"])
39
+
40
+ patterns = llm_diagnosis.get("major_patterns") or []
41
+ if patterns:
42
+ lines.append("\n## LLM Major Patterns")
43
+ for idx, item in enumerate(patterns, 1):
44
+ lines.append(f"### {idx}. {item.get('title', 'Untitled Pattern')}")
45
+ if item.get("phenomenon"):
46
+ lines.append(f"- 现象: {item['phenomenon']}")
47
+ if item.get("semantic_impact"):
48
+ lines.append(f"- 语义影响: {item['semantic_impact']}")
49
+ if item.get("confidence") is not None:
50
+ lines.append(f"- 置信度: {item['confidence']}")
51
+ for e in item.get("evidence", []) or []:
52
+ lines.append(f"- 证据: {e}")
53
+ for c in item.get("likely_causes", []) or []:
54
+ lines.append(f"- 可能原因: {c}")
55
+ for r in item.get("recommendations", []) or []:
56
+ lines.append(f"- 建议: {r}")
57
+
58
+ findings = llm_diagnosis.get("case_findings") or []
59
+ if findings:
60
+ lines.append("\n## Representative Semantic Findings")
61
+ for item in findings[:10]:
62
+ lines.append(f"- {item.get('utt_id')}: {item.get('semantic_judgement')};原因:{item.get('reason')}")
63
+
64
+ if llm_diagnosis.get("priority_actions"):
65
+ lines.append("\n## Priority Actions")
66
+ for x in llm_diagnosis["priority_actions"]:
67
+ lines.append(f"- {x}")
68
+
69
+ if llm_diagnosis.get("uncertainties"):
70
+ lines.append("\n## Uncertainties")
71
+ for x in llm_diagnosis["uncertainties"]:
72
+ lines.append(f"- {x}")
73
+ else:
74
+ lines.append("\n## Recommendations (auto-generated starter)")
75
+ lines.append("- 优先检查 CER/WER 在特定子集(device/domain/accent)是否显著升高,针对性补数据或做增强。")
76
+ lines.append("- 如果 top_confusions 集中在数字/时间类,可加入数字规范化与专门的后处理规则。")
77
+ lines.append("- 如果 mixed_language 占比高,考虑加入英文热词/专名词表或做 LM/解码侧增强。")
78
 
79
  return "\n".join(lines)
scripts/run_diagnostic.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import pandas as pd
7
  from openai import OpenAI
8
 
9
- from analysis.root_cause import infer_root_causes
10
  from report.diagnostic_report import generate_report_with_openai
11
 
12
 
@@ -20,21 +20,21 @@ def load_jsonl(path: Path):
20
  return rows
21
 
22
 
23
- def main(run_id: str, runs_dir: str = "runs"):
24
  run_dir = Path(runs_dir) / run_id
25
 
26
  df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
27
  df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
28
  summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
29
 
30
- root_cause = infer_root_causes(df_events, df_align)
31
- (run_dir / "root_cause.json").write_text(
32
- json.dumps(root_cause, ensure_ascii=False, indent=2),
 
33
  encoding="utf-8"
34
  )
35
 
36
- client = OpenAI()
37
- report = generate_report_with_openai(root_cause, summary, client)
38
  (run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
39
 
40
  print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
@@ -45,5 +45,6 @@ if __name__ == "__main__":
45
  ap = argparse.ArgumentParser()
46
  ap.add_argument("--run_id", required=True)
47
  ap.add_argument("--runs_dir", default="runs")
 
48
  args = ap.parse_args()
49
- main(args.run_id, args.runs_dir)
 
6
  import pandas as pd
7
  from openai import OpenAI
8
 
9
+ from analysis.llm_analyzer import analyze_with_llm
10
  from report.diagnostic_report import generate_report_with_openai
11
 
12
 
 
20
  return rows
21
 
22
 
23
+ def main(run_id: str, runs_dir: str = "runs", model: str = "gpt-4.1-mini"):
24
  run_dir = Path(runs_dir) / run_id
25
 
26
  df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
27
  df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
28
  summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
29
 
30
+ client = OpenAI()
31
+ llm_diagnosis = analyze_with_llm(df_align, df_events, summary, model=model, client=client)
32
+ (run_dir / "llm_diagnosis.json").write_text(
33
+ json.dumps(llm_diagnosis, ensure_ascii=False, indent=2),
34
  encoding="utf-8"
35
  )
36
 
37
+ report = generate_report_with_openai(llm_diagnosis, summary, client, model=model)
 
38
  (run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
39
 
40
  print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
 
45
  ap = argparse.ArgumentParser()
46
  ap.add_argument("--run_id", required=True)
47
  ap.add_argument("--runs_dir", default="runs")
48
+ ap.add_argument("--model", default="gpt-4.1-mini")
49
  args = ap.parse_args()
50
+ main(args.run_id, args.runs_dir, args.model)
ui/app.py CHANGED
@@ -5,11 +5,15 @@ import subprocess
5
  import sys
6
  from pathlib import Path
7
 
8
- import pandas as pd
9
  import gradio as gr
 
10
 
11
  RUNS_DIR = Path("runs")
12
 
 
 
 
 
13
 
14
  def list_runs():
15
  if not RUNS_DIR.exists():
@@ -20,107 +24,213 @@ def list_runs():
20
  )
21
 
22
 
23
- def load_run(run_id: str):
24
- run_dir = RUNS_DIR / run_id
25
 
26
- meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
27
 
28
- summary = (
29
- json.loads((run_dir / "summary.json").read_text(encoding="utf-8"))
30
- if (run_dir / "summary.json").exists()
31
- else {}
32
- )
 
 
 
 
 
33
 
34
- aligned_path = run_dir / "aligned.jsonl"
35
- if aligned_path.exists():
36
- rows = []
37
- with aligned_path.open("r", encoding="utf-8") as f:
38
- for line in f:
39
- line = line.strip()
40
- if line:
41
- rows.append(json.loads(line))
42
- df_align = pd.DataFrame(rows)
43
- else:
44
- df_align = pd.DataFrame()
45
 
46
- events_path = run_dir / "events.parquet"
47
- df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
48
 
49
- diagnostic_path = run_dir / "diagnostic_report.md"
50
- diagnostic_text = (
51
- diagnostic_path.read_text(encoding="utf-8")
52
- if diagnostic_path.exists()
53
- else "No diagnostic report yet."
54
- )
55
 
56
- return meta, summary, df_align, df_events, diagnostic_text
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
- def build_summary_md(meta, summary):
60
  lines = []
61
  lines.append(f"### Run ID: `{meta.get('run_id')}`")
62
  lines.append(f"- Model: `{meta.get('model_info')}`")
63
-
64
  if "wer_mean" in summary and summary["wer_mean"] is not None:
65
  lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
66
-
67
  if "cer_mean" in summary and summary["cer_mean"] is not None:
68
  lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
69
-
70
  lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
71
-
72
  if "top_error_classes" in summary:
73
  lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
 
 
 
 
 
 
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  return "\n".join(lines)
76
 
77
 
78
- def on_select_run(run_id):
79
- if not run_id:
80
- return "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
 
 
81
 
82
- meta, summary, df_align, df_events, diagnostic_text = load_run(run_id)
83
- md = build_summary_md(meta, summary)
84
 
85
- align_view = (
86
- df_align[["utt_id", "wer", "cer"]].head(50)
87
- if len(df_align) and all(c in df_align.columns for c in ["utt_id", "wer", "cer"])
88
- else pd.DataFrame()
89
- )
90
 
91
- if len(df_events) and all(
92
- c in df_events.columns for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
93
- ):
94
- events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
95
- else:
96
- events_view = pd.DataFrame()
97
 
98
- return md, align_view, events_view, diagnostic_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
 
101
  def search_events(run_id, error_class, contains):
102
  if not run_id:
103
  return pd.DataFrame()
104
-
105
- _, _, _, df_events, _ = load_run(run_id)
106
  if df_events is None or len(df_events) == 0:
107
  return pd.DataFrame()
108
-
109
  q = df_events.copy()
110
-
111
- if error_class and error_class != "ALL":
112
  q = q[q["error_class"] == error_class]
113
-
114
  if contains:
115
  contains = str(contains)
116
  q = q[
117
  q["ref"].astype(str).str.contains(contains, na=False)
118
  | q["hyp"].astype(str).str.contains(contains, na=False)
119
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- cols = ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
 
 
 
 
 
 
 
 
122
  cols = [c for c in cols if c in q.columns]
123
- return q[cols].head(200)
124
 
125
 
126
  def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
@@ -134,72 +244,49 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
134
  "--language", language.strip(),
135
  "--num", str(int(num_samples)),
136
  ]
137
-
138
  if dataset_config and dataset_config.strip():
139
  cmd += ["--dataset_config", dataset_config.strip()]
140
 
141
  p = subprocess.run(cmd, capture_output=True, text=True)
142
  out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
143
-
144
  if p.returncode != 0:
145
- out += (
146
- "\n\n[HINT] If you see 401/403 for Common Voice: "
147
- "set HF_TOKEN in Space Settings Secrets, and accept dataset terms on HF."
148
- )
149
- return (
150
- out,
151
- gr.update(),
152
- "",
153
- pd.DataFrame(),
154
- pd.DataFrame(),
155
- "No diagnostic report yet.",
156
- )
157
 
158
  runs = list_runs()
159
  latest = runs[0] if runs else None
160
-
161
  if latest:
162
- md, align_view, events_view, diagnostic_text = on_select_run(latest)
163
  else:
164
- md, align_view, events_view, diagnostic_text = "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
165
 
166
- return (
167
- out,
168
- gr.update(choices=runs, value=latest),
169
- md,
170
- align_view,
171
- events_view,
172
- diagnostic_text,
173
- )
174
 
175
 
176
  with gr.Blocks() as demo:
177
- gr.Markdown("# ASR Error Analysis (Stage 1)")
178
 
179
  with gr.Accordion("Run from Hugging Face", open=True):
180
  gr.Markdown(
181
  "Fill in a dataset and a Whisper model, then click **Run**. "
182
- "If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
 
183
  )
184
-
185
  with gr.Row():
186
  dataset_id = gr.Textbox(label="HF dataset repo id", value="fsicoli/common_voice_22_0")
187
  dataset_config = gr.Textbox(label="Dataset config (optional)", value="zh-CN")
188
-
189
  with gr.Row():
190
  split = gr.Textbox(label="Split", value="validation")
191
  text_field = gr.Textbox(label="Transcript field", value="sentence")
192
  num_samples = gr.Number(label="Num samples", value=50, precision=0)
193
-
194
  with gr.Row():
195
  model_repo_id = gr.Textbox(label="HF model repo id", value="openai/whisper-small")
196
  language = gr.Textbox(label="Language", value="zh")
197
-
198
  run_btn = gr.Button("Run")
199
  logs = gr.Textbox(label="Logs", lines=16)
200
 
201
  gr.Markdown("## Browse Existing Runs")
202
-
203
  runs = list_runs()
204
  run_dd = gr.Dropdown(choices=runs, label="Select run", value=(runs[0] if runs else None))
205
  summary_md = gr.Markdown()
@@ -210,15 +297,7 @@ with gr.Blocks() as demo:
210
 
211
  with gr.Accordion("Search Error Events", open=False):
212
  error_cls = gr.Dropdown(
213
- choices=[
214
- "ALL",
215
- "number_or_time",
216
- "mixed_language",
217
- "substitution",
218
- "deletion",
219
- "insertion",
220
- "other",
221
- ],
222
  value="ALL",
223
  label="error_class",
224
  )
@@ -226,30 +305,53 @@ with gr.Blocks() as demo:
226
  search_btn = gr.Button("Search")
227
  result_tbl = gr.Dataframe(label="Search results", interactive=False)
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  with gr.Accordion("Diagnostic Report", open=True):
230
  diagnostic_md = gr.Markdown("No diagnostic report yet.")
231
 
232
  if runs:
233
- md0, a0, e0, d0 = on_select_run(runs[0])
234
  summary_md.value = md0
235
  align_tbl.value = a0
236
  events_tbl.value = e0
 
 
237
  diagnostic_md.value = d0
 
 
238
 
239
  run_dd.change(
240
  on_select_run,
241
  inputs=[run_dd],
242
- outputs=[summary_md, align_tbl, events_tbl, diagnostic_md],
243
  )
244
 
245
- search_btn.click(
246
- search_events,
247
- inputs=[run_dd, error_cls, contains],
248
- outputs=[result_tbl],
 
 
249
  )
250
 
251
  run_btn.click(
252
  run_hf_job,
253
  inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
254
- outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, diagnostic_md],
255
  )
 
5
  import sys
6
  from pathlib import Path
7
 
 
8
  import gradio as gr
9
+ import pandas as pd
10
 
11
  RUNS_DIR = Path("runs")
12
 
13
+ SEMANTIC_JUDGEMENTS = ["ALL", "语义基本等价", "轻微偏差", "明显偏差", "严重失真", "不确定"]
14
+ SEVERITIES = ["ALL", "high", "medium", "low"]
15
+ BUSINESS_IMPACTS = ["ALL", "high", "medium", "low"]
16
+
17
 
18
  def list_runs():
19
  if not RUNS_DIR.exists():
 
24
  )
25
 
26
 
27
+ def _read_json(path: Path, default):
28
+ return json.loads(path.read_text(encoding="utf-8")) if path.exists() else default
29
 
 
30
 
31
+ def _read_jsonl(path: Path):
32
+ rows = []
33
+ if not path.exists():
34
+ return rows
35
+ with path.open("r", encoding="utf-8") as f:
36
+ for line in f:
37
+ line = line.strip()
38
+ if line:
39
+ rows.append(json.loads(line))
40
+ return rows
41
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
+ def _normalize_semantic_df(df: pd.DataFrame) -> pd.DataFrame:
44
+ if df is None or len(df) == 0:
45
+ return pd.DataFrame()
46
+ out = df.copy()
47
+ for col in ["semantic_error_types", "improvement_suggestions"]:
48
+ if col in out.columns:
49
+ out[col] = out[col].apply(lambda xs: xs if isinstance(xs, list) else ([] if pd.isna(xs) else [str(xs)]))
50
+ if "semantic_error_types" in out.columns and "semantic_error_types_str" not in out.columns:
51
+ out["semantic_error_types_str"] = out["semantic_error_types"].apply(lambda xs: " | ".join(xs))
52
+ if "improvement_suggestions" in out.columns and "improvement_suggestions_str" not in out.columns:
53
+ out["improvement_suggestions_str"] = out["improvement_suggestions"].apply(lambda xs: " | ".join(xs))
54
+ return out
55
 
 
 
 
 
 
 
56
 
57
+ def load_run(run_id: str):
58
+ run_dir = RUNS_DIR / run_id
59
+ meta = _read_json(run_dir / "run_meta.json", {})
60
+ summary = _read_json(run_dir / "summary.json", {})
61
+ df_align = pd.DataFrame(_read_jsonl(run_dir / "aligned.jsonl"))
62
+ df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
63
+ if (run_dir / "semantic_findings.parquet").exists():
64
+ df_semantic = _normalize_semantic_df(pd.read_parquet(run_dir / "semantic_findings.parquet"))
65
+ else:
66
+ df_semantic = pd.DataFrame()
67
+ llm_diagnosis = _read_json(run_dir / "llm_diagnosis.json", {})
68
+ diagnostic_text = (run_dir / "diagnostic_report.md").read_text(encoding="utf-8") if (run_dir / "diagnostic_report.md").exists() else "No diagnostic report yet."
69
+ return meta, summary, df_align, df_events, df_semantic, llm_diagnosis, diagnostic_text
70
 
71
 
72
+ def build_summary_md(meta, summary, df_semantic: pd.DataFrame | None = None):
73
  lines = []
74
  lines.append(f"### Run ID: `{meta.get('run_id')}`")
75
  lines.append(f"- Model: `{meta.get('model_info')}`")
 
76
  if "wer_mean" in summary and summary["wer_mean"] is not None:
77
  lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
 
78
  if "cer_mean" in summary and summary["cer_mean"] is not None:
79
  lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
 
80
  lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
 
81
  if "top_error_classes" in summary:
82
  lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
83
+ if df_semantic is not None and len(df_semantic) > 0:
84
+ sem_counts = df_semantic["semantic_judgement"].fillna("不确定").value_counts().to_dict() if "semantic_judgement" in df_semantic.columns else {}
85
+ high_impact = int((df_semantic.get("business_impact", pd.Series(dtype=str)) == "high").sum()) if "business_impact" in df_semantic.columns else 0
86
+ lines.append(f"- Semantic judgements: `{sem_counts}`")
87
+ lines.append(f"- High business impact utterances: **{high_impact}**")
88
+ return "\n".join(lines)
89
+
90
 
91
+ def build_semantic_overview_md(df_semantic: pd.DataFrame, llm_diagnosis: dict):
92
+ if df_semantic is None or len(df_semantic) == 0:
93
+ return "### Semantic Overview\n暂无 per-utterance LLM 语义诊断结果。请先用配置了 `OPENAI_API_KEY` 的流程跑分析。"
94
+ lines = ["### Semantic Overview"]
95
+ if "semantic_judgement" in df_semantic.columns:
96
+ counts = df_semantic["semantic_judgement"].fillna("不确定").value_counts().to_dict()
97
+ lines.append(f"- 语义判断分布: `{counts}`")
98
+ if "business_impact" in df_semantic.columns:
99
+ impact = df_semantic["business_impact"].fillna("low").value_counts().to_dict()
100
+ lines.append(f"- 业务影响分布: `{impact}`")
101
+ if "semantic_error_types" in df_semantic.columns:
102
+ flat = []
103
+ for xs in df_semantic["semantic_error_types"].dropna().tolist():
104
+ flat.extend(xs if isinstance(xs, list) else [str(xs)])
105
+ if flat:
106
+ top_types = pd.Series(flat).value_counts().head(8).to_dict()
107
+ lines.append(f"- 高频语义错误类型: `{top_types}`")
108
+ if llm_diagnosis.get("priority_actions"):
109
+ lines.append("- 优先行动:")
110
+ for action in llm_diagnosis.get("priority_actions", [])[:5]:
111
+ lines.append(f" - {action}")
112
  return "\n".join(lines)
113
 
114
 
115
+ def _head_align(df_align: pd.DataFrame) -> pd.DataFrame:
116
+ if len(df_align) == 0:
117
+ return pd.DataFrame()
118
+ cols = [c for c in ["utt_id", "wer", "cer", "ref_text", "hyp_text"] if c in df_align.columns]
119
+ return df_align[cols].head(50)
120
 
 
 
121
 
122
+ def _head_events(df_events: pd.DataFrame) -> pd.DataFrame:
123
+ if len(df_events) == 0:
124
+ return pd.DataFrame()
125
+ cols = [c for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"] if c in df_events.columns]
126
+ return df_events[cols].head(100)
127
 
 
 
 
 
 
 
128
 
129
+ def _head_semantic(df_semantic: pd.DataFrame) -> pd.DataFrame:
130
+ if len(df_semantic) == 0:
131
+ return pd.DataFrame()
132
+ cols = [
133
+ "utt_id", "semantic_judgement", "severity", "business_impact", "wer", "cer",
134
+ "semantic_error_types_str", "reason", "ref_text", "hyp_text",
135
+ ]
136
+ cols = [c for c in cols if c in df_semantic.columns]
137
+ return df_semantic.sort_values([c for c in ["business_impact", "severity", "cer"] if c in df_semantic.columns], ascending=[True, True, False][:len([c for c in ["business_impact", "severity", "cer"] if c in df_semantic.columns])]).head(100)[cols]
138
+
139
+
140
+ def on_select_run(run_id):
141
+ if not run_id:
142
+ empty = pd.DataFrame()
143
+ return "", empty, empty, empty, "", "No diagnostic report yet.", gr.update(choices=[]), gr.update(choices=[])
144
+
145
+ meta, summary, df_align, df_events, df_semantic, llm_diagnosis, diagnostic_text = load_run(run_id)
146
+ md = build_summary_md(meta, summary, df_semantic)
147
+ semantic_md = build_semantic_overview_md(df_semantic, llm_diagnosis)
148
+
149
+ type_choices = ["ALL"]
150
+ if len(df_semantic) > 0 and "semantic_error_types" in df_semantic.columns:
151
+ types = set()
152
+ for xs in df_semantic["semantic_error_types"].dropna().tolist():
153
+ if isinstance(xs, list):
154
+ types.update(str(x) for x in xs if x)
155
+ elif xs:
156
+ types.add(str(xs))
157
+ type_choices.extend(sorted(types))
158
+
159
+ domain_choices = ["ALL"]
160
+ if len(df_semantic) > 0 and "domain" in df_semantic.columns:
161
+ domain_choices.extend(sorted(str(x) for x in df_semantic["domain"].dropna().unique()))
162
+
163
+ return (
164
+ md,
165
+ _head_align(df_align),
166
+ _head_events(df_events),
167
+ _head_semantic(df_semantic),
168
+ semantic_md,
169
+ diagnostic_text,
170
+ gr.update(choices=type_choices, value="ALL"),
171
+ gr.update(choices=domain_choices, value="ALL"),
172
+ )
173
 
174
 
175
  def search_events(run_id, error_class, contains):
176
  if not run_id:
177
  return pd.DataFrame()
178
+ _, _, _, df_events, _, _, _ = load_run(run_id)
 
179
  if df_events is None or len(df_events) == 0:
180
  return pd.DataFrame()
 
181
  q = df_events.copy()
182
+ if error_class and error_class != "ALL" and "error_class" in q.columns:
 
183
  q = q[q["error_class"] == error_class]
 
184
  if contains:
185
  contains = str(contains)
186
  q = q[
187
  q["ref"].astype(str).str.contains(contains, na=False)
188
  | q["hyp"].astype(str).str.contains(contains, na=False)
189
  ]
190
+ cols = [c for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"] if c in q.columns]
191
+ return q[cols].head(200)
192
+
193
+
194
+ def search_semantic(run_id, judgement, severity, business_impact, semantic_type, domain, contains, min_cer):
195
+ if not run_id:
196
+ return pd.DataFrame()
197
+ _, _, _, _, df_semantic, _, _ = load_run(run_id)
198
+ if df_semantic is None or len(df_semantic) == 0:
199
+ return pd.DataFrame()
200
+
201
+ q = df_semantic.copy()
202
+ if judgement and judgement != "ALL" and "semantic_judgement" in q.columns:
203
+ q = q[q["semantic_judgement"] == judgement]
204
+ if severity and severity != "ALL" and "severity" in q.columns:
205
+ q = q[q["severity"] == severity]
206
+ if business_impact and business_impact != "ALL" and "business_impact" in q.columns:
207
+ q = q[q["business_impact"] == business_impact]
208
+ if semantic_type and semantic_type != "ALL" and "semantic_error_types_str" in q.columns:
209
+ q = q[q["semantic_error_types_str"].astype(str).str.contains(str(semantic_type), na=False)]
210
+ if domain and domain != "ALL" and "domain" in q.columns:
211
+ q = q[q["domain"].astype(str) == str(domain)]
212
+ if contains:
213
+ contains = str(contains)
214
+ q = q[
215
+ q["ref_text"].astype(str).str.contains(contains, na=False)
216
+ | q["hyp_text"].astype(str).str.contains(contains, na=False)
217
+ | q.get("reason", pd.Series(dtype=str)).astype(str).str.contains(contains, na=False)
218
+ | q.get("semantic_error_types_str", pd.Series(dtype=str)).astype(str).str.contains(contains, na=False)
219
+ ]
220
+ if min_cer is not None and "cer" in q.columns:
221
+ q = q[q["cer"].fillna(0) >= float(min_cer)]
222
 
223
+ order_cols = [c for c in ["business_impact", "severity", "cer"] if c in q.columns]
224
+ if order_cols:
225
+ q = q.sort_values(order_cols, ascending=[True, True, False][:len(order_cols)])
226
+
227
+ cols = [
228
+ "utt_id", "semantic_judgement", "severity", "business_impact", "wer", "cer",
229
+ "semantic_error_types_str", "reason", "improvement_suggestions_str", "domain", "accent",
230
+ "ref_text", "hyp_text",
231
+ ]
232
  cols = [c for c in cols if c in q.columns]
233
+ return q[cols].head(300)
234
 
235
 
236
  def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
 
244
  "--language", language.strip(),
245
  "--num", str(int(num_samples)),
246
  ]
 
247
  if dataset_config and dataset_config.strip():
248
  cmd += ["--dataset_config", dataset_config.strip()]
249
 
250
  p = subprocess.run(cmd, capture_output=True, text=True)
251
  out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
 
252
  if p.returncode != 0:
253
+ out += "\n\n[HINT] If you see 401/403 for Common Voice: set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
254
+ empty = pd.DataFrame()
255
+ return out, gr.update(), "", empty, empty, empty, "", "No diagnostic report yet.", gr.update(), gr.update()
 
 
 
 
 
 
 
 
 
256
 
257
  runs = list_runs()
258
  latest = runs[0] if runs else None
 
259
  if latest:
260
+ md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd = on_select_run(latest)
261
  else:
262
+ md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd = "", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "", "No diagnostic report yet.", gr.update(), gr.update()
263
 
264
+ return out, gr.update(choices=runs, value=latest), md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd
 
 
 
 
 
 
 
265
 
266
 
267
  with gr.Blocks() as demo:
268
+ gr.Markdown("# ASR LLM Agent UI")
269
 
270
  with gr.Accordion("Run from Hugging Face", open=True):
271
  gr.Markdown(
272
  "Fill in a dataset and a Whisper model, then click **Run**. "
273
+ "If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**. "
274
+ "For LLM semantic diagnostics, make sure `OPENAI_API_KEY` is available."
275
  )
 
276
  with gr.Row():
277
  dataset_id = gr.Textbox(label="HF dataset repo id", value="fsicoli/common_voice_22_0")
278
  dataset_config = gr.Textbox(label="Dataset config (optional)", value="zh-CN")
 
279
  with gr.Row():
280
  split = gr.Textbox(label="Split", value="validation")
281
  text_field = gr.Textbox(label="Transcript field", value="sentence")
282
  num_samples = gr.Number(label="Num samples", value=50, precision=0)
 
283
  with gr.Row():
284
  model_repo_id = gr.Textbox(label="HF model repo id", value="openai/whisper-small")
285
  language = gr.Textbox(label="Language", value="zh")
 
286
  run_btn = gr.Button("Run")
287
  logs = gr.Textbox(label="Logs", lines=16)
288
 
289
  gr.Markdown("## Browse Existing Runs")
 
290
  runs = list_runs()
291
  run_dd = gr.Dropdown(choices=runs, label="Select run", value=(runs[0] if runs else None))
292
  summary_md = gr.Markdown()
 
297
 
298
  with gr.Accordion("Search Error Events", open=False):
299
  error_cls = gr.Dropdown(
300
+ choices=["ALL", "number_or_time", "mixed_language", "substitution", "deletion", "insertion", "other"],
 
 
 
 
 
 
 
 
301
  value="ALL",
302
  label="error_class",
303
  )
 
305
  search_btn = gr.Button("Search")
306
  result_tbl = gr.Dataframe(label="Search results", interactive=False)
307
 
308
+ gr.Markdown("## Per-Utterance Semantic Diagnostics")
309
+ semantic_overview_md = gr.Markdown("暂无语义诊断结果。")
310
+ semantic_tbl = gr.Dataframe(label="Semantic findings (head)", interactive=False)
311
+
312
+ with gr.Accordion("Filter Semantic Errors", open=True):
313
+ with gr.Row():
314
+ semantic_judgement = gr.Dropdown(choices=SEMANTIC_JUDGEMENTS, value="ALL", label="semantic_judgement")
315
+ semantic_severity = gr.Dropdown(choices=SEVERITIES, value="ALL", label="severity")
316
+ semantic_business_impact = gr.Dropdown(choices=BUSINESS_IMPACTS, value="ALL", label="business_impact")
317
+ with gr.Row():
318
+ semantic_type = gr.Dropdown(choices=["ALL"], value="ALL", label="semantic_error_type")
319
+ semantic_domain = gr.Dropdown(choices=["ALL"], value="ALL", label="domain")
320
+ semantic_min_cer = gr.Number(label="min CER", value=0.0)
321
+ semantic_contains = gr.Textbox(label="contains (ref/hyp/reason/type)")
322
+ semantic_search_btn = gr.Button("Filter semantic findings")
323
+ semantic_result_tbl = gr.Dataframe(label="Filtered semantic findings", interactive=False)
324
+
325
  with gr.Accordion("Diagnostic Report", open=True):
326
  diagnostic_md = gr.Markdown("No diagnostic report yet.")
327
 
328
  if runs:
329
+ md0, a0, e0, s0, so0, d0, type0, domain0 = on_select_run(runs[0])
330
  summary_md.value = md0
331
  align_tbl.value = a0
332
  events_tbl.value = e0
333
+ semantic_tbl.value = s0
334
+ semantic_overview_md.value = so0
335
  diagnostic_md.value = d0
336
+ semantic_type.choices = type0["choices"]
337
+ semantic_domain.choices = domain0["choices"]
338
 
339
  run_dd.change(
340
  on_select_run,
341
  inputs=[run_dd],
342
+ outputs=[summary_md, align_tbl, events_tbl, semantic_tbl, semantic_overview_md, diagnostic_md, semantic_type, semantic_domain],
343
  )
344
 
345
+ search_btn.click(search_events, inputs=[run_dd, error_cls, contains], outputs=[result_tbl])
346
+
347
+ semantic_search_btn.click(
348
+ search_semantic,
349
+ inputs=[run_dd, semantic_judgement, semantic_severity, semantic_business_impact, semantic_type, semantic_domain, semantic_contains, semantic_min_cer],
350
+ outputs=[semantic_result_tbl],
351
  )
352
 
353
  run_btn.click(
354
  run_hf_job,
355
  inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
356
+ outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, semantic_tbl, semantic_overview_md, diagnostic_md, semantic_type, semantic_domain],
357
  )