unknown commited on
Commit
2d79471
·
1 Parent(s): 4ed43e6

Add diagnostic report panel to UI

Browse files
analysis/root_cause.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Any
4
+ import pandas as pd
5
+
6
+
7
+ def _safe_ratio(n: int, d: int) -> float:
8
+ return float(n / d) if d else 0.0
9
+
10
+
11
+ def infer_root_causes(df_events: pd.DataFrame, df_align: pd.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Rule/statistics based root-cause inference.
14
+ Input:
15
+ - df_events: events.parquet loaded as DataFrame
16
+ - df_align: aligned.jsonl loaded as DataFrame
17
+ Output:
18
+ - dict with evidence, issue hypotheses, and recommendations
19
+ """
20
+ result: Dict[str, Any] = {
21
+ "overview": {},
22
+ "root_causes": [],
23
+ "evidence_tables": {},
24
+ }
25
+
26
+ total_events = len(df_events)
27
+ total_utts = len(df_align)
28
+
29
+ result["overview"] = {
30
+ "num_utterances": int(total_utts),
31
+ "num_error_events": int(total_events),
32
+ "wer_mean": float(df_align["wer"].dropna().mean()) if "wer" in df_align.columns and df_align["wer"].notna().any() else None,
33
+ "cer_mean": float(df_align["cer"].dropna().mean()) if "cer" in df_align.columns and df_align["cer"].notna().any() else None,
34
+ }
35
+
36
+ if total_events == 0:
37
+ result["root_causes"].append({
38
+ "cause": "no_errors_detected",
39
+ "confidence": 1.0,
40
+ "evidence": ["No error events found in current run."],
41
+ "recommendations": ["Use a weaker model or more difficult dataset to make diagnosis meaningful."]
42
+ })
43
+ return result
44
+
45
+ # Basic counts
46
+ op_counts = df_events["op_type"].value_counts().to_dict() if "op_type" in df_events.columns else {}
47
+ cls_counts = df_events["error_class"].value_counts().to_dict() if "error_class" in df_events.columns else {}
48
+
49
+ result["evidence_tables"]["op_counts"] = {k: int(v) for k, v in op_counts.items()}
50
+ result["evidence_tables"]["error_class_counts"] = {k: int(v) for k, v in cls_counts.items()}
51
+
52
+ # --- Cause 1: number/time normalization problems
53
+ num_time_count = int(cls_counts.get("number_or_time", 0))
54
+ if _safe_ratio(num_time_count, total_events) >= 0.15:
55
+ result["root_causes"].append({
56
+ "cause": "number_time_format",
57
+ "confidence": round(min(0.95, 0.5 + _safe_ratio(num_time_count, total_events)), 3),
58
+ "evidence": [
59
+ f"number_or_time events = {num_time_count}/{total_events}",
60
+ "Large proportion of errors are related to numbers, dates, times, or units."
61
+ ],
62
+ "recommendations": [
63
+ "Add number/date/time normalization in both reference and hypothesis.",
64
+ "Create post-processing rules for time/unit expressions.",
65
+ "Add more number-heavy utterances into evaluation/training."
66
+ ]
67
+ })
68
+
69
+ # --- Cause 2: mixed-language problems
70
+ mixed_count = int(cls_counts.get("mixed_language", 0))
71
+ if _safe_ratio(mixed_count, total_events) >= 0.10:
72
+ result["root_causes"].append({
73
+ "cause": "mixed_language",
74
+ "confidence": round(min(0.95, 0.45 + _safe_ratio(mixed_count, total_events)), 3),
75
+ "evidence": [
76
+ f"mixed_language events = {mixed_count}/{total_events}",
77
+ "Frequent English/Latin-token related substitutions suggest code-switching weakness."
78
+ ],
79
+ "recommendations": [
80
+ "Add bilingual/code-switching evaluation samples.",
81
+ "Add domain-specific English terms, abbreviations, and brand names.",
82
+ "Add post-processing lexicon for mixed-language phrases."
83
+ ]
84
+ })
85
+
86
+ # --- Cause 3: deletion-heavy => possible noise / far-field / VAD
87
+ deletion_count = int(op_counts.get("D", 0))
88
+ insertion_count = int(op_counts.get("I", 0))
89
+ substitution_count = int(op_counts.get("S", 0))
90
+
91
+ if _safe_ratio(deletion_count, total_events) >= 0.30:
92
+ result["root_causes"].append({
93
+ "cause": "noise_or_farfield_or_vad",
94
+ "confidence": round(min(0.95, 0.5 + _safe_ratio(deletion_count, total_events)), 3),
95
+ "evidence": [
96
+ f"deletion events = {deletion_count}/{total_events}",
97
+ "High deletion ratio often indicates weak audibility, noise, far-field speech, or segmentation/VAD issues."
98
+ ],
99
+ "recommendations": [
100
+ "Compare CER/WER across device / SNR / domain slices.",
101
+ "Inspect quiet, noisy, or long utterances.",
102
+ "Tune VAD or segmentation strategy.",
103
+ "Add noisy / far-field augmented audio."
104
+ ]
105
+ })
106
+
107
+ # --- Cause 4: insertion-heavy => possible segmentation/repetition/echo
108
+ if _safe_ratio(insertion_count, total_events) >= 0.20:
109
+ result["root_causes"].append({
110
+ "cause": "segmentation_or_repetition",
111
+ "confidence": round(min(0.9, 0.45 + _safe_ratio(insertion_count, total_events)), 3),
112
+ "evidence": [
113
+ f"insertion events = {insertion_count}/{total_events}",
114
+ "High insertion ratio often suggests repeated decoding, segmentation mismatch, or echo."
115
+ ],
116
+ "recommendations": [
117
+ "Inspect duplicated filler words and repeated fragments.",
118
+ "Review chunking / segmentation.",
119
+ "Check whether punctuation or normalization creates false insertions."
120
+ ]
121
+ })
122
+
123
+ # --- Cause 5: slice-based evidence (device/domain/accent/speaker)
124
+ slice_findings = []
125
+ for key in ["device", "domain", "accent", "speaker"]:
126
+ if key in df_align.columns and df_align[key].notna().any() and "cer" in df_align.columns:
127
+ g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False)
128
+ if len(g) >= 2:
129
+ worst_key = str(g.index[0])
130
+ worst_val = float(g.iloc[0])
131
+ best_val = float(g.iloc[-1])
132
+ if best_val > 0 and worst_val / best_val >= 1.5:
133
+ slice_findings.append({
134
+ "slice_key": key,
135
+ "worst_group": worst_key,
136
+ "worst_cer": worst_val,
137
+ "best_cer": best_val,
138
+ "ratio": worst_val / best_val
139
+ })
140
+
141
+ if slice_findings:
142
+ result["evidence_tables"]["slice_findings"] = slice_findings
143
+ result["root_causes"].append({
144
+ "cause": "slice_specific_weakness",
145
+ "confidence": 0.85,
146
+ "evidence": [
147
+ "Some slices show much worse CER than others.",
148
+ *[
149
+ f"{x['slice_key']}={x['worst_group']} has CER {x['worst_cer']:.4f}, ratio vs best={x['ratio']:.2f}"
150
+ for x in slice_findings[:5]
151
+ ]
152
+ ],
153
+ "recommendations": [
154
+ "Prioritize the worst slices in future analysis/training.",
155
+ "Check whether those slices correspond to accent, device, or scenario mismatch."
156
+ ]
157
+ })
158
+
159
+ # --- Cause 6: substitution-dominant => pronunciation / lexical confusion
160
+ if _safe_ratio(substitution_count, total_events) >= 0.60:
161
+ result["root_causes"].append({
162
+ "cause": "pronunciation_or_lexical_confusion",
163
+ "confidence": round(min(0.9, 0.45 + _safe_ratio(substitution_count, total_events)), 3),
164
+ "evidence": [
165
+ f"substitution events = {substitution_count}/{total_events}",
166
+ "Substitutions dominate, which often indicates pronunciation ambiguity, lexical confusion, or near-homophone errors."
167
+ ],
168
+ "recommendations": [
169
+ "Add confusion-pair statistics.",
170
+ "Check near-homophone and accent-sensitive confusions.",
171
+ "Build a pronunciation-aware analysis layer."
172
+ ]
173
+ })
174
+
175
+ if not result["root_causes"]:
176
+ result["root_causes"].append({
177
+ "cause": "general_asr_mismatch",
178
+ "confidence": 0.5,
179
+ "evidence": ["No single dominant root cause identified from current heuristics."],
180
+ "recommendations": [
181
+ "Inspect top confusion pairs and low-performing slices.",
182
+ "Increase metadata coverage (device/domain/accent/snr)."
183
+ ]
184
+ })
185
+
186
+ return result
report/diagnostic_report.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Dict, Any
5
+
6
+
7
+ SYSTEM_PROMPT = """You are an ASR diagnostics expert.
8
+ Write a concise but evidence-based ASR error analysis report in Chinese.
9
+ Do not invent evidence. Only use the provided structured statistics.
10
+ Focus on:
11
+ 1. major error patterns
12
+ 2. likely root causes
13
+ 3. confidence and uncertainty
14
+ 4. actionable next steps
15
+ """
16
+
17
+
18
+ def build_prompt(root_cause: Dict[str, Any], summary: Dict[str, Any]) -> str:
19
+ return f"""
20
+ 请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
21
+
22
+ 要求:
23
+ - 先写总体结论
24
+ - 再写主要错误原因(按优先级排序)
25
+ - 每个原因要包含:现象、证据、可能原因、改进建议
26
+ - 最后给出一个优先级排序的行动清单
27
+ - 如果证据不足,要明确说“不确定”
28
+
29
+ 【summary.json】
30
+ {json.dumps(summary, ensure_ascii=False, indent=2)}
31
+
32
+ 【root_cause.json】
33
+ {json.dumps(root_cause, ensure_ascii=False, indent=2)}
34
+ """
35
+
36
+
37
+ def generate_report_with_openai(root_cause: Dict[str, Any], summary: Dict[str, Any], client) -> str:
38
+ prompt = build_prompt(root_cause, summary)
39
+
40
+ resp = client.chat.completions.create(
41
+ model="gpt-4.1-mini",
42
+ messages=[
43
+ {"role": "system", "content": SYSTEM_PROMPT},
44
+ {"role": "user", "content": prompt},
45
+ ],
46
+ temperature=0.2,
47
+ )
48
+ return resp.choices[0].message.content
requirements.txt CHANGED
@@ -22,3 +22,4 @@ soundfile
22
  librosa
23
  pydantic>=2.0
24
  opencc-python-reimplemented
 
 
22
  librosa
23
  pydantic>=2.0
24
  opencc-python-reimplemented
25
+ openai>=1.30.0
scripts/run_diagnostic.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ import pandas as pd
7
+ from openai import OpenAI
8
+
9
+ from analysis.root_cause import infer_root_causes
10
+ from report.diagnostic_report import generate_report_with_openai
11
+
12
+
13
+ def load_jsonl(path: Path):
14
+ rows = []
15
+ with path.open("r", encoding="utf-8") as f:
16
+ for line in f:
17
+ line = line.strip()
18
+ if line:
19
+ rows.append(json.loads(line))
20
+ return rows
21
+
22
+
23
+ def main(run_id: str, runs_dir: str = "runs"):
24
+ run_dir = Path(runs_dir) / run_id
25
+
26
+ df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
27
+ df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
28
+ summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
29
+
30
+ root_cause = infer_root_causes(df_events, df_align)
31
+ (run_dir / "root_cause.json").write_text(
32
+ json.dumps(root_cause, ensure_ascii=False, indent=2),
33
+ encoding="utf-8"
34
+ )
35
+
36
+ client = OpenAI()
37
+ report = generate_report_with_openai(root_cause, summary, client)
38
+ (run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
39
+
40
+ print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
41
+
42
+
43
+ if __name__ == "__main__":
44
+ import argparse
45
+ ap = argparse.ArgumentParser()
46
+ ap.add_argument("--run_id", required=True)
47
+ ap.add_argument("--runs_dir", default="runs")
48
+ args = ap.parse_args()
49
+ main(args.run_id, args.runs_dir)
scripts/run_hf_job.py CHANGED
@@ -136,7 +136,7 @@ def main():
136
  data_dir.mkdir(parents=True, exist_ok=True)
137
  manifest_path = data_dir / "manifest_hf.jsonl"
138
 
139
- print("[1/4] Building manifest from Hugging Face dataset...")
140
  n = build_manifest_from_hf(
141
  dataset_id=args.dataset_id,
142
  dataset_config=args.dataset_config.strip() or None,
@@ -148,7 +148,7 @@ def main():
148
  print(f" - Wrote {n} samples to {manifest_path}")
149
 
150
  # Run pipeline functions directly (faster than nested subprocess)
151
- print("[2/4] Running ASR inference...")
152
  from pipeline.run_asr import run_asr
153
 
154
  run_id = run_asr(
@@ -160,11 +160,15 @@ def main():
160
  )
161
  print(f" - ASR done. run_id={run_id}")
162
 
163
- print("[3/4] Running analysis (align/events/report)...")
164
  from pipeline.run_analysis import run_analysis
165
 
166
  run_analysis(run_id, out_root=args.out_root)
167
- print("[4/4] Done.")
 
 
 
 
168
  print(f"Run directory: {Path(args.out_root) / run_id}")
169
 
170
 
 
136
  data_dir.mkdir(parents=True, exist_ok=True)
137
  manifest_path = data_dir / "manifest_hf.jsonl"
138
 
139
+ print("[1/5] Building manifest from Hugging Face dataset...")
140
  n = build_manifest_from_hf(
141
  dataset_id=args.dataset_id,
142
  dataset_config=args.dataset_config.strip() or None,
 
148
  print(f" - Wrote {n} samples to {manifest_path}")
149
 
150
  # Run pipeline functions directly (faster than nested subprocess)
151
+ print("[2/5] Running ASR inference...")
152
  from pipeline.run_asr import run_asr
153
 
154
  run_id = run_asr(
 
160
  )
161
  print(f" - ASR done. run_id={run_id}")
162
 
163
+ print("[3/5] Running analysis (align/events/report)...")
164
  from pipeline.run_analysis import run_analysis
165
 
166
  run_analysis(run_id, out_root=args.out_root)
167
+ print("[4/5] Running diagnostic report...")
168
+ from scripts.run_diagnostic import main as run_diagnostic_main
169
+ run_diagnostic_main(run_id, args.out_root)
170
+
171
+ print("[5/5] Done.")
172
  print(f"Run directory: {Path(args.out_root) / run_id}")
173
 
174
 
ui/app.py CHANGED
@@ -22,8 +22,14 @@ def list_runs():
22
 
23
  def load_run(run_id: str):
24
  run_dir = RUNS_DIR / run_id
 
25
  meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
26
- summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
 
 
 
 
 
27
 
28
  aligned_path = run_dir / "aligned.jsonl"
29
  if aligned_path.exists():
@@ -40,51 +46,81 @@ def load_run(run_id: str):
40
  events_path = run_dir / "events.parquet"
41
  df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
42
 
43
- return meta, summary, df_align, df_events
 
 
 
 
 
 
 
44
 
45
 
46
  def build_summary_md(meta, summary):
47
  lines = []
48
  lines.append(f"### Run ID: `{meta.get('run_id')}`")
49
  lines.append(f"- Model: `{meta.get('model_info')}`")
50
- if "wer_mean" in summary:
 
51
  lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
52
- if "cer_mean" in summary:
 
53
  lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
 
54
  lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
 
 
 
 
55
  return "\n".join(lines)
56
 
57
 
58
  def on_select_run(run_id):
59
  if not run_id:
60
- return "", pd.DataFrame(), pd.DataFrame()
61
 
62
- meta, summary, df_align, df_events = load_run(run_id)
63
  md = build_summary_md(meta, summary)
64
 
65
- align_view = df_align[["utt_id", "wer", "cer"]].head(50) if len(df_align) else pd.DataFrame()
 
 
 
 
66
 
67
- if len(df_events):
 
 
68
  events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
69
  else:
70
  events_view = pd.DataFrame()
71
 
72
- return md, align_view, events_view
73
 
74
 
75
  def search_events(run_id, error_class, contains):
76
  if not run_id:
77
  return pd.DataFrame()
78
- _, _, _, df_events = load_run(run_id)
 
79
  if df_events is None or len(df_events) == 0:
80
  return pd.DataFrame()
81
 
82
- q = df_events
 
83
  if error_class and error_class != "ALL":
84
  q = q[q["error_class"] == error_class]
 
85
  if contains:
86
- q = q[q["ref"].astype(str).str.contains(contains) | q["hyp"].astype(str).str.contains(contains)]
87
- return q[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(200)
 
 
 
 
 
 
 
88
 
89
 
90
  def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
@@ -98,6 +134,7 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
98
  "--language", language.strip(),
99
  "--num", str(int(num_samples)),
100
  ]
 
101
  if dataset_config and dataset_config.strip():
102
  cmd += ["--dataset_config", dataset_config.strip()]
103
 
@@ -105,13 +142,35 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
105
  out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
106
 
107
  if p.returncode != 0:
108
- out += "\n\n[HINT] If you see 401/403 for Common Voice: set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
109
- return out, gr.update(), "", pd.DataFrame(), pd.DataFrame()
 
 
 
 
 
 
 
 
 
 
110
 
111
  runs = list_runs()
112
  latest = runs[0] if runs else None
113
- md, align_view, events_view = on_select_run(latest)
114
- return out, gr.update(choices=runs, value=latest), md, align_view, events_view
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
 
117
  with gr.Blocks() as demo:
@@ -120,8 +179,7 @@ with gr.Blocks() as demo:
120
  with gr.Accordion("Run from Hugging Face", open=True):
121
  gr.Markdown(
122
  "Fill in a dataset and a Whisper model, then click **Run**. "
123
- "Default is Common Voice zh-CN validation (first 50 samples). "
124
- "If Common Voice is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
125
  )
126
 
127
  with gr.Row():
@@ -138,7 +196,7 @@ with gr.Blocks() as demo:
138
  language = gr.Textbox(label="Language", value="zh")
139
 
140
  run_btn = gr.Button("Run")
141
- logs = gr.Textbox(label="Logs", lines=14)
142
 
143
  gr.Markdown("## Browse Existing Runs")
144
 
@@ -152,25 +210,46 @@ with gr.Blocks() as demo:
152
 
153
  with gr.Accordion("Search Error Events", open=False):
154
  error_cls = gr.Dropdown(
155
- choices=["ALL", "number_or_time", "mixed_language", "substitution", "deletion", "insertion", "other"],
 
 
 
 
 
 
 
 
156
  value="ALL",
157
  label="error_class",
158
  )
159
  contains = gr.Textbox(label="contains (ref/hyp substring)")
160
- btn = gr.Button("Search")
161
  result_tbl = gr.Dataframe(label="Search results", interactive=False)
162
 
 
 
 
163
  if runs:
164
- md0, a0, e0 = on_select_run(runs[0])
165
  summary_md.value = md0
166
  align_tbl.value = a0
167
  events_tbl.value = e0
 
168
 
169
- run_dd.change(on_select_run, inputs=[run_dd], outputs=[summary_md, align_tbl, events_tbl])
170
- btn.click(search_events, inputs=[run_dd, error_cls, contains], outputs=[result_tbl])
 
 
 
 
 
 
 
 
 
171
 
172
  run_btn.click(
173
  run_hf_job,
174
  inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
175
- outputs=[logs, run_dd, summary_md, align_tbl, events_tbl],
176
  )
 
22
 
23
  def load_run(run_id: str):
24
  run_dir = RUNS_DIR / run_id
25
+
26
  meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
27
+
28
+ summary = (
29
+ json.loads((run_dir / "summary.json").read_text(encoding="utf-8"))
30
+ if (run_dir / "summary.json").exists()
31
+ else {}
32
+ )
33
 
34
  aligned_path = run_dir / "aligned.jsonl"
35
  if aligned_path.exists():
 
46
  events_path = run_dir / "events.parquet"
47
  df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
48
 
49
+ diagnostic_path = run_dir / "diagnostic_report.md"
50
+ diagnostic_text = (
51
+ diagnostic_path.read_text(encoding="utf-8")
52
+ if diagnostic_path.exists()
53
+ else "No diagnostic report yet."
54
+ )
55
+
56
+ return meta, summary, df_align, df_events, diagnostic_text
57
 
58
 
59
  def build_summary_md(meta, summary):
60
  lines = []
61
  lines.append(f"### Run ID: `{meta.get('run_id')}`")
62
  lines.append(f"- Model: `{meta.get('model_info')}`")
63
+
64
+ if "wer_mean" in summary and summary["wer_mean"] is not None:
65
  lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
66
+
67
+ if "cer_mean" in summary and summary["cer_mean"] is not None:
68
  lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
69
+
70
  lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
71
+
72
+ if "top_error_classes" in summary:
73
+ lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
74
+
75
  return "\n".join(lines)
76
 
77
 
78
  def on_select_run(run_id):
79
  if not run_id:
80
+ return "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
81
 
82
+ meta, summary, df_align, df_events, diagnostic_text = load_run(run_id)
83
  md = build_summary_md(meta, summary)
84
 
85
+ align_view = (
86
+ df_align[["utt_id", "wer", "cer"]].head(50)
87
+ if len(df_align) and all(c in df_align.columns for c in ["utt_id", "wer", "cer"])
88
+ else pd.DataFrame()
89
+ )
90
 
91
+ if len(df_events) and all(
92
+ c in df_events.columns for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
93
+ ):
94
  events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
95
  else:
96
  events_view = pd.DataFrame()
97
 
98
+ return md, align_view, events_view, diagnostic_text
99
 
100
 
101
  def search_events(run_id, error_class, contains):
102
  if not run_id:
103
  return pd.DataFrame()
104
+
105
+ _, _, _, df_events, _ = load_run(run_id)
106
  if df_events is None or len(df_events) == 0:
107
  return pd.DataFrame()
108
 
109
+ q = df_events.copy()
110
+
111
  if error_class and error_class != "ALL":
112
  q = q[q["error_class"] == error_class]
113
+
114
  if contains:
115
+ contains = str(contains)
116
+ q = q[
117
+ q["ref"].astype(str).str.contains(contains, na=False)
118
+ | q["hyp"].astype(str).str.contains(contains, na=False)
119
+ ]
120
+
121
+ cols = ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
122
+ cols = [c for c in cols if c in q.columns]
123
+ return q[cols].head(200)
124
 
125
 
126
  def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
 
134
  "--language", language.strip(),
135
  "--num", str(int(num_samples)),
136
  ]
137
+
138
  if dataset_config and dataset_config.strip():
139
  cmd += ["--dataset_config", dataset_config.strip()]
140
 
 
142
  out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
143
 
144
  if p.returncode != 0:
145
+ out += (
146
+ "\n\n[HINT] If you see 401/403 for Common Voice: "
147
+ "set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
148
+ )
149
+ return (
150
+ out,
151
+ gr.update(),
152
+ "",
153
+ pd.DataFrame(),
154
+ pd.DataFrame(),
155
+ "No diagnostic report yet.",
156
+ )
157
 
158
  runs = list_runs()
159
  latest = runs[0] if runs else None
160
+
161
+ if latest:
162
+ md, align_view, events_view, diagnostic_text = on_select_run(latest)
163
+ else:
164
+ md, align_view, events_view, diagnostic_text = "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
165
+
166
+ return (
167
+ out,
168
+ gr.update(choices=runs, value=latest),
169
+ md,
170
+ align_view,
171
+ events_view,
172
+ diagnostic_text,
173
+ )
174
 
175
 
176
  with gr.Blocks() as demo:
 
179
  with gr.Accordion("Run from Hugging Face", open=True):
180
  gr.Markdown(
181
  "Fill in a dataset and a Whisper model, then click **Run**. "
182
+ "If the dataset is gated, set `HF_TOKEN` in Space **Settings Secrets**."
 
183
  )
184
 
185
  with gr.Row():
 
196
  language = gr.Textbox(label="Language", value="zh")
197
 
198
  run_btn = gr.Button("Run")
199
+ logs = gr.Textbox(label="Logs", lines=16)
200
 
201
  gr.Markdown("## Browse Existing Runs")
202
 
 
210
 
211
  with gr.Accordion("Search Error Events", open=False):
212
  error_cls = gr.Dropdown(
213
+ choices=[
214
+ "ALL",
215
+ "number_or_time",
216
+ "mixed_language",
217
+ "substitution",
218
+ "deletion",
219
+ "insertion",
220
+ "other",
221
+ ],
222
  value="ALL",
223
  label="error_class",
224
  )
225
  contains = gr.Textbox(label="contains (ref/hyp substring)")
226
+ search_btn = gr.Button("Search")
227
  result_tbl = gr.Dataframe(label="Search results", interactive=False)
228
 
229
+ with gr.Accordion("Diagnostic Report", open=True):
230
+ diagnostic_md = gr.Markdown("No diagnostic report yet.")
231
+
232
  if runs:
233
+ md0, a0, e0, d0 = on_select_run(runs[0])
234
  summary_md.value = md0
235
  align_tbl.value = a0
236
  events_tbl.value = e0
237
+ diagnostic_md.value = d0
238
 
239
+ run_dd.change(
240
+ on_select_run,
241
+ inputs=[run_dd],
242
+ outputs=[summary_md, align_tbl, events_tbl, diagnostic_md],
243
+ )
244
+
245
+ search_btn.click(
246
+ search_events,
247
+ inputs=[run_dd, error_cls, contains],
248
+ outputs=[result_tbl],
249
+ )
250
 
251
  run_btn.click(
252
  run_hf_job,
253
  inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
254
+ outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, diagnostic_md],
255
  )