Spaces:

ZTXRiley
/

ASR_AGENT_

Running

App Files Files Community

unknown commited on Mar 6

Commit

2d79471

1 Parent(s): 4ed43e6

Add diagnostic report panel to UI

Browse files

Files changed (6) hide show

analysis/root_cause.py +186 -0
report/diagnostic_report.py +48 -0
requirements.txt +1 -0
scripts/run_diagnostic.py +49 -0
scripts/run_hf_job.py +8 -4
ui/app.py +105 -26

analysis/root_cause.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from __future__ import annotations
+from typing import Dict, List, Any
+import pandas as pd
+def _safe_ratio(n: int, d: int) -> float:
+    return float(n / d) if d else 0.0
+def infer_root_causes(df_events: pd.DataFrame, df_align: pd.DataFrame) -> Dict[str, Any]:
+    """
+    Rule/statistics based root-cause inference.
+    Input:
+      - df_events: events.parquet loaded as DataFrame
+      - df_align: aligned.jsonl loaded as DataFrame
+    Output:
+      - dict with evidence, issue hypotheses, and recommendations
+    """
+    result: Dict[str, Any] = {
+        "overview": {},
+        "root_causes": [],
+        "evidence_tables": {},
+    }
+    total_events = len(df_events)
+    total_utts = len(df_align)
+    result["overview"] = {
+        "num_utterances": int(total_utts),
+        "num_error_events": int(total_events),
+        "wer_mean": float(df_align["wer"].dropna().mean()) if "wer" in df_align.columns and df_align["wer"].notna().any() else None,
+        "cer_mean": float(df_align["cer"].dropna().mean()) if "cer" in df_align.columns and df_align["cer"].notna().any() else None,
+    }
+    if total_events == 0:
+        result["root_causes"].append({
+            "cause": "no_errors_detected",
+            "confidence": 1.0,
+            "evidence": ["No error events found in current run."],
+            "recommendations": ["Use a weaker model or more difficult dataset to make diagnosis meaningful."]
+        })
+        return result
+    # Basic counts
+    op_counts = df_events["op_type"].value_counts().to_dict() if "op_type" in df_events.columns else {}
+    cls_counts = df_events["error_class"].value_counts().to_dict() if "error_class" in df_events.columns else {}
+    result["evidence_tables"]["op_counts"] = {k: int(v) for k, v in op_counts.items()}
+    result["evidence_tables"]["error_class_counts"] = {k: int(v) for k, v in cls_counts.items()}
+    # --- Cause 1: number/time normalization problems
+    num_time_count = int(cls_counts.get("number_or_time", 0))
+    if _safe_ratio(num_time_count, total_events) >= 0.15:
+        result["root_causes"].append({
+            "cause": "number_time_format",
+            "confidence": round(min(0.95, 0.5 + _safe_ratio(num_time_count, total_events)), 3),
+            "evidence": [
+                f"number_or_time events = {num_time_count}/{total_events}",
+                "Large proportion of errors are related to numbers, dates, times, or units."
+            ],
+            "recommendations": [
+                "Add number/date/time normalization in both reference and hypothesis.",
+                "Create post-processing rules for time/unit expressions.",
+                "Add more number-heavy utterances into evaluation/training."
+            ]
+        })
+    # --- Cause 2: mixed-language problems
+    mixed_count = int(cls_counts.get("mixed_language", 0))
+    if _safe_ratio(mixed_count, total_events) >= 0.10:
+        result["root_causes"].append({
+            "cause": "mixed_language",
+            "confidence": round(min(0.95, 0.45 + _safe_ratio(mixed_count, total_events)), 3),
+            "evidence": [
+                f"mixed_language events = {mixed_count}/{total_events}",
+                "Frequent English/Latin-token related substitutions suggest code-switching weakness."
+            ],
+            "recommendations": [
+                "Add bilingual/code-switching evaluation samples.",
+                "Add domain-specific English terms, abbreviations, and brand names.",
+                "Add post-processing lexicon for mixed-language phrases."
+            ]
+        })
+    # --- Cause 3: deletion-heavy => possible noise / far-field / VAD
+    deletion_count = int(op_counts.get("D", 0))
+    insertion_count = int(op_counts.get("I", 0))
+    substitution_count = int(op_counts.get("S", 0))
+    if _safe_ratio(deletion_count, total_events) >= 0.30:
+        result["root_causes"].append({
+            "cause": "noise_or_farfield_or_vad",
+            "confidence": round(min(0.95, 0.5 + _safe_ratio(deletion_count, total_events)), 3),
+            "evidence": [
+                f"deletion events = {deletion_count}/{total_events}",
+                "High deletion ratio often indicates weak audibility, noise, far-field speech, or segmentation/VAD issues."
+            ],
+            "recommendations": [
+                "Compare CER/WER across device / SNR / domain slices.",
+                "Inspect quiet, noisy, or long utterances.",
+                "Tune VAD or segmentation strategy.",
+                "Add noisy / far-field augmented audio."
+            ]
+        })
+    # --- Cause 4: insertion-heavy => possible segmentation/repetition/echo
+    if _safe_ratio(insertion_count, total_events) >= 0.20:
+        result["root_causes"].append({
+            "cause": "segmentation_or_repetition",
+            "confidence": round(min(0.9, 0.45 + _safe_ratio(insertion_count, total_events)), 3),
+            "evidence": [
+                f"insertion events = {insertion_count}/{total_events}",
+                "High insertion ratio often suggests repeated decoding, segmentation mismatch, or echo."
+            ],
+            "recommendations": [
+                "Inspect duplicated filler words and repeated fragments.",
+                "Review chunking / segmentation.",
+                "Check whether punctuation or normalization creates false insertions."
+            ]
+        })
+    # --- Cause 5: slice-based evidence (device/domain/accent/speaker)
+    slice_findings = []
+    for key in ["device", "domain", "accent", "speaker"]:
+        if key in df_align.columns and df_align[key].notna().any() and "cer" in df_align.columns:
+            g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False)
+            if len(g) >= 2:
+                worst_key = str(g.index[0])
+                worst_val = float(g.iloc[0])
+                best_val = float(g.iloc[-1])
+                if best_val > 0 and worst_val / best_val >= 1.5:
+                    slice_findings.append({
+                        "slice_key": key,
+                        "worst_group": worst_key,
+                        "worst_cer": worst_val,
+                        "best_cer": best_val,
+                        "ratio": worst_val / best_val
+                    })
+    if slice_findings:
+        result["evidence_tables"]["slice_findings"] = slice_findings
+        result["root_causes"].append({
+            "cause": "slice_specific_weakness",
+            "confidence": 0.85,
+            "evidence": [
+                "Some slices show much worse CER than others.",
+                *[
+                    f"{x['slice_key']}={x['worst_group']} has CER {x['worst_cer']:.4f}, ratio vs best={x['ratio']:.2f}"
+                    for x in slice_findings[:5]
+                ]
+            ],
+            "recommendations": [
+                "Prioritize the worst slices in future analysis/training.",
+                "Check whether those slices correspond to accent, device, or scenario mismatch."
+            ]
+        })
+    # --- Cause 6: substitution-dominant => pronunciation / lexical confusion
+    if _safe_ratio(substitution_count, total_events) >= 0.60:
+        result["root_causes"].append({
+            "cause": "pronunciation_or_lexical_confusion",
+            "confidence": round(min(0.9, 0.45 + _safe_ratio(substitution_count, total_events)), 3),
+            "evidence": [
+                f"substitution events = {substitution_count}/{total_events}",
+                "Substitutions dominate, which often indicates pronunciation ambiguity, lexical confusion, or near-homophone errors."
+            ],
+            "recommendations": [
+                "Add confusion-pair statistics.",
+                "Check near-homophone and accent-sensitive confusions.",
+                "Build a pronunciation-aware analysis layer."
+            ]
+        })
+    if not result["root_causes"]:
+        result["root_causes"].append({
+            "cause": "general_asr_mismatch",
+            "confidence": 0.5,
+            "evidence": ["No single dominant root cause identified from current heuristics."],
+            "recommendations": [
+                "Inspect top confusion pairs and low-performing slices.",
+                "Increase metadata coverage (device/domain/accent/snr)."
+            ]
+        })
+    return result

report/diagnostic_report.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from __future__ import annotations
+import json
+from typing import Dict, Any
+SYSTEM_PROMPT = """You are an ASR diagnostics expert.
+Write a concise but evidence-based ASR error analysis report in Chinese.
+Do not invent evidence. Only use the provided structured statistics.
+Focus on:
+1. major error patterns
+2. likely root causes
+3. confidence and uncertainty
+4. actionable next steps
+"""
+def build_prompt(root_cause: Dict[str, Any], summary: Dict[str, Any]) -> str:
+    return f"""
+请基于下面的结构化分析结果，生成一份中文 ASR 错误诊断报告。
+要求：
+- 先写总体结论
+- 再写主要错误原因（按优先级排序）
+- 每个原因要包含：现象、证据、可能原因、改进建议
+- 最后给出一个优先级排序的行动清单
+- 如果证据不足，要明确说“不确定”
+【summary.json】
+{json.dumps(summary, ensure_ascii=False, indent=2)}
+【root_cause.json】
+{json.dumps(root_cause, ensure_ascii=False, indent=2)}
+"""
+def generate_report_with_openai(root_cause: Dict[str, Any], summary: Dict[str, Any], client) -> str:
+    prompt = build_prompt(root_cause, summary)
+    resp = client.chat.completions.create(
+        model="gpt-4.1-mini",
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": prompt},
+        ],
+        temperature=0.2,
+    )
+    return resp.choices[0].message.content

requirements.txt CHANGED Viewed

@@ -22,3 +22,4 @@ soundfile
 librosa
 pydantic>=2.0
 opencc-python-reimplemented

 librosa
 pydantic>=2.0
 opencc-python-reimplemented
+openai>=1.30.0

scripts/run_diagnostic.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+import pandas as pd
+from openai import OpenAI
+from analysis.root_cause import infer_root_causes
+from report.diagnostic_report import generate_report_with_openai
+def load_jsonl(path: Path):
+    rows = []
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                rows.append(json.loads(line))
+    return rows
+def main(run_id: str, runs_dir: str = "runs"):
+    run_dir = Path(runs_dir) / run_id
+    df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
+    df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
+    summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
+    root_cause = infer_root_causes(df_events, df_align)
+    (run_dir / "root_cause.json").write_text(
+        json.dumps(root_cause, ensure_ascii=False, indent=2),
+        encoding="utf-8"
+    )
+    client = OpenAI()
+    report = generate_report_with_openai(root_cause, summary, client)
+    (run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
+    print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--run_id", required=True)
+    ap.add_argument("--runs_dir", default="runs")
+    args = ap.parse_args()
+    main(args.run_id, args.runs_dir)

scripts/run_hf_job.py CHANGED Viewed

@@ -136,7 +136,7 @@ def main():
     data_dir.mkdir(parents=True, exist_ok=True)
     manifest_path = data_dir / "manifest_hf.jsonl"
-    print("[1/4] Building manifest from Hugging Face dataset...")
     n = build_manifest_from_hf(
         dataset_id=args.dataset_id,
         dataset_config=args.dataset_config.strip() or None,
@@ -148,7 +148,7 @@ def main():
     print(f"  - Wrote {n} samples to {manifest_path}")
     # Run pipeline functions directly (faster than nested subprocess)
-    print("[2/4] Running ASR inference...")
     from pipeline.run_asr import run_asr
     run_id = run_asr(
@@ -160,11 +160,15 @@ def main():
     )
     print(f"  - ASR done. run_id={run_id}")
-    print("[3/4] Running analysis (align/events/report)...")
     from pipeline.run_analysis import run_analysis
     run_analysis(run_id, out_root=args.out_root)
-    print("[4/4] Done.")
     print(f"Run directory: {Path(args.out_root) / run_id}")

     data_dir.mkdir(parents=True, exist_ok=True)
     manifest_path = data_dir / "manifest_hf.jsonl"
+    print("[1/5] Building manifest from Hugging Face dataset...")
     n = build_manifest_from_hf(
         dataset_id=args.dataset_id,
         dataset_config=args.dataset_config.strip() or None,
     print(f"  - Wrote {n} samples to {manifest_path}")
     # Run pipeline functions directly (faster than nested subprocess)
+    print("[2/5] Running ASR inference...")
     from pipeline.run_asr import run_asr
     run_id = run_asr(
     )
     print(f"  - ASR done. run_id={run_id}")
+    print("[3/5] Running analysis (align/events/report)...")
     from pipeline.run_analysis import run_analysis
     run_analysis(run_id, out_root=args.out_root)
+    print("[4/5] Running diagnostic report...")
+    from scripts.run_diagnostic import main as run_diagnostic_main
+    run_diagnostic_main(run_id, args.out_root)
+    print("[5/5] Done.")
     print(f"Run directory: {Path(args.out_root) / run_id}")

ui/app.py CHANGED Viewed

@@ -22,8 +22,14 @@ def list_runs():
 def load_run(run_id: str):
     run_dir = RUNS_DIR / run_id
     meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
-    summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
     aligned_path = run_dir / "aligned.jsonl"
     if aligned_path.exists():
@@ -40,51 +46,81 @@ def load_run(run_id: str):
     events_path = run_dir / "events.parquet"
     df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
-    return meta, summary, df_align, df_events
 def build_summary_md(meta, summary):
     lines = []
     lines.append(f"### Run ID: `{meta.get('run_id')}`")
     lines.append(f"- Model: `{meta.get('model_info')}`")
-    if "wer_mean" in summary:
         lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
-    if "cer_mean" in summary:
         lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
     lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
     return "\n".join(lines)
 def on_select_run(run_id):
     if not run_id:
-        return "", pd.DataFrame(), pd.DataFrame()
-    meta, summary, df_align, df_events = load_run(run_id)
     md = build_summary_md(meta, summary)
-    align_view = df_align[["utt_id", "wer", "cer"]].head(50) if len(df_align) else pd.DataFrame()
-    if len(df_events):
         events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
     else:
         events_view = pd.DataFrame()
-    return md, align_view, events_view
 def search_events(run_id, error_class, contains):
     if not run_id:
         return pd.DataFrame()
-    _, _, _, df_events = load_run(run_id)
     if df_events is None or len(df_events) == 0:
         return pd.DataFrame()
-    q = df_events
     if error_class and error_class != "ALL":
         q = q[q["error_class"] == error_class]
     if contains:
-        q = q[q["ref"].astype(str).str.contains(contains) | q["hyp"].astype(str).str.contains(contains)]
-    return q[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(200)
 def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
@@ -98,6 +134,7 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
         "--language", language.strip(),
         "--num", str(int(num_samples)),
     ]
     if dataset_config and dataset_config.strip():
         cmd += ["--dataset_config", dataset_config.strip()]
@@ -105,13 +142,35 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
     out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
     if p.returncode != 0:
-        out += "\n\n[HINT] If you see 401/403 for Common Voice: set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
-        return out, gr.update(), "", pd.DataFrame(), pd.DataFrame()
     runs = list_runs()
     latest = runs[0] if runs else None
-    md, align_view, events_view = on_select_run(latest)
-    return out, gr.update(choices=runs, value=latest), md, align_view, events_view
 with gr.Blocks() as demo:
@@ -120,8 +179,7 @@ with gr.Blocks() as demo:
     with gr.Accordion("Run from Hugging Face", open=True):
         gr.Markdown(
             "Fill in a dataset and a Whisper model, then click **Run**. "
-            "Default is Common Voice zh-CN validation (first 50 samples). "
-            "If Common Voice is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
         )
         with gr.Row():
@@ -138,7 +196,7 @@ with gr.Blocks() as demo:
             language = gr.Textbox(label="Language", value="zh")
         run_btn = gr.Button("Run")
-        logs = gr.Textbox(label="Logs", lines=14)
     gr.Markdown("## Browse Existing Runs")
@@ -152,25 +210,46 @@ with gr.Blocks() as demo:
     with gr.Accordion("Search Error Events", open=False):
         error_cls = gr.Dropdown(
-            choices=["ALL", "number_or_time", "mixed_language", "substitution", "deletion", "insertion", "other"],
             value="ALL",
             label="error_class",
         )
         contains = gr.Textbox(label="contains (ref/hyp substring)")
-        btn = gr.Button("Search")
         result_tbl = gr.Dataframe(label="Search results", interactive=False)
     if runs:
-        md0, a0, e0 = on_select_run(runs[0])
         summary_md.value = md0
         align_tbl.value = a0
         events_tbl.value = e0
-    run_dd.change(on_select_run, inputs=[run_dd], outputs=[summary_md, align_tbl, events_tbl])
-    btn.click(search_events, inputs=[run_dd, error_cls, contains], outputs=[result_tbl])
     run_btn.click(
         run_hf_job,
         inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
-        outputs=[logs, run_dd, summary_md, align_tbl, events_tbl],
     )

 def load_run(run_id: str):
     run_dir = RUNS_DIR / run_id
     meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
+    summary = (
+        json.loads((run_dir / "summary.json").read_text(encoding="utf-8"))
+        if (run_dir / "summary.json").exists()
+        else {}
+    )
     aligned_path = run_dir / "aligned.jsonl"
     if aligned_path.exists():
     events_path = run_dir / "events.parquet"
     df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
+    diagnostic_path = run_dir / "diagnostic_report.md"
+    diagnostic_text = (
+        diagnostic_path.read_text(encoding="utf-8")
+        if diagnostic_path.exists()
+        else "No diagnostic report yet."
+    )
+    return meta, summary, df_align, df_events, diagnostic_text
 def build_summary_md(meta, summary):
     lines = []
     lines.append(f"### Run ID: `{meta.get('run_id')}`")
     lines.append(f"- Model: `{meta.get('model_info')}`")
+    if "wer_mean" in summary and summary["wer_mean"] is not None:
         lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
+    if "cer_mean" in summary and summary["cer_mean"] is not None:
         lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
     lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
+    if "top_error_classes" in summary:
+        lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
     return "\n".join(lines)
 def on_select_run(run_id):
     if not run_id:
+        return "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
+    meta, summary, df_align, df_events, diagnostic_text = load_run(run_id)
     md = build_summary_md(meta, summary)
+    align_view = (
+        df_align[["utt_id", "wer", "cer"]].head(50)
+        if len(df_align) and all(c in df_align.columns for c in ["utt_id", "wer", "cer"])
+        else pd.DataFrame()
+    )
+    if len(df_events) and all(
+        c in df_events.columns for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
+    ):
         events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
     else:
         events_view = pd.DataFrame()
+    return md, align_view, events_view, diagnostic_text
 def search_events(run_id, error_class, contains):
     if not run_id:
         return pd.DataFrame()
+    _, _, _, df_events, _ = load_run(run_id)
     if df_events is None or len(df_events) == 0:
         return pd.DataFrame()
+    q = df_events.copy()
     if error_class and error_class != "ALL":
         q = q[q["error_class"] == error_class]
     if contains:
+        contains = str(contains)
+        q = q[
+            q["ref"].astype(str).str.contains(contains, na=False)
+            | q["hyp"].astype(str).str.contains(contains, na=False)
+        ]
+    cols = ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
+    cols = [c for c in cols if c in q.columns]
+    return q[cols].head(200)
 def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
         "--language", language.strip(),
         "--num", str(int(num_samples)),
     ]
     if dataset_config and dataset_config.strip():
         cmd += ["--dataset_config", dataset_config.strip()]
     out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
     if p.returncode != 0:
+        out += (
+            "\n\n[HINT] If you see 401/403 for Common Voice: "
+            "set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
+        )
+        return (
+            out,
+            gr.update(),
+            "",
+            pd.DataFrame(),
+            pd.DataFrame(),
+            "No diagnostic report yet.",
+        )
     runs = list_runs()
     latest = runs[0] if runs else None
+    if latest:
+        md, align_view, events_view, diagnostic_text = on_select_run(latest)
+    else:
+        md, align_view, events_view, diagnostic_text = "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
+    return (
+        out,
+        gr.update(choices=runs, value=latest),
+        md,
+        align_view,
+        events_view,
+        diagnostic_text,
+    )
 with gr.Blocks() as demo:
     with gr.Accordion("Run from Hugging Face", open=True):
         gr.Markdown(
             "Fill in a dataset and a Whisper model, then click **Run**. "
+            "If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
         )
         with gr.Row():
             language = gr.Textbox(label="Language", value="zh")
         run_btn = gr.Button("Run")
+        logs = gr.Textbox(label="Logs", lines=16)
     gr.Markdown("## Browse Existing Runs")
     with gr.Accordion("Search Error Events", open=False):
         error_cls = gr.Dropdown(
+            choices=[
+                "ALL",
+                "number_or_time",
+                "mixed_language",
+                "substitution",
+                "deletion",
+                "insertion",
+                "other",
+            ],
             value="ALL",
             label="error_class",
         )
         contains = gr.Textbox(label="contains (ref/hyp substring)")
+        search_btn = gr.Button("Search")
         result_tbl = gr.Dataframe(label="Search results", interactive=False)
+    with gr.Accordion("Diagnostic Report", open=True):
+        diagnostic_md = gr.Markdown("No diagnostic report yet.")
     if runs:
+        md0, a0, e0, d0 = on_select_run(runs[0])
         summary_md.value = md0
         align_tbl.value = a0
         events_tbl.value = e0
+        diagnostic_md.value = d0
+    run_dd.change(
+        on_select_run,
+        inputs=[run_dd],
+        outputs=[summary_md, align_tbl, events_tbl, diagnostic_md],
+    )
+    search_btn.click(
+        search_events,
+        inputs=[run_dd, error_cls, contains],
+        outputs=[result_tbl],
+    )
     run_btn.click(
         run_hf_job,
         inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
+        outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, diagnostic_md],
     )