Spaces:
Running
Running
unknown commited on
Commit ·
2d79471
1
Parent(s): 4ed43e6
Add diagnostic report panel to UI
Browse files- analysis/root_cause.py +186 -0
- report/diagnostic_report.py +48 -0
- requirements.txt +1 -0
- scripts/run_diagnostic.py +49 -0
- scripts/run_hf_job.py +8 -4
- ui/app.py +105 -26
analysis/root_cause.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List, Any
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _safe_ratio(n: int, d: int) -> float:
|
| 8 |
+
return float(n / d) if d else 0.0
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def infer_root_causes(df_events: pd.DataFrame, df_align: pd.DataFrame) -> Dict[str, Any]:
|
| 12 |
+
"""
|
| 13 |
+
Rule/statistics based root-cause inference.
|
| 14 |
+
Input:
|
| 15 |
+
- df_events: events.parquet loaded as DataFrame
|
| 16 |
+
- df_align: aligned.jsonl loaded as DataFrame
|
| 17 |
+
Output:
|
| 18 |
+
- dict with evidence, issue hypotheses, and recommendations
|
| 19 |
+
"""
|
| 20 |
+
result: Dict[str, Any] = {
|
| 21 |
+
"overview": {},
|
| 22 |
+
"root_causes": [],
|
| 23 |
+
"evidence_tables": {},
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
total_events = len(df_events)
|
| 27 |
+
total_utts = len(df_align)
|
| 28 |
+
|
| 29 |
+
result["overview"] = {
|
| 30 |
+
"num_utterances": int(total_utts),
|
| 31 |
+
"num_error_events": int(total_events),
|
| 32 |
+
"wer_mean": float(df_align["wer"].dropna().mean()) if "wer" in df_align.columns and df_align["wer"].notna().any() else None,
|
| 33 |
+
"cer_mean": float(df_align["cer"].dropna().mean()) if "cer" in df_align.columns and df_align["cer"].notna().any() else None,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
if total_events == 0:
|
| 37 |
+
result["root_causes"].append({
|
| 38 |
+
"cause": "no_errors_detected",
|
| 39 |
+
"confidence": 1.0,
|
| 40 |
+
"evidence": ["No error events found in current run."],
|
| 41 |
+
"recommendations": ["Use a weaker model or more difficult dataset to make diagnosis meaningful."]
|
| 42 |
+
})
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
# Basic counts
|
| 46 |
+
op_counts = df_events["op_type"].value_counts().to_dict() if "op_type" in df_events.columns else {}
|
| 47 |
+
cls_counts = df_events["error_class"].value_counts().to_dict() if "error_class" in df_events.columns else {}
|
| 48 |
+
|
| 49 |
+
result["evidence_tables"]["op_counts"] = {k: int(v) for k, v in op_counts.items()}
|
| 50 |
+
result["evidence_tables"]["error_class_counts"] = {k: int(v) for k, v in cls_counts.items()}
|
| 51 |
+
|
| 52 |
+
# --- Cause 1: number/time normalization problems
|
| 53 |
+
num_time_count = int(cls_counts.get("number_or_time", 0))
|
| 54 |
+
if _safe_ratio(num_time_count, total_events) >= 0.15:
|
| 55 |
+
result["root_causes"].append({
|
| 56 |
+
"cause": "number_time_format",
|
| 57 |
+
"confidence": round(min(0.95, 0.5 + _safe_ratio(num_time_count, total_events)), 3),
|
| 58 |
+
"evidence": [
|
| 59 |
+
f"number_or_time events = {num_time_count}/{total_events}",
|
| 60 |
+
"Large proportion of errors are related to numbers, dates, times, or units."
|
| 61 |
+
],
|
| 62 |
+
"recommendations": [
|
| 63 |
+
"Add number/date/time normalization in both reference and hypothesis.",
|
| 64 |
+
"Create post-processing rules for time/unit expressions.",
|
| 65 |
+
"Add more number-heavy utterances into evaluation/training."
|
| 66 |
+
]
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
# --- Cause 2: mixed-language problems
|
| 70 |
+
mixed_count = int(cls_counts.get("mixed_language", 0))
|
| 71 |
+
if _safe_ratio(mixed_count, total_events) >= 0.10:
|
| 72 |
+
result["root_causes"].append({
|
| 73 |
+
"cause": "mixed_language",
|
| 74 |
+
"confidence": round(min(0.95, 0.45 + _safe_ratio(mixed_count, total_events)), 3),
|
| 75 |
+
"evidence": [
|
| 76 |
+
f"mixed_language events = {mixed_count}/{total_events}",
|
| 77 |
+
"Frequent English/Latin-token related substitutions suggest code-switching weakness."
|
| 78 |
+
],
|
| 79 |
+
"recommendations": [
|
| 80 |
+
"Add bilingual/code-switching evaluation samples.",
|
| 81 |
+
"Add domain-specific English terms, abbreviations, and brand names.",
|
| 82 |
+
"Add post-processing lexicon for mixed-language phrases."
|
| 83 |
+
]
|
| 84 |
+
})
|
| 85 |
+
|
| 86 |
+
# --- Cause 3: deletion-heavy => possible noise / far-field / VAD
|
| 87 |
+
deletion_count = int(op_counts.get("D", 0))
|
| 88 |
+
insertion_count = int(op_counts.get("I", 0))
|
| 89 |
+
substitution_count = int(op_counts.get("S", 0))
|
| 90 |
+
|
| 91 |
+
if _safe_ratio(deletion_count, total_events) >= 0.30:
|
| 92 |
+
result["root_causes"].append({
|
| 93 |
+
"cause": "noise_or_farfield_or_vad",
|
| 94 |
+
"confidence": round(min(0.95, 0.5 + _safe_ratio(deletion_count, total_events)), 3),
|
| 95 |
+
"evidence": [
|
| 96 |
+
f"deletion events = {deletion_count}/{total_events}",
|
| 97 |
+
"High deletion ratio often indicates weak audibility, noise, far-field speech, or segmentation/VAD issues."
|
| 98 |
+
],
|
| 99 |
+
"recommendations": [
|
| 100 |
+
"Compare CER/WER across device / SNR / domain slices.",
|
| 101 |
+
"Inspect quiet, noisy, or long utterances.",
|
| 102 |
+
"Tune VAD or segmentation strategy.",
|
| 103 |
+
"Add noisy / far-field augmented audio."
|
| 104 |
+
]
|
| 105 |
+
})
|
| 106 |
+
|
| 107 |
+
# --- Cause 4: insertion-heavy => possible segmentation/repetition/echo
|
| 108 |
+
if _safe_ratio(insertion_count, total_events) >= 0.20:
|
| 109 |
+
result["root_causes"].append({
|
| 110 |
+
"cause": "segmentation_or_repetition",
|
| 111 |
+
"confidence": round(min(0.9, 0.45 + _safe_ratio(insertion_count, total_events)), 3),
|
| 112 |
+
"evidence": [
|
| 113 |
+
f"insertion events = {insertion_count}/{total_events}",
|
| 114 |
+
"High insertion ratio often suggests repeated decoding, segmentation mismatch, or echo."
|
| 115 |
+
],
|
| 116 |
+
"recommendations": [
|
| 117 |
+
"Inspect duplicated filler words and repeated fragments.",
|
| 118 |
+
"Review chunking / segmentation.",
|
| 119 |
+
"Check whether punctuation or normalization creates false insertions."
|
| 120 |
+
]
|
| 121 |
+
})
|
| 122 |
+
|
| 123 |
+
# --- Cause 5: slice-based evidence (device/domain/accent/speaker)
|
| 124 |
+
slice_findings = []
|
| 125 |
+
for key in ["device", "domain", "accent", "speaker"]:
|
| 126 |
+
if key in df_align.columns and df_align[key].notna().any() and "cer" in df_align.columns:
|
| 127 |
+
g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False)
|
| 128 |
+
if len(g) >= 2:
|
| 129 |
+
worst_key = str(g.index[0])
|
| 130 |
+
worst_val = float(g.iloc[0])
|
| 131 |
+
best_val = float(g.iloc[-1])
|
| 132 |
+
if best_val > 0 and worst_val / best_val >= 1.5:
|
| 133 |
+
slice_findings.append({
|
| 134 |
+
"slice_key": key,
|
| 135 |
+
"worst_group": worst_key,
|
| 136 |
+
"worst_cer": worst_val,
|
| 137 |
+
"best_cer": best_val,
|
| 138 |
+
"ratio": worst_val / best_val
|
| 139 |
+
})
|
| 140 |
+
|
| 141 |
+
if slice_findings:
|
| 142 |
+
result["evidence_tables"]["slice_findings"] = slice_findings
|
| 143 |
+
result["root_causes"].append({
|
| 144 |
+
"cause": "slice_specific_weakness",
|
| 145 |
+
"confidence": 0.85,
|
| 146 |
+
"evidence": [
|
| 147 |
+
"Some slices show much worse CER than others.",
|
| 148 |
+
*[
|
| 149 |
+
f"{x['slice_key']}={x['worst_group']} has CER {x['worst_cer']:.4f}, ratio vs best={x['ratio']:.2f}"
|
| 150 |
+
for x in slice_findings[:5]
|
| 151 |
+
]
|
| 152 |
+
],
|
| 153 |
+
"recommendations": [
|
| 154 |
+
"Prioritize the worst slices in future analysis/training.",
|
| 155 |
+
"Check whether those slices correspond to accent, device, or scenario mismatch."
|
| 156 |
+
]
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
# --- Cause 6: substitution-dominant => pronunciation / lexical confusion
|
| 160 |
+
if _safe_ratio(substitution_count, total_events) >= 0.60:
|
| 161 |
+
result["root_causes"].append({
|
| 162 |
+
"cause": "pronunciation_or_lexical_confusion",
|
| 163 |
+
"confidence": round(min(0.9, 0.45 + _safe_ratio(substitution_count, total_events)), 3),
|
| 164 |
+
"evidence": [
|
| 165 |
+
f"substitution events = {substitution_count}/{total_events}",
|
| 166 |
+
"Substitutions dominate, which often indicates pronunciation ambiguity, lexical confusion, or near-homophone errors."
|
| 167 |
+
],
|
| 168 |
+
"recommendations": [
|
| 169 |
+
"Add confusion-pair statistics.",
|
| 170 |
+
"Check near-homophone and accent-sensitive confusions.",
|
| 171 |
+
"Build a pronunciation-aware analysis layer."
|
| 172 |
+
]
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
if not result["root_causes"]:
|
| 176 |
+
result["root_causes"].append({
|
| 177 |
+
"cause": "general_asr_mismatch",
|
| 178 |
+
"confidence": 0.5,
|
| 179 |
+
"evidence": ["No single dominant root cause identified from current heuristics."],
|
| 180 |
+
"recommendations": [
|
| 181 |
+
"Inspect top confusion pairs and low-performing slices.",
|
| 182 |
+
"Increase metadata coverage (device/domain/accent/snr)."
|
| 183 |
+
]
|
| 184 |
+
})
|
| 185 |
+
|
| 186 |
+
return result
|
report/diagnostic_report.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from typing import Dict, Any
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
SYSTEM_PROMPT = """You are an ASR diagnostics expert.
|
| 8 |
+
Write a concise but evidence-based ASR error analysis report in Chinese.
|
| 9 |
+
Do not invent evidence. Only use the provided structured statistics.
|
| 10 |
+
Focus on:
|
| 11 |
+
1. major error patterns
|
| 12 |
+
2. likely root causes
|
| 13 |
+
3. confidence and uncertainty
|
| 14 |
+
4. actionable next steps
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_prompt(root_cause: Dict[str, Any], summary: Dict[str, Any]) -> str:
|
| 19 |
+
return f"""
|
| 20 |
+
请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
|
| 21 |
+
|
| 22 |
+
要求:
|
| 23 |
+
- 先写总体结论
|
| 24 |
+
- 再写主要错误原因(按优先级排序)
|
| 25 |
+
- 每个原因要包含:现象、证据、可能原因、改进建议
|
| 26 |
+
- 最后给出一个优先级排序的行动清单
|
| 27 |
+
- 如果证据不足,要明确说“不确定”
|
| 28 |
+
|
| 29 |
+
【summary.json】
|
| 30 |
+
{json.dumps(summary, ensure_ascii=False, indent=2)}
|
| 31 |
+
|
| 32 |
+
【root_cause.json】
|
| 33 |
+
{json.dumps(root_cause, ensure_ascii=False, indent=2)}
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def generate_report_with_openai(root_cause: Dict[str, Any], summary: Dict[str, Any], client) -> str:
|
| 38 |
+
prompt = build_prompt(root_cause, summary)
|
| 39 |
+
|
| 40 |
+
resp = client.chat.completions.create(
|
| 41 |
+
model="gpt-4.1-mini",
|
| 42 |
+
messages=[
|
| 43 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 44 |
+
{"role": "user", "content": prompt},
|
| 45 |
+
],
|
| 46 |
+
temperature=0.2,
|
| 47 |
+
)
|
| 48 |
+
return resp.choices[0].message.content
|
requirements.txt
CHANGED
|
@@ -22,3 +22,4 @@ soundfile
|
|
| 22 |
librosa
|
| 23 |
pydantic>=2.0
|
| 24 |
opencc-python-reimplemented
|
|
|
|
|
|
| 22 |
librosa
|
| 23 |
pydantic>=2.0
|
| 24 |
opencc-python-reimplemented
|
| 25 |
+
openai>=1.30.0
|
scripts/run_diagnostic.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
|
| 9 |
+
from analysis.root_cause import infer_root_causes
|
| 10 |
+
from report.diagnostic_report import generate_report_with_openai
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_jsonl(path: Path):
|
| 14 |
+
rows = []
|
| 15 |
+
with path.open("r", encoding="utf-8") as f:
|
| 16 |
+
for line in f:
|
| 17 |
+
line = line.strip()
|
| 18 |
+
if line:
|
| 19 |
+
rows.append(json.loads(line))
|
| 20 |
+
return rows
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def main(run_id: str, runs_dir: str = "runs"):
|
| 24 |
+
run_dir = Path(runs_dir) / run_id
|
| 25 |
+
|
| 26 |
+
df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
|
| 27 |
+
df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
|
| 28 |
+
summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
|
| 29 |
+
|
| 30 |
+
root_cause = infer_root_causes(df_events, df_align)
|
| 31 |
+
(run_dir / "root_cause.json").write_text(
|
| 32 |
+
json.dumps(root_cause, ensure_ascii=False, indent=2),
|
| 33 |
+
encoding="utf-8"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
client = OpenAI()
|
| 37 |
+
report = generate_report_with_openai(root_cause, summary, client)
|
| 38 |
+
(run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
|
| 39 |
+
|
| 40 |
+
print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
import argparse
|
| 45 |
+
ap = argparse.ArgumentParser()
|
| 46 |
+
ap.add_argument("--run_id", required=True)
|
| 47 |
+
ap.add_argument("--runs_dir", default="runs")
|
| 48 |
+
args = ap.parse_args()
|
| 49 |
+
main(args.run_id, args.runs_dir)
|
scripts/run_hf_job.py
CHANGED
|
@@ -136,7 +136,7 @@ def main():
|
|
| 136 |
data_dir.mkdir(parents=True, exist_ok=True)
|
| 137 |
manifest_path = data_dir / "manifest_hf.jsonl"
|
| 138 |
|
| 139 |
-
print("[1/
|
| 140 |
n = build_manifest_from_hf(
|
| 141 |
dataset_id=args.dataset_id,
|
| 142 |
dataset_config=args.dataset_config.strip() or None,
|
|
@@ -148,7 +148,7 @@ def main():
|
|
| 148 |
print(f" - Wrote {n} samples to {manifest_path}")
|
| 149 |
|
| 150 |
# Run pipeline functions directly (faster than nested subprocess)
|
| 151 |
-
print("[2/
|
| 152 |
from pipeline.run_asr import run_asr
|
| 153 |
|
| 154 |
run_id = run_asr(
|
|
@@ -160,11 +160,15 @@ def main():
|
|
| 160 |
)
|
| 161 |
print(f" - ASR done. run_id={run_id}")
|
| 162 |
|
| 163 |
-
print("[3/
|
| 164 |
from pipeline.run_analysis import run_analysis
|
| 165 |
|
| 166 |
run_analysis(run_id, out_root=args.out_root)
|
| 167 |
-
print("[4/
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
print(f"Run directory: {Path(args.out_root) / run_id}")
|
| 169 |
|
| 170 |
|
|
|
|
| 136 |
data_dir.mkdir(parents=True, exist_ok=True)
|
| 137 |
manifest_path = data_dir / "manifest_hf.jsonl"
|
| 138 |
|
| 139 |
+
print("[1/5] Building manifest from Hugging Face dataset...")
|
| 140 |
n = build_manifest_from_hf(
|
| 141 |
dataset_id=args.dataset_id,
|
| 142 |
dataset_config=args.dataset_config.strip() or None,
|
|
|
|
| 148 |
print(f" - Wrote {n} samples to {manifest_path}")
|
| 149 |
|
| 150 |
# Run pipeline functions directly (faster than nested subprocess)
|
| 151 |
+
print("[2/5] Running ASR inference...")
|
| 152 |
from pipeline.run_asr import run_asr
|
| 153 |
|
| 154 |
run_id = run_asr(
|
|
|
|
| 160 |
)
|
| 161 |
print(f" - ASR done. run_id={run_id}")
|
| 162 |
|
| 163 |
+
print("[3/5] Running analysis (align/events/report)...")
|
| 164 |
from pipeline.run_analysis import run_analysis
|
| 165 |
|
| 166 |
run_analysis(run_id, out_root=args.out_root)
|
| 167 |
+
print("[4/5] Running diagnostic report...")
|
| 168 |
+
from scripts.run_diagnostic import main as run_diagnostic_main
|
| 169 |
+
run_diagnostic_main(run_id, args.out_root)
|
| 170 |
+
|
| 171 |
+
print("[5/5] Done.")
|
| 172 |
print(f"Run directory: {Path(args.out_root) / run_id}")
|
| 173 |
|
| 174 |
|
ui/app.py
CHANGED
|
@@ -22,8 +22,14 @@ def list_runs():
|
|
| 22 |
|
| 23 |
def load_run(run_id: str):
|
| 24 |
run_dir = RUNS_DIR / run_id
|
|
|
|
| 25 |
meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
aligned_path = run_dir / "aligned.jsonl"
|
| 29 |
if aligned_path.exists():
|
|
@@ -40,51 +46,81 @@ def load_run(run_id: str):
|
|
| 40 |
events_path = run_dir / "events.parquet"
|
| 41 |
df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
def build_summary_md(meta, summary):
|
| 47 |
lines = []
|
| 48 |
lines.append(f"### Run ID: `{meta.get('run_id')}`")
|
| 49 |
lines.append(f"- Model: `{meta.get('model_info')}`")
|
| 50 |
-
|
|
|
|
| 51 |
lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
|
| 52 |
-
|
|
|
|
| 53 |
lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
|
|
|
|
| 54 |
lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
return "\n".join(lines)
|
| 56 |
|
| 57 |
|
| 58 |
def on_select_run(run_id):
|
| 59 |
if not run_id:
|
| 60 |
-
return "", pd.DataFrame(), pd.DataFrame()
|
| 61 |
|
| 62 |
-
meta, summary, df_align, df_events = load_run(run_id)
|
| 63 |
md = build_summary_md(meta, summary)
|
| 64 |
|
| 65 |
-
align_view =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
-
if len(df_events)
|
|
|
|
|
|
|
| 68 |
events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
|
| 69 |
else:
|
| 70 |
events_view = pd.DataFrame()
|
| 71 |
|
| 72 |
-
return md, align_view, events_view
|
| 73 |
|
| 74 |
|
| 75 |
def search_events(run_id, error_class, contains):
|
| 76 |
if not run_id:
|
| 77 |
return pd.DataFrame()
|
| 78 |
-
|
|
|
|
| 79 |
if df_events is None or len(df_events) == 0:
|
| 80 |
return pd.DataFrame()
|
| 81 |
|
| 82 |
-
q = df_events
|
|
|
|
| 83 |
if error_class and error_class != "ALL":
|
| 84 |
q = q[q["error_class"] == error_class]
|
|
|
|
| 85 |
if contains:
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
|
|
@@ -98,6 +134,7 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
|
|
| 98 |
"--language", language.strip(),
|
| 99 |
"--num", str(int(num_samples)),
|
| 100 |
]
|
|
|
|
| 101 |
if dataset_config and dataset_config.strip():
|
| 102 |
cmd += ["--dataset_config", dataset_config.strip()]
|
| 103 |
|
|
@@ -105,13 +142,35 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
|
|
| 105 |
out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
|
| 106 |
|
| 107 |
if p.returncode != 0:
|
| 108 |
-
out +=
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
runs = list_runs()
|
| 112 |
latest = runs[0] if runs else None
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
with gr.Blocks() as demo:
|
|
@@ -120,8 +179,7 @@ with gr.Blocks() as demo:
|
|
| 120 |
with gr.Accordion("Run from Hugging Face", open=True):
|
| 121 |
gr.Markdown(
|
| 122 |
"Fill in a dataset and a Whisper model, then click **Run**. "
|
| 123 |
-
"
|
| 124 |
-
"If Common Voice is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
|
| 125 |
)
|
| 126 |
|
| 127 |
with gr.Row():
|
|
@@ -138,7 +196,7 @@ with gr.Blocks() as demo:
|
|
| 138 |
language = gr.Textbox(label="Language", value="zh")
|
| 139 |
|
| 140 |
run_btn = gr.Button("Run")
|
| 141 |
-
logs = gr.Textbox(label="Logs", lines=
|
| 142 |
|
| 143 |
gr.Markdown("## Browse Existing Runs")
|
| 144 |
|
|
@@ -152,25 +210,46 @@ with gr.Blocks() as demo:
|
|
| 152 |
|
| 153 |
with gr.Accordion("Search Error Events", open=False):
|
| 154 |
error_cls = gr.Dropdown(
|
| 155 |
-
choices=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
value="ALL",
|
| 157 |
label="error_class",
|
| 158 |
)
|
| 159 |
contains = gr.Textbox(label="contains (ref/hyp substring)")
|
| 160 |
-
|
| 161 |
result_tbl = gr.Dataframe(label="Search results", interactive=False)
|
| 162 |
|
|
|
|
|
|
|
|
|
|
| 163 |
if runs:
|
| 164 |
-
md0, a0, e0 = on_select_run(runs[0])
|
| 165 |
summary_md.value = md0
|
| 166 |
align_tbl.value = a0
|
| 167 |
events_tbl.value = e0
|
|
|
|
| 168 |
|
| 169 |
-
run_dd.change(
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
run_btn.click(
|
| 173 |
run_hf_job,
|
| 174 |
inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
|
| 175 |
-
outputs=[logs, run_dd, summary_md, align_tbl, events_tbl],
|
| 176 |
)
|
|
|
|
| 22 |
|
| 23 |
def load_run(run_id: str):
|
| 24 |
run_dir = RUNS_DIR / run_id
|
| 25 |
+
|
| 26 |
meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
|
| 27 |
+
|
| 28 |
+
summary = (
|
| 29 |
+
json.loads((run_dir / "summary.json").read_text(encoding="utf-8"))
|
| 30 |
+
if (run_dir / "summary.json").exists()
|
| 31 |
+
else {}
|
| 32 |
+
)
|
| 33 |
|
| 34 |
aligned_path = run_dir / "aligned.jsonl"
|
| 35 |
if aligned_path.exists():
|
|
|
|
| 46 |
events_path = run_dir / "events.parquet"
|
| 47 |
df_events = pd.read_parquet(events_path) if events_path.exists() else pd.DataFrame()
|
| 48 |
|
| 49 |
+
diagnostic_path = run_dir / "diagnostic_report.md"
|
| 50 |
+
diagnostic_text = (
|
| 51 |
+
diagnostic_path.read_text(encoding="utf-8")
|
| 52 |
+
if diagnostic_path.exists()
|
| 53 |
+
else "No diagnostic report yet."
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
return meta, summary, df_align, df_events, diagnostic_text
|
| 57 |
|
| 58 |
|
| 59 |
def build_summary_md(meta, summary):
|
| 60 |
lines = []
|
| 61 |
lines.append(f"### Run ID: `{meta.get('run_id')}`")
|
| 62 |
lines.append(f"- Model: `{meta.get('model_info')}`")
|
| 63 |
+
|
| 64 |
+
if "wer_mean" in summary and summary["wer_mean"] is not None:
|
| 65 |
lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
|
| 66 |
+
|
| 67 |
+
if "cer_mean" in summary and summary["cer_mean"] is not None:
|
| 68 |
lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
|
| 69 |
+
|
| 70 |
lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
|
| 71 |
+
|
| 72 |
+
if "top_error_classes" in summary:
|
| 73 |
+
lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
|
| 74 |
+
|
| 75 |
return "\n".join(lines)
|
| 76 |
|
| 77 |
|
| 78 |
def on_select_run(run_id):
|
| 79 |
if not run_id:
|
| 80 |
+
return "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
|
| 81 |
|
| 82 |
+
meta, summary, df_align, df_events, diagnostic_text = load_run(run_id)
|
| 83 |
md = build_summary_md(meta, summary)
|
| 84 |
|
| 85 |
+
align_view = (
|
| 86 |
+
df_align[["utt_id", "wer", "cer"]].head(50)
|
| 87 |
+
if len(df_align) and all(c in df_align.columns for c in ["utt_id", "wer", "cer"])
|
| 88 |
+
else pd.DataFrame()
|
| 89 |
+
)
|
| 90 |
|
| 91 |
+
if len(df_events) and all(
|
| 92 |
+
c in df_events.columns for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
|
| 93 |
+
):
|
| 94 |
events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
|
| 95 |
else:
|
| 96 |
events_view = pd.DataFrame()
|
| 97 |
|
| 98 |
+
return md, align_view, events_view, diagnostic_text
|
| 99 |
|
| 100 |
|
| 101 |
def search_events(run_id, error_class, contains):
|
| 102 |
if not run_id:
|
| 103 |
return pd.DataFrame()
|
| 104 |
+
|
| 105 |
+
_, _, _, df_events, _ = load_run(run_id)
|
| 106 |
if df_events is None or len(df_events) == 0:
|
| 107 |
return pd.DataFrame()
|
| 108 |
|
| 109 |
+
q = df_events.copy()
|
| 110 |
+
|
| 111 |
if error_class and error_class != "ALL":
|
| 112 |
q = q[q["error_class"] == error_class]
|
| 113 |
+
|
| 114 |
if contains:
|
| 115 |
+
contains = str(contains)
|
| 116 |
+
q = q[
|
| 117 |
+
q["ref"].astype(str).str.contains(contains, na=False)
|
| 118 |
+
| q["hyp"].astype(str).str.contains(contains, na=False)
|
| 119 |
+
]
|
| 120 |
+
|
| 121 |
+
cols = ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
|
| 122 |
+
cols = [c for c in cols if c in q.columns]
|
| 123 |
+
return q[cols].head(200)
|
| 124 |
|
| 125 |
|
| 126 |
def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
|
|
|
|
| 134 |
"--language", language.strip(),
|
| 135 |
"--num", str(int(num_samples)),
|
| 136 |
]
|
| 137 |
+
|
| 138 |
if dataset_config and dataset_config.strip():
|
| 139 |
cmd += ["--dataset_config", dataset_config.strip()]
|
| 140 |
|
|
|
|
| 142 |
out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
|
| 143 |
|
| 144 |
if p.returncode != 0:
|
| 145 |
+
out += (
|
| 146 |
+
"\n\n[HINT] If you see 401/403 for Common Voice: "
|
| 147 |
+
"set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
|
| 148 |
+
)
|
| 149 |
+
return (
|
| 150 |
+
out,
|
| 151 |
+
gr.update(),
|
| 152 |
+
"",
|
| 153 |
+
pd.DataFrame(),
|
| 154 |
+
pd.DataFrame(),
|
| 155 |
+
"No diagnostic report yet.",
|
| 156 |
+
)
|
| 157 |
|
| 158 |
runs = list_runs()
|
| 159 |
latest = runs[0] if runs else None
|
| 160 |
+
|
| 161 |
+
if latest:
|
| 162 |
+
md, align_view, events_view, diagnostic_text = on_select_run(latest)
|
| 163 |
+
else:
|
| 164 |
+
md, align_view, events_view, diagnostic_text = "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
|
| 165 |
+
|
| 166 |
+
return (
|
| 167 |
+
out,
|
| 168 |
+
gr.update(choices=runs, value=latest),
|
| 169 |
+
md,
|
| 170 |
+
align_view,
|
| 171 |
+
events_view,
|
| 172 |
+
diagnostic_text,
|
| 173 |
+
)
|
| 174 |
|
| 175 |
|
| 176 |
with gr.Blocks() as demo:
|
|
|
|
| 179 |
with gr.Accordion("Run from Hugging Face", open=True):
|
| 180 |
gr.Markdown(
|
| 181 |
"Fill in a dataset and a Whisper model, then click **Run**. "
|
| 182 |
+
"If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
|
|
|
|
| 183 |
)
|
| 184 |
|
| 185 |
with gr.Row():
|
|
|
|
| 196 |
language = gr.Textbox(label="Language", value="zh")
|
| 197 |
|
| 198 |
run_btn = gr.Button("Run")
|
| 199 |
+
logs = gr.Textbox(label="Logs", lines=16)
|
| 200 |
|
| 201 |
gr.Markdown("## Browse Existing Runs")
|
| 202 |
|
|
|
|
| 210 |
|
| 211 |
with gr.Accordion("Search Error Events", open=False):
|
| 212 |
error_cls = gr.Dropdown(
|
| 213 |
+
choices=[
|
| 214 |
+
"ALL",
|
| 215 |
+
"number_or_time",
|
| 216 |
+
"mixed_language",
|
| 217 |
+
"substitution",
|
| 218 |
+
"deletion",
|
| 219 |
+
"insertion",
|
| 220 |
+
"other",
|
| 221 |
+
],
|
| 222 |
value="ALL",
|
| 223 |
label="error_class",
|
| 224 |
)
|
| 225 |
contains = gr.Textbox(label="contains (ref/hyp substring)")
|
| 226 |
+
search_btn = gr.Button("Search")
|
| 227 |
result_tbl = gr.Dataframe(label="Search results", interactive=False)
|
| 228 |
|
| 229 |
+
with gr.Accordion("Diagnostic Report", open=True):
|
| 230 |
+
diagnostic_md = gr.Markdown("No diagnostic report yet.")
|
| 231 |
+
|
| 232 |
if runs:
|
| 233 |
+
md0, a0, e0, d0 = on_select_run(runs[0])
|
| 234 |
summary_md.value = md0
|
| 235 |
align_tbl.value = a0
|
| 236 |
events_tbl.value = e0
|
| 237 |
+
diagnostic_md.value = d0
|
| 238 |
|
| 239 |
+
run_dd.change(
|
| 240 |
+
on_select_run,
|
| 241 |
+
inputs=[run_dd],
|
| 242 |
+
outputs=[summary_md, align_tbl, events_tbl, diagnostic_md],
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
search_btn.click(
|
| 246 |
+
search_events,
|
| 247 |
+
inputs=[run_dd, error_cls, contains],
|
| 248 |
+
outputs=[result_tbl],
|
| 249 |
+
)
|
| 250 |
|
| 251 |
run_btn.click(
|
| 252 |
run_hf_job,
|
| 253 |
inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
|
| 254 |
+
outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, diagnostic_md],
|
| 255 |
)
|