Spaces:
Sleeping
Sleeping
unknown commited on
Commit ·
59afc96
1
Parent(s): 04000ce
Update UI
Browse files- README.md +33 -0
- analysis/llm_analyzer.py +317 -0
- pipeline/__init__.py +0 -0
- pipeline/run_all.py +3 -1
- pipeline/run_analysis.py +55 -13
- report/diagnostic_report.py +12 -11
- report/generate.py +5 -3
- report/templates.py +46 -8
- scripts/run_diagnostic.py +9 -8
- ui/app.py +209 -107
README.md
CHANGED
|
@@ -10,3 +10,36 @@ pinned: false
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
## ASR LLM Agent Upgrade
|
| 16 |
+
|
| 17 |
+
This version adds an LLM-based diagnosis layer on top of alignment/event statistics:
|
| 18 |
+
|
| 19 |
+
- `analysis/llm_analyzer.py`: sends representative ASR error cases + aggregate stats to an LLM
|
| 20 |
+
- `pipeline/run_analysis.py`: optionally runs LLM diagnosis when `OPENAI_API_KEY` is set
|
| 21 |
+
- `scripts/run_diagnostic.py`: regenerate `llm_diagnosis.json` and `diagnostic_report.md`
|
| 22 |
+
- `report.md`: now includes LLM semantic findings and priority actions
|
| 23 |
+
|
| 24 |
+
### What the LLM adds
|
| 25 |
+
|
| 26 |
+
Compared with rule-only classification, the LLM layer can:
|
| 27 |
+
|
| 28 |
+
- separate surface-form differences from true semantic distortions
|
| 29 |
+
- identify meaning-preserving paraphrases vs business-critical errors
|
| 30 |
+
- infer likely causes from representative cases
|
| 31 |
+
- propose prioritized, actionable improvement suggestions
|
| 32 |
+
|
| 33 |
+
### Quick start
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
export OPENAI_API_KEY=your_key
|
| 37 |
+
python pipeline/run_all.py --manifest data/manifest.jsonl --model_name openai/whisper-small --llm_model gpt-4.1-mini
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
Or rerun diagnosis only for an existing run:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
export OPENAI_API_KEY=your_key
|
| 44 |
+
python scripts/run_diagnostic.py --run_id <run_id> --model gpt-4.1-mini
|
| 45 |
+
```
|
analysis/llm_analyzer.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from openai import OpenAI
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SEMANTIC_SCHEMA_EXAMPLE = {
|
| 12 |
+
"utt_id": "string",
|
| 13 |
+
"semantic_judgement": "语义基本等价|轻微偏差|明显偏差|严重失真",
|
| 14 |
+
"severity": "high|medium|low",
|
| 15 |
+
"semantic_error_types": ["string"],
|
| 16 |
+
"business_impact": "high|medium|low",
|
| 17 |
+
"reason": "string",
|
| 18 |
+
"improvement_suggestions": ["string"],
|
| 19 |
+
"confidence": 0.0,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _safe_float(v: Any) -> Optional[float]:
|
| 24 |
+
try:
|
| 25 |
+
if v is None:
|
| 26 |
+
return None
|
| 27 |
+
return float(v)
|
| 28 |
+
except Exception:
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def build_case_pack(df_align: pd.DataFrame, df_events: pd.DataFrame, max_cases: int = 24) -> List[Dict[str, Any]]:
|
| 33 |
+
if df_align is None or len(df_align) == 0:
|
| 34 |
+
return []
|
| 35 |
+
|
| 36 |
+
align = df_align.copy()
|
| 37 |
+
if "cer" not in align.columns:
|
| 38 |
+
align["cer"] = None
|
| 39 |
+
if "wer" not in align.columns:
|
| 40 |
+
align["wer"] = None
|
| 41 |
+
|
| 42 |
+
sort_cols = [c for c in ["cer", "wer"] if c in align.columns]
|
| 43 |
+
if sort_cols:
|
| 44 |
+
align = align.sort_values(sort_cols, ascending=False, na_position="last")
|
| 45 |
+
|
| 46 |
+
cases: List[Dict[str, Any]] = []
|
| 47 |
+
seen = set()
|
| 48 |
+
|
| 49 |
+
def _event_summary(utt_id: str) -> Dict[str, Any]:
|
| 50 |
+
if df_events is None or len(df_events) == 0 or "utt_id" not in df_events.columns:
|
| 51 |
+
return {"error_classes": {}, "ops": {}, "examples": []}
|
| 52 |
+
x = df_events[df_events["utt_id"] == utt_id].copy()
|
| 53 |
+
if len(x) == 0:
|
| 54 |
+
return {"error_classes": {}, "ops": {}, "examples": []}
|
| 55 |
+
examples = []
|
| 56 |
+
for _, row in x.head(10).iterrows():
|
| 57 |
+
examples.append({
|
| 58 |
+
"level": row.get("level"),
|
| 59 |
+
"op_type": row.get("op_type"),
|
| 60 |
+
"ref": row.get("ref"),
|
| 61 |
+
"hyp": row.get("hyp"),
|
| 62 |
+
"error_class": row.get("error_class"),
|
| 63 |
+
})
|
| 64 |
+
return {
|
| 65 |
+
"error_classes": {str(k): int(v) for k, v in x["error_class"].value_counts().head(10).to_dict().items()} if "error_class" in x.columns else {},
|
| 66 |
+
"ops": {str(k): int(v) for k, v in x["op_type"].value_counts().to_dict().items()} if "op_type" in x.columns else {},
|
| 67 |
+
"examples": examples,
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
for _, row in align.head(max_cases).iterrows():
|
| 71 |
+
utt_id = str(row.get("utt_id"))
|
| 72 |
+
if utt_id in seen:
|
| 73 |
+
continue
|
| 74 |
+
seen.add(utt_id)
|
| 75 |
+
meta = {k: row.get(k) for k in ["device", "domain", "accent", "speaker"] if k in row.index and pd.notna(row.get(k))}
|
| 76 |
+
cases.append({
|
| 77 |
+
"utt_id": utt_id,
|
| 78 |
+
"ref_text": row.get("ref_text"),
|
| 79 |
+
"hyp_text": row.get("hyp_text"),
|
| 80 |
+
"norm_ref": row.get("norm_ref"),
|
| 81 |
+
"norm_hyp": row.get("norm_hyp"),
|
| 82 |
+
"wer": _safe_float(row.get("wer")),
|
| 83 |
+
"cer": _safe_float(row.get("cer")),
|
| 84 |
+
"meta": meta,
|
| 85 |
+
"event_summary": _event_summary(utt_id),
|
| 86 |
+
})
|
| 87 |
+
if len(cases) >= max_cases:
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
return cases
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def build_global_stats(df_align: pd.DataFrame, df_events: pd.DataFrame, summary: Dict[str, Any]) -> Dict[str, Any]:
|
| 94 |
+
stats: Dict[str, Any] = {
|
| 95 |
+
"summary": summary,
|
| 96 |
+
"num_utterances": int(len(df_align)) if df_align is not None else 0,
|
| 97 |
+
"num_events": int(len(df_events)) if df_events is not None else 0,
|
| 98 |
+
}
|
| 99 |
+
if df_events is not None and len(df_events) > 0:
|
| 100 |
+
if "op_type" in df_events.columns:
|
| 101 |
+
stats["op_counts"] = {str(k): int(v) for k, v in df_events["op_type"].value_counts().to_dict().items()}
|
| 102 |
+
if "error_class" in df_events.columns:
|
| 103 |
+
stats["error_class_counts"] = {str(k): int(v) for k, v in df_events["error_class"].value_counts().head(20).to_dict().items()}
|
| 104 |
+
|
| 105 |
+
slice_stats = {}
|
| 106 |
+
if df_align is not None and len(df_align) > 0 and "cer" in df_align.columns:
|
| 107 |
+
for key in ["device", "domain", "accent", "speaker"]:
|
| 108 |
+
if key in df_align.columns and df_align[key].notna().any():
|
| 109 |
+
g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False).head(10)
|
| 110 |
+
if len(g) > 0:
|
| 111 |
+
slice_stats[key] = [{"key": str(k), "cer": float(v)} for k, v in g.items()]
|
| 112 |
+
if slice_stats:
|
| 113 |
+
stats["slice_stats"] = slice_stats
|
| 114 |
+
return stats
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
SYSTEM_PROMPT = """你是资深 ASR 诊断专家,同时具备语音识别、语言学和业务语义分析能力。
|
| 118 |
+
你的任务不是只做 S/I/D 统计,而是识别:
|
| 119 |
+
1. 结构性错误(替换、删除、插入、数字、英文、专名等)
|
| 120 |
+
2. 语义层错误(是否改变原意、是否造成业务理解偏差、是否只是表面字词不同但语义基本等价)
|
| 121 |
+
3. 可能成因(口音、同音混淆、领域词缺失、分段/VAD、噪声、数字口语化、语言模型偏置等)
|
| 122 |
+
4. 可执行的改进建议
|
| 123 |
+
|
| 124 |
+
必须严格基于输入证据,不要编造音频层信息。若证据不足,明确写“不确定”���
|
| 125 |
+
请输出严格 JSON。"""
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _extract_json(text: str) -> Dict[str, Any]:
|
| 129 |
+
text = text.strip()
|
| 130 |
+
try:
|
| 131 |
+
return json.loads(text)
|
| 132 |
+
except Exception:
|
| 133 |
+
pass
|
| 134 |
+
start = text.find("{")
|
| 135 |
+
end = text.rfind("}")
|
| 136 |
+
if start >= 0 and end > start:
|
| 137 |
+
return json.loads(text[start:end + 1])
|
| 138 |
+
raise ValueError("LLM output is not valid JSON")
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def analyze_with_llm(
|
| 142 |
+
df_align: pd.DataFrame,
|
| 143 |
+
df_events: pd.DataFrame,
|
| 144 |
+
summary: Dict[str, Any],
|
| 145 |
+
model: str = "gpt-4.1-mini",
|
| 146 |
+
client: Optional[OpenAI] = None,
|
| 147 |
+
max_cases: int = 24,
|
| 148 |
+
) -> Dict[str, Any]:
|
| 149 |
+
client = client or OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 150 |
+
|
| 151 |
+
case_pack = build_case_pack(df_align, df_events, max_cases=max_cases)
|
| 152 |
+
global_stats = build_global_stats(df_align, df_events, summary)
|
| 153 |
+
|
| 154 |
+
user_prompt = {
|
| 155 |
+
"task": "请对 ASR 结果做结构+语义联合诊断,并给出改进建议。",
|
| 156 |
+
"instructions": {
|
| 157 |
+
"output_schema": {
|
| 158 |
+
"executive_summary": "string",
|
| 159 |
+
"major_patterns": [
|
| 160 |
+
{
|
| 161 |
+
"title": "string",
|
| 162 |
+
"priority": 1,
|
| 163 |
+
"phenomenon": "string",
|
| 164 |
+
"evidence": ["string"],
|
| 165 |
+
"semantic_impact": "high|medium|low",
|
| 166 |
+
"likely_causes": ["string"],
|
| 167 |
+
"recommendations": ["string"],
|
| 168 |
+
"confidence": 0.0,
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"case_findings": [
|
| 172 |
+
{
|
| 173 |
+
"utt_id": "string",
|
| 174 |
+
"semantic_judgement": "语义基本等价|轻微偏差|明显偏差|严重失真",
|
| 175 |
+
"severity": "high|medium|low",
|
| 176 |
+
"reason": "string",
|
| 177 |
+
"semantic_error_types": ["string"],
|
| 178 |
+
"suggestions": ["string"],
|
| 179 |
+
}
|
| 180 |
+
],
|
| 181 |
+
"priority_actions": ["string"],
|
| 182 |
+
"uncertainties": ["string"],
|
| 183 |
+
},
|
| 184 |
+
"requirements": [
|
| 185 |
+
"不要复述所有 case,只保留最有代表性的 findings。",
|
| 186 |
+
"要区分字符差异和真正改变语义的错误。",
|
| 187 |
+
"如果 ref/hyp 只是同义改写或口语差异,应该指出语义影响较低。",
|
| 188 |
+
"建议必须可执行,优先包含数据、解码、后处理、术语表、prompt/context、评测集扩展等方向。",
|
| 189 |
+
],
|
| 190 |
+
},
|
| 191 |
+
"global_stats": global_stats,
|
| 192 |
+
"cases": case_pack,
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
resp = client.chat.completions.create(
|
| 196 |
+
model=model,
|
| 197 |
+
temperature=0.2,
|
| 198 |
+
messages=[
|
| 199 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 200 |
+
{"role": "user", "content": json.dumps(user_prompt, ensure_ascii=False)},
|
| 201 |
+
],
|
| 202 |
+
response_format={"type": "json_object"},
|
| 203 |
+
)
|
| 204 |
+
raw = resp.choices[0].message.content or "{}"
|
| 205 |
+
parsed = _extract_json(raw)
|
| 206 |
+
parsed["model"] = model
|
| 207 |
+
parsed["num_cases_sent"] = len(case_pack)
|
| 208 |
+
parsed["global_stats"] = global_stats
|
| 209 |
+
return parsed
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def _build_semantic_rows(df_align: pd.DataFrame, df_events: pd.DataFrame) -> List[Dict[str, Any]]:
|
| 213 |
+
rows: List[Dict[str, Any]] = []
|
| 214 |
+
event_map: Dict[str, List[Dict[str, Any]]] = {}
|
| 215 |
+
if df_events is not None and len(df_events) > 0 and "utt_id" in df_events.columns:
|
| 216 |
+
for utt_id, sub in df_events.groupby("utt_id"):
|
| 217 |
+
examples = []
|
| 218 |
+
for _, row in sub.head(8).iterrows():
|
| 219 |
+
examples.append({
|
| 220 |
+
"op_type": row.get("op_type"),
|
| 221 |
+
"ref": row.get("ref"),
|
| 222 |
+
"hyp": row.get("hyp"),
|
| 223 |
+
"error_class": row.get("error_class"),
|
| 224 |
+
"level": row.get("level"),
|
| 225 |
+
})
|
| 226 |
+
event_map[str(utt_id)] = examples
|
| 227 |
+
|
| 228 |
+
for _, row in df_align.iterrows():
|
| 229 |
+
utt_id = str(row.get("utt_id"))
|
| 230 |
+
meta = {k: row.get(k) for k in ["device", "domain", "accent", "speaker"] if k in row.index and pd.notna(row.get(k))}
|
| 231 |
+
rows.append({
|
| 232 |
+
"utt_id": utt_id,
|
| 233 |
+
"ref_text": row.get("ref_text"),
|
| 234 |
+
"hyp_text": row.get("hyp_text"),
|
| 235 |
+
"norm_ref": row.get("norm_ref"),
|
| 236 |
+
"norm_hyp": row.get("norm_hyp"),
|
| 237 |
+
"wer": _safe_float(row.get("wer")),
|
| 238 |
+
"cer": _safe_float(row.get("cer")),
|
| 239 |
+
"meta": meta,
|
| 240 |
+
"events": event_map.get(utt_id, []),
|
| 241 |
+
})
|
| 242 |
+
return rows
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def _normalize_semantic_item(item: Dict[str, Any], fallback: Dict[str, Any]) -> Dict[str, Any]:
|
| 246 |
+
semantic_types = item.get("semantic_error_types") or []
|
| 247 |
+
suggestions = item.get("improvement_suggestions") or item.get("suggestions") or []
|
| 248 |
+
return {
|
| 249 |
+
"utt_id": str(item.get("utt_id") or fallback.get("utt_id")),
|
| 250 |
+
"semantic_judgement": str(item.get("semantic_judgement") or "不确定"),
|
| 251 |
+
"severity": str(item.get("severity") or "low"),
|
| 252 |
+
"semantic_error_types": semantic_types if isinstance(semantic_types, list) else [str(semantic_types)],
|
| 253 |
+
"business_impact": str(item.get("business_impact") or "low"),
|
| 254 |
+
"reason": str(item.get("reason") or ""),
|
| 255 |
+
"improvement_suggestions": suggestions if isinstance(suggestions, list) else [str(suggestions)],
|
| 256 |
+
"confidence": _safe_float(item.get("confidence")) if item.get("confidence") is not None else None,
|
| 257 |
+
"ref_text": fallback.get("ref_text"),
|
| 258 |
+
"hyp_text": fallback.get("hyp_text"),
|
| 259 |
+
"wer": fallback.get("wer"),
|
| 260 |
+
"cer": fallback.get("cer"),
|
| 261 |
+
"device": (fallback.get("meta") or {}).get("device"),
|
| 262 |
+
"domain": (fallback.get("meta") or {}).get("domain"),
|
| 263 |
+
"accent": (fallback.get("meta") or {}).get("accent"),
|
| 264 |
+
"speaker": (fallback.get("meta") or {}).get("speaker"),
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def analyze_semantic_per_utterance(
|
| 269 |
+
df_align: pd.DataFrame,
|
| 270 |
+
df_events: pd.DataFrame,
|
| 271 |
+
model: str = "gpt-4.1-mini",
|
| 272 |
+
client: Optional[OpenAI] = None,
|
| 273 |
+
batch_size: int = 12,
|
| 274 |
+
) -> pd.DataFrame:
|
| 275 |
+
client = client or OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 276 |
+
rows = _build_semantic_rows(df_align, df_events)
|
| 277 |
+
if not rows:
|
| 278 |
+
return pd.DataFrame()
|
| 279 |
+
|
| 280 |
+
results: List[Dict[str, Any]] = []
|
| 281 |
+
for start in range(0, len(rows), batch_size):
|
| 282 |
+
batch = rows[start:start + batch_size]
|
| 283 |
+
payload = {
|
| 284 |
+
"task": "逐条 utterance 做语义级错误判断。",
|
| 285 |
+
"requirements": [
|
| 286 |
+
"逐条判断 ref_text 与 hyp_text 的语义偏差程度。",
|
| 287 |
+
"不要因为表面字不同就判严重错误;如果基本不改变含义,应标注为语义基本等价或轻微偏差。",
|
| 288 |
+
"结合 events 判断数字、时间、专名、否定、实体、动作关系等关键语义是否出错。",
|
| 289 |
+
"输出必须覆盖 batch 中每个 utt_id,且仅输出 JSON 对象。",
|
| 290 |
+
],
|
| 291 |
+
"output_schema": {"items": [SEMANTIC_SCHEMA_EXAMPLE]},
|
| 292 |
+
"items": batch,
|
| 293 |
+
}
|
| 294 |
+
resp = client.chat.completions.create(
|
| 295 |
+
model=model,
|
| 296 |
+
temperature=0,
|
| 297 |
+
messages=[
|
| 298 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 299 |
+
{"role": "user", "content": json.dumps(payload, ensure_ascii=False)},
|
| 300 |
+
],
|
| 301 |
+
response_format={"type": "json_object"},
|
| 302 |
+
)
|
| 303 |
+
raw = resp.choices[0].message.content or "{}"
|
| 304 |
+
parsed = _extract_json(raw)
|
| 305 |
+
items = parsed.get("items") or []
|
| 306 |
+
by_id = {str(x.get("utt_id")): x for x in items if isinstance(x, dict)}
|
| 307 |
+
for fallback in batch:
|
| 308 |
+
item = by_id.get(str(fallback.get("utt_id")), {"utt_id": fallback.get("utt_id"), "semantic_judgement": "不确定", "reason": "LLM 未返回该条结果。"})
|
| 309 |
+
results.append(_normalize_semantic_item(item, fallback))
|
| 310 |
+
|
| 311 |
+
df = pd.DataFrame(results)
|
| 312 |
+
if len(df) > 0 and "semantic_error_types" in df.columns:
|
| 313 |
+
df["semantic_error_types_str"] = df["semantic_error_types"].apply(lambda xs: " | ".join(xs) if isinstance(xs, list) else str(xs))
|
| 314 |
+
if len(df) > 0 and "improvement_suggestions" in df.columns:
|
| 315 |
+
df["improvement_suggestions_str"] = df["improvement_suggestions"].apply(lambda xs: " | ".join(xs) if isinstance(xs, list) else str(xs))
|
| 316 |
+
df["llm_model"] = model
|
| 317 |
+
return df
|
pipeline/__init__.py
ADDED
|
File without changes
|
pipeline/run_all.py
CHANGED
|
@@ -10,6 +10,8 @@ def main():
|
|
| 10 |
ap.add_argument("--model_name", default="small")
|
| 11 |
ap.add_argument("--device", default="cpu")
|
| 12 |
ap.add_argument("--compute_type", default="int8")
|
|
|
|
|
|
|
| 13 |
args = ap.parse_args()
|
| 14 |
|
| 15 |
run_id = run_asr(
|
|
@@ -18,7 +20,7 @@ def main():
|
|
| 18 |
device=args.device,
|
| 19 |
compute_type=args.compute_type,
|
| 20 |
)
|
| 21 |
-
run_analysis(run_id)
|
| 22 |
print(f"Done. Run: runs/{run_id}")
|
| 23 |
|
| 24 |
|
|
|
|
| 10 |
ap.add_argument("--model_name", default="small")
|
| 11 |
ap.add_argument("--device", default="cpu")
|
| 12 |
ap.add_argument("--compute_type", default="int8")
|
| 13 |
+
ap.add_argument("--llm_model", default="gpt-4.1-mini")
|
| 14 |
+
ap.add_argument("--disable_llm", action="store_true")
|
| 15 |
args = ap.parse_args()
|
| 16 |
|
| 17 |
run_id = run_asr(
|
|
|
|
| 20 |
device=args.device,
|
| 21 |
compute_type=args.compute_type,
|
| 22 |
)
|
| 23 |
+
run_analysis(run_id, llm_enabled=not args.disable_llm, llm_model=args.llm_model)
|
| 24 |
print(f"Done. Run: runs/{run_id}")
|
| 25 |
|
| 26 |
|
pipeline/run_analysis.py
CHANGED
|
@@ -1,29 +1,42 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import json
|
|
|
|
| 3 |
from pathlib import Path
|
| 4 |
-
from typing import List, Dict
|
| 5 |
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
|
| 8 |
from core.io import read_jsonl, write_jsonl
|
| 9 |
from analysis.align import align_one
|
| 10 |
from analysis.events import extract_events
|
| 11 |
from analysis.aggregate import aggregate_summary
|
|
|
|
| 12 |
from report.generate import write_report
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
-
def run_analysis(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
run_dir = Path(out_root) / run_id
|
| 17 |
run_meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
|
| 18 |
|
| 19 |
asr_path = run_dir / "asr_outputs.jsonl"
|
| 20 |
aligned_path = run_dir / "aligned.jsonl"
|
| 21 |
events_path = run_dir / "events.parquet"
|
|
|
|
|
|
|
| 22 |
|
| 23 |
aligned_records: List[Dict] = []
|
| 24 |
events_records: List[Dict] = []
|
| 25 |
|
| 26 |
-
|
|
|
|
| 27 |
utt_id = r["utt_id"]
|
| 28 |
ref = r.get("ref_text")
|
| 29 |
hyp = r.get("hyp_text", "")
|
|
@@ -44,17 +57,46 @@ def run_analysis(run_id: str, out_root: str = "runs") -> None:
|
|
| 44 |
df_events = pd.DataFrame(events_records)
|
| 45 |
df_events.to_parquet(events_path, index=False)
|
| 46 |
else:
|
| 47 |
-
df_events = pd.DataFrame(columns=["op_type", "error_class"])
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
# df_align currently doesn't have meta; add a few common keys if present
|
| 51 |
-
# We'll reconstruct from asr_outputs for slicing.
|
| 52 |
-
# (Simple approach) reload and join on utt_id:
|
| 53 |
-
meta_map = {}
|
| 54 |
-
for r in read_jsonl(asr_path):
|
| 55 |
-
meta_map[r["utt_id"]] = r.get("meta", {}) or {}
|
| 56 |
for key in ["device", "domain", "accent", "speaker"]:
|
| 57 |
df_align[key] = df_align["utt_id"].map(lambda u: meta_map.get(u, {}).get(key))
|
| 58 |
|
| 59 |
summary = aggregate_summary(df_events, df_align)
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import json
|
| 3 |
+
import os
|
| 4 |
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Optional
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
+
from openai import OpenAI
|
| 9 |
|
| 10 |
from core.io import read_jsonl, write_jsonl
|
| 11 |
from analysis.align import align_one
|
| 12 |
from analysis.events import extract_events
|
| 13 |
from analysis.aggregate import aggregate_summary
|
| 14 |
+
from analysis.llm_analyzer import analyze_with_llm, analyze_semantic_per_utterance
|
| 15 |
from report.generate import write_report
|
| 16 |
+
from report.diagnostic_report import generate_report_with_openai
|
| 17 |
|
| 18 |
|
| 19 |
+
def run_analysis(
|
| 20 |
+
run_id: str,
|
| 21 |
+
out_root: str = "runs",
|
| 22 |
+
llm_enabled: bool = True,
|
| 23 |
+
llm_model: str = "gpt-4.1-mini",
|
| 24 |
+
write_diagnostic_report: bool = True,
|
| 25 |
+
) -> None:
|
| 26 |
run_dir = Path(out_root) / run_id
|
| 27 |
run_meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
|
| 28 |
|
| 29 |
asr_path = run_dir / "asr_outputs.jsonl"
|
| 30 |
aligned_path = run_dir / "aligned.jsonl"
|
| 31 |
events_path = run_dir / "events.parquet"
|
| 32 |
+
semantic_path = run_dir / "semantic_findings.parquet"
|
| 33 |
+
semantic_jsonl_path = run_dir / "semantic_findings.jsonl"
|
| 34 |
|
| 35 |
aligned_records: List[Dict] = []
|
| 36 |
events_records: List[Dict] = []
|
| 37 |
|
| 38 |
+
asr_rows = list(read_jsonl(asr_path))
|
| 39 |
+
for r in asr_rows:
|
| 40 |
utt_id = r["utt_id"]
|
| 41 |
ref = r.get("ref_text")
|
| 42 |
hyp = r.get("hyp_text", "")
|
|
|
|
| 57 |
df_events = pd.DataFrame(events_records)
|
| 58 |
df_events.to_parquet(events_path, index=False)
|
| 59 |
else:
|
| 60 |
+
df_events = pd.DataFrame(columns=["utt_id", "op_type", "error_class", "ref", "hyp", "level"])
|
| 61 |
+
|
| 62 |
+
meta_map = {r["utt_id"]: r.get("meta", {}) or {} for r in asr_rows}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
for key in ["device", "domain", "accent", "speaker"]:
|
| 64 |
df_align[key] = df_align["utt_id"].map(lambda u: meta_map.get(u, {}).get(key))
|
| 65 |
|
| 66 |
summary = aggregate_summary(df_events, df_align)
|
| 67 |
+
|
| 68 |
+
llm_diagnosis: Optional[Dict] = None
|
| 69 |
+
semantic_df = pd.DataFrame()
|
| 70 |
+
if llm_enabled and os.getenv("OPENAI_API_KEY") and len(df_align) > 0:
|
| 71 |
+
client = OpenAI()
|
| 72 |
+
llm_diagnosis = analyze_with_llm(
|
| 73 |
+
df_align=df_align,
|
| 74 |
+
df_events=df_events,
|
| 75 |
+
summary=summary,
|
| 76 |
+
model=llm_model,
|
| 77 |
+
client=client,
|
| 78 |
+
)
|
| 79 |
+
semantic_df = analyze_semantic_per_utterance(
|
| 80 |
+
df_align=df_align,
|
| 81 |
+
df_events=df_events,
|
| 82 |
+
model=llm_model,
|
| 83 |
+
client=client,
|
| 84 |
+
)
|
| 85 |
+
if len(semantic_df) > 0:
|
| 86 |
+
semantic_df.to_parquet(semantic_path, index=False)
|
| 87 |
+
write_jsonl(semantic_jsonl_path, semantic_df.to_dict(orient="records"))
|
| 88 |
+
if write_diagnostic_report:
|
| 89 |
+
report = generate_report_with_openai(llm_diagnosis, summary, client, model=llm_model)
|
| 90 |
+
(run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
|
| 91 |
+
elif write_diagnostic_report and not (run_dir / "diagnostic_report.md").exists():
|
| 92 |
+
(run_dir / "diagnostic_report.md").write_text(
|
| 93 |
+
"LLM diagnostic report was skipped because OPENAI_API_KEY is not set.\n"
|
| 94 |
+
"You can still inspect summary.json and report.md, or rerun with an API key.",
|
| 95 |
+
encoding="utf-8",
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
write_report(run_dir, run_meta, summary, llm_diagnosis=llm_diagnosis)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
raise SystemExit("Use pipeline/run_all.py or import run_analysis()")
|
report/diagnostic_report.py
CHANGED
|
@@ -6,39 +6,40 @@ from typing import Dict, Any
|
|
| 6 |
|
| 7 |
SYSTEM_PROMPT = """You are an ASR diagnostics expert.
|
| 8 |
Write a concise but evidence-based ASR error analysis report in Chinese.
|
| 9 |
-
Do not invent evidence.
|
| 10 |
Focus on:
|
| 11 |
1. major error patterns
|
| 12 |
-
2.
|
| 13 |
-
3.
|
| 14 |
-
4.
|
|
|
|
| 15 |
"""
|
| 16 |
|
| 17 |
|
| 18 |
-
def build_prompt(
|
| 19 |
return f"""
|
| 20 |
请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
|
| 21 |
|
| 22 |
要求:
|
| 23 |
- 先写总体结论
|
| 24 |
- 再写主要错误原因(按优先级排序)
|
| 25 |
-
- 每个原因要包含:现象、证据、可能原因、改进建议
|
| 26 |
- 最后给出一个优先级排序的行动清单
|
| 27 |
- 如果证据不足,要明确说“不确定”
|
| 28 |
|
| 29 |
【summary.json】
|
| 30 |
{json.dumps(summary, ensure_ascii=False, indent=2)}
|
| 31 |
|
| 32 |
-
【
|
| 33 |
-
{json.dumps(
|
| 34 |
"""
|
| 35 |
|
| 36 |
|
| 37 |
-
def generate_report_with_openai(
|
| 38 |
-
prompt = build_prompt(
|
| 39 |
|
| 40 |
resp = client.chat.completions.create(
|
| 41 |
-
model=
|
| 42 |
messages=[
|
| 43 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 44 |
{"role": "user", "content": prompt},
|
|
|
|
| 6 |
|
| 7 |
SYSTEM_PROMPT = """You are an ASR diagnostics expert.
|
| 8 |
Write a concise but evidence-based ASR error analysis report in Chinese.
|
| 9 |
+
Do not invent evidence. Use both structured metrics and LLM semantic diagnosis.
|
| 10 |
Focus on:
|
| 11 |
1. major error patterns
|
| 12 |
+
2. semantic impact of errors
|
| 13 |
+
3. likely root causes
|
| 14 |
+
4. confidence and uncertainty
|
| 15 |
+
5. actionable next steps
|
| 16 |
"""
|
| 17 |
|
| 18 |
|
| 19 |
+
def build_prompt(llm_diagnosis: Dict[str, Any], summary: Dict[str, Any]) -> str:
|
| 20 |
return f"""
|
| 21 |
请基于下面的结构化分析结果,生成一份中文 ASR 错误诊断报告。
|
| 22 |
|
| 23 |
要求:
|
| 24 |
- 先写总体结论
|
| 25 |
- 再写主要错误原因(按优先级排序)
|
| 26 |
+
- 每个原因要包含:现象、证据、语义影响、可能原因、改进建议
|
| 27 |
- 最后给出一个优先级排序的行动清单
|
| 28 |
- 如果证据不足,要明确说“不确定”
|
| 29 |
|
| 30 |
【summary.json】
|
| 31 |
{json.dumps(summary, ensure_ascii=False, indent=2)}
|
| 32 |
|
| 33 |
+
【llm_diagnosis.json】
|
| 34 |
+
{json.dumps(llm_diagnosis, ensure_ascii=False, indent=2)}
|
| 35 |
"""
|
| 36 |
|
| 37 |
|
| 38 |
+
def generate_report_with_openai(llm_diagnosis: Dict[str, Any], summary: Dict[str, Any], client, model: str = "gpt-4.1-mini") -> str:
|
| 39 |
+
prompt = build_prompt(llm_diagnosis, summary)
|
| 40 |
|
| 41 |
resp = client.chat.completions.create(
|
| 42 |
+
model=model,
|
| 43 |
messages=[
|
| 44 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 45 |
{"role": "user", "content": prompt},
|
report/generate.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import json
|
| 3 |
from pathlib import Path
|
| 4 |
-
from typing import Dict
|
| 5 |
from .templates import render_markdown
|
| 6 |
|
| 7 |
|
| 8 |
-
def write_report(run_dir: Path, run_meta: Dict, summary: Dict) -> None:
|
| 9 |
run_dir.mkdir(parents=True, exist_ok=True)
|
| 10 |
|
| 11 |
(run_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
(run_dir / "report.md").write_text(md, encoding="utf-8")
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
import json
|
| 3 |
from pathlib import Path
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
from .templates import render_markdown
|
| 6 |
|
| 7 |
|
| 8 |
+
def write_report(run_dir: Path, run_meta: Dict, summary: Dict, llm_diagnosis: Optional[Dict] = None) -> None:
|
| 9 |
run_dir.mkdir(parents=True, exist_ok=True)
|
| 10 |
|
| 11 |
(run_dir / "summary.json").write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 12 |
+
if llm_diagnosis is not None:
|
| 13 |
+
(run_dir / "llm_diagnosis.json").write_text(json.dumps(llm_diagnosis, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 14 |
+
md = render_markdown(run_meta, summary, llm_diagnosis=llm_diagnosis)
|
| 15 |
(run_dir / "report.md").write_text(md, encoding="utf-8")
|
report/templates.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
-
from typing import Dict
|
| 3 |
|
| 4 |
|
| 5 |
-
def render_markdown(run_meta: Dict, summary: Dict) -> str:
|
| 6 |
lines = []
|
| 7 |
-
lines.append(
|
| 8 |
lines.append(f"**Run ID:** {run_meta.get('run_id')}")
|
| 9 |
lines.append(f"**Model:** {run_meta.get('model_info')}\n")
|
| 10 |
|
|
@@ -25,7 +25,6 @@ def render_markdown(run_meta: Dict, summary: Dict) -> str:
|
|
| 25 |
for k, v in (summary.get("top_confusions", {}) or {}).items():
|
| 26 |
lines.append(f"- {k}: {v}")
|
| 27 |
|
| 28 |
-
# Slice
|
| 29 |
for key in ["device", "domain", "accent", "speaker"]:
|
| 30 |
k2 = f"worst_{key}_by_cer"
|
| 31 |
if k2 in summary:
|
|
@@ -33,9 +32,48 @@ def render_markdown(run_meta: Dict, summary: Dict) -> str:
|
|
| 33 |
for item in summary[k2]:
|
| 34 |
lines.append(f"- {item['key']}: {item['cer']:.4f}")
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
return "\n".join(lines)
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
+
from typing import Dict, Optional
|
| 3 |
|
| 4 |
|
| 5 |
+
def render_markdown(run_meta: Dict, summary: Dict, llm_diagnosis: Optional[Dict] = None) -> str:
|
| 6 |
lines = []
|
| 7 |
+
lines.append("# ASR Error Analysis Report\n")
|
| 8 |
lines.append(f"**Run ID:** {run_meta.get('run_id')}")
|
| 9 |
lines.append(f"**Model:** {run_meta.get('model_info')}\n")
|
| 10 |
|
|
|
|
| 25 |
for k, v in (summary.get("top_confusions", {}) or {}).items():
|
| 26 |
lines.append(f"- {k}: {v}")
|
| 27 |
|
|
|
|
| 28 |
for key in ["device", "domain", "accent", "speaker"]:
|
| 29 |
k2 = f"worst_{key}_by_cer"
|
| 30 |
if k2 in summary:
|
|
|
|
| 32 |
for item in summary[k2]:
|
| 33 |
lines.append(f"- {item['key']}: {item['cer']:.4f}")
|
| 34 |
|
| 35 |
+
if llm_diagnosis:
|
| 36 |
+
lines.append("\n## LLM Executive Summary")
|
| 37 |
+
if llm_diagnosis.get("executive_summary"):
|
| 38 |
+
lines.append(llm_diagnosis["executive_summary"])
|
| 39 |
+
|
| 40 |
+
patterns = llm_diagnosis.get("major_patterns") or []
|
| 41 |
+
if patterns:
|
| 42 |
+
lines.append("\n## LLM Major Patterns")
|
| 43 |
+
for idx, item in enumerate(patterns, 1):
|
| 44 |
+
lines.append(f"### {idx}. {item.get('title', 'Untitled Pattern')}")
|
| 45 |
+
if item.get("phenomenon"):
|
| 46 |
+
lines.append(f"- 现象: {item['phenomenon']}")
|
| 47 |
+
if item.get("semantic_impact"):
|
| 48 |
+
lines.append(f"- 语义影响: {item['semantic_impact']}")
|
| 49 |
+
if item.get("confidence") is not None:
|
| 50 |
+
lines.append(f"- 置信度: {item['confidence']}")
|
| 51 |
+
for e in item.get("evidence", []) or []:
|
| 52 |
+
lines.append(f"- 证据: {e}")
|
| 53 |
+
for c in item.get("likely_causes", []) or []:
|
| 54 |
+
lines.append(f"- 可能原因: {c}")
|
| 55 |
+
for r in item.get("recommendations", []) or []:
|
| 56 |
+
lines.append(f"- 建议: {r}")
|
| 57 |
+
|
| 58 |
+
findings = llm_diagnosis.get("case_findings") or []
|
| 59 |
+
if findings:
|
| 60 |
+
lines.append("\n## Representative Semantic Findings")
|
| 61 |
+
for item in findings[:10]:
|
| 62 |
+
lines.append(f"- {item.get('utt_id')}: {item.get('semantic_judgement')};原因:{item.get('reason')}")
|
| 63 |
+
|
| 64 |
+
if llm_diagnosis.get("priority_actions"):
|
| 65 |
+
lines.append("\n## Priority Actions")
|
| 66 |
+
for x in llm_diagnosis["priority_actions"]:
|
| 67 |
+
lines.append(f"- {x}")
|
| 68 |
+
|
| 69 |
+
if llm_diagnosis.get("uncertainties"):
|
| 70 |
+
lines.append("\n## Uncertainties")
|
| 71 |
+
for x in llm_diagnosis["uncertainties"]:
|
| 72 |
+
lines.append(f"- {x}")
|
| 73 |
+
else:
|
| 74 |
+
lines.append("\n## Recommendations (auto-generated starter)")
|
| 75 |
+
lines.append("- 优先检查 CER/WER 在特定子集(device/domain/accent)是否显著升高,针对性补数据或做增强。")
|
| 76 |
+
lines.append("- 如果 top_confusions 集中在数字/时间类,可加入数字规范化与专门的后处理规则。")
|
| 77 |
+
lines.append("- 如果 mixed_language 占比高,考虑加入英文热词/专名词表或做 LM/解码侧增强。")
|
| 78 |
|
| 79 |
return "\n".join(lines)
|
scripts/run_diagnostic.py
CHANGED
|
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
| 6 |
import pandas as pd
|
| 7 |
from openai import OpenAI
|
| 8 |
|
| 9 |
-
from analysis.
|
| 10 |
from report.diagnostic_report import generate_report_with_openai
|
| 11 |
|
| 12 |
|
|
@@ -20,21 +20,21 @@ def load_jsonl(path: Path):
|
|
| 20 |
return rows
|
| 21 |
|
| 22 |
|
| 23 |
-
def main(run_id: str, runs_dir: str = "runs"):
|
| 24 |
run_dir = Path(runs_dir) / run_id
|
| 25 |
|
| 26 |
df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
|
| 27 |
df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
|
| 28 |
summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
|
| 29 |
|
| 30 |
-
|
| 31 |
-
(
|
| 32 |
-
|
|
|
|
| 33 |
encoding="utf-8"
|
| 34 |
)
|
| 35 |
|
| 36 |
-
|
| 37 |
-
report = generate_report_with_openai(root_cause, summary, client)
|
| 38 |
(run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
|
| 39 |
|
| 40 |
print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
|
|
@@ -45,5 +45,6 @@ if __name__ == "__main__":
|
|
| 45 |
ap = argparse.ArgumentParser()
|
| 46 |
ap.add_argument("--run_id", required=True)
|
| 47 |
ap.add_argument("--runs_dir", default="runs")
|
|
|
|
| 48 |
args = ap.parse_args()
|
| 49 |
-
main(args.run_id, args.runs_dir)
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
from openai import OpenAI
|
| 8 |
|
| 9 |
+
from analysis.llm_analyzer import analyze_with_llm
|
| 10 |
from report.diagnostic_report import generate_report_with_openai
|
| 11 |
|
| 12 |
|
|
|
|
| 20 |
return rows
|
| 21 |
|
| 22 |
|
| 23 |
+
def main(run_id: str, runs_dir: str = "runs", model: str = "gpt-4.1-mini"):
|
| 24 |
run_dir = Path(runs_dir) / run_id
|
| 25 |
|
| 26 |
df_align = pd.DataFrame(load_jsonl(run_dir / "aligned.jsonl"))
|
| 27 |
df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
|
| 28 |
summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8")) if (run_dir / "summary.json").exists() else {}
|
| 29 |
|
| 30 |
+
client = OpenAI()
|
| 31 |
+
llm_diagnosis = analyze_with_llm(df_align, df_events, summary, model=model, client=client)
|
| 32 |
+
(run_dir / "llm_diagnosis.json").write_text(
|
| 33 |
+
json.dumps(llm_diagnosis, ensure_ascii=False, indent=2),
|
| 34 |
encoding="utf-8"
|
| 35 |
)
|
| 36 |
|
| 37 |
+
report = generate_report_with_openai(llm_diagnosis, summary, client, model=model)
|
|
|
|
| 38 |
(run_dir / "diagnostic_report.md").write_text(report, encoding="utf-8")
|
| 39 |
|
| 40 |
print(f"Diagnostic report written to: {run_dir / 'diagnostic_report.md'}")
|
|
|
|
| 45 |
ap = argparse.ArgumentParser()
|
| 46 |
ap.add_argument("--run_id", required=True)
|
| 47 |
ap.add_argument("--runs_dir", default="runs")
|
| 48 |
+
ap.add_argument("--model", default="gpt-4.1-mini")
|
| 49 |
args = ap.parse_args()
|
| 50 |
+
main(args.run_id, args.runs_dir, args.model)
|
ui/app.py
CHANGED
|
@@ -5,11 +5,15 @@ import subprocess
|
|
| 5 |
import sys
|
| 6 |
from pathlib import Path
|
| 7 |
|
| 8 |
-
import pandas as pd
|
| 9 |
import gradio as gr
|
|
|
|
| 10 |
|
| 11 |
RUNS_DIR = Path("runs")
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def list_runs():
|
| 15 |
if not RUNS_DIR.exists():
|
|
@@ -20,107 +24,213 @@ def list_runs():
|
|
| 20 |
)
|
| 21 |
|
| 22 |
|
| 23 |
-
def
|
| 24 |
-
|
| 25 |
|
| 26 |
-
meta = json.loads((run_dir / "run_meta.json").read_text(encoding="utf-8"))
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
-
aligned_path = run_dir / "aligned.jsonl"
|
| 35 |
-
if aligned_path.exists():
|
| 36 |
-
rows = []
|
| 37 |
-
with aligned_path.open("r", encoding="utf-8") as f:
|
| 38 |
-
for line in f:
|
| 39 |
-
line = line.strip()
|
| 40 |
-
if line:
|
| 41 |
-
rows.append(json.loads(line))
|
| 42 |
-
df_align = pd.DataFrame(rows)
|
| 43 |
-
else:
|
| 44 |
-
df_align = pd.DataFrame()
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
diagnostic_path = run_dir / "diagnostic_report.md"
|
| 50 |
-
diagnostic_text = (
|
| 51 |
-
diagnostic_path.read_text(encoding="utf-8")
|
| 52 |
-
if diagnostic_path.exists()
|
| 53 |
-
else "No diagnostic report yet."
|
| 54 |
-
)
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
-
def build_summary_md(meta, summary):
|
| 60 |
lines = []
|
| 61 |
lines.append(f"### Run ID: `{meta.get('run_id')}`")
|
| 62 |
lines.append(f"- Model: `{meta.get('model_info')}`")
|
| 63 |
-
|
| 64 |
if "wer_mean" in summary and summary["wer_mean"] is not None:
|
| 65 |
lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
|
| 66 |
-
|
| 67 |
if "cer_mean" in summary and summary["cer_mean"] is not None:
|
| 68 |
lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
|
| 69 |
-
|
| 70 |
lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
|
| 71 |
-
|
| 72 |
if "top_error_classes" in summary:
|
| 73 |
lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
return "\n".join(lines)
|
| 76 |
|
| 77 |
|
| 78 |
-
def
|
| 79 |
-
if
|
| 80 |
-
return
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
meta, summary, df_align, df_events, diagnostic_text = load_run(run_id)
|
| 83 |
-
md = build_summary_md(meta, summary)
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
)
|
| 90 |
|
| 91 |
-
if len(df_events) and all(
|
| 92 |
-
c in df_events.columns for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"]
|
| 93 |
-
):
|
| 94 |
-
events_view = df_events[["utt_id", "op_type", "ref", "hyp", "error_class", "level"]].head(100)
|
| 95 |
-
else:
|
| 96 |
-
events_view = pd.DataFrame()
|
| 97 |
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
|
| 101 |
def search_events(run_id, error_class, contains):
|
| 102 |
if not run_id:
|
| 103 |
return pd.DataFrame()
|
| 104 |
-
|
| 105 |
-
_, _, _, df_events, _ = load_run(run_id)
|
| 106 |
if df_events is None or len(df_events) == 0:
|
| 107 |
return pd.DataFrame()
|
| 108 |
-
|
| 109 |
q = df_events.copy()
|
| 110 |
-
|
| 111 |
-
if error_class and error_class != "ALL":
|
| 112 |
q = q[q["error_class"] == error_class]
|
| 113 |
-
|
| 114 |
if contains:
|
| 115 |
contains = str(contains)
|
| 116 |
q = q[
|
| 117 |
q["ref"].astype(str).str.contains(contains, na=False)
|
| 118 |
| q["hyp"].astype(str).str.contains(contains, na=False)
|
| 119 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
cols = [c for c in cols if c in q.columns]
|
| 123 |
-
return q[cols].head(
|
| 124 |
|
| 125 |
|
| 126 |
def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
|
|
@@ -134,72 +244,49 @@ def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, lan
|
|
| 134 |
"--language", language.strip(),
|
| 135 |
"--num", str(int(num_samples)),
|
| 136 |
]
|
| 137 |
-
|
| 138 |
if dataset_config and dataset_config.strip():
|
| 139 |
cmd += ["--dataset_config", dataset_config.strip()]
|
| 140 |
|
| 141 |
p = subprocess.run(cmd, capture_output=True, text=True)
|
| 142 |
out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
|
| 143 |
-
|
| 144 |
if p.returncode != 0:
|
| 145 |
-
out +=
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
)
|
| 149 |
-
return (
|
| 150 |
-
out,
|
| 151 |
-
gr.update(),
|
| 152 |
-
"",
|
| 153 |
-
pd.DataFrame(),
|
| 154 |
-
pd.DataFrame(),
|
| 155 |
-
"No diagnostic report yet.",
|
| 156 |
-
)
|
| 157 |
|
| 158 |
runs = list_runs()
|
| 159 |
latest = runs[0] if runs else None
|
| 160 |
-
|
| 161 |
if latest:
|
| 162 |
-
md, align_view, events_view, diagnostic_text = on_select_run(latest)
|
| 163 |
else:
|
| 164 |
-
md, align_view, events_view, diagnostic_text = "", pd.DataFrame(), pd.DataFrame(), "No diagnostic report yet."
|
| 165 |
|
| 166 |
-
return (
|
| 167 |
-
out,
|
| 168 |
-
gr.update(choices=runs, value=latest),
|
| 169 |
-
md,
|
| 170 |
-
align_view,
|
| 171 |
-
events_view,
|
| 172 |
-
diagnostic_text,
|
| 173 |
-
)
|
| 174 |
|
| 175 |
|
| 176 |
with gr.Blocks() as demo:
|
| 177 |
-
gr.Markdown("# ASR
|
| 178 |
|
| 179 |
with gr.Accordion("Run from Hugging Face", open=True):
|
| 180 |
gr.Markdown(
|
| 181 |
"Fill in a dataset and a Whisper model, then click **Run**. "
|
| 182 |
-
"If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**."
|
|
|
|
| 183 |
)
|
| 184 |
-
|
| 185 |
with gr.Row():
|
| 186 |
dataset_id = gr.Textbox(label="HF dataset repo id", value="fsicoli/common_voice_22_0")
|
| 187 |
dataset_config = gr.Textbox(label="Dataset config (optional)", value="zh-CN")
|
| 188 |
-
|
| 189 |
with gr.Row():
|
| 190 |
split = gr.Textbox(label="Split", value="validation")
|
| 191 |
text_field = gr.Textbox(label="Transcript field", value="sentence")
|
| 192 |
num_samples = gr.Number(label="Num samples", value=50, precision=0)
|
| 193 |
-
|
| 194 |
with gr.Row():
|
| 195 |
model_repo_id = gr.Textbox(label="HF model repo id", value="openai/whisper-small")
|
| 196 |
language = gr.Textbox(label="Language", value="zh")
|
| 197 |
-
|
| 198 |
run_btn = gr.Button("Run")
|
| 199 |
logs = gr.Textbox(label="Logs", lines=16)
|
| 200 |
|
| 201 |
gr.Markdown("## Browse Existing Runs")
|
| 202 |
-
|
| 203 |
runs = list_runs()
|
| 204 |
run_dd = gr.Dropdown(choices=runs, label="Select run", value=(runs[0] if runs else None))
|
| 205 |
summary_md = gr.Markdown()
|
|
@@ -210,15 +297,7 @@ with gr.Blocks() as demo:
|
|
| 210 |
|
| 211 |
with gr.Accordion("Search Error Events", open=False):
|
| 212 |
error_cls = gr.Dropdown(
|
| 213 |
-
choices=[
|
| 214 |
-
"ALL",
|
| 215 |
-
"number_or_time",
|
| 216 |
-
"mixed_language",
|
| 217 |
-
"substitution",
|
| 218 |
-
"deletion",
|
| 219 |
-
"insertion",
|
| 220 |
-
"other",
|
| 221 |
-
],
|
| 222 |
value="ALL",
|
| 223 |
label="error_class",
|
| 224 |
)
|
|
@@ -226,30 +305,53 @@ with gr.Blocks() as demo:
|
|
| 226 |
search_btn = gr.Button("Search")
|
| 227 |
result_tbl = gr.Dataframe(label="Search results", interactive=False)
|
| 228 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
with gr.Accordion("Diagnostic Report", open=True):
|
| 230 |
diagnostic_md = gr.Markdown("No diagnostic report yet.")
|
| 231 |
|
| 232 |
if runs:
|
| 233 |
-
md0, a0, e0, d0 = on_select_run(runs[0])
|
| 234 |
summary_md.value = md0
|
| 235 |
align_tbl.value = a0
|
| 236 |
events_tbl.value = e0
|
|
|
|
|
|
|
| 237 |
diagnostic_md.value = d0
|
|
|
|
|
|
|
| 238 |
|
| 239 |
run_dd.change(
|
| 240 |
on_select_run,
|
| 241 |
inputs=[run_dd],
|
| 242 |
-
outputs=[summary_md, align_tbl, events_tbl, diagnostic_md],
|
| 243 |
)
|
| 244 |
|
| 245 |
-
search_btn.click(
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
)
|
| 250 |
|
| 251 |
run_btn.click(
|
| 252 |
run_hf_job,
|
| 253 |
inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
|
| 254 |
-
outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, diagnostic_md],
|
| 255 |
)
|
|
|
|
| 5 |
import sys
|
| 6 |
from pathlib import Path
|
| 7 |
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
+
import pandas as pd
|
| 10 |
|
| 11 |
RUNS_DIR = Path("runs")
|
| 12 |
|
| 13 |
+
SEMANTIC_JUDGEMENTS = ["ALL", "语义基本等价", "轻微偏差", "明显偏差", "严重失真", "不确定"]
|
| 14 |
+
SEVERITIES = ["ALL", "high", "medium", "low"]
|
| 15 |
+
BUSINESS_IMPACTS = ["ALL", "high", "medium", "low"]
|
| 16 |
+
|
| 17 |
|
| 18 |
def list_runs():
|
| 19 |
if not RUNS_DIR.exists():
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
|
| 27 |
+
def _read_json(path: Path, default):
|
| 28 |
+
return json.loads(path.read_text(encoding="utf-8")) if path.exists() else default
|
| 29 |
|
|
|
|
| 30 |
|
| 31 |
+
def _read_jsonl(path: Path):
|
| 32 |
+
rows = []
|
| 33 |
+
if not path.exists():
|
| 34 |
+
return rows
|
| 35 |
+
with path.open("r", encoding="utf-8") as f:
|
| 36 |
+
for line in f:
|
| 37 |
+
line = line.strip()
|
| 38 |
+
if line:
|
| 39 |
+
rows.append(json.loads(line))
|
| 40 |
+
return rows
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
+
def _normalize_semantic_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 44 |
+
if df is None or len(df) == 0:
|
| 45 |
+
return pd.DataFrame()
|
| 46 |
+
out = df.copy()
|
| 47 |
+
for col in ["semantic_error_types", "improvement_suggestions"]:
|
| 48 |
+
if col in out.columns:
|
| 49 |
+
out[col] = out[col].apply(lambda xs: xs if isinstance(xs, list) else ([] if pd.isna(xs) else [str(xs)]))
|
| 50 |
+
if "semantic_error_types" in out.columns and "semantic_error_types_str" not in out.columns:
|
| 51 |
+
out["semantic_error_types_str"] = out["semantic_error_types"].apply(lambda xs: " | ".join(xs))
|
| 52 |
+
if "improvement_suggestions" in out.columns and "improvement_suggestions_str" not in out.columns:
|
| 53 |
+
out["improvement_suggestions_str"] = out["improvement_suggestions"].apply(lambda xs: " | ".join(xs))
|
| 54 |
+
return out
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
+
def load_run(run_id: str):
|
| 58 |
+
run_dir = RUNS_DIR / run_id
|
| 59 |
+
meta = _read_json(run_dir / "run_meta.json", {})
|
| 60 |
+
summary = _read_json(run_dir / "summary.json", {})
|
| 61 |
+
df_align = pd.DataFrame(_read_jsonl(run_dir / "aligned.jsonl"))
|
| 62 |
+
df_events = pd.read_parquet(run_dir / "events.parquet") if (run_dir / "events.parquet").exists() else pd.DataFrame()
|
| 63 |
+
if (run_dir / "semantic_findings.parquet").exists():
|
| 64 |
+
df_semantic = _normalize_semantic_df(pd.read_parquet(run_dir / "semantic_findings.parquet"))
|
| 65 |
+
else:
|
| 66 |
+
df_semantic = pd.DataFrame()
|
| 67 |
+
llm_diagnosis = _read_json(run_dir / "llm_diagnosis.json", {})
|
| 68 |
+
diagnostic_text = (run_dir / "diagnostic_report.md").read_text(encoding="utf-8") if (run_dir / "diagnostic_report.md").exists() else "No diagnostic report yet."
|
| 69 |
+
return meta, summary, df_align, df_events, df_semantic, llm_diagnosis, diagnostic_text
|
| 70 |
|
| 71 |
|
| 72 |
+
def build_summary_md(meta, summary, df_semantic: pd.DataFrame | None = None):
|
| 73 |
lines = []
|
| 74 |
lines.append(f"### Run ID: `{meta.get('run_id')}`")
|
| 75 |
lines.append(f"- Model: `{meta.get('model_info')}`")
|
|
|
|
| 76 |
if "wer_mean" in summary and summary["wer_mean"] is not None:
|
| 77 |
lines.append(f"- WER(mean): **{summary['wer_mean']:.4f}**")
|
|
|
|
| 78 |
if "cer_mean" in summary and summary["cer_mean"] is not None:
|
| 79 |
lines.append(f"- CER(mean): **{summary['cer_mean']:.4f}**")
|
|
|
|
| 80 |
lines.append(f"- S/I/D: `{summary.get('sid_counts', {})}`")
|
|
|
|
| 81 |
if "top_error_classes" in summary:
|
| 82 |
lines.append(f"- Top error classes: `{summary.get('top_error_classes', {})}`")
|
| 83 |
+
if df_semantic is not None and len(df_semantic) > 0:
|
| 84 |
+
sem_counts = df_semantic["semantic_judgement"].fillna("不确定").value_counts().to_dict() if "semantic_judgement" in df_semantic.columns else {}
|
| 85 |
+
high_impact = int((df_semantic.get("business_impact", pd.Series(dtype=str)) == "high").sum()) if "business_impact" in df_semantic.columns else 0
|
| 86 |
+
lines.append(f"- Semantic judgements: `{sem_counts}`")
|
| 87 |
+
lines.append(f"- High business impact utterances: **{high_impact}**")
|
| 88 |
+
return "\n".join(lines)
|
| 89 |
+
|
| 90 |
|
| 91 |
+
def build_semantic_overview_md(df_semantic: pd.DataFrame, llm_diagnosis: dict):
|
| 92 |
+
if df_semantic is None or len(df_semantic) == 0:
|
| 93 |
+
return "### Semantic Overview\n暂无 per-utterance LLM 语义诊断结果。请先用配置了 `OPENAI_API_KEY` 的流程跑分析。"
|
| 94 |
+
lines = ["### Semantic Overview"]
|
| 95 |
+
if "semantic_judgement" in df_semantic.columns:
|
| 96 |
+
counts = df_semantic["semantic_judgement"].fillna("不确定").value_counts().to_dict()
|
| 97 |
+
lines.append(f"- 语义判断分布: `{counts}`")
|
| 98 |
+
if "business_impact" in df_semantic.columns:
|
| 99 |
+
impact = df_semantic["business_impact"].fillna("low").value_counts().to_dict()
|
| 100 |
+
lines.append(f"- 业务影响分布: `{impact}`")
|
| 101 |
+
if "semantic_error_types" in df_semantic.columns:
|
| 102 |
+
flat = []
|
| 103 |
+
for xs in df_semantic["semantic_error_types"].dropna().tolist():
|
| 104 |
+
flat.extend(xs if isinstance(xs, list) else [str(xs)])
|
| 105 |
+
if flat:
|
| 106 |
+
top_types = pd.Series(flat).value_counts().head(8).to_dict()
|
| 107 |
+
lines.append(f"- 高频语义错误类型: `{top_types}`")
|
| 108 |
+
if llm_diagnosis.get("priority_actions"):
|
| 109 |
+
lines.append("- 优先行动:")
|
| 110 |
+
for action in llm_diagnosis.get("priority_actions", [])[:5]:
|
| 111 |
+
lines.append(f" - {action}")
|
| 112 |
return "\n".join(lines)
|
| 113 |
|
| 114 |
|
| 115 |
+
def _head_align(df_align: pd.DataFrame) -> pd.DataFrame:
|
| 116 |
+
if len(df_align) == 0:
|
| 117 |
+
return pd.DataFrame()
|
| 118 |
+
cols = [c for c in ["utt_id", "wer", "cer", "ref_text", "hyp_text"] if c in df_align.columns]
|
| 119 |
+
return df_align[cols].head(50)
|
| 120 |
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
def _head_events(df_events: pd.DataFrame) -> pd.DataFrame:
|
| 123 |
+
if len(df_events) == 0:
|
| 124 |
+
return pd.DataFrame()
|
| 125 |
+
cols = [c for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"] if c in df_events.columns]
|
| 126 |
+
return df_events[cols].head(100)
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
+
def _head_semantic(df_semantic: pd.DataFrame) -> pd.DataFrame:
|
| 130 |
+
if len(df_semantic) == 0:
|
| 131 |
+
return pd.DataFrame()
|
| 132 |
+
cols = [
|
| 133 |
+
"utt_id", "semantic_judgement", "severity", "business_impact", "wer", "cer",
|
| 134 |
+
"semantic_error_types_str", "reason", "ref_text", "hyp_text",
|
| 135 |
+
]
|
| 136 |
+
cols = [c for c in cols if c in df_semantic.columns]
|
| 137 |
+
return df_semantic.sort_values([c for c in ["business_impact", "severity", "cer"] if c in df_semantic.columns], ascending=[True, True, False][:len([c for c in ["business_impact", "severity", "cer"] if c in df_semantic.columns])]).head(100)[cols]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def on_select_run(run_id):
|
| 141 |
+
if not run_id:
|
| 142 |
+
empty = pd.DataFrame()
|
| 143 |
+
return "", empty, empty, empty, "", "No diagnostic report yet.", gr.update(choices=[]), gr.update(choices=[])
|
| 144 |
+
|
| 145 |
+
meta, summary, df_align, df_events, df_semantic, llm_diagnosis, diagnostic_text = load_run(run_id)
|
| 146 |
+
md = build_summary_md(meta, summary, df_semantic)
|
| 147 |
+
semantic_md = build_semantic_overview_md(df_semantic, llm_diagnosis)
|
| 148 |
+
|
| 149 |
+
type_choices = ["ALL"]
|
| 150 |
+
if len(df_semantic) > 0 and "semantic_error_types" in df_semantic.columns:
|
| 151 |
+
types = set()
|
| 152 |
+
for xs in df_semantic["semantic_error_types"].dropna().tolist():
|
| 153 |
+
if isinstance(xs, list):
|
| 154 |
+
types.update(str(x) for x in xs if x)
|
| 155 |
+
elif xs:
|
| 156 |
+
types.add(str(xs))
|
| 157 |
+
type_choices.extend(sorted(types))
|
| 158 |
+
|
| 159 |
+
domain_choices = ["ALL"]
|
| 160 |
+
if len(df_semantic) > 0 and "domain" in df_semantic.columns:
|
| 161 |
+
domain_choices.extend(sorted(str(x) for x in df_semantic["domain"].dropna().unique()))
|
| 162 |
+
|
| 163 |
+
return (
|
| 164 |
+
md,
|
| 165 |
+
_head_align(df_align),
|
| 166 |
+
_head_events(df_events),
|
| 167 |
+
_head_semantic(df_semantic),
|
| 168 |
+
semantic_md,
|
| 169 |
+
diagnostic_text,
|
| 170 |
+
gr.update(choices=type_choices, value="ALL"),
|
| 171 |
+
gr.update(choices=domain_choices, value="ALL"),
|
| 172 |
+
)
|
| 173 |
|
| 174 |
|
| 175 |
def search_events(run_id, error_class, contains):
|
| 176 |
if not run_id:
|
| 177 |
return pd.DataFrame()
|
| 178 |
+
_, _, _, df_events, _, _, _ = load_run(run_id)
|
|
|
|
| 179 |
if df_events is None or len(df_events) == 0:
|
| 180 |
return pd.DataFrame()
|
|
|
|
| 181 |
q = df_events.copy()
|
| 182 |
+
if error_class and error_class != "ALL" and "error_class" in q.columns:
|
|
|
|
| 183 |
q = q[q["error_class"] == error_class]
|
|
|
|
| 184 |
if contains:
|
| 185 |
contains = str(contains)
|
| 186 |
q = q[
|
| 187 |
q["ref"].astype(str).str.contains(contains, na=False)
|
| 188 |
| q["hyp"].astype(str).str.contains(contains, na=False)
|
| 189 |
]
|
| 190 |
+
cols = [c for c in ["utt_id", "op_type", "ref", "hyp", "error_class", "level"] if c in q.columns]
|
| 191 |
+
return q[cols].head(200)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def search_semantic(run_id, judgement, severity, business_impact, semantic_type, domain, contains, min_cer):
|
| 195 |
+
if not run_id:
|
| 196 |
+
return pd.DataFrame()
|
| 197 |
+
_, _, _, _, df_semantic, _, _ = load_run(run_id)
|
| 198 |
+
if df_semantic is None or len(df_semantic) == 0:
|
| 199 |
+
return pd.DataFrame()
|
| 200 |
+
|
| 201 |
+
q = df_semantic.copy()
|
| 202 |
+
if judgement and judgement != "ALL" and "semantic_judgement" in q.columns:
|
| 203 |
+
q = q[q["semantic_judgement"] == judgement]
|
| 204 |
+
if severity and severity != "ALL" and "severity" in q.columns:
|
| 205 |
+
q = q[q["severity"] == severity]
|
| 206 |
+
if business_impact and business_impact != "ALL" and "business_impact" in q.columns:
|
| 207 |
+
q = q[q["business_impact"] == business_impact]
|
| 208 |
+
if semantic_type and semantic_type != "ALL" and "semantic_error_types_str" in q.columns:
|
| 209 |
+
q = q[q["semantic_error_types_str"].astype(str).str.contains(str(semantic_type), na=False)]
|
| 210 |
+
if domain and domain != "ALL" and "domain" in q.columns:
|
| 211 |
+
q = q[q["domain"].astype(str) == str(domain)]
|
| 212 |
+
if contains:
|
| 213 |
+
contains = str(contains)
|
| 214 |
+
q = q[
|
| 215 |
+
q["ref_text"].astype(str).str.contains(contains, na=False)
|
| 216 |
+
| q["hyp_text"].astype(str).str.contains(contains, na=False)
|
| 217 |
+
| q.get("reason", pd.Series(dtype=str)).astype(str).str.contains(contains, na=False)
|
| 218 |
+
| q.get("semantic_error_types_str", pd.Series(dtype=str)).astype(str).str.contains(contains, na=False)
|
| 219 |
+
]
|
| 220 |
+
if min_cer is not None and "cer" in q.columns:
|
| 221 |
+
q = q[q["cer"].fillna(0) >= float(min_cer)]
|
| 222 |
|
| 223 |
+
order_cols = [c for c in ["business_impact", "severity", "cer"] if c in q.columns]
|
| 224 |
+
if order_cols:
|
| 225 |
+
q = q.sort_values(order_cols, ascending=[True, True, False][:len(order_cols)])
|
| 226 |
+
|
| 227 |
+
cols = [
|
| 228 |
+
"utt_id", "semantic_judgement", "severity", "business_impact", "wer", "cer",
|
| 229 |
+
"semantic_error_types_str", "reason", "improvement_suggestions_str", "domain", "accent",
|
| 230 |
+
"ref_text", "hyp_text",
|
| 231 |
+
]
|
| 232 |
cols = [c for c in cols if c in q.columns]
|
| 233 |
+
return q[cols].head(300)
|
| 234 |
|
| 235 |
|
| 236 |
def run_hf_job(dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples):
|
|
|
|
| 244 |
"--language", language.strip(),
|
| 245 |
"--num", str(int(num_samples)),
|
| 246 |
]
|
|
|
|
| 247 |
if dataset_config and dataset_config.strip():
|
| 248 |
cmd += ["--dataset_config", dataset_config.strip()]
|
| 249 |
|
| 250 |
p = subprocess.run(cmd, capture_output=True, text=True)
|
| 251 |
out = (p.stdout or "") + ("\n" + (p.stderr or "") if p.stderr else "")
|
|
|
|
| 252 |
if p.returncode != 0:
|
| 253 |
+
out += "\n\n[HINT] If you see 401/403 for Common Voice: set HF_TOKEN in Space Settings → Secrets, and accept dataset terms on HF."
|
| 254 |
+
empty = pd.DataFrame()
|
| 255 |
+
return out, gr.update(), "", empty, empty, empty, "", "No diagnostic report yet.", gr.update(), gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
runs = list_runs()
|
| 258 |
latest = runs[0] if runs else None
|
|
|
|
| 259 |
if latest:
|
| 260 |
+
md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd = on_select_run(latest)
|
| 261 |
else:
|
| 262 |
+
md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd = "", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "", "No diagnostic report yet.", gr.update(), gr.update()
|
| 263 |
|
| 264 |
+
return out, gr.update(choices=runs, value=latest), md, align_view, events_view, semantic_view, semantic_md, diagnostic_text, type_dd, domain_dd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
|
| 267 |
with gr.Blocks() as demo:
|
| 268 |
+
gr.Markdown("# ASR LLM Agent UI")
|
| 269 |
|
| 270 |
with gr.Accordion("Run from Hugging Face", open=True):
|
| 271 |
gr.Markdown(
|
| 272 |
"Fill in a dataset and a Whisper model, then click **Run**. "
|
| 273 |
+
"If the dataset is gated, set `HF_TOKEN` in Space **Settings → Secrets**. "
|
| 274 |
+
"For LLM semantic diagnostics, make sure `OPENAI_API_KEY` is available."
|
| 275 |
)
|
|
|
|
| 276 |
with gr.Row():
|
| 277 |
dataset_id = gr.Textbox(label="HF dataset repo id", value="fsicoli/common_voice_22_0")
|
| 278 |
dataset_config = gr.Textbox(label="Dataset config (optional)", value="zh-CN")
|
|
|
|
| 279 |
with gr.Row():
|
| 280 |
split = gr.Textbox(label="Split", value="validation")
|
| 281 |
text_field = gr.Textbox(label="Transcript field", value="sentence")
|
| 282 |
num_samples = gr.Number(label="Num samples", value=50, precision=0)
|
|
|
|
| 283 |
with gr.Row():
|
| 284 |
model_repo_id = gr.Textbox(label="HF model repo id", value="openai/whisper-small")
|
| 285 |
language = gr.Textbox(label="Language", value="zh")
|
|
|
|
| 286 |
run_btn = gr.Button("Run")
|
| 287 |
logs = gr.Textbox(label="Logs", lines=16)
|
| 288 |
|
| 289 |
gr.Markdown("## Browse Existing Runs")
|
|
|
|
| 290 |
runs = list_runs()
|
| 291 |
run_dd = gr.Dropdown(choices=runs, label="Select run", value=(runs[0] if runs else None))
|
| 292 |
summary_md = gr.Markdown()
|
|
|
|
| 297 |
|
| 298 |
with gr.Accordion("Search Error Events", open=False):
|
| 299 |
error_cls = gr.Dropdown(
|
| 300 |
+
choices=["ALL", "number_or_time", "mixed_language", "substitution", "deletion", "insertion", "other"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
value="ALL",
|
| 302 |
label="error_class",
|
| 303 |
)
|
|
|
|
| 305 |
search_btn = gr.Button("Search")
|
| 306 |
result_tbl = gr.Dataframe(label="Search results", interactive=False)
|
| 307 |
|
| 308 |
+
gr.Markdown("## Per-Utterance Semantic Diagnostics")
|
| 309 |
+
semantic_overview_md = gr.Markdown("暂无语义诊断结果。")
|
| 310 |
+
semantic_tbl = gr.Dataframe(label="Semantic findings (head)", interactive=False)
|
| 311 |
+
|
| 312 |
+
with gr.Accordion("Filter Semantic Errors", open=True):
|
| 313 |
+
with gr.Row():
|
| 314 |
+
semantic_judgement = gr.Dropdown(choices=SEMANTIC_JUDGEMENTS, value="ALL", label="semantic_judgement")
|
| 315 |
+
semantic_severity = gr.Dropdown(choices=SEVERITIES, value="ALL", label="severity")
|
| 316 |
+
semantic_business_impact = gr.Dropdown(choices=BUSINESS_IMPACTS, value="ALL", label="business_impact")
|
| 317 |
+
with gr.Row():
|
| 318 |
+
semantic_type = gr.Dropdown(choices=["ALL"], value="ALL", label="semantic_error_type")
|
| 319 |
+
semantic_domain = gr.Dropdown(choices=["ALL"], value="ALL", label="domain")
|
| 320 |
+
semantic_min_cer = gr.Number(label="min CER", value=0.0)
|
| 321 |
+
semantic_contains = gr.Textbox(label="contains (ref/hyp/reason/type)")
|
| 322 |
+
semantic_search_btn = gr.Button("Filter semantic findings")
|
| 323 |
+
semantic_result_tbl = gr.Dataframe(label="Filtered semantic findings", interactive=False)
|
| 324 |
+
|
| 325 |
with gr.Accordion("Diagnostic Report", open=True):
|
| 326 |
diagnostic_md = gr.Markdown("No diagnostic report yet.")
|
| 327 |
|
| 328 |
if runs:
|
| 329 |
+
md0, a0, e0, s0, so0, d0, type0, domain0 = on_select_run(runs[0])
|
| 330 |
summary_md.value = md0
|
| 331 |
align_tbl.value = a0
|
| 332 |
events_tbl.value = e0
|
| 333 |
+
semantic_tbl.value = s0
|
| 334 |
+
semantic_overview_md.value = so0
|
| 335 |
diagnostic_md.value = d0
|
| 336 |
+
semantic_type.choices = type0["choices"]
|
| 337 |
+
semantic_domain.choices = domain0["choices"]
|
| 338 |
|
| 339 |
run_dd.change(
|
| 340 |
on_select_run,
|
| 341 |
inputs=[run_dd],
|
| 342 |
+
outputs=[summary_md, align_tbl, events_tbl, semantic_tbl, semantic_overview_md, diagnostic_md, semantic_type, semantic_domain],
|
| 343 |
)
|
| 344 |
|
| 345 |
+
search_btn.click(search_events, inputs=[run_dd, error_cls, contains], outputs=[result_tbl])
|
| 346 |
+
|
| 347 |
+
semantic_search_btn.click(
|
| 348 |
+
search_semantic,
|
| 349 |
+
inputs=[run_dd, semantic_judgement, semantic_severity, semantic_business_impact, semantic_type, semantic_domain, semantic_contains, semantic_min_cer],
|
| 350 |
+
outputs=[semantic_result_tbl],
|
| 351 |
)
|
| 352 |
|
| 353 |
run_btn.click(
|
| 354 |
run_hf_job,
|
| 355 |
inputs=[dataset_id, dataset_config, split, text_field, model_repo_id, language, num_samples],
|
| 356 |
+
outputs=[logs, run_dd, summary_md, align_tbl, events_tbl, semantic_tbl, semantic_overview_md, diagnostic_md, semantic_type, semantic_domain],
|
| 357 |
)
|