ASR_AGENT_ / analysis /root_cause.py
unknown
Add diagnostic report panel to UI
2d79471
from __future__ import annotations
from typing import Dict, List, Any
import pandas as pd
def _safe_ratio(n: int, d: int) -> float:
return float(n / d) if d else 0.0
def infer_root_causes(df_events: pd.DataFrame, df_align: pd.DataFrame) -> Dict[str, Any]:
"""
Rule/statistics based root-cause inference.
Input:
- df_events: events.parquet loaded as DataFrame
- df_align: aligned.jsonl loaded as DataFrame
Output:
- dict with evidence, issue hypotheses, and recommendations
"""
result: Dict[str, Any] = {
"overview": {},
"root_causes": [],
"evidence_tables": {},
}
total_events = len(df_events)
total_utts = len(df_align)
result["overview"] = {
"num_utterances": int(total_utts),
"num_error_events": int(total_events),
"wer_mean": float(df_align["wer"].dropna().mean()) if "wer" in df_align.columns and df_align["wer"].notna().any() else None,
"cer_mean": float(df_align["cer"].dropna().mean()) if "cer" in df_align.columns and df_align["cer"].notna().any() else None,
}
if total_events == 0:
result["root_causes"].append({
"cause": "no_errors_detected",
"confidence": 1.0,
"evidence": ["No error events found in current run."],
"recommendations": ["Use a weaker model or more difficult dataset to make diagnosis meaningful."]
})
return result
# Basic counts
op_counts = df_events["op_type"].value_counts().to_dict() if "op_type" in df_events.columns else {}
cls_counts = df_events["error_class"].value_counts().to_dict() if "error_class" in df_events.columns else {}
result["evidence_tables"]["op_counts"] = {k: int(v) for k, v in op_counts.items()}
result["evidence_tables"]["error_class_counts"] = {k: int(v) for k, v in cls_counts.items()}
# --- Cause 1: number/time normalization problems
num_time_count = int(cls_counts.get("number_or_time", 0))
if _safe_ratio(num_time_count, total_events) >= 0.15:
result["root_causes"].append({
"cause": "number_time_format",
"confidence": round(min(0.95, 0.5 + _safe_ratio(num_time_count, total_events)), 3),
"evidence": [
f"number_or_time events = {num_time_count}/{total_events}",
"Large proportion of errors are related to numbers, dates, times, or units."
],
"recommendations": [
"Add number/date/time normalization in both reference and hypothesis.",
"Create post-processing rules for time/unit expressions.",
"Add more number-heavy utterances into evaluation/training."
]
})
# --- Cause 2: mixed-language problems
mixed_count = int(cls_counts.get("mixed_language", 0))
if _safe_ratio(mixed_count, total_events) >= 0.10:
result["root_causes"].append({
"cause": "mixed_language",
"confidence": round(min(0.95, 0.45 + _safe_ratio(mixed_count, total_events)), 3),
"evidence": [
f"mixed_language events = {mixed_count}/{total_events}",
"Frequent English/Latin-token related substitutions suggest code-switching weakness."
],
"recommendations": [
"Add bilingual/code-switching evaluation samples.",
"Add domain-specific English terms, abbreviations, and brand names.",
"Add post-processing lexicon for mixed-language phrases."
]
})
# --- Cause 3: deletion-heavy => possible noise / far-field / VAD
deletion_count = int(op_counts.get("D", 0))
insertion_count = int(op_counts.get("I", 0))
substitution_count = int(op_counts.get("S", 0))
if _safe_ratio(deletion_count, total_events) >= 0.30:
result["root_causes"].append({
"cause": "noise_or_farfield_or_vad",
"confidence": round(min(0.95, 0.5 + _safe_ratio(deletion_count, total_events)), 3),
"evidence": [
f"deletion events = {deletion_count}/{total_events}",
"High deletion ratio often indicates weak audibility, noise, far-field speech, or segmentation/VAD issues."
],
"recommendations": [
"Compare CER/WER across device / SNR / domain slices.",
"Inspect quiet, noisy, or long utterances.",
"Tune VAD or segmentation strategy.",
"Add noisy / far-field augmented audio."
]
})
# --- Cause 4: insertion-heavy => possible segmentation/repetition/echo
if _safe_ratio(insertion_count, total_events) >= 0.20:
result["root_causes"].append({
"cause": "segmentation_or_repetition",
"confidence": round(min(0.9, 0.45 + _safe_ratio(insertion_count, total_events)), 3),
"evidence": [
f"insertion events = {insertion_count}/{total_events}",
"High insertion ratio often suggests repeated decoding, segmentation mismatch, or echo."
],
"recommendations": [
"Inspect duplicated filler words and repeated fragments.",
"Review chunking / segmentation.",
"Check whether punctuation or normalization creates false insertions."
]
})
# --- Cause 5: slice-based evidence (device/domain/accent/speaker)
slice_findings = []
for key in ["device", "domain", "accent", "speaker"]:
if key in df_align.columns and df_align[key].notna().any() and "cer" in df_align.columns:
g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False)
if len(g) >= 2:
worst_key = str(g.index[0])
worst_val = float(g.iloc[0])
best_val = float(g.iloc[-1])
if best_val > 0 and worst_val / best_val >= 1.5:
slice_findings.append({
"slice_key": key,
"worst_group": worst_key,
"worst_cer": worst_val,
"best_cer": best_val,
"ratio": worst_val / best_val
})
if slice_findings:
result["evidence_tables"]["slice_findings"] = slice_findings
result["root_causes"].append({
"cause": "slice_specific_weakness",
"confidence": 0.85,
"evidence": [
"Some slices show much worse CER than others.",
*[
f"{x['slice_key']}={x['worst_group']} has CER {x['worst_cer']:.4f}, ratio vs best={x['ratio']:.2f}"
for x in slice_findings[:5]
]
],
"recommendations": [
"Prioritize the worst slices in future analysis/training.",
"Check whether those slices correspond to accent, device, or scenario mismatch."
]
})
# --- Cause 6: substitution-dominant => pronunciation / lexical confusion
if _safe_ratio(substitution_count, total_events) >= 0.60:
result["root_causes"].append({
"cause": "pronunciation_or_lexical_confusion",
"confidence": round(min(0.9, 0.45 + _safe_ratio(substitution_count, total_events)), 3),
"evidence": [
f"substitution events = {substitution_count}/{total_events}",
"Substitutions dominate, which often indicates pronunciation ambiguity, lexical confusion, or near-homophone errors."
],
"recommendations": [
"Add confusion-pair statistics.",
"Check near-homophone and accent-sensitive confusions.",
"Build a pronunciation-aware analysis layer."
]
})
if not result["root_causes"]:
result["root_causes"].append({
"cause": "general_asr_mismatch",
"confidence": 0.5,
"evidence": ["No single dominant root cause identified from current heuristics."],
"recommendations": [
"Inspect top confusion pairs and low-performing slices.",
"Increase metadata coverage (device/domain/accent/snr)."
]
})
return result