Spaces:

ZTXRiley
/

ASR_AGENT_

Running

unknown

Add diagnostic report panel to UI

2d79471 12 days ago

8.31 kB

	from __future__ import annotations

	from typing import Dict, List, Any
	import pandas as pd


	def _safe_ratio(n: int, d: int) -> float:
	return float(n / d) if d else 0.0


	def infer_root_causes(df_events: pd.DataFrame, df_align: pd.DataFrame) -> Dict[str, Any]:
	"""
	Rule/statistics based root-cause inference.
	Input:
	- df_events: events.parquet loaded as DataFrame
	- df_align: aligned.jsonl loaded as DataFrame
	Output:
	- dict with evidence, issue hypotheses, and recommendations
	"""
	result: Dict[str, Any] = {
	"overview": {},
	"root_causes": [],
	"evidence_tables": {},
	}

	total_events = len(df_events)
	total_utts = len(df_align)

	result["overview"] = {
	"num_utterances": int(total_utts),
	"num_error_events": int(total_events),
	"wer_mean": float(df_align["wer"].dropna().mean()) if "wer" in df_align.columns and df_align["wer"].notna().any() else None,
	"cer_mean": float(df_align["cer"].dropna().mean()) if "cer" in df_align.columns and df_align["cer"].notna().any() else None,
	}

	if total_events == 0:
	result["root_causes"].append({
	"cause": "no_errors_detected",
	"confidence": 1.0,
	"evidence": ["No error events found in current run."],
	"recommendations": ["Use a weaker model or more difficult dataset to make diagnosis meaningful."]
	})
	return result

	# Basic counts
	op_counts = df_events["op_type"].value_counts().to_dict() if "op_type" in df_events.columns else {}
	cls_counts = df_events["error_class"].value_counts().to_dict() if "error_class" in df_events.columns else {}

	result["evidence_tables"]["op_counts"] = {k: int(v) for k, v in op_counts.items()}
	result["evidence_tables"]["error_class_counts"] = {k: int(v) for k, v in cls_counts.items()}

	# --- Cause 1: number/time normalization problems
	num_time_count = int(cls_counts.get("number_or_time", 0))
	if _safe_ratio(num_time_count, total_events) >= 0.15:
	result["root_causes"].append({
	"cause": "number_time_format",
	"confidence": round(min(0.95, 0.5 + _safe_ratio(num_time_count, total_events)), 3),
	"evidence": [
	f"number_or_time events = {num_time_count}/{total_events}",
	"Large proportion of errors are related to numbers, dates, times, or units."
	],
	"recommendations": [
	"Add number/date/time normalization in both reference and hypothesis.",
	"Create post-processing rules for time/unit expressions.",
	"Add more number-heavy utterances into evaluation/training."
	]
	})

	# --- Cause 2: mixed-language problems
	mixed_count = int(cls_counts.get("mixed_language", 0))
	if _safe_ratio(mixed_count, total_events) >= 0.10:
	result["root_causes"].append({
	"cause": "mixed_language",
	"confidence": round(min(0.95, 0.45 + _safe_ratio(mixed_count, total_events)), 3),
	"evidence": [
	f"mixed_language events = {mixed_count}/{total_events}",
	"Frequent English/Latin-token related substitutions suggest code-switching weakness."
	],
	"recommendations": [
	"Add bilingual/code-switching evaluation samples.",
	"Add domain-specific English terms, abbreviations, and brand names.",
	"Add post-processing lexicon for mixed-language phrases."
	]
	})

	# --- Cause 3: deletion-heavy => possible noise / far-field / VAD
	deletion_count = int(op_counts.get("D", 0))
	insertion_count = int(op_counts.get("I", 0))
	substitution_count = int(op_counts.get("S", 0))

	if _safe_ratio(deletion_count, total_events) >= 0.30:
	result["root_causes"].append({
	"cause": "noise_or_farfield_or_vad",
	"confidence": round(min(0.95, 0.5 + _safe_ratio(deletion_count, total_events)), 3),
	"evidence": [
	f"deletion events = {deletion_count}/{total_events}",
	"High deletion ratio often indicates weak audibility, noise, far-field speech, or segmentation/VAD issues."
	],
	"recommendations": [
	"Compare CER/WER across device / SNR / domain slices.",
	"Inspect quiet, noisy, or long utterances.",
	"Tune VAD or segmentation strategy.",
	"Add noisy / far-field augmented audio."
	]
	})

	# --- Cause 4: insertion-heavy => possible segmentation/repetition/echo
	if _safe_ratio(insertion_count, total_events) >= 0.20:
	result["root_causes"].append({
	"cause": "segmentation_or_repetition",
	"confidence": round(min(0.9, 0.45 + _safe_ratio(insertion_count, total_events)), 3),
	"evidence": [
	f"insertion events = {insertion_count}/{total_events}",
	"High insertion ratio often suggests repeated decoding, segmentation mismatch, or echo."
	],
	"recommendations": [
	"Inspect duplicated filler words and repeated fragments.",
	"Review chunking / segmentation.",
	"Check whether punctuation or normalization creates false insertions."
	]
	})

	# --- Cause 5: slice-based evidence (device/domain/accent/speaker)
	slice_findings = []
	for key in ["device", "domain", "accent", "speaker"]:
	if key in df_align.columns and df_align[key].notna().any() and "cer" in df_align.columns:
	g = df_align.groupby(key)["cer"].mean().dropna().sort_values(ascending=False)
	if len(g) >= 2:
	worst_key = str(g.index[0])
	worst_val = float(g.iloc[0])
	best_val = float(g.iloc[-1])
	if best_val > 0 and worst_val / best_val >= 1.5:
	slice_findings.append({
	"slice_key": key,
	"worst_group": worst_key,
	"worst_cer": worst_val,
	"best_cer": best_val,
	"ratio": worst_val / best_val
	})

	if slice_findings:
	result["evidence_tables"]["slice_findings"] = slice_findings
	result["root_causes"].append({
	"cause": "slice_specific_weakness",
	"confidence": 0.85,
	"evidence": [
	"Some slices show much worse CER than others.",
	*[
	f"{x['slice_key']}={x['worst_group']} has CER {x['worst_cer']:.4f}, ratio vs best={x['ratio']:.2f}"
	for x in slice_findings[:5]
	]
	],
	"recommendations": [
	"Prioritize the worst slices in future analysis/training.",
	"Check whether those slices correspond to accent, device, or scenario mismatch."
	]
	})

	# --- Cause 6: substitution-dominant => pronunciation / lexical confusion
	if _safe_ratio(substitution_count, total_events) >= 0.60:
	result["root_causes"].append({
	"cause": "pronunciation_or_lexical_confusion",
	"confidence": round(min(0.9, 0.45 + _safe_ratio(substitution_count, total_events)), 3),
	"evidence": [
	f"substitution events = {substitution_count}/{total_events}",
	"Substitutions dominate, which often indicates pronunciation ambiguity, lexical confusion, or near-homophone errors."
	],
	"recommendations": [
	"Add confusion-pair statistics.",
	"Check near-homophone and accent-sensitive confusions.",
	"Build a pronunciation-aware analysis layer."
	]
	})

	if not result["root_causes"]:
	result["root_causes"].append({
	"cause": "general_asr_mismatch",
	"confidence": 0.5,
	"evidence": ["No single dominant root cause identified from current heuristics."],
	"recommendations": [
	"Inspect top confusion pairs and low-performing slices.",
	"Increase metadata coverage (device/domain/accent/snr)."
	]
	})

	return result