"""Structural fidelity evaluation. Checks whether the pipeline preserves structural properties across stages: Encoded DAS → Localized DAS → Decoded dialogue Metrics: 1. Turn count preservation (encoded → localized → decoded) 2. Speaker role preservation (encoded → localized) 3. Function preservation (are communicative intents carried through?) """ import json import re import sys from pathlib import Path from typing import Any, Dict, List def load_json(path: str) -> List[Dict[str, Any]]: return json.loads(Path(path).read_text(encoding="utf-8")) def normalize_speaker(speaker: Any) -> str: s = str(speaker).strip().lower() s = re.sub(r"^speaker_?", "", s) if s in ("1", "a"): return "A" if s in ("2", "b"): return "B" return s.upper() def normalize_speaker_role(speaker: Any) -> str: """Normalize to role identity (A/B) — treats named speakers by position.""" s = str(speaker).strip().lower() s = re.sub(r"^speaker_?", "", s) if s in ("1", "a"): return "A" if s in ("2", "b"): return "B" return s.upper() def extract_function_names(functions_field: Any) -> List[str]: """Extract top-level function names from a functions field.""" if isinstance(functions_field, list): text = "; ".join(str(f) for f in functions_field) else: text = str(functions_field) return re.findall(r"(\w+)\(", text) def evaluate_dialogue( dialogue_id: Any, encoded_das: List[Dict], localized_das: List[Dict], decoded_turns: List[str], ) -> Dict[str, Any]: """Evaluate structural fidelity for one dialogue.""" result: Dict[str, Any] = { "dialogue_id": dialogue_id, "encoded_turns": len(encoded_das), "localized_turns": len(localized_das), "decoded_turns": len(decoded_turns), "issues": [], } # 1. Turn count if len(encoded_das) != len(localized_das): result["issues"].append( f"Turn count mismatch: encoded={len(encoded_das)}, localized={len(localized_das)}" ) if len(encoded_das) != len(decoded_turns): result["issues"].append( f"Turn count mismatch: encoded={len(encoded_das)}, decoded={len(decoded_turns)}" ) result["turn_count_preserved"] = ( len(encoded_das) == len(localized_das) == len(decoded_turns) ) # 2. Speaker roles — check alternation pattern is preserved, not exact names encoded_speakers = [normalize_speaker(t.get("speaker_id", "")) for t in encoded_das] localized_speakers = [normalize_speaker(t.get("speaker_id", "")) for t in localized_das] # Build role mapping: first unique speaker = A, second = B def to_role_sequence(speakers: List[str]) -> List[str]: mapping: Dict[str, str] = {} role_counter = 0 roles = [] for s in speakers: if s not in mapping: mapping[s] = chr(ord("A") + role_counter) role_counter += 1 roles.append(mapping[s]) return roles encoded_roles = to_role_sequence(encoded_speakers) localized_roles = to_role_sequence(localized_speakers) speaker_mismatches = [] for i, (er, lr) in enumerate(zip(encoded_roles, localized_roles)): if er != lr: speaker_mismatches.append( f"Turn {i+1}: encoded_role={er} ({encoded_speakers[i]}), " f"localized_role={lr} ({localized_speakers[i]})" ) if speaker_mismatches: result["issues"].append( f"Speaker role mismatches: {speaker_mismatches}" ) result["speaker_roles_preserved"] = len(speaker_mismatches) == 0 # 3. Communicative intent (function names) encoded_funcs = [extract_function_names(t.get("functions", "")) for t in encoded_das] localized_funcs = [extract_function_names(t.get("functions", "")) for t in localized_das] intent_mismatches = [] for i, (ef, lf) in enumerate(zip(encoded_funcs, localized_funcs)): if set(ef) != set(lf): intent_mismatches.append({ "turn": i + 1, "encoded": ef, "localized": lf, "missing": list(set(ef) - set(lf)), "added": list(set(lf) - set(ef)), }) result["intent_mismatches"] = intent_mismatches result["intents_preserved"] = len(intent_mismatches) == 0 result["intent_preservation_rate"] = ( 1.0 - len(intent_mismatches) / max(len(encoded_funcs), 1) ) result["fully_faithful"] = ( result["turn_count_preserved"] and result["speaker_roles_preserved"] and result["intents_preserved"] ) return result def run_evaluation( encoded_path: str, localized_path: str, decoded_path: str, label: str = "", ) -> List[Dict[str, Any]]: encoded = load_json(encoded_path) localized = load_json(localized_path) decoded = load_json(decoded_path) results = [] for i, (enc, loc, dec) in enumerate(zip(encoded, localized, decoded)): dialogue_id = enc.get("id", loc.get("dialogue_id", i + 1)) decoded_turns = dec.get("decoded_swahili", []) result = evaluate_dialogue( dialogue_id=dialogue_id, encoded_das=enc["das_encoding"], localized_das=loc["localized_das"], decoded_turns=decoded_turns, ) results.append(result) # Summary n = len(results) turn_ok = sum(1 for r in results if r["turn_count_preserved"]) speaker_ok = sum(1 for r in results if r["speaker_roles_preserved"]) intent_ok = sum(1 for r in results if r["intents_preserved"]) fully_ok = sum(1 for r in results if r["fully_faithful"]) avg_intent_rate = sum(r["intent_preservation_rate"] for r in results) / max(n, 1) print(f"\n{'=' * 60}") print(f"Structural Fidelity: {label}") print(f"{'=' * 60}") print(f" Dialogues evaluated: {n}") print(f" Turn count preserved: {turn_ok}/{n}") print(f" Speaker roles preserved: {speaker_ok}/{n}") print(f" All intents preserved: {intent_ok}/{n}") print(f" Avg intent preservation: {avg_intent_rate:.1%}") print(f" Fully faithful: {fully_ok}/{n}") # Show issues for r in results: if not r["fully_faithful"]: print(f"\n ⚠ Dialogue {r['dialogue_id']}:") for issue in r["issues"]: print(f" - {issue}") for im in r["intent_mismatches"]: print( f" - Turn {im['turn']}: " f"encoded={im['encoded']} → localized={im['localized']}" ) return results def main() -> None: encoded_path = "data/encoded/dailydialog_encoded.json" regions = { "Kenya - Nairobi": "swahili_kenya___nairobi", "Tanzania - Zanzibar": "swahili_tanzania___zanzibar", } models = { "gpt-5.1": "gpt_5_1", "qwen-3.5-122b": "qwen_3_5_122b", "gemma-3-27b-it": "gemma_3_27b_it", } all_results = {} for region_name, region_tag in regions.items(): for model_name, model_tag in models.items(): label = f"{region_name} × {model_name}" loc_path = f"data/localized/dailydialog_{region_tag}_{model_tag}_localized.json" dec_path = f"data/decoded/dailydialog_{region_tag}_{model_tag}_decoded.json" if not Path(loc_path).exists() or not Path(dec_path).exists(): print(f"\n SKIP {label}: files not found") continue results = run_evaluation(encoded_path, loc_path, dec_path, label) all_results[label] = results # Overall summary table print(f"\n{'=' * 60}") print("SUMMARY TABLE") print(f"{'=' * 60}") print(f"{'Config':<40} {'Turns':>6} {'Speak':>6} {'Intent':>6} {'Full':>6}") print("-" * 64) for label, results in all_results.items(): n = len(results) t = sum(1 for r in results if r["turn_count_preserved"]) s = sum(1 for r in results if r["speaker_roles_preserved"]) i = sum(1 for r in results if r["intents_preserved"]) f = sum(1 for r in results if r["fully_faithful"]) print(f"{label:<40} {t}/{n:>4} {s}/{n:>4} {i}/{n:>4} {f}/{n:>4}") if __name__ == "__main__": main()