Spaces:

millicentochieng
/

afridialeval

Running

File size: 8,377 Bytes

e2b8b61

"""Structural fidelity evaluation.

Checks whether the pipeline preserves structural properties across stages:
  Encoded DAS → Localized DAS → Decoded dialogue

Metrics:
  1. Turn count preservation (encoded → localized → decoded)
  2. Speaker role preservation (encoded → localized)
  3. Function preservation (are communicative intents carried through?)
"""

import json
import re
import sys
from pathlib import Path
from typing import Any, Dict, List


def load_json(path: str) -> List[Dict[str, Any]]:
    return json.loads(Path(path).read_text(encoding="utf-8"))


def normalize_speaker(speaker: Any) -> str:
    s = str(speaker).strip().lower()
    s = re.sub(r"^speaker_?", "", s)
    if s in ("1", "a"):
        return "A"
    if s in ("2", "b"):
        return "B"
    return s.upper()


def normalize_speaker_role(speaker: Any) -> str:
    """Normalize to role identity (A/B) — treats named speakers by position."""
    s = str(speaker).strip().lower()
    s = re.sub(r"^speaker_?", "", s)
    if s in ("1", "a"):
        return "A"
    if s in ("2", "b"):
        return "B"
    return s.upper()


def extract_function_names(functions_field: Any) -> List[str]:
    """Extract top-level function names from a functions field."""
    if isinstance(functions_field, list):
        text = "; ".join(str(f) for f in functions_field)
    else:
        text = str(functions_field)
    return re.findall(r"(\w+)\(", text)


def evaluate_dialogue(
    dialogue_id: Any,
    encoded_das: List[Dict],
    localized_das: List[Dict],
    decoded_turns: List[str],
) -> Dict[str, Any]:
    """Evaluate structural fidelity for one dialogue."""
    result: Dict[str, Any] = {
        "dialogue_id": dialogue_id,
        "encoded_turns": len(encoded_das),
        "localized_turns": len(localized_das),
        "decoded_turns": len(decoded_turns),
        "issues": [],
    }

    # 1. Turn count
    if len(encoded_das) != len(localized_das):
        result["issues"].append(
            f"Turn count mismatch: encoded={len(encoded_das)}, localized={len(localized_das)}"
        )
    if len(encoded_das) != len(decoded_turns):
        result["issues"].append(
            f"Turn count mismatch: encoded={len(encoded_das)}, decoded={len(decoded_turns)}"
        )

    result["turn_count_preserved"] = (
        len(encoded_das) == len(localized_das) == len(decoded_turns)
    )

    # 2. Speaker roles — check alternation pattern is preserved, not exact names
    encoded_speakers = [normalize_speaker(t.get("speaker_id", "")) for t in encoded_das]
    localized_speakers = [normalize_speaker(t.get("speaker_id", "")) for t in localized_das]

    # Build role mapping: first unique speaker = A, second = B
    def to_role_sequence(speakers: List[str]) -> List[str]:
        mapping: Dict[str, str] = {}
        role_counter = 0
        roles = []
        for s in speakers:
            if s not in mapping:
                mapping[s] = chr(ord("A") + role_counter)
                role_counter += 1
            roles.append(mapping[s])
        return roles

    encoded_roles = to_role_sequence(encoded_speakers)
    localized_roles = to_role_sequence(localized_speakers)

    speaker_mismatches = []
    for i, (er, lr) in enumerate(zip(encoded_roles, localized_roles)):
        if er != lr:
            speaker_mismatches.append(
                f"Turn {i+1}: encoded_role={er} ({encoded_speakers[i]}), "
                f"localized_role={lr} ({localized_speakers[i]})"
            )
    if speaker_mismatches:
        result["issues"].append(
            f"Speaker role mismatches: {speaker_mismatches}"
        )

    result["speaker_roles_preserved"] = len(speaker_mismatches) == 0

    # 3. Communicative intent (function names)
    encoded_funcs = [extract_function_names(t.get("functions", "")) for t in encoded_das]
    localized_funcs = [extract_function_names(t.get("functions", "")) for t in localized_das]

    intent_mismatches = []
    for i, (ef, lf) in enumerate(zip(encoded_funcs, localized_funcs)):
        if set(ef) != set(lf):
            intent_mismatches.append({
                "turn": i + 1,
                "encoded": ef,
                "localized": lf,
                "missing": list(set(ef) - set(lf)),
                "added": list(set(lf) - set(ef)),
            })

    result["intent_mismatches"] = intent_mismatches
    result["intents_preserved"] = len(intent_mismatches) == 0
    result["intent_preservation_rate"] = (
        1.0 - len(intent_mismatches) / max(len(encoded_funcs), 1)
    )

    result["fully_faithful"] = (
        result["turn_count_preserved"]
        and result["speaker_roles_preserved"]
        and result["intents_preserved"]
    )

    return result


def run_evaluation(
    encoded_path: str,
    localized_path: str,
    decoded_path: str,
    label: str = "",
) -> List[Dict[str, Any]]:
    encoded = load_json(encoded_path)
    localized = load_json(localized_path)
    decoded = load_json(decoded_path)

    results = []
    for i, (enc, loc, dec) in enumerate(zip(encoded, localized, decoded)):
        dialogue_id = enc.get("id", loc.get("dialogue_id", i + 1))
        decoded_turns = dec.get("decoded_swahili", [])

        result = evaluate_dialogue(
            dialogue_id=dialogue_id,
            encoded_das=enc["das_encoding"],
            localized_das=loc["localized_das"],
            decoded_turns=decoded_turns,
        )
        results.append(result)

    # Summary
    n = len(results)
    turn_ok = sum(1 for r in results if r["turn_count_preserved"])
    speaker_ok = sum(1 for r in results if r["speaker_roles_preserved"])
    intent_ok = sum(1 for r in results if r["intents_preserved"])
    fully_ok = sum(1 for r in results if r["fully_faithful"])
    avg_intent_rate = sum(r["intent_preservation_rate"] for r in results) / max(n, 1)

    print(f"\n{'=' * 60}")
    print(f"Structural Fidelity: {label}")
    print(f"{'=' * 60}")
    print(f"  Dialogues evaluated:       {n}")
    print(f"  Turn count preserved:      {turn_ok}/{n}")
    print(f"  Speaker roles preserved:   {speaker_ok}/{n}")
    print(f"  All intents preserved:     {intent_ok}/{n}")
    print(f"  Avg intent preservation:   {avg_intent_rate:.1%}")
    print(f"  Fully faithful:            {fully_ok}/{n}")

    # Show issues
    for r in results:
        if not r["fully_faithful"]:
            print(f"\n  ⚠ Dialogue {r['dialogue_id']}:")
            for issue in r["issues"]:
                print(f"    - {issue}")
            for im in r["intent_mismatches"]:
                print(
                    f"    - Turn {im['turn']}: "
                    f"encoded={im['encoded']} → localized={im['localized']}"
                )

    return results


def main() -> None:
    encoded_path = "data/encoded/dailydialog_encoded.json"
    regions = {
        "Kenya - Nairobi": "swahili_kenya___nairobi",
        "Tanzania - Zanzibar": "swahili_tanzania___zanzibar",
    }
    models = {
        "gpt-5.1": "gpt_5_1",
        "qwen-3.5-122b": "qwen_3_5_122b",
        "gemma-3-27b-it": "gemma_3_27b_it",
    }

    all_results = {}

    for region_name, region_tag in regions.items():
        for model_name, model_tag in models.items():
            label = f"{region_name} × {model_name}"
            loc_path = f"data/localized/dailydialog_{region_tag}_{model_tag}_localized.json"
            dec_path = f"data/decoded/dailydialog_{region_tag}_{model_tag}_decoded.json"

            if not Path(loc_path).exists() or not Path(dec_path).exists():
                print(f"\n  SKIP {label}: files not found")
                continue

            results = run_evaluation(encoded_path, loc_path, dec_path, label)
            all_results[label] = results

    # Overall summary table
    print(f"\n{'=' * 60}")
    print("SUMMARY TABLE")
    print(f"{'=' * 60}")
    print(f"{'Config':<40} {'Turns':>6} {'Speak':>6} {'Intent':>6} {'Full':>6}")
    print("-" * 64)
    for label, results in all_results.items():
        n = len(results)
        t = sum(1 for r in results if r["turn_count_preserved"])
        s = sum(1 for r in results if r["speaker_roles_preserved"])
        i = sum(1 for r in results if r["intents_preserved"])
        f = sum(1 for r in results if r["fully_faithful"])
        print(f"{label:<40} {t}/{n:>4} {s}/{n:>4} {i}/{n:>4} {f}/{n:>4}")


if __name__ == "__main__":
    main()