Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

File size: 17,812 Bytes

469ef7f

"""Compile the final evaluation report for the multilingual chatbot.

Two parts:

  1. Aggregate per-model metrics from existing eval_results.json files
     (lang_detector, intent_classifier, ner_model). These were measured on
     the proper held-out test splits during their training scripts.

  2. Run an end-to-end smoke eval on a curated 15-case test set covering
     each language x several intents. Verifies the full pipeline (lang ->
     intent -> NER -> RAG -> Qwen) and measures per-stage latency.

Outputs:
  evaluation_report.md       (human-readable)
  evaluation_summary.json    (machine-readable)

Usage:
  python src/evaluate.py
  python src/evaluate.py --skip-e2e         # only aggregate stored metrics
  python src/evaluate.py --quick            # fewer E2E cases
"""

from __future__ import annotations

import argparse
import json
import statistics
import sys
import time
from pathlib import Path
from typing import Any

PROJECT_ROOT = Path(__file__).resolve().parent.parent
# Make `src.chatbot` importable when this script is run as `python src/evaluate.py`.
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
LANG_EVAL = PROJECT_ROOT / "models" / "lang_detector" / "eval_results.json"
INTENT_EVAL = PROJECT_ROOT / "models" / "intent_classifier" / "eval_results.json"
NER_EVAL = PROJECT_ROOT / "models" / "ner_model" / "eval_results.json"
REPORT_MD = PROJECT_ROOT / "evaluation_report.md"
REPORT_JSON = PROJECT_ROOT / "evaluation_summary.json"

# Curated end-to-end smoke cases: (text, expected_lang, expected_intents).
# expected_intents is a SET — many natural utterances are validly classified
# as either "booking" or "inquiry", so we accept anything in the set.
E2E_CASES: list[tuple[str, str, set[str]]] = [
    ("Hello there!",                                      "EN", {"greeting"}),
    ("How can I book a hotel in Paris?",                  "EN", {"booking", "inquiry"}),
    ("What payment methods do you accept?",               "EN", {"inquiry"}),
    ("My order arrived damaged, I want a refund.",        "EN", {"complaint"}),
    ("Goodbye, see you next time.",                       "EN", {"farewell"}),

    ("مرحباً، كيف حالك؟",                                  "AR", {"greeting"}),
    ("أحتاج أحجز فندقاً في مكة المكرمة",                   "AR", {"booking", "inquiry"}),
    ("ما هي طرق الدفع المتاحة لديكم؟",                    "AR", {"inquiry"}),
    ("لدي شكوى بخصوص طلبي الأخير",                        "AR", {"complaint"}),
    ("إلى اللقاء، شكراً لكم",                              "AR", {"farewell"}),

    ("Bonjour, comment allez-vous ?",                     "FR", {"greeting"}),
    ("Comment puis-je réserver un vol pour Lyon ?",       "FR", {"booking", "inquiry"}),
    ("Quels sont vos horaires d'ouverture ?",             "FR", {"inquiry"}),
    ("Je voudrais déposer une réclamation.",              "FR", {"complaint"}),
    ("Au revoir et bonne journée",                        "FR", {"farewell"}),
]


# --------------------------------------------------------------------------- #
# Part 1 — aggregate stored per-model metrics
# --------------------------------------------------------------------------- #

def _load_json_safely(path: Path) -> dict[str, Any] | None:
    if not path.exists():
        return None
    try:
        return json.loads(path.read_text())
    except json.JSONDecodeError:
        return None


def aggregate_model_metrics() -> dict[str, Any]:
    """Pull headline numbers from each per-model eval_results.json."""
    summary: dict[str, Any] = {}

    lang = _load_json_safely(LANG_EVAL)
    if lang:
        tm = lang.get("test_metrics", {})
        summary["language_detector"] = {
            "model": lang.get("model_name"),
            "task": lang.get("task"),
            "test_accuracy": tm.get("test_accuracy"),
            "test_f1_weighted": tm.get("test_f1"),
            "test_f1_macro": tm.get("test_f1_macro"),
            "labels": list(lang.get("labels", {}).keys()),
        }
    else:
        summary["language_detector"] = {"error": f"missing {LANG_EVAL.name}"}

    intent = _load_json_safely(INTENT_EVAL)
    if intent:
        tm = intent.get("test_metrics", {})
        summary["intent_classifier"] = {
            "model": intent.get("model_name"),
            "task": intent.get("task"),
            "test_accuracy": tm.get("test_accuracy"),
            "test_f1_weighted": tm.get("test_f1"),
            "test_f1_macro": tm.get("test_f1_macro"),
            "labels": list(intent.get("labels", {}).keys()),
            "per_language": intent.get("per_language", {}),
        }
    else:
        summary["intent_classifier"] = {"error": f"missing {INTENT_EVAL.name}"}

    ner = _load_json_safely(NER_EVAL)
    if ner:
        tm = ner.get("test_metrics", {})
        summary["ner_model"] = {
            "model": ner.get("model_name"),
            "task": ner.get("task"),
            "test_f1_micro": tm.get("test_f1"),
            "test_precision_micro": tm.get("test_precision"),
            "test_recall_micro": tm.get("test_recall"),
            "labels": list(ner.get("labels", {}).keys()),
            "per_language": ner.get("per_language", {}),
            "per_entity_f1": {
                k: v.get("f1-score")
                for k, v in (ner.get("classification_report") or {}).items()
                if isinstance(v, dict) and k not in ("micro avg", "macro avg",
                                                     "weighted avg", "accuracy")
            },
        }
    else:
        summary["ner_model"] = {"error": f"missing {NER_EVAL.name}"}

    return summary


# --------------------------------------------------------------------------- #
# Part 2 — end-to-end smoke eval
# --------------------------------------------------------------------------- #

def run_e2e(cases: list[tuple[str, str, set[str]]]) -> dict[str, Any]:
    """Run the full chatbot pipeline against curated cases. Lazy-imports the
    Chatbot so --skip-e2e doesn't pay the model-loading cost."""
    from src.chatbot import Chatbot  # noqa: PLC0415 — intentional lazy import

    print(f"\n[e2e] loading chatbot (this triggers model load + ~10 s startup) ...")
    bot = Chatbot()

    results = []
    correct_lang = 0
    correct_intent = 0
    rag_used = 0
    nonempty_replies = 0
    latencies: list[float] = []

    for i, (text, exp_lang, exp_intents) in enumerate(cases, start=1):
        t0 = time.perf_counter()
        turn = bot.respond(text)
        dt = time.perf_counter() - t0
        latencies.append(dt)

        lang_ok = (turn.language == exp_lang)
        intent_ok = (turn.intent in exp_intents)
        reply_ok = bool(turn.reply.strip())

        if lang_ok:
            correct_lang += 1
        if intent_ok:
            correct_intent += 1
        if turn.retrieved:
            rag_used += 1
        if reply_ok:
            nonempty_replies += 1

        results.append({
            "i": i,
            "text": text,
            "expected_language": exp_lang,
            "predicted_language": turn.language,
            "language_ok": lang_ok,
            "expected_intents": sorted(exp_intents),
            "predicted_intent": turn.intent,
            "intent_confidence": turn.intent_confidence,
            "intent_ok": intent_ok,
            "n_entities": len(turn.entities),
            "n_retrieved": len(turn.retrieved),
            "generator_used": turn.generator_used,
            "reply": turn.reply,
            "latency_s": round(dt, 3),
        })

        flag = "✓" if (lang_ok and intent_ok and reply_ok) else "✗"
        print(f"  [{flag}] case {i:>2}/{len(cases)}  "
              f"lang={turn.language}({'ok' if lang_ok else f'≠{exp_lang}'})  "
              f"intent={turn.intent}({'ok' if intent_ok else 'unexpected'})  "
              f"latency={dt:.2f}s")

    n = len(cases)
    return {
        "n_cases": n,
        "language_accuracy": correct_lang / n,
        "intent_accuracy_in_set": correct_intent / n,
        "retrieval_triggered": rag_used,
        "nonempty_reply_rate": nonempty_replies / n,
        "latency_seconds": {
            "mean": round(statistics.mean(latencies), 3),
            "median": round(statistics.median(latencies), 3),
            "p95": round(sorted(latencies)[int(0.95 * n) - 1], 3) if n >= 5 else None,
            "max": round(max(latencies), 3),
        },
        "cases": results,
    }


# --------------------------------------------------------------------------- #
# Markdown rendering
# --------------------------------------------------------------------------- #

def _fmt_pct(x: float | None, digits: int = 4) -> str:
    return f"{x:.{digits}f}" if isinstance(x, (int, float)) else "—"


def render_markdown(model_summary: dict[str, Any],
                    e2e: dict[str, Any] | None) -> str:
    out: list[str] = []
    out.append("# Multilingual Chatbot — Evaluation Report")
    out.append("")
    out.append("End-to-end multilingual customer-service chatbot built around three "
               "fine-tuned DistilBERT classifiers, a FAISS retrieval index over a "
               "36-row knowledge base, and Qwen2.5-0.5B-Instruct as the response "
               "generator. This report aggregates per-model held-out test metrics "
               "and a full-pipeline smoke evaluation.")
    out.append("")

    out.append("## 1. Per-model test metrics")
    out.append("")
    out.append("| Component | Model | Headline metric (held-out test) |")
    out.append("|-----------|-------|---------------------------------|")
    ld = model_summary.get("language_detector", {})
    out.append(f"| Language detector | `{ld.get('model', '?')}` | "
               f"acc {_fmt_pct(ld.get('test_accuracy'))} · "
               f"F1 macro {_fmt_pct(ld.get('test_f1_macro'))} · "
               f"F1 weighted {_fmt_pct(ld.get('test_f1_weighted'))} |")
    ic = model_summary.get("intent_classifier", {})
    out.append(f"| Intent classifier | `{ic.get('model', '?')}` | "
               f"acc {_fmt_pct(ic.get('test_accuracy'))} · "
               f"F1 macro {_fmt_pct(ic.get('test_f1_macro'))} · "
               f"F1 weighted {_fmt_pct(ic.get('test_f1_weighted'))} |")
    nm = model_summary.get("ner_model", {})
    out.append(f"| NER (entity-level) | `{nm.get('model', '?')}` | "
               f"P {_fmt_pct(nm.get('test_precision_micro'))} · "
               f"R {_fmt_pct(nm.get('test_recall_micro'))} · "
               f"F1 micro {_fmt_pct(nm.get('test_f1_micro'))} |")
    out.append("")

    if isinstance(ic.get("per_language"), dict) and ic["per_language"]:
        out.append("### Intent — per language")
        out.append("")
        out.append("| Language | n | accuracy | F1 weighted | F1 macro |")
        out.append("|----------|---|----------|-------------|----------|")
        for lang, m in sorted(ic["per_language"].items()):
            out.append(f"| {lang} | {m.get('n', '—')} | "
                       f"{_fmt_pct(m.get('accuracy'))} | "
                       f"{_fmt_pct(m.get('f1_weighted'))} | "
                       f"{_fmt_pct(m.get('f1_macro'))} |")
        out.append("")

    if isinstance(nm.get("per_entity_f1"), dict) and nm["per_entity_f1"]:
        out.append("### NER — per entity type (F1)")
        out.append("")
        out.append("| Entity | F1 |")
        out.append("|--------|----|")
        for ent, f1 in sorted(nm["per_entity_f1"].items()):
            out.append(f"| {ent} | {_fmt_pct(f1)} |")
        out.append("")

    if isinstance(nm.get("per_language"), dict) and nm["per_language"]:
        out.append("### NER — per language (entity-level F1)")
        out.append("")
        out.append("| Language | n | precision | recall | F1 |")
        out.append("|----------|---|-----------|--------|----|")
        for lang, m in sorted(nm["per_language"].items()):
            out.append(f"| {lang} | {m.get('n', '—')} | "
                       f"{_fmt_pct(m.get('precision'))} | "
                       f"{_fmt_pct(m.get('recall'))} | "
                       f"{_fmt_pct(m.get('f1'))} |")
        out.append("")

    out.append("## 2. End-to-end pipeline evaluation")
    out.append("")
    if e2e is None:
        out.append("_E2E evaluation skipped (use `python src/evaluate.py` without "
                   "`--skip-e2e` to run it)._")
        out.append("")
    else:
        n = e2e["n_cases"]
        out.append(f"Tested **{n} curated messages** covering AR/EN/FR × greeting / "
                   "booking / inquiry / complaint / farewell.")
        out.append("")
        out.append("| Metric | Value |")
        out.append("|--------|-------|")
        out.append(f"| Language accuracy | {_fmt_pct(e2e['language_accuracy'])} "
                   f"({int(round(e2e['language_accuracy'] * n))}/{n}) |")
        out.append(f"| Intent accuracy (in-set) | "
                   f"{_fmt_pct(e2e['intent_accuracy_in_set'])} "
                   f"({int(round(e2e['intent_accuracy_in_set'] * n))}/{n}) |")
        out.append(f"| Non-empty reply rate | "
                   f"{_fmt_pct(e2e['nonempty_reply_rate'])} "
                   f"({int(round(e2e['nonempty_reply_rate'] * n))}/{n}) |")
        out.append(f"| Retrieval triggered | {e2e['retrieval_triggered']}/{n} "
                   "(expected: only on inquiry/booking/complaint) |")
        lat = e2e["latency_seconds"]
        out.append(f"| Latency: mean / median / p95 / max | "
                   f"{lat['mean']}s / {lat['median']}s / "
                   f"{lat['p95'] if lat['p95'] is not None else '—'}s / "
                   f"{lat['max']}s |")
        out.append("")

        out.append("### Per-case detail")
        out.append("")
        out.append("| # | text | lang (pred / exp) | intent (pred / exp) | "
                   "retrieved | gen | latency |")
        out.append("|---|------|------|------|-----------|-----|---------|")
        for c in e2e["cases"]:
            lang_cell = f"{c['predicted_language']} / {c['expected_language']}"
            if not c["language_ok"]:
                lang_cell = f"**{lang_cell} ✗**"
            intent_cell = f"{c['predicted_intent']} / {','.join(c['expected_intents'])}"
            if not c["intent_ok"]:
                intent_cell = f"**{intent_cell} ✗**"
            text = c["text"].replace("|", "\\|")
            text_short = text if len(text) <= 50 else text[:47] + "..."
            out.append(f"| {c['i']} | {text_short} | {lang_cell} | "
                       f"{intent_cell} | {c['n_retrieved']} | "
                       f"{'Y' if c['generator_used'] else 'n'} | "
                       f"{c['latency_s']}s |")
        out.append("")

    out.append("## 3. Architecture summary")
    out.append("")
    out.append("```")
    out.append("user_text")
    out.append("  -> Language Detector  (DistilBERT, CPU)        AR / CS / EN / FR")
    out.append("  -> Intent Classifier  (DistilBERT, CPU)        booking | complaint | farewell |")
    out.append("                                                 greeting | inquiry | other")
    out.append("  -> NER                (DistilBERT, CPU)        PER / LOC / ORG / DATE (BIO)")
    out.append("  -> Branch on intent:")
    out.append("       greeting / farewell  -> canned reply (instant)")
    out.append("       inquiry / booking /  -> RAG retrieve top-3 (FAISS, MiniLM, CPU)")
    out.append("         complaint            + Qwen2.5-0.5B generates reply (cuda fp16)")
    out.append("       other                -> Qwen2.5-0.5B from general knowledge")
    out.append("  -> Reply (always in user's detected language; CS uses EN)")
    out.append("```")
    out.append("")
    out.append("All metrics in §1 were computed on held-out test splits during each "
               "model's training script. The §2 numbers are an additional integration "
               "check on hand-curated messages.")
    return "\n".join(out)


# --------------------------------------------------------------------------- #
# Entry point
# --------------------------------------------------------------------------- #

def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument("--skip-e2e", action="store_true",
                        help="Don't load models or run the E2E smoke eval; only "
                             "aggregate stored eval_results.json files.")
    parser.add_argument("--quick", action="store_true",
                        help="Run only the first 6 E2E cases.")
    args = parser.parse_args()

    print("=" * 72)
    print("Multilingual chatbot — final evaluation")
    print("=" * 72)

    print("\n[1/2] Aggregating per-model metrics from eval_results.json files ...")
    model_summary = aggregate_model_metrics()
    for k, v in model_summary.items():
        if "error" in v:
            print(f"  - {k}: {v['error']}")
        else:
            print(f"  - {k}: loaded ({v.get('model', '?')})")

    e2e = None
    if args.skip_e2e:
        print("\n[2/2] Skipping E2E evaluation (--skip-e2e).")
    else:
        cases = E2E_CASES[:6] if args.quick else E2E_CASES
        print(f"\n[2/2] Running E2E pipeline on {len(cases)} curated cases ...")
        e2e = run_e2e(cases)

    md = render_markdown(model_summary, e2e)
    REPORT_MD.write_text(md, encoding="utf-8")
    summary = {"models": model_summary, "e2e": e2e}
    REPORT_JSON.write_text(json.dumps(summary, indent=2, ensure_ascii=False))

    print(f"\n[OK] Wrote {REPORT_MD.relative_to(PROJECT_ROOT)}")
    print(f"[OK] Wrote {REPORT_JSON.relative_to(PROJECT_ROOT)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())