multilingual-chatbot / src /evaluate.py
momenalhamza's picture
Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)
469ef7f verified
"""Compile the final evaluation report for the multilingual chatbot.
Two parts:
1. Aggregate per-model metrics from existing eval_results.json files
(lang_detector, intent_classifier, ner_model). These were measured on
the proper held-out test splits during their training scripts.
2. Run an end-to-end smoke eval on a curated 15-case test set covering
each language x several intents. Verifies the full pipeline (lang ->
intent -> NER -> RAG -> Qwen) and measures per-stage latency.
Outputs:
evaluation_report.md (human-readable)
evaluation_summary.json (machine-readable)
Usage:
python src/evaluate.py
python src/evaluate.py --skip-e2e # only aggregate stored metrics
python src/evaluate.py --quick # fewer E2E cases
"""
from __future__ import annotations
import argparse
import json
import statistics
import sys
import time
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parent.parent
# Make `src.chatbot` importable when this script is run as `python src/evaluate.py`.
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
LANG_EVAL = PROJECT_ROOT / "models" / "lang_detector" / "eval_results.json"
INTENT_EVAL = PROJECT_ROOT / "models" / "intent_classifier" / "eval_results.json"
NER_EVAL = PROJECT_ROOT / "models" / "ner_model" / "eval_results.json"
REPORT_MD = PROJECT_ROOT / "evaluation_report.md"
REPORT_JSON = PROJECT_ROOT / "evaluation_summary.json"
# Curated end-to-end smoke cases: (text, expected_lang, expected_intents).
# expected_intents is a SET — many natural utterances are validly classified
# as either "booking" or "inquiry", so we accept anything in the set.
E2E_CASES: list[tuple[str, str, set[str]]] = [
("Hello there!", "EN", {"greeting"}),
("How can I book a hotel in Paris?", "EN", {"booking", "inquiry"}),
("What payment methods do you accept?", "EN", {"inquiry"}),
("My order arrived damaged, I want a refund.", "EN", {"complaint"}),
("Goodbye, see you next time.", "EN", {"farewell"}),
("مرحباً، كيف حالك؟", "AR", {"greeting"}),
("أحتاج أحجز فندقاً في مكة المكرمة", "AR", {"booking", "inquiry"}),
("ما هي طرق الدفع المتاحة لديكم؟", "AR", {"inquiry"}),
("لدي شكوى بخصوص طلبي الأخير", "AR", {"complaint"}),
("إلى اللقاء، شكراً لكم", "AR", {"farewell"}),
("Bonjour, comment allez-vous ?", "FR", {"greeting"}),
("Comment puis-je réserver un vol pour Lyon ?", "FR", {"booking", "inquiry"}),
("Quels sont vos horaires d'ouverture ?", "FR", {"inquiry"}),
("Je voudrais déposer une réclamation.", "FR", {"complaint"}),
("Au revoir et bonne journée", "FR", {"farewell"}),
]
# --------------------------------------------------------------------------- #
# Part 1 — aggregate stored per-model metrics
# --------------------------------------------------------------------------- #
def _load_json_safely(path: Path) -> dict[str, Any] | None:
if not path.exists():
return None
try:
return json.loads(path.read_text())
except json.JSONDecodeError:
return None
def aggregate_model_metrics() -> dict[str, Any]:
"""Pull headline numbers from each per-model eval_results.json."""
summary: dict[str, Any] = {}
lang = _load_json_safely(LANG_EVAL)
if lang:
tm = lang.get("test_metrics", {})
summary["language_detector"] = {
"model": lang.get("model_name"),
"task": lang.get("task"),
"test_accuracy": tm.get("test_accuracy"),
"test_f1_weighted": tm.get("test_f1"),
"test_f1_macro": tm.get("test_f1_macro"),
"labels": list(lang.get("labels", {}).keys()),
}
else:
summary["language_detector"] = {"error": f"missing {LANG_EVAL.name}"}
intent = _load_json_safely(INTENT_EVAL)
if intent:
tm = intent.get("test_metrics", {})
summary["intent_classifier"] = {
"model": intent.get("model_name"),
"task": intent.get("task"),
"test_accuracy": tm.get("test_accuracy"),
"test_f1_weighted": tm.get("test_f1"),
"test_f1_macro": tm.get("test_f1_macro"),
"labels": list(intent.get("labels", {}).keys()),
"per_language": intent.get("per_language", {}),
}
else:
summary["intent_classifier"] = {"error": f"missing {INTENT_EVAL.name}"}
ner = _load_json_safely(NER_EVAL)
if ner:
tm = ner.get("test_metrics", {})
summary["ner_model"] = {
"model": ner.get("model_name"),
"task": ner.get("task"),
"test_f1_micro": tm.get("test_f1"),
"test_precision_micro": tm.get("test_precision"),
"test_recall_micro": tm.get("test_recall"),
"labels": list(ner.get("labels", {}).keys()),
"per_language": ner.get("per_language", {}),
"per_entity_f1": {
k: v.get("f1-score")
for k, v in (ner.get("classification_report") or {}).items()
if isinstance(v, dict) and k not in ("micro avg", "macro avg",
"weighted avg", "accuracy")
},
}
else:
summary["ner_model"] = {"error": f"missing {NER_EVAL.name}"}
return summary
# --------------------------------------------------------------------------- #
# Part 2 — end-to-end smoke eval
# --------------------------------------------------------------------------- #
def run_e2e(cases: list[tuple[str, str, set[str]]]) -> dict[str, Any]:
"""Run the full chatbot pipeline against curated cases. Lazy-imports the
Chatbot so --skip-e2e doesn't pay the model-loading cost."""
from src.chatbot import Chatbot # noqa: PLC0415 — intentional lazy import
print(f"\n[e2e] loading chatbot (this triggers model load + ~10 s startup) ...")
bot = Chatbot()
results = []
correct_lang = 0
correct_intent = 0
rag_used = 0
nonempty_replies = 0
latencies: list[float] = []
for i, (text, exp_lang, exp_intents) in enumerate(cases, start=1):
t0 = time.perf_counter()
turn = bot.respond(text)
dt = time.perf_counter() - t0
latencies.append(dt)
lang_ok = (turn.language == exp_lang)
intent_ok = (turn.intent in exp_intents)
reply_ok = bool(turn.reply.strip())
if lang_ok:
correct_lang += 1
if intent_ok:
correct_intent += 1
if turn.retrieved:
rag_used += 1
if reply_ok:
nonempty_replies += 1
results.append({
"i": i,
"text": text,
"expected_language": exp_lang,
"predicted_language": turn.language,
"language_ok": lang_ok,
"expected_intents": sorted(exp_intents),
"predicted_intent": turn.intent,
"intent_confidence": turn.intent_confidence,
"intent_ok": intent_ok,
"n_entities": len(turn.entities),
"n_retrieved": len(turn.retrieved),
"generator_used": turn.generator_used,
"reply": turn.reply,
"latency_s": round(dt, 3),
})
flag = "✓" if (lang_ok and intent_ok and reply_ok) else "✗"
print(f" [{flag}] case {i:>2}/{len(cases)} "
f"lang={turn.language}({'ok' if lang_ok else f'≠{exp_lang}'}) "
f"intent={turn.intent}({'ok' if intent_ok else 'unexpected'}) "
f"latency={dt:.2f}s")
n = len(cases)
return {
"n_cases": n,
"language_accuracy": correct_lang / n,
"intent_accuracy_in_set": correct_intent / n,
"retrieval_triggered": rag_used,
"nonempty_reply_rate": nonempty_replies / n,
"latency_seconds": {
"mean": round(statistics.mean(latencies), 3),
"median": round(statistics.median(latencies), 3),
"p95": round(sorted(latencies)[int(0.95 * n) - 1], 3) if n >= 5 else None,
"max": round(max(latencies), 3),
},
"cases": results,
}
# --------------------------------------------------------------------------- #
# Markdown rendering
# --------------------------------------------------------------------------- #
def _fmt_pct(x: float | None, digits: int = 4) -> str:
return f"{x:.{digits}f}" if isinstance(x, (int, float)) else "—"
def render_markdown(model_summary: dict[str, Any],
e2e: dict[str, Any] | None) -> str:
out: list[str] = []
out.append("# Multilingual Chatbot — Evaluation Report")
out.append("")
out.append("End-to-end multilingual customer-service chatbot built around three "
"fine-tuned DistilBERT classifiers, a FAISS retrieval index over a "
"36-row knowledge base, and Qwen2.5-0.5B-Instruct as the response "
"generator. This report aggregates per-model held-out test metrics "
"and a full-pipeline smoke evaluation.")
out.append("")
out.append("## 1. Per-model test metrics")
out.append("")
out.append("| Component | Model | Headline metric (held-out test) |")
out.append("|-----------|-------|---------------------------------|")
ld = model_summary.get("language_detector", {})
out.append(f"| Language detector | `{ld.get('model', '?')}` | "
f"acc {_fmt_pct(ld.get('test_accuracy'))} · "
f"F1 macro {_fmt_pct(ld.get('test_f1_macro'))} · "
f"F1 weighted {_fmt_pct(ld.get('test_f1_weighted'))} |")
ic = model_summary.get("intent_classifier", {})
out.append(f"| Intent classifier | `{ic.get('model', '?')}` | "
f"acc {_fmt_pct(ic.get('test_accuracy'))} · "
f"F1 macro {_fmt_pct(ic.get('test_f1_macro'))} · "
f"F1 weighted {_fmt_pct(ic.get('test_f1_weighted'))} |")
nm = model_summary.get("ner_model", {})
out.append(f"| NER (entity-level) | `{nm.get('model', '?')}` | "
f"P {_fmt_pct(nm.get('test_precision_micro'))} · "
f"R {_fmt_pct(nm.get('test_recall_micro'))} · "
f"F1 micro {_fmt_pct(nm.get('test_f1_micro'))} |")
out.append("")
if isinstance(ic.get("per_language"), dict) and ic["per_language"]:
out.append("### Intent — per language")
out.append("")
out.append("| Language | n | accuracy | F1 weighted | F1 macro |")
out.append("|----------|---|----------|-------------|----------|")
for lang, m in sorted(ic["per_language"].items()):
out.append(f"| {lang} | {m.get('n', '—')} | "
f"{_fmt_pct(m.get('accuracy'))} | "
f"{_fmt_pct(m.get('f1_weighted'))} | "
f"{_fmt_pct(m.get('f1_macro'))} |")
out.append("")
if isinstance(nm.get("per_entity_f1"), dict) and nm["per_entity_f1"]:
out.append("### NER — per entity type (F1)")
out.append("")
out.append("| Entity | F1 |")
out.append("|--------|----|")
for ent, f1 in sorted(nm["per_entity_f1"].items()):
out.append(f"| {ent} | {_fmt_pct(f1)} |")
out.append("")
if isinstance(nm.get("per_language"), dict) and nm["per_language"]:
out.append("### NER — per language (entity-level F1)")
out.append("")
out.append("| Language | n | precision | recall | F1 |")
out.append("|----------|---|-----------|--------|----|")
for lang, m in sorted(nm["per_language"].items()):
out.append(f"| {lang} | {m.get('n', '—')} | "
f"{_fmt_pct(m.get('precision'))} | "
f"{_fmt_pct(m.get('recall'))} | "
f"{_fmt_pct(m.get('f1'))} |")
out.append("")
out.append("## 2. End-to-end pipeline evaluation")
out.append("")
if e2e is None:
out.append("_E2E evaluation skipped (use `python src/evaluate.py` without "
"`--skip-e2e` to run it)._")
out.append("")
else:
n = e2e["n_cases"]
out.append(f"Tested **{n} curated messages** covering AR/EN/FR × greeting / "
"booking / inquiry / complaint / farewell.")
out.append("")
out.append("| Metric | Value |")
out.append("|--------|-------|")
out.append(f"| Language accuracy | {_fmt_pct(e2e['language_accuracy'])} "
f"({int(round(e2e['language_accuracy'] * n))}/{n}) |")
out.append(f"| Intent accuracy (in-set) | "
f"{_fmt_pct(e2e['intent_accuracy_in_set'])} "
f"({int(round(e2e['intent_accuracy_in_set'] * n))}/{n}) |")
out.append(f"| Non-empty reply rate | "
f"{_fmt_pct(e2e['nonempty_reply_rate'])} "
f"({int(round(e2e['nonempty_reply_rate'] * n))}/{n}) |")
out.append(f"| Retrieval triggered | {e2e['retrieval_triggered']}/{n} "
"(expected: only on inquiry/booking/complaint) |")
lat = e2e["latency_seconds"]
out.append(f"| Latency: mean / median / p95 / max | "
f"{lat['mean']}s / {lat['median']}s / "
f"{lat['p95'] if lat['p95'] is not None else '—'}s / "
f"{lat['max']}s |")
out.append("")
out.append("### Per-case detail")
out.append("")
out.append("| # | text | lang (pred / exp) | intent (pred / exp) | "
"retrieved | gen | latency |")
out.append("|---|------|------|------|-----------|-----|---------|")
for c in e2e["cases"]:
lang_cell = f"{c['predicted_language']} / {c['expected_language']}"
if not c["language_ok"]:
lang_cell = f"**{lang_cell} ✗**"
intent_cell = f"{c['predicted_intent']} / {','.join(c['expected_intents'])}"
if not c["intent_ok"]:
intent_cell = f"**{intent_cell} ✗**"
text = c["text"].replace("|", "\\|")
text_short = text if len(text) <= 50 else text[:47] + "..."
out.append(f"| {c['i']} | {text_short} | {lang_cell} | "
f"{intent_cell} | {c['n_retrieved']} | "
f"{'Y' if c['generator_used'] else 'n'} | "
f"{c['latency_s']}s |")
out.append("")
out.append("## 3. Architecture summary")
out.append("")
out.append("```")
out.append("user_text")
out.append(" -> Language Detector (DistilBERT, CPU) AR / CS / EN / FR")
out.append(" -> Intent Classifier (DistilBERT, CPU) booking | complaint | farewell |")
out.append(" greeting | inquiry | other")
out.append(" -> NER (DistilBERT, CPU) PER / LOC / ORG / DATE (BIO)")
out.append(" -> Branch on intent:")
out.append(" greeting / farewell -> canned reply (instant)")
out.append(" inquiry / booking / -> RAG retrieve top-3 (FAISS, MiniLM, CPU)")
out.append(" complaint + Qwen2.5-0.5B generates reply (cuda fp16)")
out.append(" other -> Qwen2.5-0.5B from general knowledge")
out.append(" -> Reply (always in user's detected language; CS uses EN)")
out.append("```")
out.append("")
out.append("All metrics in §1 were computed on held-out test splits during each "
"model's training script. The §2 numbers are an additional integration "
"check on hand-curated messages.")
return "\n".join(out)
# --------------------------------------------------------------------------- #
# Entry point
# --------------------------------------------------------------------------- #
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
parser.add_argument("--skip-e2e", action="store_true",
help="Don't load models or run the E2E smoke eval; only "
"aggregate stored eval_results.json files.")
parser.add_argument("--quick", action="store_true",
help="Run only the first 6 E2E cases.")
args = parser.parse_args()
print("=" * 72)
print("Multilingual chatbot — final evaluation")
print("=" * 72)
print("\n[1/2] Aggregating per-model metrics from eval_results.json files ...")
model_summary = aggregate_model_metrics()
for k, v in model_summary.items():
if "error" in v:
print(f" - {k}: {v['error']}")
else:
print(f" - {k}: loaded ({v.get('model', '?')})")
e2e = None
if args.skip_e2e:
print("\n[2/2] Skipping E2E evaluation (--skip-e2e).")
else:
cases = E2E_CASES[:6] if args.quick else E2E_CASES
print(f"\n[2/2] Running E2E pipeline on {len(cases)} curated cases ...")
e2e = run_e2e(cases)
md = render_markdown(model_summary, e2e)
REPORT_MD.write_text(md, encoding="utf-8")
summary = {"models": model_summary, "e2e": e2e}
REPORT_JSON.write_text(json.dumps(summary, indent=2, ensure_ascii=False))
print(f"\n[OK] Wrote {REPORT_MD.relative_to(PROJECT_ROOT)}")
print(f"[OK] Wrote {REPORT_JSON.relative_to(PROJECT_ROOT)}")
return 0
if __name__ == "__main__":
sys.exit(main())