Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

App Files Files Community

multilingual-chatbot / src /evaluate.py

momenalhamza

Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)

469ef7f verified 10 days ago

raw

history blame contribute delete

17.8 kB

	"""Compile the final evaluation report for the multilingual chatbot.

	Two parts:

	1. Aggregate per-model metrics from existing eval_results.json files
	(lang_detector, intent_classifier, ner_model). These were measured on
	the proper held-out test splits during their training scripts.

	2. Run an end-to-end smoke eval on a curated 15-case test set covering
	each language x several intents. Verifies the full pipeline (lang ->
	intent -> NER -> RAG -> Qwen) and measures per-stage latency.

	Outputs:
	evaluation_report.md (human-readable)
	evaluation_summary.json (machine-readable)

	Usage:
	python src/evaluate.py
	python src/evaluate.py --skip-e2e # only aggregate stored metrics
	python src/evaluate.py --quick # fewer E2E cases
	"""

	from __future__ import annotations

	import argparse
	import json
	import statistics
	import sys
	import time
	from pathlib import Path
	from typing import Any

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	# Make `src.chatbot` importable when this script is run as `python src/evaluate.py`.
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))
	LANG_EVAL = PROJECT_ROOT / "models" / "lang_detector" / "eval_results.json"
	INTENT_EVAL = PROJECT_ROOT / "models" / "intent_classifier" / "eval_results.json"
	NER_EVAL = PROJECT_ROOT / "models" / "ner_model" / "eval_results.json"
	REPORT_MD = PROJECT_ROOT / "evaluation_report.md"
	REPORT_JSON = PROJECT_ROOT / "evaluation_summary.json"

	# Curated end-to-end smoke cases: (text, expected_lang, expected_intents).
	# expected_intents is a SET — many natural utterances are validly classified
	# as either "booking" or "inquiry", so we accept anything in the set.
	E2E_CASES: list[tuple[str, str, set[str]]] = [
	("Hello there!", "EN", {"greeting"}),
	("How can I book a hotel in Paris?", "EN", {"booking", "inquiry"}),
	("What payment methods do you accept?", "EN", {"inquiry"}),
	("My order arrived damaged, I want a refund.", "EN", {"complaint"}),
	("Goodbye, see you next time.", "EN", {"farewell"}),

	("مرحباً، كيف حالك؟", "AR", {"greeting"}),
	("أحتاج أحجز فندقاً في مكة المكرمة", "AR", {"booking", "inquiry"}),
	("ما هي طرق الدفع المتاحة لديكم؟", "AR", {"inquiry"}),
	("لدي شكوى بخصوص طلبي الأخير", "AR", {"complaint"}),
	("إلى اللقاء، شكراً لكم", "AR", {"farewell"}),

	("Bonjour, comment allez-vous ?", "FR", {"greeting"}),
	("Comment puis-je réserver un vol pour Lyon ?", "FR", {"booking", "inquiry"}),
	("Quels sont vos horaires d'ouverture ?", "FR", {"inquiry"}),
	("Je voudrais déposer une réclamation.", "FR", {"complaint"}),
	("Au revoir et bonne journée", "FR", {"farewell"}),
	]


	# --------------------------------------------------------------------------- #
	# Part 1 — aggregate stored per-model metrics
	# --------------------------------------------------------------------------- #

	def _load_json_safely(path: Path) -> dict[str, Any] \| None:
	if not path.exists():
	return None
	try:
	return json.loads(path.read_text())
	except json.JSONDecodeError:
	return None


	def aggregate_model_metrics() -> dict[str, Any]:
	"""Pull headline numbers from each per-model eval_results.json."""
	summary: dict[str, Any] = {}

	lang = _load_json_safely(LANG_EVAL)
	if lang:
	tm = lang.get("test_metrics", {})
	summary["language_detector"] = {
	"model": lang.get("model_name"),
	"task": lang.get("task"),
	"test_accuracy": tm.get("test_accuracy"),
	"test_f1_weighted": tm.get("test_f1"),
	"test_f1_macro": tm.get("test_f1_macro"),
	"labels": list(lang.get("labels", {}).keys()),
	}
	else:
	summary["language_detector"] = {"error": f"missing {LANG_EVAL.name}"}

	intent = _load_json_safely(INTENT_EVAL)
	if intent:
	tm = intent.get("test_metrics", {})
	summary["intent_classifier"] = {
	"model": intent.get("model_name"),
	"task": intent.get("task"),
	"test_accuracy": tm.get("test_accuracy"),
	"test_f1_weighted": tm.get("test_f1"),
	"test_f1_macro": tm.get("test_f1_macro"),
	"labels": list(intent.get("labels", {}).keys()),
	"per_language": intent.get("per_language", {}),
	}
	else:
	summary["intent_classifier"] = {"error": f"missing {INTENT_EVAL.name}"}

	ner = _load_json_safely(NER_EVAL)
	if ner:
	tm = ner.get("test_metrics", {})
	summary["ner_model"] = {
	"model": ner.get("model_name"),
	"task": ner.get("task"),
	"test_f1_micro": tm.get("test_f1"),
	"test_precision_micro": tm.get("test_precision"),
	"test_recall_micro": tm.get("test_recall"),
	"labels": list(ner.get("labels", {}).keys()),
	"per_language": ner.get("per_language", {}),
	"per_entity_f1": {
	k: v.get("f1-score")
	for k, v in (ner.get("classification_report") or {}).items()
	if isinstance(v, dict) and k not in ("micro avg", "macro avg",
	"weighted avg", "accuracy")
	},
	}
	else:
	summary["ner_model"] = {"error": f"missing {NER_EVAL.name}"}

	return summary


	# --------------------------------------------------------------------------- #
	# Part 2 — end-to-end smoke eval
	# --------------------------------------------------------------------------- #

	def run_e2e(cases: list[tuple[str, str, set[str]]]) -> dict[str, Any]:
	"""Run the full chatbot pipeline against curated cases. Lazy-imports the
	Chatbot so --skip-e2e doesn't pay the model-loading cost."""
	from src.chatbot import Chatbot # noqa: PLC0415 — intentional lazy import

	print(f"\n[e2e] loading chatbot (this triggers model load + ~10 s startup) ...")
	bot = Chatbot()

	results = []
	correct_lang = 0
	correct_intent = 0
	rag_used = 0
	nonempty_replies = 0
	latencies: list[float] = []

	for i, (text, exp_lang, exp_intents) in enumerate(cases, start=1):
	t0 = time.perf_counter()
	turn = bot.respond(text)
	dt = time.perf_counter() - t0
	latencies.append(dt)

	lang_ok = (turn.language == exp_lang)
	intent_ok = (turn.intent in exp_intents)
	reply_ok = bool(turn.reply.strip())

	if lang_ok:
	correct_lang += 1
	if intent_ok:
	correct_intent += 1
	if turn.retrieved:
	rag_used += 1
	if reply_ok:
	nonempty_replies += 1

	results.append({
	"i": i,
	"text": text,
	"expected_language": exp_lang,
	"predicted_language": turn.language,
	"language_ok": lang_ok,
	"expected_intents": sorted(exp_intents),
	"predicted_intent": turn.intent,
	"intent_confidence": turn.intent_confidence,
	"intent_ok": intent_ok,
	"n_entities": len(turn.entities),
	"n_retrieved": len(turn.retrieved),
	"generator_used": turn.generator_used,
	"reply": turn.reply,
	"latency_s": round(dt, 3),
	})

	flag = "✓" if (lang_ok and intent_ok and reply_ok) else "✗"
	print(f" [{flag}] case {i:>2}/{len(cases)} "
	f"lang={turn.language}({'ok' if lang_ok else f'≠{exp_lang}'}) "
	f"intent={turn.intent}({'ok' if intent_ok else 'unexpected'}) "
	f"latency={dt:.2f}s")

	n = len(cases)
	return {
	"n_cases": n,
	"language_accuracy": correct_lang / n,
	"intent_accuracy_in_set": correct_intent / n,
	"retrieval_triggered": rag_used,
	"nonempty_reply_rate": nonempty_replies / n,
	"latency_seconds": {
	"mean": round(statistics.mean(latencies), 3),
	"median": round(statistics.median(latencies), 3),
	"p95": round(sorted(latencies)[int(0.95 * n) - 1], 3) if n >= 5 else None,
	"max": round(max(latencies), 3),
	},
	"cases": results,
	}


	# --------------------------------------------------------------------------- #
	# Markdown rendering
	# --------------------------------------------------------------------------- #

	def _fmt_pct(x: float \| None, digits: int = 4) -> str:
	return f"{x:.{digits}f}" if isinstance(x, (int, float)) else "—"


	def render_markdown(model_summary: dict[str, Any],
	e2e: dict[str, Any] \| None) -> str:
	out: list[str] = []
	out.append("# Multilingual Chatbot — Evaluation Report")
	out.append("")
	out.append("End-to-end multilingual customer-service chatbot built around three "
	"fine-tuned DistilBERT classifiers, a FAISS retrieval index over a "
	"36-row knowledge base, and Qwen2.5-0.5B-Instruct as the response "
	"generator. This report aggregates per-model held-out test metrics "
	"and a full-pipeline smoke evaluation.")
	out.append("")

	out.append("## 1. Per-model test metrics")
	out.append("")
	out.append("\| Component \| Model \| Headline metric (held-out test) \|")
	out.append("\|-----------\|-------\|---------------------------------\|")
	ld = model_summary.get("language_detector", {})
	out.append(f"\| Language detector \| `{ld.get('model', '?')}` \| "
	f"acc {_fmt_pct(ld.get('test_accuracy'))} · "
	f"F1 macro {_fmt_pct(ld.get('test_f1_macro'))} · "
	f"F1 weighted {_fmt_pct(ld.get('test_f1_weighted'))} \|")
	ic = model_summary.get("intent_classifier", {})
	out.append(f"\| Intent classifier \| `{ic.get('model', '?')}` \| "
	f"acc {_fmt_pct(ic.get('test_accuracy'))} · "
	f"F1 macro {_fmt_pct(ic.get('test_f1_macro'))} · "
	f"F1 weighted {_fmt_pct(ic.get('test_f1_weighted'))} \|")
	nm = model_summary.get("ner_model", {})
	out.append(f"\| NER (entity-level) \| `{nm.get('model', '?')}` \| "
	f"P {_fmt_pct(nm.get('test_precision_micro'))} · "
	f"R {_fmt_pct(nm.get('test_recall_micro'))} · "
	f"F1 micro {_fmt_pct(nm.get('test_f1_micro'))} \|")
	out.append("")

	if isinstance(ic.get("per_language"), dict) and ic["per_language"]:
	out.append("### Intent — per language")
	out.append("")
	out.append("\| Language \| n \| accuracy \| F1 weighted \| F1 macro \|")
	out.append("\|----------\|---\|----------\|-------------\|----------\|")
	for lang, m in sorted(ic["per_language"].items()):
	out.append(f"\| {lang} \| {m.get('n', '—')} \| "
	f"{_fmt_pct(m.get('accuracy'))} \| "
	f"{_fmt_pct(m.get('f1_weighted'))} \| "
	f"{_fmt_pct(m.get('f1_macro'))} \|")
	out.append("")

	if isinstance(nm.get("per_entity_f1"), dict) and nm["per_entity_f1"]:
	out.append("### NER — per entity type (F1)")
	out.append("")
	out.append("\| Entity \| F1 \|")
	out.append("\|--------\|----\|")
	for ent, f1 in sorted(nm["per_entity_f1"].items()):
	out.append(f"\| {ent} \| {_fmt_pct(f1)} \|")
	out.append("")

	if isinstance(nm.get("per_language"), dict) and nm["per_language"]:
	out.append("### NER — per language (entity-level F1)")
	out.append("")
	out.append("\| Language \| n \| precision \| recall \| F1 \|")
	out.append("\|----------\|---\|-----------\|--------\|----\|")
	for lang, m in sorted(nm["per_language"].items()):
	out.append(f"\| {lang} \| {m.get('n', '—')} \| "
	f"{_fmt_pct(m.get('precision'))} \| "
	f"{_fmt_pct(m.get('recall'))} \| "
	f"{_fmt_pct(m.get('f1'))} \|")
	out.append("")

	out.append("## 2. End-to-end pipeline evaluation")
	out.append("")
	if e2e is None:
	out.append("_E2E evaluation skipped (use `python src/evaluate.py` without "
	"`--skip-e2e` to run it)._")
	out.append("")
	else:
	n = e2e["n_cases"]
	out.append(f"Tested {n} curated messages covering AR/EN/FR × greeting / "
	"booking / inquiry / complaint / farewell.")
	out.append("")
	out.append("\| Metric \| Value \|")
	out.append("\|--------\|-------\|")
	out.append(f"\| Language accuracy \| {_fmt_pct(e2e['language_accuracy'])} "
	f"({int(round(e2e['language_accuracy'] * n))}/{n}) \|")
	out.append(f"\| Intent accuracy (in-set) \| "
	f"{_fmt_pct(e2e['intent_accuracy_in_set'])} "
	f"({int(round(e2e['intent_accuracy_in_set'] * n))}/{n}) \|")
	out.append(f"\| Non-empty reply rate \| "
	f"{_fmt_pct(e2e['nonempty_reply_rate'])} "
	f"({int(round(e2e['nonempty_reply_rate'] * n))}/{n}) \|")
	out.append(f"\| Retrieval triggered \| {e2e['retrieval_triggered']}/{n} "
	"(expected: only on inquiry/booking/complaint) \|")
	lat = e2e["latency_seconds"]
	out.append(f"\| Latency: mean / median / p95 / max \| "
	f"{lat['mean']}s / {lat['median']}s / "
	f"{lat['p95'] if lat['p95'] is not None else '—'}s / "
	f"{lat['max']}s \|")
	out.append("")

	out.append("### Per-case detail")
	out.append("")
	out.append("\| # \| text \| lang (pred / exp) \| intent (pred / exp) \| "
	"retrieved \| gen \| latency \|")
	out.append("\|---\|------\|------\|------\|-----------\|-----\|---------\|")
	for c in e2e["cases"]:
	lang_cell = f"{c['predicted_language']} / {c['expected_language']}"
	if not c["language_ok"]:
	lang_cell = f"{lang_cell} ✗"
	intent_cell = f"{c['predicted_intent']} / {','.join(c['expected_intents'])}"
	if not c["intent_ok"]:
	intent_cell = f"{intent_cell} ✗"
	text = c["text"].replace("\|", "\\\|")
	text_short = text if len(text) <= 50 else text[:47] + "..."
	out.append(f"\| {c['i']} \| {text_short} \| {lang_cell} \| "
	f"{intent_cell} \| {c['n_retrieved']} \| "
	f"{'Y' if c['generator_used'] else 'n'} \| "
	f"{c['latency_s']}s \|")
	out.append("")

	out.append("## 3. Architecture summary")
	out.append("")
	out.append("```")
	out.append("user_text")
	out.append(" -> Language Detector (DistilBERT, CPU) AR / CS / EN / FR")
	out.append(" -> Intent Classifier (DistilBERT, CPU) booking \| complaint \| farewell \|")
	out.append(" greeting \| inquiry \| other")
	out.append(" -> NER (DistilBERT, CPU) PER / LOC / ORG / DATE (BIO)")
	out.append(" -> Branch on intent:")
	out.append(" greeting / farewell -> canned reply (instant)")
	out.append(" inquiry / booking / -> RAG retrieve top-3 (FAISS, MiniLM, CPU)")
	out.append(" complaint + Qwen2.5-0.5B generates reply (cuda fp16)")
	out.append(" other -> Qwen2.5-0.5B from general knowledge")
	out.append(" -> Reply (always in user's detected language; CS uses EN)")
	out.append("```")
	out.append("")
	out.append("All metrics in §1 were computed on held-out test splits during each "
	"model's training script. The §2 numbers are an additional integration "
	"check on hand-curated messages.")
	return "\n".join(out)


	# --------------------------------------------------------------------------- #
	# Entry point
	# --------------------------------------------------------------------------- #

	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
	parser.add_argument("--skip-e2e", action="store_true",
	help="Don't load models or run the E2E smoke eval; only "
	"aggregate stored eval_results.json files.")
	parser.add_argument("--quick", action="store_true",
	help="Run only the first 6 E2E cases.")
	args = parser.parse_args()

	print("=" * 72)
	print("Multilingual chatbot — final evaluation")
	print("=" * 72)

	print("\n[1/2] Aggregating per-model metrics from eval_results.json files ...")
	model_summary = aggregate_model_metrics()
	for k, v in model_summary.items():
	if "error" in v:
	print(f" - {k}: {v['error']}")
	else:
	print(f" - {k}: loaded ({v.get('model', '?')})")

	e2e = None
	if args.skip_e2e:
	print("\n[2/2] Skipping E2E evaluation (--skip-e2e).")
	else:
	cases = E2E_CASES[:6] if args.quick else E2E_CASES
	print(f"\n[2/2] Running E2E pipeline on {len(cases)} curated cases ...")
	e2e = run_e2e(cases)

	md = render_markdown(model_summary, e2e)
	REPORT_MD.write_text(md, encoding="utf-8")
	summary = {"models": model_summary, "e2e": e2e}
	REPORT_JSON.write_text(json.dumps(summary, indent=2, ensure_ascii=False))

	print(f"\n[OK] Wrote {REPORT_MD.relative_to(PROJECT_ROOT)}")
	print(f"[OK] Wrote {REPORT_JSON.relative_to(PROJECT_ROOT)}")
	return 0


	if __name__ == "__main__":
	sys.exit(main())