Spaces:
Sleeping
Sleeping
| """ | |
| NLU eval harness — intent accuracy, threshold sweep, per-intent recall. | |
| ======================================================================= | |
| Usage: | |
| python eval_nlu.py # leave-one-out over INTENT_EXAMPLES | |
| python eval_nlu.py labeled.csv # external set, columns: text,intent | |
| Leave-one-out is a *sanity* tool: for each example phrase, it is held out of | |
| its own intent's centroid, then classified. It tells you whether the centroids | |
| are internally separable and lets you sweep CONFIDENCE_THRESHOLD. It does NOT | |
| substitute for a held-out, human-labeled Hausa test set — collect that from the | |
| turn logs (logging_util.py) and pass it as the CSV argument. | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import sys | |
| from collections import defaultdict | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from nlu import INTENT_EXAMPLES, EMBEDDING_MODEL_ID, CONFIDENCE_THRESHOLD | |
| THRESHOLDS = [0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60] | |
| def _centroid(vecs, exclude=None): | |
| if exclude is not None: | |
| vecs = [v for i, v in enumerate(vecs) if i != exclude] | |
| c = np.mean(vecs, axis=0) | |
| return c / np.linalg.norm(c) | |
| def _build_records(encoder): | |
| """Returns list of (text, gold_intent, pred_intent, confidence).""" | |
| emb = {it: encoder.encode(ph, normalize_embeddings=True) | |
| for it, ph in INTENT_EXAMPLES.items()} | |
| if len(sys.argv) > 1: | |
| rows = list(csv.DictReader(open(sys.argv[1], encoding="utf-8"))) | |
| cents = {it: _centroid(list(v)) for it, v in emb.items()} | |
| records = [] | |
| for r in rows: | |
| q = encoder.encode(r["text"], normalize_embeddings=True) | |
| scores = {it: float(np.dot(q, c)) for it, c in cents.items()} | |
| best = max(scores, key=scores.get) | |
| records.append((r["text"], r["intent"], best, scores[best])) | |
| return records | |
| # leave-one-out over the examples themselves | |
| records = [] | |
| for it, vecs in emb.items(): | |
| vecs = list(vecs) | |
| for i, q in enumerate(vecs): | |
| cents = {jt: (_centroid(list(v), exclude=i) if jt == it else _centroid(list(v))) | |
| for jt, v in emb.items()} | |
| scores = {jt: float(np.dot(q, c)) for jt, c in cents.items()} | |
| best = max(scores, key=scores.get) | |
| records.append((INTENT_EXAMPLES[it][i], it, best, scores[best])) | |
| return records | |
| def main(): | |
| print(f"Loading {EMBEDDING_MODEL_ID} …") | |
| encoder = SentenceTransformer(EMBEDDING_MODEL_ID, device="cpu") | |
| records = _build_records(encoder) | |
| n = len(records) | |
| mode = "external CSV" if len(sys.argv) > 1 else "leave-one-out" | |
| print(f"Evaluating {n} utterances ({mode}).\n") | |
| print("threshold accuracy coverage (below thresh → predicted 'unknown')") | |
| for th in THRESHOLDS: | |
| correct = cov = 0 | |
| for _, gold, pred, conf in records: | |
| p = pred if conf >= th else "unknown" | |
| if p != "unknown": | |
| cov += 1 | |
| if p == gold: | |
| correct += 1 | |
| marker = " <- current" if abs(th - CONFIDENCE_THRESHOLD) < 1e-9 else "" | |
| print(f" {th:.2f} {correct/n:.3f} {cov/n:.3f}{marker}") | |
| th = CONFIDENCE_THRESHOLD | |
| confusion = defaultdict(lambda: defaultdict(int)) | |
| for _, gold, pred, conf in records: | |
| p = pred if conf >= th else "unknown" | |
| confusion[gold][p] += 1 | |
| print(f"\nPer-intent recall @ {th:.2f}:") | |
| for gold in sorted(confusion): | |
| total = sum(confusion[gold].values()) | |
| hit = confusion[gold][gold] | |
| worst = sorted(((c, p) for p, c in confusion[gold].items() if p != gold), reverse=True) | |
| leak = f" (most confused → {worst[0][1]}×{worst[0][0]})" if worst else "" | |
| print(f" {gold:14s} {hit}/{total} recall={hit/total:.2f}{leak}") | |
| if __name__ == "__main__": | |
| main() | |