Voice-AI-Agent-Clean / eval_nlu.py
Toadoum's picture
Upload 4 files
3c8a5a9 verified
Raw
History Blame Contribute Delete
3.89 kB
"""
NLU eval harness — intent accuracy, threshold sweep, per-intent recall.
=======================================================================
Usage:
python eval_nlu.py # leave-one-out over INTENT_EXAMPLES
python eval_nlu.py labeled.csv # external set, columns: text,intent
Leave-one-out is a *sanity* tool: for each example phrase, it is held out of
its own intent's centroid, then classified. It tells you whether the centroids
are internally separable and lets you sweep CONFIDENCE_THRESHOLD. It does NOT
substitute for a held-out, human-labeled Hausa test set — collect that from the
turn logs (logging_util.py) and pass it as the CSV argument.
"""
from __future__ import annotations
import csv
import sys
from collections import defaultdict
import numpy as np
from sentence_transformers import SentenceTransformer
from nlu import INTENT_EXAMPLES, EMBEDDING_MODEL_ID, CONFIDENCE_THRESHOLD
THRESHOLDS = [0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]
def _centroid(vecs, exclude=None):
if exclude is not None:
vecs = [v for i, v in enumerate(vecs) if i != exclude]
c = np.mean(vecs, axis=0)
return c / np.linalg.norm(c)
def _build_records(encoder):
"""Returns list of (text, gold_intent, pred_intent, confidence)."""
emb = {it: encoder.encode(ph, normalize_embeddings=True)
for it, ph in INTENT_EXAMPLES.items()}
if len(sys.argv) > 1:
rows = list(csv.DictReader(open(sys.argv[1], encoding="utf-8")))
cents = {it: _centroid(list(v)) for it, v in emb.items()}
records = []
for r in rows:
q = encoder.encode(r["text"], normalize_embeddings=True)
scores = {it: float(np.dot(q, c)) for it, c in cents.items()}
best = max(scores, key=scores.get)
records.append((r["text"], r["intent"], best, scores[best]))
return records
# leave-one-out over the examples themselves
records = []
for it, vecs in emb.items():
vecs = list(vecs)
for i, q in enumerate(vecs):
cents = {jt: (_centroid(list(v), exclude=i) if jt == it else _centroid(list(v)))
for jt, v in emb.items()}
scores = {jt: float(np.dot(q, c)) for jt, c in cents.items()}
best = max(scores, key=scores.get)
records.append((INTENT_EXAMPLES[it][i], it, best, scores[best]))
return records
def main():
print(f"Loading {EMBEDDING_MODEL_ID} …")
encoder = SentenceTransformer(EMBEDDING_MODEL_ID, device="cpu")
records = _build_records(encoder)
n = len(records)
mode = "external CSV" if len(sys.argv) > 1 else "leave-one-out"
print(f"Evaluating {n} utterances ({mode}).\n")
print("threshold accuracy coverage (below thresh → predicted 'unknown')")
for th in THRESHOLDS:
correct = cov = 0
for _, gold, pred, conf in records:
p = pred if conf >= th else "unknown"
if p != "unknown":
cov += 1
if p == gold:
correct += 1
marker = " <- current" if abs(th - CONFIDENCE_THRESHOLD) < 1e-9 else ""
print(f" {th:.2f} {correct/n:.3f} {cov/n:.3f}{marker}")
th = CONFIDENCE_THRESHOLD
confusion = defaultdict(lambda: defaultdict(int))
for _, gold, pred, conf in records:
p = pred if conf >= th else "unknown"
confusion[gold][p] += 1
print(f"\nPer-intent recall @ {th:.2f}:")
for gold in sorted(confusion):
total = sum(confusion[gold].values())
hit = confusion[gold][gold]
worst = sorted(((c, p) for p, c in confusion[gold].items() if p != gold), reverse=True)
leak = f" (most confused → {worst[0][1]}×{worst[0][0]})" if worst else ""
print(f" {gold:14s} {hit}/{total} recall={hit/total:.2f}{leak}")
if __name__ == "__main__":
main()