Spaces:

PlotweaverAI
/

Voice-AI-Agent-Clean

Sleeping

App Files Files Community

Voice-AI-Agent-Clean / eval_nlu.py

Toadoum

Upload 4 files

3c8a5a9 verified 16 days ago

Raw

History Blame Contribute Delete

3.89 kB

	"""
	NLU eval harness — intent accuracy, threshold sweep, per-intent recall.
	=======================================================================
	Usage:
	python eval_nlu.py # leave-one-out over INTENT_EXAMPLES
	python eval_nlu.py labeled.csv # external set, columns: text,intent

	Leave-one-out is a sanity tool: for each example phrase, it is held out of
	its own intent's centroid, then classified. It tells you whether the centroids
	are internally separable and lets you sweep CONFIDENCE_THRESHOLD. It does NOT
	substitute for a held-out, human-labeled Hausa test set — collect that from the
	turn logs (logging_util.py) and pass it as the CSV argument.
	"""
	from __future__ import annotations

	import csv
	import sys
	from collections import defaultdict

	import numpy as np
	from sentence_transformers import SentenceTransformer

	from nlu import INTENT_EXAMPLES, EMBEDDING_MODEL_ID, CONFIDENCE_THRESHOLD

	THRESHOLDS = [0.30, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60]


	def _centroid(vecs, exclude=None):
	if exclude is not None:
	vecs = [v for i, v in enumerate(vecs) if i != exclude]
	c = np.mean(vecs, axis=0)
	return c / np.linalg.norm(c)


	def _build_records(encoder):
	"""Returns list of (text, gold_intent, pred_intent, confidence)."""
	emb = {it: encoder.encode(ph, normalize_embeddings=True)
	for it, ph in INTENT_EXAMPLES.items()}

	if len(sys.argv) > 1:
	rows = list(csv.DictReader(open(sys.argv[1], encoding="utf-8")))
	cents = {it: _centroid(list(v)) for it, v in emb.items()}
	records = []
	for r in rows:
	q = encoder.encode(r["text"], normalize_embeddings=True)
	scores = {it: float(np.dot(q, c)) for it, c in cents.items()}
	best = max(scores, key=scores.get)
	records.append((r["text"], r["intent"], best, scores[best]))
	return records

	# leave-one-out over the examples themselves
	records = []
	for it, vecs in emb.items():
	vecs = list(vecs)
	for i, q in enumerate(vecs):
	cents = {jt: (_centroid(list(v), exclude=i) if jt == it else _centroid(list(v)))
	for jt, v in emb.items()}
	scores = {jt: float(np.dot(q, c)) for jt, c in cents.items()}
	best = max(scores, key=scores.get)
	records.append((INTENT_EXAMPLES[it][i], it, best, scores[best]))
	return records


	def main():
	print(f"Loading {EMBEDDING_MODEL_ID} …")
	encoder = SentenceTransformer(EMBEDDING_MODEL_ID, device="cpu")
	records = _build_records(encoder)
	n = len(records)
	mode = "external CSV" if len(sys.argv) > 1 else "leave-one-out"
	print(f"Evaluating {n} utterances ({mode}).\n")

	print("threshold accuracy coverage (below thresh → predicted 'unknown')")
	for th in THRESHOLDS:
	correct = cov = 0
	for _, gold, pred, conf in records:
	p = pred if conf >= th else "unknown"
	if p != "unknown":
	cov += 1
	if p == gold:
	correct += 1
	marker = " <- current" if abs(th - CONFIDENCE_THRESHOLD) < 1e-9 else ""
	print(f" {th:.2f} {correct/n:.3f} {cov/n:.3f}{marker}")

	th = CONFIDENCE_THRESHOLD
	confusion = defaultdict(lambda: defaultdict(int))
	for _, gold, pred, conf in records:
	p = pred if conf >= th else "unknown"
	confusion[gold][p] += 1

	print(f"\nPer-intent recall @ {th:.2f}:")
	for gold in sorted(confusion):
	total = sum(confusion[gold].values())
	hit = confusion[gold][gold]
	worst = sorted(((c, p) for p, c in confusion[gold].items() if p != gold), reverse=True)
	leak = f" (most confused → {worst[0][1]}×{worst[0][0]})" if worst else ""
	print(f" {gold:14s} {hit}/{total} recall={hit/total:.2f}{leak}")


	if __name__ == "__main__":
	main()