""" EvaluationProbe — Vitalis FSI Measures whether the system has learned to distinguish primitive speech acts after the curriculum. Speech acts measured: - question (interrogative) - instruction (imperative) - explanation (declarative/expository) No labeled audio required at runtime — uses helix prototype clustering for zero-shot evaluation. """ import numpy as np from pathlib import Path from typing import Dict, List, Tuple from src.dream_engine.helix_memory import HelixMemory from src.hdc_encoder.encoder import encode from src.audio_ear.feature_extractor import extract_features class EvaluationProbe: N_CLUSTERS = 3 LABELS = ["question", "instruction", "explanation"] def __init__(self, helix_path: Path = None): self.helix_path = helix_path or ( Path.home() / ".vitalis_workspace" / "helix_memory.pkl" ) self.helix = HelixMemory(self.helix_path) def _build_centroids(self) -> Dict[str, np.ndarray]: """ Build one centroid per speech act by clustering all stored helix prototypes. """ if len(self.helix.entries) < self.N_CLUSTERS: raise RuntimeError( f"Need at least {self.N_CLUSTERS} helix codes. " f"Run curriculum first." ) all_protos = np.stack( [proto for _, proto, _, _ in self.helix.entries] ).astype(np.int8) # Seeded k-means for reproducibility rng = np.random.default_rng(42) idx = rng.choice(len(all_protos), self.N_CLUSTERS, replace=False) centroids = all_protos[idx].copy() for _ in range(6): dists = np.stack([ np.sum(all_protos != c, axis=1) for c in centroids ], axis=1) assigns = np.argmin(dists, axis=1) for i in range(self.N_CLUSTERS): mask = assigns == i if np.any(mask): summed = all_protos[mask].astype(np.int32).sum(axis=0) new_c = np.sign(summed).astype(np.int8) new_c[new_c == 0] = 1 centroids[i] = new_c return dict(zip(self.LABELS, centroids)) def _semantic_fingerprint(self, hv: np.ndarray) -> np.ndarray: """ Retrieve top-3 helix prototypes and XOR-bundle them. Reduces noise in raw hypervector. """ matches = self.helix.retrieve(hv, top_k=3) if not matches: return hv.copy() protos = [proto for proto, _ in matches] stacked = np.stack(protos).astype(np.int32).sum(axis=0) result = np.sign(stacked).astype(np.int8) result[result == 0] = 1 return result def evaluate_file( self, wav_path: Path, true_label: str, centroids: Dict[str, np.ndarray], ) -> Tuple[str, float, bool]: """Evaluate one audio file. Returns (predicted, confidence, correct).""" mfcc, prosody = extract_features(wav_path) raw_hv = encode(mfcc, prosody) semantic_hv = self._semantic_fingerprint(raw_hv) sims = { label: float(np.mean(semantic_hv == centroid)) for label, centroid in centroids.items() } predicted = max(sims, key=sims.get) confidence = sims[predicted] correct = predicted == true_label return predicted, confidence, correct def evaluate_directory(self, probe_dir: Path) -> Dict: """ Evaluate all wav files in probe_dir. Directory structure: probe_dir/label/file.wav """ if not probe_dir.exists(): return {"status": "probe_dir_not_found", "path": str(probe_dir)} centroids = self._build_centroids() results = {label: [] for label in self.LABELS} total = 0 correct = 0 for label_dir in probe_dir.iterdir(): if not label_dir.is_dir(): continue label = label_dir.name for wav in label_dir.glob("*.wav"): pred, conf, is_correct = self.evaluate_file( wav, label, centroids ) results[label].append({ "file": wav.name, "predicted": pred, "confidence": round(conf, 4), "correct": is_correct, }) total += 1 correct += int(is_correct) if total == 0: return {"status": "no_files_found"} per_class_acc = { label: round( sum(r["correct"] for r in items) / len(items), 4 ) if items else 0.0 for label, items in results.items() } return { "status": "complete", "overall_accuracy": round(correct / total, 4), "per_class": per_class_acc, "total_files": total, "helix_codes": len(self.helix.entries), "details": results, } def evaluate_helix_health(self) -> Dict: """ Evaluate helix memory health without audio files. Tests clustering quality and prototype diversity. """ if len(self.helix.entries) < 2: return {"status": "insufficient_data"} protos = np.stack( [p for _, p, _, _ in self.helix.entries] ).astype(np.float32) # Inter-prototype similarity matrix n = len(protos) sims = [] for i in range(n): for j in range(i + 1, n): sim = float(np.mean(protos[i] == protos[j])) sims.append(sim) avg_sim = float(np.mean(sims)) if sims else 0.0 diversity = round(1.0 - avg_sim, 4) usage_counts = [cnt for _, _, cnt, _ in self.helix.entries] return { "status": "healthy" if diversity > 0.1 else "low_diversity", "helix_codes": n, "diversity_score": diversity, "avg_similarity": round(avg_sim, 4), "total_ingestions": sum(usage_counts), "most_used_code": int(np.argmax(usage_counts)), }