File size: 6,284 Bytes

653b8c1

"""
EvaluationProbe — Vitalis FSI

Measures whether the system has learned to distinguish
primitive speech acts after the curriculum.

Speech acts measured:
  - question    (interrogative)
  - instruction (imperative)
  - explanation (declarative/expository)

No labeled audio required at runtime —
uses helix prototype clustering for zero-shot evaluation.
"""
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple

from src.dream_engine.helix_memory import HelixMemory
from src.hdc_encoder.encoder import encode
from src.audio_ear.feature_extractor import extract_features


class EvaluationProbe:
    N_CLUSTERS  = 3
    LABELS      = ["question", "instruction", "explanation"]

    def __init__(self, helix_path: Path = None):
        self.helix_path = helix_path or (
            Path.home() / ".vitalis_workspace" / "helix_memory.pkl"
        )
        self.helix = HelixMemory(self.helix_path)

    def _build_centroids(self) -> Dict[str, np.ndarray]:
        """
        Build one centroid per speech act by clustering
        all stored helix prototypes.
        """
        if len(self.helix.entries) < self.N_CLUSTERS:
            raise RuntimeError(
                f"Need at least {self.N_CLUSTERS} helix codes. "
                f"Run curriculum first."
            )

        all_protos = np.stack(
            [proto for _, proto, _, _ in self.helix.entries]
        ).astype(np.int8)

        # Seeded k-means for reproducibility
        rng        = np.random.default_rng(42)
        idx        = rng.choice(len(all_protos), self.N_CLUSTERS, replace=False)
        centroids  = all_protos[idx].copy()

        for _ in range(6):
            dists   = np.stack([
                np.sum(all_protos != c, axis=1) for c in centroids
            ], axis=1)
            assigns = np.argmin(dists, axis=1)
            for i in range(self.N_CLUSTERS):
                mask = assigns == i
                if np.any(mask):
                    summed      = all_protos[mask].astype(np.int32).sum(axis=0)
                    new_c       = np.sign(summed).astype(np.int8)
                    new_c[new_c == 0] = 1
                    centroids[i] = new_c

        return dict(zip(self.LABELS, centroids))

    def _semantic_fingerprint(self, hv: np.ndarray) -> np.ndarray:
        """
        Retrieve top-3 helix prototypes and XOR-bundle them.
        Reduces noise in raw hypervector.
        """
        matches = self.helix.retrieve(hv, top_k=3)
        if not matches:
            return hv.copy()
        protos = [proto for proto, _ in matches]
        stacked = np.stack(protos).astype(np.int32).sum(axis=0)
        result = np.sign(stacked).astype(np.int8)
        result[result == 0] = 1
        return result

    def evaluate_file(
        self,
        wav_path: Path,
        true_label: str,
        centroids: Dict[str, np.ndarray],
    ) -> Tuple[str, float, bool]:
        """Evaluate one audio file. Returns (predicted, confidence, correct)."""
        mfcc, prosody  = extract_features(wav_path)
        raw_hv         = encode(mfcc, prosody)
        semantic_hv    = self._semantic_fingerprint(raw_hv)

        sims = {
            label: float(np.mean(semantic_hv == centroid))
            for label, centroid in centroids.items()
        }
        predicted   = max(sims, key=sims.get)
        confidence  = sims[predicted]
        correct     = predicted == true_label
        return predicted, confidence, correct

    def evaluate_directory(self, probe_dir: Path) -> Dict:
        """
        Evaluate all wav files in probe_dir.
        Directory structure: probe_dir/label/file.wav
        """
        if not probe_dir.exists():
            return {"status": "probe_dir_not_found", "path": str(probe_dir)}

        centroids = self._build_centroids()
        results   = {label: [] for label in self.LABELS}
        total     = 0
        correct   = 0

        for label_dir in probe_dir.iterdir():
            if not label_dir.is_dir():
                continue
            label = label_dir.name
            for wav in label_dir.glob("*.wav"):
                pred, conf, is_correct = self.evaluate_file(
                    wav, label, centroids
                )
                results[label].append({
                    "file":      wav.name,
                    "predicted": pred,
                    "confidence": round(conf, 4),
                    "correct":   is_correct,
                })
                total   += 1
                correct += int(is_correct)

        if total == 0:
            return {"status": "no_files_found"}

        per_class_acc = {
            label: round(
                sum(r["correct"] for r in items) / len(items), 4
            ) if items else 0.0
            for label, items in results.items()
        }

        return {
            "status":          "complete",
            "overall_accuracy": round(correct / total, 4),
            "per_class":       per_class_acc,
            "total_files":     total,
            "helix_codes":     len(self.helix.entries),
            "details":         results,
        }

    def evaluate_helix_health(self) -> Dict:
        """
        Evaluate helix memory health without audio files.
        Tests clustering quality and prototype diversity.
        """
        if len(self.helix.entries) < 2:
            return {"status": "insufficient_data"}

        protos = np.stack(
            [p for _, p, _, _ in self.helix.entries]
        ).astype(np.float32)

        # Inter-prototype similarity matrix
        n    = len(protos)
        sims = []
        for i in range(n):
            for j in range(i + 1, n):
                sim = float(np.mean(protos[i] == protos[j]))
                sims.append(sim)

        avg_sim  = float(np.mean(sims)) if sims else 0.0
        diversity = round(1.0 - avg_sim, 4)

        usage_counts = [cnt for _, _, cnt, _ in self.helix.entries]

        return {
            "status":          "healthy" if diversity > 0.1 else "low_diversity",
            "helix_codes":     n,
            "diversity_score": diversity,
            "avg_similarity":  round(avg_sim, 4),
            "total_ingestions": sum(usage_counts),
            "most_used_code":  int(np.argmax(usage_counts)),
        }