File size: 6,284 Bytes
653b8c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | """
EvaluationProbe — Vitalis FSI
Measures whether the system has learned to distinguish
primitive speech acts after the curriculum.
Speech acts measured:
- question (interrogative)
- instruction (imperative)
- explanation (declarative/expository)
No labeled audio required at runtime —
uses helix prototype clustering for zero-shot evaluation.
"""
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
from src.dream_engine.helix_memory import HelixMemory
from src.hdc_encoder.encoder import encode
from src.audio_ear.feature_extractor import extract_features
class EvaluationProbe:
N_CLUSTERS = 3
LABELS = ["question", "instruction", "explanation"]
def __init__(self, helix_path: Path = None):
self.helix_path = helix_path or (
Path.home() / ".vitalis_workspace" / "helix_memory.pkl"
)
self.helix = HelixMemory(self.helix_path)
def _build_centroids(self) -> Dict[str, np.ndarray]:
"""
Build one centroid per speech act by clustering
all stored helix prototypes.
"""
if len(self.helix.entries) < self.N_CLUSTERS:
raise RuntimeError(
f"Need at least {self.N_CLUSTERS} helix codes. "
f"Run curriculum first."
)
all_protos = np.stack(
[proto for _, proto, _, _ in self.helix.entries]
).astype(np.int8)
# Seeded k-means for reproducibility
rng = np.random.default_rng(42)
idx = rng.choice(len(all_protos), self.N_CLUSTERS, replace=False)
centroids = all_protos[idx].copy()
for _ in range(6):
dists = np.stack([
np.sum(all_protos != c, axis=1) for c in centroids
], axis=1)
assigns = np.argmin(dists, axis=1)
for i in range(self.N_CLUSTERS):
mask = assigns == i
if np.any(mask):
summed = all_protos[mask].astype(np.int32).sum(axis=0)
new_c = np.sign(summed).astype(np.int8)
new_c[new_c == 0] = 1
centroids[i] = new_c
return dict(zip(self.LABELS, centroids))
def _semantic_fingerprint(self, hv: np.ndarray) -> np.ndarray:
"""
Retrieve top-3 helix prototypes and XOR-bundle them.
Reduces noise in raw hypervector.
"""
matches = self.helix.retrieve(hv, top_k=3)
if not matches:
return hv.copy()
protos = [proto for proto, _ in matches]
stacked = np.stack(protos).astype(np.int32).sum(axis=0)
result = np.sign(stacked).astype(np.int8)
result[result == 0] = 1
return result
def evaluate_file(
self,
wav_path: Path,
true_label: str,
centroids: Dict[str, np.ndarray],
) -> Tuple[str, float, bool]:
"""Evaluate one audio file. Returns (predicted, confidence, correct)."""
mfcc, prosody = extract_features(wav_path)
raw_hv = encode(mfcc, prosody)
semantic_hv = self._semantic_fingerprint(raw_hv)
sims = {
label: float(np.mean(semantic_hv == centroid))
for label, centroid in centroids.items()
}
predicted = max(sims, key=sims.get)
confidence = sims[predicted]
correct = predicted == true_label
return predicted, confidence, correct
def evaluate_directory(self, probe_dir: Path) -> Dict:
"""
Evaluate all wav files in probe_dir.
Directory structure: probe_dir/label/file.wav
"""
if not probe_dir.exists():
return {"status": "probe_dir_not_found", "path": str(probe_dir)}
centroids = self._build_centroids()
results = {label: [] for label in self.LABELS}
total = 0
correct = 0
for label_dir in probe_dir.iterdir():
if not label_dir.is_dir():
continue
label = label_dir.name
for wav in label_dir.glob("*.wav"):
pred, conf, is_correct = self.evaluate_file(
wav, label, centroids
)
results[label].append({
"file": wav.name,
"predicted": pred,
"confidence": round(conf, 4),
"correct": is_correct,
})
total += 1
correct += int(is_correct)
if total == 0:
return {"status": "no_files_found"}
per_class_acc = {
label: round(
sum(r["correct"] for r in items) / len(items), 4
) if items else 0.0
for label, items in results.items()
}
return {
"status": "complete",
"overall_accuracy": round(correct / total, 4),
"per_class": per_class_acc,
"total_files": total,
"helix_codes": len(self.helix.entries),
"details": results,
}
def evaluate_helix_health(self) -> Dict:
"""
Evaluate helix memory health without audio files.
Tests clustering quality and prototype diversity.
"""
if len(self.helix.entries) < 2:
return {"status": "insufficient_data"}
protos = np.stack(
[p for _, p, _, _ in self.helix.entries]
).astype(np.float32)
# Inter-prototype similarity matrix
n = len(protos)
sims = []
for i in range(n):
for j in range(i + 1, n):
sim = float(np.mean(protos[i] == protos[j]))
sims.append(sim)
avg_sim = float(np.mean(sims)) if sims else 0.0
diversity = round(1.0 - avg_sim, 4)
usage_counts = [cnt for _, _, cnt, _ in self.helix.entries]
return {
"status": "healthy" if diversity > 0.1 else "low_diversity",
"helix_codes": n,
"diversity_score": diversity,
"avg_similarity": round(avg_sim, 4),
"total_ingestions": sum(usage_counts),
"most_used_code": int(np.argmax(usage_counts)),
}
|