FerrellSyntheticIntelligence
Add deep cognition layer, curriculum runner, evaluation probe, updated loop and benchmark
653b8c1
"""
EvaluationProbe — Vitalis FSI
Measures whether the system has learned to distinguish
primitive speech acts after the curriculum.
Speech acts measured:
- question (interrogative)
- instruction (imperative)
- explanation (declarative/expository)
No labeled audio required at runtime —
uses helix prototype clustering for zero-shot evaluation.
"""
import numpy as np
from pathlib import Path
from typing import Dict, List, Tuple
from src.dream_engine.helix_memory import HelixMemory
from src.hdc_encoder.encoder import encode
from src.audio_ear.feature_extractor import extract_features
class EvaluationProbe:
N_CLUSTERS = 3
LABELS = ["question", "instruction", "explanation"]
def __init__(self, helix_path: Path = None):
self.helix_path = helix_path or (
Path.home() / ".vitalis_workspace" / "helix_memory.pkl"
)
self.helix = HelixMemory(self.helix_path)
def _build_centroids(self) -> Dict[str, np.ndarray]:
"""
Build one centroid per speech act by clustering
all stored helix prototypes.
"""
if len(self.helix.entries) < self.N_CLUSTERS:
raise RuntimeError(
f"Need at least {self.N_CLUSTERS} helix codes. "
f"Run curriculum first."
)
all_protos = np.stack(
[proto for _, proto, _, _ in self.helix.entries]
).astype(np.int8)
# Seeded k-means for reproducibility
rng = np.random.default_rng(42)
idx = rng.choice(len(all_protos), self.N_CLUSTERS, replace=False)
centroids = all_protos[idx].copy()
for _ in range(6):
dists = np.stack([
np.sum(all_protos != c, axis=1) for c in centroids
], axis=1)
assigns = np.argmin(dists, axis=1)
for i in range(self.N_CLUSTERS):
mask = assigns == i
if np.any(mask):
summed = all_protos[mask].astype(np.int32).sum(axis=0)
new_c = np.sign(summed).astype(np.int8)
new_c[new_c == 0] = 1
centroids[i] = new_c
return dict(zip(self.LABELS, centroids))
def _semantic_fingerprint(self, hv: np.ndarray) -> np.ndarray:
"""
Retrieve top-3 helix prototypes and XOR-bundle them.
Reduces noise in raw hypervector.
"""
matches = self.helix.retrieve(hv, top_k=3)
if not matches:
return hv.copy()
protos = [proto for proto, _ in matches]
stacked = np.stack(protos).astype(np.int32).sum(axis=0)
result = np.sign(stacked).astype(np.int8)
result[result == 0] = 1
return result
def evaluate_file(
self,
wav_path: Path,
true_label: str,
centroids: Dict[str, np.ndarray],
) -> Tuple[str, float, bool]:
"""Evaluate one audio file. Returns (predicted, confidence, correct)."""
mfcc, prosody = extract_features(wav_path)
raw_hv = encode(mfcc, prosody)
semantic_hv = self._semantic_fingerprint(raw_hv)
sims = {
label: float(np.mean(semantic_hv == centroid))
for label, centroid in centroids.items()
}
predicted = max(sims, key=sims.get)
confidence = sims[predicted]
correct = predicted == true_label
return predicted, confidence, correct
def evaluate_directory(self, probe_dir: Path) -> Dict:
"""
Evaluate all wav files in probe_dir.
Directory structure: probe_dir/label/file.wav
"""
if not probe_dir.exists():
return {"status": "probe_dir_not_found", "path": str(probe_dir)}
centroids = self._build_centroids()
results = {label: [] for label in self.LABELS}
total = 0
correct = 0
for label_dir in probe_dir.iterdir():
if not label_dir.is_dir():
continue
label = label_dir.name
for wav in label_dir.glob("*.wav"):
pred, conf, is_correct = self.evaluate_file(
wav, label, centroids
)
results[label].append({
"file": wav.name,
"predicted": pred,
"confidence": round(conf, 4),
"correct": is_correct,
})
total += 1
correct += int(is_correct)
if total == 0:
return {"status": "no_files_found"}
per_class_acc = {
label: round(
sum(r["correct"] for r in items) / len(items), 4
) if items else 0.0
for label, items in results.items()
}
return {
"status": "complete",
"overall_accuracy": round(correct / total, 4),
"per_class": per_class_acc,
"total_files": total,
"helix_codes": len(self.helix.entries),
"details": results,
}
def evaluate_helix_health(self) -> Dict:
"""
Evaluate helix memory health without audio files.
Tests clustering quality and prototype diversity.
"""
if len(self.helix.entries) < 2:
return {"status": "insufficient_data"}
protos = np.stack(
[p for _, p, _, _ in self.helix.entries]
).astype(np.float32)
# Inter-prototype similarity matrix
n = len(protos)
sims = []
for i in range(n):
for j in range(i + 1, n):
sim = float(np.mean(protos[i] == protos[j]))
sims.append(sim)
avg_sim = float(np.mean(sims)) if sims else 0.0
diversity = round(1.0 - avg_sim, 4)
usage_counts = [cnt for _, _, cnt, _ in self.helix.entries]
return {
"status": "healthy" if diversity > 0.1 else "low_diversity",
"helix_codes": n,
"diversity_score": diversity,
"avg_similarity": round(avg_sim, 4),
"total_ingestions": sum(usage_counts),
"most_used_code": int(np.argmax(usage_counts)),
}