FerrellSyntheticIntelligence

Add deep cognition layer, curriculum runner, evaluation probe, updated loop and benchmark

653b8c1 about 11 hours ago

6.28 kB

	"""
	EvaluationProbe — Vitalis FSI

	Measures whether the system has learned to distinguish
	primitive speech acts after the curriculum.

	Speech acts measured:
	- question (interrogative)
	- instruction (imperative)
	- explanation (declarative/expository)

	No labeled audio required at runtime —
	uses helix prototype clustering for zero-shot evaluation.
	"""
	import numpy as np
	from pathlib import Path
	from typing import Dict, List, Tuple

	from src.dream_engine.helix_memory import HelixMemory
	from src.hdc_encoder.encoder import encode
	from src.audio_ear.feature_extractor import extract_features


	class EvaluationProbe:
	N_CLUSTERS = 3
	LABELS = ["question", "instruction", "explanation"]

	def __init__(self, helix_path: Path = None):
	self.helix_path = helix_path or (
	Path.home() / ".vitalis_workspace" / "helix_memory.pkl"
	)
	self.helix = HelixMemory(self.helix_path)

	def _build_centroids(self) -> Dict[str, np.ndarray]:
	"""
	Build one centroid per speech act by clustering
	all stored helix prototypes.
	"""
	if len(self.helix.entries) < self.N_CLUSTERS:
	raise RuntimeError(
	f"Need at least {self.N_CLUSTERS} helix codes. "
	f"Run curriculum first."
	)

	all_protos = np.stack(
	[proto for _, proto, _, _ in self.helix.entries]
	).astype(np.int8)

	# Seeded k-means for reproducibility
	rng = np.random.default_rng(42)
	idx = rng.choice(len(all_protos), self.N_CLUSTERS, replace=False)
	centroids = all_protos[idx].copy()

	for _ in range(6):
	dists = np.stack([
	np.sum(all_protos != c, axis=1) for c in centroids
	], axis=1)
	assigns = np.argmin(dists, axis=1)
	for i in range(self.N_CLUSTERS):
	mask = assigns == i
	if np.any(mask):
	summed = all_protos[mask].astype(np.int32).sum(axis=0)
	new_c = np.sign(summed).astype(np.int8)
	new_c[new_c == 0] = 1
	centroids[i] = new_c

	return dict(zip(self.LABELS, centroids))

	def _semantic_fingerprint(self, hv: np.ndarray) -> np.ndarray:
	"""
	Retrieve top-3 helix prototypes and XOR-bundle them.
	Reduces noise in raw hypervector.
	"""
	matches = self.helix.retrieve(hv, top_k=3)
	if not matches:
	return hv.copy()
	protos = [proto for proto, _ in matches]
	stacked = np.stack(protos).astype(np.int32).sum(axis=0)
	result = np.sign(stacked).astype(np.int8)
	result[result == 0] = 1
	return result

	def evaluate_file(
	self,
	wav_path: Path,
	true_label: str,
	centroids: Dict[str, np.ndarray],
	) -> Tuple[str, float, bool]:
	"""Evaluate one audio file. Returns (predicted, confidence, correct)."""
	mfcc, prosody = extract_features(wav_path)
	raw_hv = encode(mfcc, prosody)
	semantic_hv = self._semantic_fingerprint(raw_hv)

	sims = {
	label: float(np.mean(semantic_hv == centroid))
	for label, centroid in centroids.items()
	}
	predicted = max(sims, key=sims.get)
	confidence = sims[predicted]
	correct = predicted == true_label
	return predicted, confidence, correct

	def evaluate_directory(self, probe_dir: Path) -> Dict:
	"""
	Evaluate all wav files in probe_dir.
	Directory structure: probe_dir/label/file.wav
	"""
	if not probe_dir.exists():
	return {"status": "probe_dir_not_found", "path": str(probe_dir)}

	centroids = self._build_centroids()
	results = {label: [] for label in self.LABELS}
	total = 0
	correct = 0

	for label_dir in probe_dir.iterdir():
	if not label_dir.is_dir():
	continue
	label = label_dir.name
	for wav in label_dir.glob("*.wav"):
	pred, conf, is_correct = self.evaluate_file(
	wav, label, centroids
	)
	results[label].append({
	"file": wav.name,
	"predicted": pred,
	"confidence": round(conf, 4),
	"correct": is_correct,
	})
	total += 1
	correct += int(is_correct)

	if total == 0:
	return {"status": "no_files_found"}

	per_class_acc = {
	label: round(
	sum(r["correct"] for r in items) / len(items), 4
	) if items else 0.0
	for label, items in results.items()
	}

	return {
	"status": "complete",
	"overall_accuracy": round(correct / total, 4),
	"per_class": per_class_acc,
	"total_files": total,
	"helix_codes": len(self.helix.entries),
	"details": results,
	}

	def evaluate_helix_health(self) -> Dict:
	"""
	Evaluate helix memory health without audio files.
	Tests clustering quality and prototype diversity.
	"""
	if len(self.helix.entries) < 2:
	return {"status": "insufficient_data"}

	protos = np.stack(
	[p for _, p, _, _ in self.helix.entries]
	).astype(np.float32)

	# Inter-prototype similarity matrix
	n = len(protos)
	sims = []
	for i in range(n):
	for j in range(i + 1, n):
	sim = float(np.mean(protos[i] == protos[j]))
	sims.append(sim)

	avg_sim = float(np.mean(sims)) if sims else 0.0
	diversity = round(1.0 - avg_sim, 4)

	usage_counts = [cnt for _, _, cnt, _ in self.helix.entries]

	return {
	"status": "healthy" if diversity > 0.1 else "low_diversity",
	"helix_codes": n,
	"diversity_score": diversity,
	"avg_similarity": round(avg_sim, 4),
	"total_ingestions": sum(usage_counts),
	"most_used_code": int(np.argmax(usage_counts)),
	}