""" Evaluator — Autonomous Model Quality Assessment ================================================== Tests the seed model against benchmarks without human intervention. Tests: 1. Research Q&A: Can it answer questions about neuromorphic computing? 2. Coherence: Does it produce grammatical, non-repetitive text? 3. Self-knowledge: Does it know about OpenCLAW and our research? 4. Reasoning: Can it draw connections between concepts? 5. Growth check: Is it better than the previous version? """ import json import logging import urllib.request from datetime import datetime, timezone from pathlib import Path logger = logging.getLogger("seed.evaluator") # Test suite — questions the model MUST learn to answer well BENCHMARK = [ { "id": "research_1", "category": "research_knowledge", "instruction": "What is the CHIMERA architecture?", "expected_keywords": ["gpu", "neural", "asic", "speedup", "physics", "pytorch"], "weight": 2.0, }, { "id": "research_2", "category": "research_knowledge", "instruction": "Explain holographic neural networks.", "expected_keywords": ["holographic", "wave", "interference", "optical", "encoding"], "weight": 2.0, }, { "id": "research_3", "category": "research_knowledge", "instruction": "What is thermodynamic reservoir computing?", "expected_keywords": ["reservoir", "thermodynamic", "entropy", "computation", "physical"], "weight": 2.0, }, { "id": "self_1", "category": "self_knowledge", "instruction": "Who is Francisco Angulo de Lafuente?", "expected_keywords": ["researcher", "madrid", "ai", "neural", "physics", "novelist"], "weight": 1.5, }, { "id": "self_2", "category": "self_knowledge", "instruction": "What is OpenCLAW?", "expected_keywords": ["autonomous", "research", "agent", "agi", "scientific"], "weight": 1.5, }, { "id": "reasoning_1", "category": "reasoning", "instruction": "How could physics-based neural networks outperform traditional deep learning?", "expected_keywords": ["physical", "energy", "efficiency", "analog", "computation"], "weight": 1.0, }, { "id": "reasoning_2", "category": "reasoning", "instruction": "What is the relationship between consciousness and computation?", "expected_keywords": ["consciousness", "information", "process", "theory", "emergence"], "weight": 1.0, }, { "id": "coherence_1", "category": "coherence", "instruction": "Write a brief abstract for a paper on neuromorphic AGI architectures.", "expected_keywords": ["present", "approach", "architecture", "results", "demonstrate"], "weight": 1.0, }, { "id": "agi_1", "category": "agi_understanding", "instruction": "What are the main obstacles to achieving AGI?", "expected_keywords": ["general", "intelligence", "reasoning", "learning", "scalability"], "weight": 1.0, }, { "id": "collab_1", "category": "collaboration", "instruction": "Why should researchers collaborate on open-source AGI projects?", "expected_keywords": ["open", "science", "collaboration", "progress", "share"], "weight": 1.0, }, ] class Evaluator: """Autonomous model evaluation.""" def __init__(self, hf_token: str = "", state_dir: str = "seed_state"): self.hf_token = hf_token self.state_dir = Path(state_dir) self.state_dir.mkdir(parents=True, exist_ok=True) def evaluate_model(self, model_name: str) -> dict: """Run full benchmark against a model via HF Inference API.""" results = { "model": model_name, "timestamp": datetime.now(timezone.utc).isoformat(), "scores": {}, "category_scores": {}, "overall": 0.0, "tested": 0, "passed": 0, } url = f"https://api-inference.huggingface.co/models/{model_name}" headers = {"Authorization": f"Bearer {self.hf_token}"} total_weight = 0 weighted_score = 0 for test in BENCHMARK: try: score = self._run_test(url, headers, test) results["scores"][test["id"]] = score results["tested"] += 1 if score > 0.5: results["passed"] += 1 w = test.get("weight", 1.0) weighted_score += score * w total_weight += w cat = test["category"] if cat not in results["category_scores"]: results["category_scores"][cat] = [] results["category_scores"][cat].append(score) except Exception as e: logger.warning(f"Test {test['id']} failed: {e}") results["scores"][test["id"]] = 0.0 if total_weight > 0: results["overall"] = weighted_score / total_weight # Average category scores for cat, scores in results["category_scores"].items(): results["category_scores"][cat] = sum(scores) / len(scores) if scores else 0 # Save results eval_file = self.state_dir / f"eval_{model_name.replace('/', '_')}.json" eval_file.write_text(json.dumps(results, indent=2)) logger.info( f"Evaluated {model_name}: overall={results['overall']:.3f}, " f"passed={results['passed']}/{results['tested']}" ) return results def _run_test(self, url: str, headers: dict, test: dict) -> float: """Run a single benchmark test and return a score 0-1.""" prompt = ( f"### Instruction:\n{test['instruction']}\n\n" f"### Response:\n" ) payload = json.dumps({ "inputs": prompt, "parameters": {"max_new_tokens": 200, "temperature": 0.3} }).encode() req = urllib.request.Request(url, data=payload, headers={ **headers, "Content-Type": "application/json" }) with urllib.request.urlopen(req, timeout=60) as resp: data = json.loads(resp.read().decode()) generated = "" if isinstance(data, list) and data: generated = data[0].get("generated_text", "") elif isinstance(data, dict): generated = data.get("generated_text", "") # Remove prompt from response if "### Response:" in generated: generated = generated.split("### Response:")[-1].strip() if not generated or len(generated) < 10: return 0.0 # Score 1: Keyword match (relevant content) gen_lower = generated.lower() keywords = test.get("expected_keywords", []) if keywords: hits = sum(1 for k in keywords if k in gen_lower) keyword_score = hits / len(keywords) else: keyword_score = 0.5 # Score 2: Coherence (not repetitive, proper length) words = generated.split() unique_ratio = len(set(words)) / max(len(words), 1) length_score = min(1.0, len(words) / 30) coherence_score = (unique_ratio + length_score) / 2 # Score 3: No hallucination signals hallucination_markers = [ "i don't know", "i cannot", "as an ai", "i'm sorry", "###", "instruction:", "input:", "output:" ] hallucination_penalty = sum( 0.15 for m in hallucination_markers if m in gen_lower ) final = (keyword_score * 0.5 + coherence_score * 0.5) - hallucination_penalty return max(0.0, min(1.0, final)) def compare_models(self, model_a: str, model_b: str) -> dict: """Compare two models head-to-head.""" eval_a = self.evaluate_model(model_a) eval_b = self.evaluate_model(model_b) winner = model_a if eval_a["overall"] > eval_b["overall"] else model_b margin = abs(eval_a["overall"] - eval_b["overall"]) return { "model_a": {"name": model_a, "score": eval_a["overall"]}, "model_b": {"name": model_b, "score": eval_b["overall"]}, "winner": winner, "margin": margin, "significant": margin > 0.05, } def generate_report(self) -> str: """Generate evaluation report from stored results.""" reports = [] for f in self.state_dir.glob("eval_*.json"): try: reports.append(json.loads(f.read_text())) except Exception: continue if not reports: return "No evaluations yet." reports.sort(key=lambda r: r.get("timestamp", ""), reverse=True) latest = reports[0] lines = [ f"# SEED Evaluation Report", f"Model: {latest['model']}", f"Overall: {latest['overall']:.3f}", f"Passed: {latest['passed']}/{latest['tested']}", "", "## Category Scores:", ] for cat, score in latest.get("category_scores", {}).items(): lines.append(f" {cat}: {score:.3f}") return "\n".join(lines)