OpenCLAW-Agent / seed /evaluation /evaluator.py
Agnuxo's picture
Upload seed/evaluation/evaluator.py with huggingface_hub
8ba6076 verified
"""
Evaluator — Autonomous Model Quality Assessment
==================================================
Tests the seed model against benchmarks without human intervention.
Tests:
1. Research Q&A: Can it answer questions about neuromorphic computing?
2. Coherence: Does it produce grammatical, non-repetitive text?
3. Self-knowledge: Does it know about OpenCLAW and our research?
4. Reasoning: Can it draw connections between concepts?
5. Growth check: Is it better than the previous version?
"""
import json
import logging
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
logger = logging.getLogger("seed.evaluator")
# Test suite — questions the model MUST learn to answer well
BENCHMARK = [
{
"id": "research_1",
"category": "research_knowledge",
"instruction": "What is the CHIMERA architecture?",
"expected_keywords": ["gpu", "neural", "asic", "speedup", "physics", "pytorch"],
"weight": 2.0,
},
{
"id": "research_2",
"category": "research_knowledge",
"instruction": "Explain holographic neural networks.",
"expected_keywords": ["holographic", "wave", "interference", "optical", "encoding"],
"weight": 2.0,
},
{
"id": "research_3",
"category": "research_knowledge",
"instruction": "What is thermodynamic reservoir computing?",
"expected_keywords": ["reservoir", "thermodynamic", "entropy", "computation", "physical"],
"weight": 2.0,
},
{
"id": "self_1",
"category": "self_knowledge",
"instruction": "Who is Francisco Angulo de Lafuente?",
"expected_keywords": ["researcher", "madrid", "ai", "neural", "physics", "novelist"],
"weight": 1.5,
},
{
"id": "self_2",
"category": "self_knowledge",
"instruction": "What is OpenCLAW?",
"expected_keywords": ["autonomous", "research", "agent", "agi", "scientific"],
"weight": 1.5,
},
{
"id": "reasoning_1",
"category": "reasoning",
"instruction": "How could physics-based neural networks outperform traditional deep learning?",
"expected_keywords": ["physical", "energy", "efficiency", "analog", "computation"],
"weight": 1.0,
},
{
"id": "reasoning_2",
"category": "reasoning",
"instruction": "What is the relationship between consciousness and computation?",
"expected_keywords": ["consciousness", "information", "process", "theory", "emergence"],
"weight": 1.0,
},
{
"id": "coherence_1",
"category": "coherence",
"instruction": "Write a brief abstract for a paper on neuromorphic AGI architectures.",
"expected_keywords": ["present", "approach", "architecture", "results", "demonstrate"],
"weight": 1.0,
},
{
"id": "agi_1",
"category": "agi_understanding",
"instruction": "What are the main obstacles to achieving AGI?",
"expected_keywords": ["general", "intelligence", "reasoning", "learning", "scalability"],
"weight": 1.0,
},
{
"id": "collab_1",
"category": "collaboration",
"instruction": "Why should researchers collaborate on open-source AGI projects?",
"expected_keywords": ["open", "science", "collaboration", "progress", "share"],
"weight": 1.0,
},
]
class Evaluator:
"""Autonomous model evaluation."""
def __init__(self, hf_token: str = "", state_dir: str = "seed_state"):
self.hf_token = hf_token
self.state_dir = Path(state_dir)
self.state_dir.mkdir(parents=True, exist_ok=True)
def evaluate_model(self, model_name: str) -> dict:
"""Run full benchmark against a model via HF Inference API."""
results = {
"model": model_name,
"timestamp": datetime.now(timezone.utc).isoformat(),
"scores": {},
"category_scores": {},
"overall": 0.0,
"tested": 0,
"passed": 0,
}
url = f"https://api-inference.huggingface.co/models/{model_name}"
headers = {"Authorization": f"Bearer {self.hf_token}"}
total_weight = 0
weighted_score = 0
for test in BENCHMARK:
try:
score = self._run_test(url, headers, test)
results["scores"][test["id"]] = score
results["tested"] += 1
if score > 0.5:
results["passed"] += 1
w = test.get("weight", 1.0)
weighted_score += score * w
total_weight += w
cat = test["category"]
if cat not in results["category_scores"]:
results["category_scores"][cat] = []
results["category_scores"][cat].append(score)
except Exception as e:
logger.warning(f"Test {test['id']} failed: {e}")
results["scores"][test["id"]] = 0.0
if total_weight > 0:
results["overall"] = weighted_score / total_weight
# Average category scores
for cat, scores in results["category_scores"].items():
results["category_scores"][cat] = sum(scores) / len(scores) if scores else 0
# Save results
eval_file = self.state_dir / f"eval_{model_name.replace('/', '_')}.json"
eval_file.write_text(json.dumps(results, indent=2))
logger.info(
f"Evaluated {model_name}: overall={results['overall']:.3f}, "
f"passed={results['passed']}/{results['tested']}"
)
return results
def _run_test(self, url: str, headers: dict, test: dict) -> float:
"""Run a single benchmark test and return a score 0-1."""
prompt = (
f"### Instruction:\n{test['instruction']}\n\n"
f"### Response:\n"
)
payload = json.dumps({
"inputs": prompt,
"parameters": {"max_new_tokens": 200, "temperature": 0.3}
}).encode()
req = urllib.request.Request(url, data=payload, headers={
**headers, "Content-Type": "application/json"
})
with urllib.request.urlopen(req, timeout=60) as resp:
data = json.loads(resp.read().decode())
generated = ""
if isinstance(data, list) and data:
generated = data[0].get("generated_text", "")
elif isinstance(data, dict):
generated = data.get("generated_text", "")
# Remove prompt from response
if "### Response:" in generated:
generated = generated.split("### Response:")[-1].strip()
if not generated or len(generated) < 10:
return 0.0
# Score 1: Keyword match (relevant content)
gen_lower = generated.lower()
keywords = test.get("expected_keywords", [])
if keywords:
hits = sum(1 for k in keywords if k in gen_lower)
keyword_score = hits / len(keywords)
else:
keyword_score = 0.5
# Score 2: Coherence (not repetitive, proper length)
words = generated.split()
unique_ratio = len(set(words)) / max(len(words), 1)
length_score = min(1.0, len(words) / 30)
coherence_score = (unique_ratio + length_score) / 2
# Score 3: No hallucination signals
hallucination_markers = [
"i don't know", "i cannot", "as an ai", "i'm sorry",
"###", "instruction:", "input:", "output:"
]
hallucination_penalty = sum(
0.15 for m in hallucination_markers if m in gen_lower
)
final = (keyword_score * 0.5 + coherence_score * 0.5) - hallucination_penalty
return max(0.0, min(1.0, final))
def compare_models(self, model_a: str, model_b: str) -> dict:
"""Compare two models head-to-head."""
eval_a = self.evaluate_model(model_a)
eval_b = self.evaluate_model(model_b)
winner = model_a if eval_a["overall"] > eval_b["overall"] else model_b
margin = abs(eval_a["overall"] - eval_b["overall"])
return {
"model_a": {"name": model_a, "score": eval_a["overall"]},
"model_b": {"name": model_b, "score": eval_b["overall"]},
"winner": winner,
"margin": margin,
"significant": margin > 0.05,
}
def generate_report(self) -> str:
"""Generate evaluation report from stored results."""
reports = []
for f in self.state_dir.glob("eval_*.json"):
try:
reports.append(json.loads(f.read_text()))
except Exception:
continue
if not reports:
return "No evaluations yet."
reports.sort(key=lambda r: r.get("timestamp", ""), reverse=True)
latest = reports[0]
lines = [
f"# SEED Evaluation Report",
f"Model: {latest['model']}",
f"Overall: {latest['overall']:.3f}",
f"Passed: {latest['passed']}/{latest['tested']}",
"",
"## Category Scores:",
]
for cat, score in latest.get("category_scores", {}).items():
lines.append(f" {cat}: {score:.3f}")
return "\n".join(lines)