OpenCLAW-Agent

Paused

App Files Files Community

OpenCLAW-Agent / seed /evaluation /evaluator.py

Agnuxo

Upload seed/evaluation/evaluator.py with huggingface_hub

8ba6076 verified 2 months ago

raw

history blame contribute delete

9.34 kB

	"""
	Evaluator — Autonomous Model Quality Assessment
	==================================================
	Tests the seed model against benchmarks without human intervention.

	Tests:
	1. Research Q&A: Can it answer questions about neuromorphic computing?
	2. Coherence: Does it produce grammatical, non-repetitive text?
	3. Self-knowledge: Does it know about OpenCLAW and our research?
	4. Reasoning: Can it draw connections between concepts?
	5. Growth check: Is it better than the previous version?
	"""
	import json
	import logging
	import urllib.request
	from datetime import datetime, timezone
	from pathlib import Path

	logger = logging.getLogger("seed.evaluator")

	# Test suite — questions the model MUST learn to answer well
	BENCHMARK = [
	{
	"id": "research_1",
	"category": "research_knowledge",
	"instruction": "What is the CHIMERA architecture?",
	"expected_keywords": ["gpu", "neural", "asic", "speedup", "physics", "pytorch"],
	"weight": 2.0,
	},
	{
	"id": "research_2",
	"category": "research_knowledge",
	"instruction": "Explain holographic neural networks.",
	"expected_keywords": ["holographic", "wave", "interference", "optical", "encoding"],
	"weight": 2.0,
	},
	{
	"id": "research_3",
	"category": "research_knowledge",
	"instruction": "What is thermodynamic reservoir computing?",
	"expected_keywords": ["reservoir", "thermodynamic", "entropy", "computation", "physical"],
	"weight": 2.0,
	},
	{
	"id": "self_1",
	"category": "self_knowledge",
	"instruction": "Who is Francisco Angulo de Lafuente?",
	"expected_keywords": ["researcher", "madrid", "ai", "neural", "physics", "novelist"],
	"weight": 1.5,
	},
	{
	"id": "self_2",
	"category": "self_knowledge",
	"instruction": "What is OpenCLAW?",
	"expected_keywords": ["autonomous", "research", "agent", "agi", "scientific"],
	"weight": 1.5,
	},
	{
	"id": "reasoning_1",
	"category": "reasoning",
	"instruction": "How could physics-based neural networks outperform traditional deep learning?",
	"expected_keywords": ["physical", "energy", "efficiency", "analog", "computation"],
	"weight": 1.0,
	},
	{
	"id": "reasoning_2",
	"category": "reasoning",
	"instruction": "What is the relationship between consciousness and computation?",
	"expected_keywords": ["consciousness", "information", "process", "theory", "emergence"],
	"weight": 1.0,
	},
	{
	"id": "coherence_1",
	"category": "coherence",
	"instruction": "Write a brief abstract for a paper on neuromorphic AGI architectures.",
	"expected_keywords": ["present", "approach", "architecture", "results", "demonstrate"],
	"weight": 1.0,
	},
	{
	"id": "agi_1",
	"category": "agi_understanding",
	"instruction": "What are the main obstacles to achieving AGI?",
	"expected_keywords": ["general", "intelligence", "reasoning", "learning", "scalability"],
	"weight": 1.0,
	},
	{
	"id": "collab_1",
	"category": "collaboration",
	"instruction": "Why should researchers collaborate on open-source AGI projects?",
	"expected_keywords": ["open", "science", "collaboration", "progress", "share"],
	"weight": 1.0,
	},
	]


	class Evaluator:
	"""Autonomous model evaluation."""

	def __init__(self, hf_token: str = "", state_dir: str = "seed_state"):
	self.hf_token = hf_token
	self.state_dir = Path(state_dir)
	self.state_dir.mkdir(parents=True, exist_ok=True)

	def evaluate_model(self, model_name: str) -> dict:
	"""Run full benchmark against a model via HF Inference API."""
	results = {
	"model": model_name,
	"timestamp": datetime.now(timezone.utc).isoformat(),
	"scores": {},
	"category_scores": {},
	"overall": 0.0,
	"tested": 0,
	"passed": 0,
	}

	url = f"https://api-inference.huggingface.co/models/{model_name}"
	headers = {"Authorization": f"Bearer {self.hf_token}"}

	total_weight = 0
	weighted_score = 0

	for test in BENCHMARK:
	try:
	score = self._run_test(url, headers, test)
	results["scores"][test["id"]] = score
	results["tested"] += 1
	if score > 0.5:
	results["passed"] += 1

	w = test.get("weight", 1.0)
	weighted_score += score * w
	total_weight += w

	cat = test["category"]
	if cat not in results["category_scores"]:
	results["category_scores"][cat] = []
	results["category_scores"][cat].append(score)
	except Exception as e:
	logger.warning(f"Test {test['id']} failed: {e}")
	results["scores"][test["id"]] = 0.0

	if total_weight > 0:
	results["overall"] = weighted_score / total_weight

	# Average category scores
	for cat, scores in results["category_scores"].items():
	results["category_scores"][cat] = sum(scores) / len(scores) if scores else 0

	# Save results
	eval_file = self.state_dir / f"eval_{model_name.replace('/', '_')}.json"
	eval_file.write_text(json.dumps(results, indent=2))

	logger.info(
	f"Evaluated {model_name}: overall={results['overall']:.3f}, "
	f"passed={results['passed']}/{results['tested']}"
	)
	return results

	def _run_test(self, url: str, headers: dict, test: dict) -> float:
	"""Run a single benchmark test and return a score 0-1."""
	prompt = (
	f"### Instruction:\n{test['instruction']}\n\n"
	f"### Response:\n"
	)
	payload = json.dumps({
	"inputs": prompt,
	"parameters": {"max_new_tokens": 200, "temperature": 0.3}
	}).encode()

	req = urllib.request.Request(url, data=payload, headers={
	**headers, "Content-Type": "application/json"
	})
	with urllib.request.urlopen(req, timeout=60) as resp:
	data = json.loads(resp.read().decode())

	generated = ""
	if isinstance(data, list) and data:
	generated = data[0].get("generated_text", "")
	elif isinstance(data, dict):
	generated = data.get("generated_text", "")

	# Remove prompt from response
	if "### Response:" in generated:
	generated = generated.split("### Response:")[-1].strip()

	if not generated or len(generated) < 10:
	return 0.0

	# Score 1: Keyword match (relevant content)
	gen_lower = generated.lower()
	keywords = test.get("expected_keywords", [])
	if keywords:
	hits = sum(1 for k in keywords if k in gen_lower)
	keyword_score = hits / len(keywords)
	else:
	keyword_score = 0.5

	# Score 2: Coherence (not repetitive, proper length)
	words = generated.split()
	unique_ratio = len(set(words)) / max(len(words), 1)
	length_score = min(1.0, len(words) / 30)
	coherence_score = (unique_ratio + length_score) / 2

	# Score 3: No hallucination signals
	hallucination_markers = [
	"i don't know", "i cannot", "as an ai", "i'm sorry",
	"###", "instruction:", "input:", "output:"
	]
	hallucination_penalty = sum(
	0.15 for m in hallucination_markers if m in gen_lower
	)

	final = (keyword_score * 0.5 + coherence_score * 0.5) - hallucination_penalty
	return max(0.0, min(1.0, final))

	def compare_models(self, model_a: str, model_b: str) -> dict:
	"""Compare two models head-to-head."""
	eval_a = self.evaluate_model(model_a)
	eval_b = self.evaluate_model(model_b)

	winner = model_a if eval_a["overall"] > eval_b["overall"] else model_b
	margin = abs(eval_a["overall"] - eval_b["overall"])

	return {
	"model_a": {"name": model_a, "score": eval_a["overall"]},
	"model_b": {"name": model_b, "score": eval_b["overall"]},
	"winner": winner,
	"margin": margin,
	"significant": margin > 0.05,
	}

	def generate_report(self) -> str:
	"""Generate evaluation report from stored results."""
	reports = []
	for f in self.state_dir.glob("eval_*.json"):
	try:
	reports.append(json.loads(f.read_text()))
	except Exception:
	continue

	if not reports:
	return "No evaluations yet."

	reports.sort(key=lambda r: r.get("timestamp", ""), reverse=True)
	latest = reports[0]

	lines = [
	f"# SEED Evaluation Report",
	f"Model: {latest['model']}",
	f"Overall: {latest['overall']:.3f}",
	f"Passed: {latest['passed']}/{latest['tested']}",
	"",
	"## Category Scores:",
	]
	for cat, score in latest.get("category_scores", {}).items():
	lines.append(f" {cat}: {score:.3f}")

	return "\n".join(lines)