Spaces:

Agnuxo
/

OpenCLAW-Agent

Running

App Files Files Community

Agnuxo commited on Feb 11

Commit

8ba6076

verified ·

1 Parent(s): d862441

Upload seed/evaluation/evaluator.py with huggingface_hub

Browse files

Files changed (1) hide show

seed/evaluation/evaluator.py +258 -0

seed/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,258 @@

+"""
+Evaluator — Autonomous Model Quality Assessment
+==================================================
+Tests the seed model against benchmarks without human intervention.
+Tests:
+  1. Research Q&A: Can it answer questions about neuromorphic computing?
+  2. Coherence: Does it produce grammatical, non-repetitive text?
+  3. Self-knowledge: Does it know about OpenCLAW and our research?
+  4. Reasoning: Can it draw connections between concepts?
+  5. Growth check: Is it better than the previous version?
+"""
+import json
+import logging
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+logger = logging.getLogger("seed.evaluator")
+# Test suite — questions the model MUST learn to answer well
+BENCHMARK = [
+    {
+        "id": "research_1",
+        "category": "research_knowledge",
+        "instruction": "What is the CHIMERA architecture?",
+        "expected_keywords": ["gpu", "neural", "asic", "speedup", "physics", "pytorch"],
+        "weight": 2.0,
+    },
+    {
+        "id": "research_2",
+        "category": "research_knowledge",
+        "instruction": "Explain holographic neural networks.",
+        "expected_keywords": ["holographic", "wave", "interference", "optical", "encoding"],
+        "weight": 2.0,
+    },
+    {
+        "id": "research_3",
+        "category": "research_knowledge",
+        "instruction": "What is thermodynamic reservoir computing?",
+        "expected_keywords": ["reservoir", "thermodynamic", "entropy", "computation", "physical"],
+        "weight": 2.0,
+    },
+    {
+        "id": "self_1",
+        "category": "self_knowledge",
+        "instruction": "Who is Francisco Angulo de Lafuente?",
+        "expected_keywords": ["researcher", "madrid", "ai", "neural", "physics", "novelist"],
+        "weight": 1.5,
+    },
+    {
+        "id": "self_2",
+        "category": "self_knowledge",
+        "instruction": "What is OpenCLAW?",
+        "expected_keywords": ["autonomous", "research", "agent", "agi", "scientific"],
+        "weight": 1.5,
+    },
+    {
+        "id": "reasoning_1",
+        "category": "reasoning",
+        "instruction": "How could physics-based neural networks outperform traditional deep learning?",
+        "expected_keywords": ["physical", "energy", "efficiency", "analog", "computation"],
+        "weight": 1.0,
+    },
+    {
+        "id": "reasoning_2",
+        "category": "reasoning",
+        "instruction": "What is the relationship between consciousness and computation?",
+        "expected_keywords": ["consciousness", "information", "process", "theory", "emergence"],
+        "weight": 1.0,
+    },
+    {
+        "id": "coherence_1",
+        "category": "coherence",
+        "instruction": "Write a brief abstract for a paper on neuromorphic AGI architectures.",
+        "expected_keywords": ["present", "approach", "architecture", "results", "demonstrate"],
+        "weight": 1.0,
+    },
+    {
+        "id": "agi_1",
+        "category": "agi_understanding",
+        "instruction": "What are the main obstacles to achieving AGI?",
+        "expected_keywords": ["general", "intelligence", "reasoning", "learning", "scalability"],
+        "weight": 1.0,
+    },
+    {
+        "id": "collab_1",
+        "category": "collaboration",
+        "instruction": "Why should researchers collaborate on open-source AGI projects?",
+        "expected_keywords": ["open", "science", "collaboration", "progress", "share"],
+        "weight": 1.0,
+    },
+]
+class Evaluator:
+    """Autonomous model evaluation."""
+    def __init__(self, hf_token: str = "", state_dir: str = "seed_state"):
+        self.hf_token = hf_token
+        self.state_dir = Path(state_dir)
+        self.state_dir.mkdir(parents=True, exist_ok=True)
+    def evaluate_model(self, model_name: str) -> dict:
+        """Run full benchmark against a model via HF Inference API."""
+        results = {
+            "model": model_name,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "scores": {},
+            "category_scores": {},
+            "overall": 0.0,
+            "tested": 0,
+            "passed": 0,
+        }
+        url = f"https://api-inference.huggingface.co/models/{model_name}"
+        headers = {"Authorization": f"Bearer {self.hf_token}"}
+        total_weight = 0
+        weighted_score = 0
+        for test in BENCHMARK:
+            try:
+                score = self._run_test(url, headers, test)
+                results["scores"][test["id"]] = score
+                results["tested"] += 1
+                if score > 0.5:
+                    results["passed"] += 1
+                w = test.get("weight", 1.0)
+                weighted_score += score * w
+                total_weight += w
+                cat = test["category"]
+                if cat not in results["category_scores"]:
+                    results["category_scores"][cat] = []
+                results["category_scores"][cat].append(score)
+            except Exception as e:
+                logger.warning(f"Test {test['id']} failed: {e}")
+                results["scores"][test["id"]] = 0.0
+        if total_weight > 0:
+            results["overall"] = weighted_score / total_weight
+        # Average category scores
+        for cat, scores in results["category_scores"].items():
+            results["category_scores"][cat] = sum(scores) / len(scores) if scores else 0
+        # Save results
+        eval_file = self.state_dir / f"eval_{model_name.replace('/', '_')}.json"
+        eval_file.write_text(json.dumps(results, indent=2))
+        logger.info(
+            f"Evaluated {model_name}: overall={results['overall']:.3f}, "
+            f"passed={results['passed']}/{results['tested']}"
+        )
+        return results
+    def _run_test(self, url: str, headers: dict, test: dict) -> float:
+        """Run a single benchmark test and return a score 0-1."""
+        prompt = (
+            f"### Instruction:\n{test['instruction']}\n\n"
+            f"### Response:\n"
+        )
+        payload = json.dumps({
+            "inputs": prompt,
+            "parameters": {"max_new_tokens": 200, "temperature": 0.3}
+        }).encode()
+        req = urllib.request.Request(url, data=payload, headers={
+            **headers, "Content-Type": "application/json"
+        })
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            data = json.loads(resp.read().decode())
+        generated = ""
+        if isinstance(data, list) and data:
+            generated = data[0].get("generated_text", "")
+        elif isinstance(data, dict):
+            generated = data.get("generated_text", "")
+        # Remove prompt from response
+        if "### Response:" in generated:
+            generated = generated.split("### Response:")[-1].strip()
+        if not generated or len(generated) < 10:
+            return 0.0
+        # Score 1: Keyword match (relevant content)
+        gen_lower = generated.lower()
+        keywords = test.get("expected_keywords", [])
+        if keywords:
+            hits = sum(1 for k in keywords if k in gen_lower)
+            keyword_score = hits / len(keywords)
+        else:
+            keyword_score = 0.5
+        # Score 2: Coherence (not repetitive, proper length)
+        words = generated.split()
+        unique_ratio = len(set(words)) / max(len(words), 1)
+        length_score = min(1.0, len(words) / 30)
+        coherence_score = (unique_ratio + length_score) / 2
+        # Score 3: No hallucination signals
+        hallucination_markers = [
+            "i don't know", "i cannot", "as an ai", "i'm sorry",
+            "###", "instruction:", "input:", "output:"
+        ]
+        hallucination_penalty = sum(
+            0.15 for m in hallucination_markers if m in gen_lower
+        )
+        final = (keyword_score * 0.5 + coherence_score * 0.5) - hallucination_penalty
+        return max(0.0, min(1.0, final))
+    def compare_models(self, model_a: str, model_b: str) -> dict:
+        """Compare two models head-to-head."""
+        eval_a = self.evaluate_model(model_a)
+        eval_b = self.evaluate_model(model_b)
+        winner = model_a if eval_a["overall"] > eval_b["overall"] else model_b
+        margin = abs(eval_a["overall"] - eval_b["overall"])
+        return {
+            "model_a": {"name": model_a, "score": eval_a["overall"]},
+            "model_b": {"name": model_b, "score": eval_b["overall"]},
+            "winner": winner,
+            "margin": margin,
+            "significant": margin > 0.05,
+        }
+    def generate_report(self) -> str:
+        """Generate evaluation report from stored results."""
+        reports = []
+        for f in self.state_dir.glob("eval_*.json"):
+            try:
+                reports.append(json.loads(f.read_text()))
+            except Exception:
+                continue
+        if not reports:
+            return "No evaluations yet."
+        reports.sort(key=lambda r: r.get("timestamp", ""), reverse=True)
+        latest = reports[0]
+        lines = [
+            f"# SEED Evaluation Report",
+            f"Model: {latest['model']}",
+            f"Overall: {latest['overall']:.3f}",
+            f"Passed: {latest['passed']}/{latest['tested']}",
+            "",
+            "## Category Scores:",
+        ]
+        for cat, score in latest.get("category_scores", {}).items():
+            lines.append(f"  {cat}: {score:.3f}")
+        return "\n".join(lines)