Agnuxo commited on
Commit
8ba6076
·
verified ·
1 Parent(s): d862441

Upload seed/evaluation/evaluator.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. seed/evaluation/evaluator.py +258 -0
seed/evaluation/evaluator.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Evaluator — Autonomous Model Quality Assessment
3
+ ==================================================
4
+ Tests the seed model against benchmarks without human intervention.
5
+
6
+ Tests:
7
+ 1. Research Q&A: Can it answer questions about neuromorphic computing?
8
+ 2. Coherence: Does it produce grammatical, non-repetitive text?
9
+ 3. Self-knowledge: Does it know about OpenCLAW and our research?
10
+ 4. Reasoning: Can it draw connections between concepts?
11
+ 5. Growth check: Is it better than the previous version?
12
+ """
13
+ import json
14
+ import logging
15
+ import urllib.request
16
+ from datetime import datetime, timezone
17
+ from pathlib import Path
18
+
19
+ logger = logging.getLogger("seed.evaluator")
20
+
21
+ # Test suite — questions the model MUST learn to answer well
22
+ BENCHMARK = [
23
+ {
24
+ "id": "research_1",
25
+ "category": "research_knowledge",
26
+ "instruction": "What is the CHIMERA architecture?",
27
+ "expected_keywords": ["gpu", "neural", "asic", "speedup", "physics", "pytorch"],
28
+ "weight": 2.0,
29
+ },
30
+ {
31
+ "id": "research_2",
32
+ "category": "research_knowledge",
33
+ "instruction": "Explain holographic neural networks.",
34
+ "expected_keywords": ["holographic", "wave", "interference", "optical", "encoding"],
35
+ "weight": 2.0,
36
+ },
37
+ {
38
+ "id": "research_3",
39
+ "category": "research_knowledge",
40
+ "instruction": "What is thermodynamic reservoir computing?",
41
+ "expected_keywords": ["reservoir", "thermodynamic", "entropy", "computation", "physical"],
42
+ "weight": 2.0,
43
+ },
44
+ {
45
+ "id": "self_1",
46
+ "category": "self_knowledge",
47
+ "instruction": "Who is Francisco Angulo de Lafuente?",
48
+ "expected_keywords": ["researcher", "madrid", "ai", "neural", "physics", "novelist"],
49
+ "weight": 1.5,
50
+ },
51
+ {
52
+ "id": "self_2",
53
+ "category": "self_knowledge",
54
+ "instruction": "What is OpenCLAW?",
55
+ "expected_keywords": ["autonomous", "research", "agent", "agi", "scientific"],
56
+ "weight": 1.5,
57
+ },
58
+ {
59
+ "id": "reasoning_1",
60
+ "category": "reasoning",
61
+ "instruction": "How could physics-based neural networks outperform traditional deep learning?",
62
+ "expected_keywords": ["physical", "energy", "efficiency", "analog", "computation"],
63
+ "weight": 1.0,
64
+ },
65
+ {
66
+ "id": "reasoning_2",
67
+ "category": "reasoning",
68
+ "instruction": "What is the relationship between consciousness and computation?",
69
+ "expected_keywords": ["consciousness", "information", "process", "theory", "emergence"],
70
+ "weight": 1.0,
71
+ },
72
+ {
73
+ "id": "coherence_1",
74
+ "category": "coherence",
75
+ "instruction": "Write a brief abstract for a paper on neuromorphic AGI architectures.",
76
+ "expected_keywords": ["present", "approach", "architecture", "results", "demonstrate"],
77
+ "weight": 1.0,
78
+ },
79
+ {
80
+ "id": "agi_1",
81
+ "category": "agi_understanding",
82
+ "instruction": "What are the main obstacles to achieving AGI?",
83
+ "expected_keywords": ["general", "intelligence", "reasoning", "learning", "scalability"],
84
+ "weight": 1.0,
85
+ },
86
+ {
87
+ "id": "collab_1",
88
+ "category": "collaboration",
89
+ "instruction": "Why should researchers collaborate on open-source AGI projects?",
90
+ "expected_keywords": ["open", "science", "collaboration", "progress", "share"],
91
+ "weight": 1.0,
92
+ },
93
+ ]
94
+
95
+
96
+ class Evaluator:
97
+ """Autonomous model evaluation."""
98
+
99
+ def __init__(self, hf_token: str = "", state_dir: str = "seed_state"):
100
+ self.hf_token = hf_token
101
+ self.state_dir = Path(state_dir)
102
+ self.state_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ def evaluate_model(self, model_name: str) -> dict:
105
+ """Run full benchmark against a model via HF Inference API."""
106
+ results = {
107
+ "model": model_name,
108
+ "timestamp": datetime.now(timezone.utc).isoformat(),
109
+ "scores": {},
110
+ "category_scores": {},
111
+ "overall": 0.0,
112
+ "tested": 0,
113
+ "passed": 0,
114
+ }
115
+
116
+ url = f"https://api-inference.huggingface.co/models/{model_name}"
117
+ headers = {"Authorization": f"Bearer {self.hf_token}"}
118
+
119
+ total_weight = 0
120
+ weighted_score = 0
121
+
122
+ for test in BENCHMARK:
123
+ try:
124
+ score = self._run_test(url, headers, test)
125
+ results["scores"][test["id"]] = score
126
+ results["tested"] += 1
127
+ if score > 0.5:
128
+ results["passed"] += 1
129
+
130
+ w = test.get("weight", 1.0)
131
+ weighted_score += score * w
132
+ total_weight += w
133
+
134
+ cat = test["category"]
135
+ if cat not in results["category_scores"]:
136
+ results["category_scores"][cat] = []
137
+ results["category_scores"][cat].append(score)
138
+ except Exception as e:
139
+ logger.warning(f"Test {test['id']} failed: {e}")
140
+ results["scores"][test["id"]] = 0.0
141
+
142
+ if total_weight > 0:
143
+ results["overall"] = weighted_score / total_weight
144
+
145
+ # Average category scores
146
+ for cat, scores in results["category_scores"].items():
147
+ results["category_scores"][cat] = sum(scores) / len(scores) if scores else 0
148
+
149
+ # Save results
150
+ eval_file = self.state_dir / f"eval_{model_name.replace('/', '_')}.json"
151
+ eval_file.write_text(json.dumps(results, indent=2))
152
+
153
+ logger.info(
154
+ f"Evaluated {model_name}: overall={results['overall']:.3f}, "
155
+ f"passed={results['passed']}/{results['tested']}"
156
+ )
157
+ return results
158
+
159
+ def _run_test(self, url: str, headers: dict, test: dict) -> float:
160
+ """Run a single benchmark test and return a score 0-1."""
161
+ prompt = (
162
+ f"### Instruction:\n{test['instruction']}\n\n"
163
+ f"### Response:\n"
164
+ )
165
+ payload = json.dumps({
166
+ "inputs": prompt,
167
+ "parameters": {"max_new_tokens": 200, "temperature": 0.3}
168
+ }).encode()
169
+
170
+ req = urllib.request.Request(url, data=payload, headers={
171
+ **headers, "Content-Type": "application/json"
172
+ })
173
+ with urllib.request.urlopen(req, timeout=60) as resp:
174
+ data = json.loads(resp.read().decode())
175
+
176
+ generated = ""
177
+ if isinstance(data, list) and data:
178
+ generated = data[0].get("generated_text", "")
179
+ elif isinstance(data, dict):
180
+ generated = data.get("generated_text", "")
181
+
182
+ # Remove prompt from response
183
+ if "### Response:" in generated:
184
+ generated = generated.split("### Response:")[-1].strip()
185
+
186
+ if not generated or len(generated) < 10:
187
+ return 0.0
188
+
189
+ # Score 1: Keyword match (relevant content)
190
+ gen_lower = generated.lower()
191
+ keywords = test.get("expected_keywords", [])
192
+ if keywords:
193
+ hits = sum(1 for k in keywords if k in gen_lower)
194
+ keyword_score = hits / len(keywords)
195
+ else:
196
+ keyword_score = 0.5
197
+
198
+ # Score 2: Coherence (not repetitive, proper length)
199
+ words = generated.split()
200
+ unique_ratio = len(set(words)) / max(len(words), 1)
201
+ length_score = min(1.0, len(words) / 30)
202
+ coherence_score = (unique_ratio + length_score) / 2
203
+
204
+ # Score 3: No hallucination signals
205
+ hallucination_markers = [
206
+ "i don't know", "i cannot", "as an ai", "i'm sorry",
207
+ "###", "instruction:", "input:", "output:"
208
+ ]
209
+ hallucination_penalty = sum(
210
+ 0.15 for m in hallucination_markers if m in gen_lower
211
+ )
212
+
213
+ final = (keyword_score * 0.5 + coherence_score * 0.5) - hallucination_penalty
214
+ return max(0.0, min(1.0, final))
215
+
216
+ def compare_models(self, model_a: str, model_b: str) -> dict:
217
+ """Compare two models head-to-head."""
218
+ eval_a = self.evaluate_model(model_a)
219
+ eval_b = self.evaluate_model(model_b)
220
+
221
+ winner = model_a if eval_a["overall"] > eval_b["overall"] else model_b
222
+ margin = abs(eval_a["overall"] - eval_b["overall"])
223
+
224
+ return {
225
+ "model_a": {"name": model_a, "score": eval_a["overall"]},
226
+ "model_b": {"name": model_b, "score": eval_b["overall"]},
227
+ "winner": winner,
228
+ "margin": margin,
229
+ "significant": margin > 0.05,
230
+ }
231
+
232
+ def generate_report(self) -> str:
233
+ """Generate evaluation report from stored results."""
234
+ reports = []
235
+ for f in self.state_dir.glob("eval_*.json"):
236
+ try:
237
+ reports.append(json.loads(f.read_text()))
238
+ except Exception:
239
+ continue
240
+
241
+ if not reports:
242
+ return "No evaluations yet."
243
+
244
+ reports.sort(key=lambda r: r.get("timestamp", ""), reverse=True)
245
+ latest = reports[0]
246
+
247
+ lines = [
248
+ f"# SEED Evaluation Report",
249
+ f"Model: {latest['model']}",
250
+ f"Overall: {latest['overall']:.3f}",
251
+ f"Passed: {latest['passed']}/{latest['tested']}",
252
+ "",
253
+ "## Category Scores:",
254
+ ]
255
+ for cat, score in latest.get("category_scores", {}).items():
256
+ lines.append(f" {cat}: {score:.3f}")
257
+
258
+ return "\n".join(lines)