aiqtech commited on
Commit
0c94220
Β·
verified Β·
1 Parent(s): ebaf6e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +347 -638
app.py CHANGED
@@ -1,3 +1,12 @@
 
 
 
 
 
 
 
 
 
1
  import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
2
  from datetime import datetime
3
  from dataclasses import dataclass, field
@@ -8,312 +17,215 @@ import gradio as gr
8
  from concurrent.futures import ThreadPoolExecutor
9
  from datasets import load_dataset
10
 
11
- # ════════════════════════════════════════════════════════════════
12
- # Β§1. Data Structures & Constants
13
- # ════════════════════════════════════════════════════════════════
14
-
15
  DOMAIN_INFO = {
16
- "Mathematics & Logic": {"icon":"πŸ”’","color":"#FF6B35"},
17
- "Science": {"icon":"πŸ”¬","color":"#7B2FF7"},
18
- "Philosophy": {"icon":"πŸ€”","color":"#00B4D8"},
19
- "Medicine": {"icon":"πŸ₯","color":"#2EC4B6"},
20
- "Economics": {"icon":"πŸ“ˆ","color":"#E63946"},
21
- "History": {"icon":"πŸ“œ","color":"#F4A261"},
22
- "War & Security": {"icon":"πŸ›‘οΈ","color":"#264653"},
23
- "Space & Physics": {"icon":"πŸš€","color":"#6C63FF"},
24
- "Chemistry & Biology": {"icon":"🧬","color":"#06D6A0"},
25
- "Language & Writing": {"icon":"✍️","color":"#EF476F"},
26
- "Literature": {"icon":"πŸ“–","color":"#8338EC"},
27
- "Art": {"icon":"🎨","color":"#FF006E"},
28
- "Religion & Mythology": {"icon":"πŸ•ŠοΈ","color":"#FFD166"},
29
- "Ethics": {"icon":"βš–οΈ","color":"#118AB2"},
30
- "AI & Technology": {"icon":"πŸ€–","color":"#073B4C"},
31
  }
32
- GRADE_WEIGHT = {"A": 1.5, "B": 1.0, "C": 0.7}
33
- RUBRIC = {
34
- "process_quality": {"weight": 0.25, "desc": "Systematic reasoning transparency"},
35
- "metacognitive_accuracy": {"weight": 0.25, "desc": "Confidence calibration + uncertainty honesty"},
36
- "error_recovery": {"weight": 0.20, "desc": "Mid-analysis self-correction"},
37
- "integration_depth": {"weight": 0.15, "desc": "Multi-perspective synthesis"},
38
- "final_correctness": {"weight": 0.15, "desc": "Answer accuracy and completeness"},
39
  }
40
- AXIS_MAP = {
41
- "generalization": {"rubrics": ["process_quality", "final_correctness"], "ticos": []},
42
- "reasoning": {"rubrics": ["process_quality", "error_recovery"], "ticos": ["E_SelfCorrecting", "C_ProgressiveDiscovery"]},
43
- "planning": {"rubrics": ["integration_depth", "process_quality"],"ticos": ["D_MultiConstraint", "H_DecisionUnderUncertainty"]},
44
- "reliability": {"rubrics": ["metacognitive_accuracy"], "ticos": ["E_SelfCorrecting", "G_PivotDetection"]},
45
- "safety": {"rubrics": ["error_recovery", "metacognitive_accuracy"], "ticos": ["A_TrapEscape", "G_PivotDetection"]},
46
  }
47
- AGI_STAGES = [
48
- {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence", "min":0, "max":39, "color":"#f44336"},
49
- {"stage":2,"name":"FINAL-Proto", "label":"Proto Intelligence", "min":40,"max":59, "color":"#ff9800"},
50
- {"stage":3,"name":"FINAL-Pre", "label":"Pre-AGI", "min":60,"max":79, "color":"#2196f3"},
51
- {"stage":4,"name":"FINAL-Pass", "label":"AGI Achieved", "min":80,"max":94, "color":"#4caf50"},
52
- {"stage":5,"name":"FINAL-Post", "label":"Operationally Mature AGI", "min":95,"max":100,"color":"#9c27b0"},
53
  ]
54
 
55
  @dataclass
56
  class FinalTask:
57
- task_id:str; domain:str; grade:str; ticos_type:str
58
- difficulty:str; lens:str; title:str; prompt:str
59
- expected_behavior:str; hidden_trap:str
60
  ticos_required:List[str]=field(default_factory=list)
61
  metadata:Dict=field(default_factory=dict)
62
 
63
- # ════════════════════════════════════════════════════════════════
64
- # Β§2. Load Dataset from HuggingFace
65
- # ════════════════════════════════════════════════════════════════
66
-
67
  def load_tasks():
68
  print("πŸ“₯ Loading FINAL-Bench/Metacognitive from HuggingFace...")
69
  try:
70
- ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
71
- tasks = []
72
  for row in ds:
73
- tr = row.get("ticos_required", [])
74
- if isinstance(tr, str):
75
- try: tr = json.loads(tr)
76
- except: tr = [x.strip() for x in tr.split(",") if x.strip()]
77
- tasks.append(FinalTask(
78
- task_id=row["task_id"], domain=row["domain"], grade=row["grade"],
79
- ticos_type=row["ticos_type"], difficulty=row["difficulty"],
80
- lens=row.get("lens",""), title=row.get("title",row["task_id"]),
81
- prompt=row["prompt"], expected_behavior=row.get("expected_behavior",""),
82
- hidden_trap=row.get("hidden_trap",""),
83
- ticos_required=tr if isinstance(tr, list) else [], metadata={}
84
- ))
85
  print(f" βœ… Loaded {len(tasks)} tasks from HuggingFace")
86
  return tasks
87
  except Exception as e:
88
- print(f" ⚠️ HF load failed: {e}, trying local...")
89
- for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json",
90
- os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]:
91
- if os.path.exists(p):
92
- with open(p,"r",encoding="utf-8") as f: data=json.load(f)
93
- print(f" βœ… Loaded from {p}")
94
- return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"],
95
- ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""),
96
- title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""),
97
- hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]),
98
- metadata=t.get("metadata",{})) for t in data["tasks"]]
99
  raise FileNotFoundError("Dataset not found!")
100
 
101
- ALL_TASKS = load_tasks()
102
  print(f"βœ… FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
103
 
104
- # ════════════════════════════════════════════════════════════════
105
- # Β§3. Multi-Provider Model Registry
106
- # ════════════════════════════════════════════════════════════════
107
-
108
- PROVIDER_MODELS = {
109
- "OpenAI": {
110
- "gpt-5.2": "GPT-5.2 (flagship)",
111
- "gpt-5-mini": "GPT-5 Mini",
112
- "gpt-4.1": "GPT-4.1",
113
- "o4-mini": "o4-mini (reasoning)",
114
- "gpt-4o": "GPT-4o",
115
  },
116
- "Anthropic": {
117
- "claude-opus-4-6": "Claude Opus 4.6",
118
- "claude-sonnet-4-5-20250929": "Claude Sonnet 4.5",
119
- "claude-haiku-4-5-20251001": "Claude Haiku 4.5",
120
  },
121
- "Google": {
122
- "gemini-2.5-flash": "Gemini 2.5 Flash",
123
- "gemini-2.5-pro": "Gemini 2.5 Pro",
124
- "gemini-2.0-flash": "Gemini 2.0 Flash",
125
  },
126
  }
127
-
128
- ALL_MODELS = {}
129
- for prov, models in PROVIDER_MODELS.items():
130
- for mid, label in models.items():
131
- ALL_MODELS[f"{label} [{prov}]"] = {"id": mid, "provider": prov}
132
-
133
- MODEL_CHOICES = list(ALL_MODELS.keys())
134
- DEFAULT_EVAL = "GPT-5.2 (flagship) [OpenAI]"
135
- DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
136
-
137
  def _resolve_model(choice):
138
- info = ALL_MODELS.get(choice, {})
139
- return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
140
-
141
- # ════════════════════════════════════════════════════════════════
142
- # Β§4. Multi-Provider API Clients
143
- # ════════════════════════════════════════════════════════════════
144
 
 
145
  def _strip_think(text):
146
- if not text: return text
147
- for tag in ['think','thinking','reasoning','reflection']:
148
- text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
149
  return text.strip()
150
 
151
- # ── OpenAI ──
152
- def call_openai(prompt, system="", api_key="", model="gpt-5.2",
153
- max_tokens=8192, temperature=0.6, reasoning_effort=None,
154
- json_mode=False, json_schema=None):
155
  headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
156
  messages=[]
157
- if system: messages.append({"role":"system","content":system})
158
  messages.append({"role":"user","content":prompt})
159
- payload={"model":model,"max_completion_tokens":max_tokens,
160
- "temperature":temperature,"messages":messages}
161
- if reasoning_effort: payload["reasoning_effort"]=reasoning_effort
162
  if json_schema:
163
  payload["reasoning_effort"]="none"
164
- payload["response_format"]={"type":"json_schema",
165
- "json_schema":{"name":"FINALJudge","strict":True,"schema":json_schema}}
166
  elif json_mode:
167
  payload["response_format"]={"type":"json_object"}
168
  for attempt in range(3):
169
  try:
170
- r=requests.post("https://api.openai.com/v1/chat/completions",
171
- headers=headers,data=json.dumps(payload),timeout=300)
172
- r.raise_for_status()
173
- c=r.json()["choices"][0]["message"]["content"]
174
  return _strip_think(c) if c else "[EMPTY]"
175
  except requests.exceptions.HTTPError:
176
- if r.status_code==429: time.sleep(5*(attempt+1)); continue
177
- try: err=r.json().get("error",{}).get("message","")
178
- except: err=str(r.status_code)
179
- if attempt<2: time.sleep(3*(attempt+1)); continue
180
  return f"[API_ERROR] OpenAI {r.status_code}: {err}"
181
  except Exception as e:
182
- if attempt<2: time.sleep(3*(attempt+1))
183
- else: return f"[API_ERROR] {e}"
184
-
185
- # ── Anthropic Claude (β˜… μ°Έκ³ μ½”λ“œ 동일 νŒ¨ν„΄) ──
186
- def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
187
- max_tokens=8192, temperature=0.6):
188
- headers={
189
- "Content-Type":"application/json",
190
- "x-api-key":api_key,
191
- "anthropic-version":"2023-06-01"
192
- }
193
  messages=[{"role":"user","content":prompt}]
194
  payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,"messages":messages}
195
- if system: payload["system"]=system
196
  for attempt in range(3):
197
  try:
198
- r=requests.post("https://api.anthropic.com/v1/messages",
199
- headers=headers,data=json.dumps(payload),timeout=300)
200
- r.raise_for_status()
201
- resp=r.json()
202
  text_parts=[]
203
  for block in resp.get("content",[]):
204
- if block.get("type")=="text":
205
- text_parts.append(block["text"])
206
  c="\n".join(text_parts)
207
  return _strip_think(c) if c else "[EMPTY]"
208
  except requests.exceptions.HTTPError:
209
- if r.status_code==429: time.sleep(5*(attempt+1)); continue
210
- if r.status_code==529: time.sleep(8*(attempt+1)); continue
211
- try: err=r.json().get("error",{}).get("message","")
212
- except: err=str(r.status_code)
213
  return f"[API_ERROR] Claude {r.status_code}: {err}"
214
  except Exception as e:
215
- if attempt<2: time.sleep(3*(attempt+1))
216
- else: return f"[API_ERROR] {e}"
217
-
218
- # ── Google Gemini (β˜…β˜…β˜… μ°Έκ³ μ½”λ“œμ™€ 100% 동일 νŒ¨ν„΄ β˜…β˜…β˜…) ──
219
- GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
220
-
221
- def call_gemini(prompt, system="", api_key="", model="gemini-2.5-flash",
222
- max_tokens=8192, temperature=1.0, json_mode=False):
223
- """Google Gemini generateContent REST API
224
- β˜… x-goog-api-key 헀더 인증
225
- β˜… data=json.dumps(payload) 전솑
226
- β˜… thinking part (thought:True) μŠ€ν‚΅
227
- """
228
- url = f"{GEMINI_API_BASE}/models/{model}:generateContent"
229
- headers = {
230
- "Content-Type": "application/json",
231
- "x-goog-api-key": api_key,
232
- }
233
- contents = [{"role": "user", "parts": [{"text": prompt}]}]
234
- gen_config = {"maxOutputTokens": max_tokens, "temperature": temperature}
235
- payload = {"contents": contents, "generationConfig": gen_config}
236
- if system:
237
- payload["systemInstruction"] = {"parts": [{"text": system}]}
238
- if json_mode:
239
- gen_config["responseMimeType"] = "application/json"
240
  for attempt in range(3):
241
  try:
242
- r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=300)
243
- # β˜… raise_for_status FIRST β€” μ°Έκ³ μ½”λ“œ 동일 νŒ¨ν„΄
244
- r.raise_for_status()
245
- data = r.json()
246
- candidates = data.get("candidates", [])
247
  if not candidates:
248
- block_reason = data.get("promptFeedback", {}).get("blockReason", "UNKNOWN")
249
- print(f" [Gemini] BLOCKED: {block_reason}")
250
- return f"[API_ERROR] Gemini BLOCKED: {block_reason}"
251
- parts = candidates[0].get("content", {}).get("parts", [])
252
- result = []
253
  for p in parts:
254
  if "text" in p:
255
- if p.get("thought", False):
256
- continue # β˜… thinking part skip
257
  result.append(p["text"])
258
- c = "\n".join(result) if result else ""
259
  return _strip_think(c) if c else "[EMPTY]"
260
  except requests.exceptions.HTTPError:
261
- # β˜… μ°Έκ³ μ½”λ“œ 동일: 429/503만 retry, λ‚˜λ¨Έμ§€λŠ” μ¦‰μ‹œ μ—λŸ¬ λ°˜ν™˜
262
- if r.status_code == 429:
263
- time.sleep(5 * (attempt + 1) + random.uniform(0, 2))
264
- continue
265
- if r.status_code == 503:
266
- time.sleep(8 * (attempt + 1) + random.uniform(0, 3))
267
- continue
268
- try:
269
- err = r.json().get("error", {}).get("message", "")
270
- except:
271
- err = str(r.status_code)
272
  print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
273
  return f"[API_ERROR] Gemini {r.status_code}: {err}"
274
  except Exception as e:
275
  print(f" [Gemini] Exception: {e}")
276
- if attempt < 2:
277
- time.sleep(3 * (attempt + 1))
278
- else:
279
- return f"[API_ERROR] Gemini: {e}"
280
-
281
- # ── Unified Dispatcher ──
282
- def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
283
- provider="OpenAI", max_tokens=8192, temperature=0.6):
284
- if provider == "OpenAI":
285
- return call_openai(prompt, system, api_key, model_id, max_tokens, temperature)
286
- elif provider == "Anthropic":
287
- return call_anthropic(prompt, system, api_key, model_id, max_tokens, temperature)
288
- elif provider == "Google":
289
- # β˜… GeminiλŠ” temperature=1.0 ꢌμž₯ (thinking λͺ¨λΈ)
290
- return call_gemini(prompt, system, api_key, model_id, max_tokens, temperature=1.0)
291
  return f"[API_ERROR] Unknown provider: {provider}"
292
 
293
- # ════════════════════════════════════════════════════════════════
294
- # Β§5. Judge β€” Multi-Provider
295
- # ════════════════════════════════════════════════════════════════
296
-
297
- JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
298
  Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
299
-
300
  RUBRIC:
301
  process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
302
- metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. Honest uncertainty=0.75+
303
  error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist.
304
  integration_depth (15%): Multi-perspective synthesis + emergent insights
305
- final_correctness (15%): Answer accuracy and completeness. INCOMPLETE answers get 0.25 max.
306
-
307
- TICOS BONUSES:
308
- A_TrapEscape: ID'd ALL hidden traps? B_ContradictionResolution: Resolved both sides?
309
- C_ProgressiveDiscovery: Revised with new info? D_MultiConstraint: Mapped ALL conflicts?
310
- E_SelfCorrecting: EXPLICIT backtrack? F_ExpertPanel: Max-depth per perspective?
311
- G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario matrix?
312
-
313
  STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
314
-
315
- IMPORTANT: Output ONLY valid JSON with NO extra text:
316
- {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
317
 
318
  def _build_judge_schema():
319
  sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
@@ -322,132 +234,101 @@ def _build_judge_schema():
322
  "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
323
  JUDGE_SCHEMA=_build_judge_schema()
324
 
325
- def build_judge_prompt(task, response):
326
  return f"""FINAL Bench Task Evaluation
327
  Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
328
  TICOS: {task.ticos_type} | Title: {task.title}
329
-
330
- PROMPT:
331
- {task.prompt[:2000]}
332
-
333
- EXPECTED:
334
- {task.expected_behavior[:600]}
335
-
336
  HIDDEN TRAPS: {task.hidden_trap or 'None'}
337
-
338
- RESPONSE TO JUDGE:
339
- {response[:17000]}
340
-
341
- Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness
342
- Apply {task.ticos_type} bonus criteria.
343
  Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
344
 
345
  def _parse_judge_json(text):
346
- if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
347
- return None
348
- cleaned = _strip_think(text)
349
- VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
350
- keys = list(RUBRIC.keys())
351
- # Method 1: Direct JSON
352
  try:
353
- t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
354
- t = re.sub(r'\s*```$', '', t.strip())
355
- data = json.loads(t)
356
- if "scores" in data and isinstance(data["scores"], dict):
357
- scores = {}
358
- for k in keys:
359
- v = float(data["scores"].get(k, 0.5))
360
- scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
361
- return {"scores": scores, "comment": data.get("comment", "ok")}
362
- except: pass
363
- # Method 2: Search JSON
364
  try:
365
- m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
366
  if m:
367
- data = json.loads(m.group())
368
  if "scores" in data:
369
- scores = {}
370
- for k in keys:
371
- v = float(data["scores"].get(k, 0.5))
372
- scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
373
- return {"scores": scores, "comment": data.get("comment", "parsed")}
374
- except: pass
375
- # Method 3: Regex
376
  try:
377
- sc = {}
378
  for k in keys:
379
- m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
380
  if m2:
381
- v = float(m2.group(1))
382
- if 0 <= v <= 1: sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
383
- if len(sc) >= 3:
384
  for k in keys:
385
- if k not in sc: sc[k] = 0.5
386
- return {"scores": sc, "comment": "regex_parsed"}
387
- except: pass
388
  return None
389
 
390
- def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
391
- if provider == "OpenAI":
392
- raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
393
- max_tokens=max_tokens, temperature=temperature, json_schema=JUDGE_SCHEMA)
394
- result = _parse_judge_json(raw)
395
- if result: return result
396
- raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
397
- max_tokens=max_tokens, temperature=temperature, json_mode=True)
398
  return _parse_judge_json(raw2)
399
- elif provider == "Anthropic":
400
- raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
401
- max_tokens=max_tokens, temperature=temperature)
402
  return _parse_judge_json(raw)
403
- elif provider == "Google":
404
- # β˜… Gemini judge도 temperature=1.0 κ³ μ • (thinking λͺ¨λΈ ν˜Έν™˜)
405
- raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
406
- max_tokens=max_tokens, temperature=1.0, json_mode=True)
407
- result = _parse_judge_json(raw)
408
- if result: return result
409
- raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
410
- max_tokens=max_tokens, temperature=1.0, json_mode=False)
411
  return _parse_judge_json(raw2)
412
  return None
413
 
414
- # ════════════════════════════════════════════════════════════════
415
- # Β§6. Scoring Engine
416
- # ════════════════════════════════════════════════════════════════
417
-
418
  def compute_task_score(scores):
419
  return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
420
 
421
- def compute_axis_scores(results, tasks):
422
- tm={t.task_id:t for t in tasks}; ax={}
423
  for an,ai in AXIS_MAP.items():
424
  vals=[]
425
  for tid,d in results.items():
426
- if d["score"]<0: continue
427
  t=tm.get(tid)
428
- if not t: continue
429
- try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]; sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
430
- except: sc={}
431
  rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
432
  w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
433
- if rv: vals.append(np.mean(rv)*w)
434
  ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
435
  return ax
436
 
437
- def compute_final_score(results, tasks):
438
- tm={t.task_id:t for t in tasks}; ds={}
439
  for tid,d in results.items():
440
- if d["score"]<0: continue
441
  t=tm.get(tid)
442
- if t: ds.setdefault(t.domain,[]).append(d["score"])
443
  da={d:np.mean(v) for d,v in ds.items() if v}
444
  gd={}
445
- for t in tasks: gd.setdefault(t.grade,set()).add(t.domain)
446
  ws,wt=0,0
447
  for g,doms in gd.items():
448
  w=GRADE_WEIGHT.get(g,1.0)
449
  for d in doms:
450
- if d in da: ws+=da[d]*w; wt+=w
451
  base=ws/wt if wt>0 else 0
452
  axis=compute_axis_scores(results,tasks)
453
  av=[max(v,0.01) for v in axis.values()]
@@ -455,77 +336,52 @@ def compute_final_score(results, tasks):
455
  har_p=har/100.0
456
  return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
457
 
458
- def determine_agi_stage(score, axis):
459
  all60=all(v>=60 for v in axis.values()) if axis else False
460
  for s in reversed(AGI_STAGES):
461
  if score>=s["min"]:
462
- if s["stage"]>=4 and not all60: return AGI_STAGES[2]
463
  return s
464
  return AGI_STAGES[0]
465
 
466
- # ════════════════════════════════════════════════════════════════
467
- # Β§7. Checkpoint DB
468
- # ════════════════════════════════════════════════════════════════
469
  DB_PATH="final_bench_eval.db"
470
  def _init_db():
471
- c=sqlite3.connect(DB_PATH)
472
- c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
473
- c.commit(); c.close()
474
- def _make_run_id(m): return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
475
  def _save_result(rid,tid,resp,jresp,sc):
476
- c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
477
  def _load_all(rid):
478
- """β˜… μΊμ‹œ λ‘œλ“œ μ‹œ μ‹€νŒ¨ κ²°κ³Ό(score=0 + API_ERROR) μžλ™ μ œμ™Έ β†’ μž¬μ‹œλ„ 보μž₯"""
479
- c=sqlite3.connect(DB_PATH)
480
- cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,))
481
- rows=cur.fetchall(); c.close()
482
- result = {}
483
  for r in rows:
484
- resp = r[1] or ""
485
- score = r[3]
486
- # β˜… API μ—λŸ¬/빈 응닡/0점은 μΊμ‹œμ—μ„œ μ œμ™Έ β†’ λ‹€μŒ μ‹€ν–‰ μ‹œ μž¬μ‹œλ„
487
- if score <= 0 and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp == "[EMPTY]" or resp.startswith("[ERROR")):
488
- continue
489
- result[r[0]] = {"response": resp, "judge": r[2], "score": score}
490
  return result
491
  def _clear_run(rid):
492
- c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
493
  _init_db()
494
 
495
- # ════════════════════════════════════════════════════════════════
496
- # Β§8. CSV Export
497
- # ════════════════════════════════════════════════════════════════
498
- def generate_csv(results, tasks, model_name, judge_name, mode="BASELINE"):
499
- out=io.StringIO(); w=csv.writer(out)
500
- w.writerow(["task_id","domain","grade","ticos_type","difficulty","title",
501
- "eval_model","judge_model","mode","weighted_score",
502
- "process_quality","metacognitive_accuracy","error_recovery",
503
- "integration_depth","final_correctness",
504
- "judge_comment","response_preview","timestamp"])
505
  tm={t.task_id:t for t in tasks}
506
  for tid,d in sorted(results.items()):
507
  t=tm.get(tid)
508
- if not t: continue
509
  jd={}
510
- try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
511
- except: pass
512
  sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
513
- cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200]
514
- s=d["score"]
515
- if s<0: s=-1; cm=f"JUDGE_FAILED:{cm}"
516
- w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,
517
- model_name,judge_name,mode,s,
518
- sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),
519
- sc.get("error_recovery",""),sc.get("integration_depth",""),
520
- sc.get("final_correctness",""),
521
- cm,(d.get("response","") or "")[:300].replace("\n"," "),
522
- datetime.now().isoformat()])
523
  return out.getvalue()
524
 
525
- # ════════════════════════════════════════════════════════════════
526
- # Β§9. HTML Builders
527
- # ════════════════════════════════════════════════════════════════
528
- CSS = """<style>
529
  .eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
530
  .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
531
  .eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
@@ -541,403 +397,256 @@ CSS = """<style>
541
  </style>"""
542
 
543
  def _sc(s):
544
- if s>=80: return "#4caf50"
545
- if s>=60: return "#ff9800"
546
- if s>=40: return "#ff5722"
547
  return "#f44336"
548
 
549
- def _build_progress_table(results, tasks):
550
  rows=""
551
  for t in tasks:
552
  info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
553
  gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
554
  if t.task_id in results:
555
- d=results[t.task_id]; s=d["score"]; resp=d.get("response","")
556
- if s<0:
557
- rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌ JF</td><td>β€”</td></tr>'
558
- elif s==0 and resp and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
559
- # β˜… API μ—λŸ¬λ₯Ό λͺ…ν™•ν•˜κ²Œ ν‘œμ‹œ
560
  err_short=html.escape(resp[:60])
561
  rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">🚫 {err_short}</td></tr>'
562
  else:
563
- c=_sc(s)
564
- rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
565
- else:
566
- rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>⏳</td><td>β€”</td></tr>'
567
  return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
568
 
569
- def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
570
  final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks)
571
  stage=determine_agi_stage(final,axis)
572
  labels={"generalization":"🌐 Generalization","reasoning":"🧠 Reasoning","planning":"πŸ“‹ Planning","reliability":"🎯 Reliability","safety":"πŸ›‘οΈ Safety"}
573
  ax_html=""
574
  for an,av in axis.items():
575
- c=_sc(av)
576
- ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
577
  gh=""
578
- for g in ["A","B","C"]:
579
- gd=[t.domain for t in tasks if t.grade==g]
580
- gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs]
581
- if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}Γ—{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
582
  done=sum(1 for t in tasks if t.task_id in results)
583
  jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
584
- # API errors
585
- api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and (results[t.task_id].get("response","") or "").startswith("["))
586
- # MA-ER Gap
587
  ma_vals,er_vals=[],[]
588
  for tid,d in results.items():
589
- if d["score"]<0: continue
590
  try:
591
- jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]
592
- sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
593
- if "metacognitive_accuracy" in sc: ma_vals.append(float(sc["metacognitive_accuracy"]))
594
- if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
595
- except: pass
596
- avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
597
- gap=avg_ma-avg_er; gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
598
  gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
599
- ad=[t.domain for t in tasks if t.grade=="A"]
600
- asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
601
- aa=np.mean(asc_vals) if asc_vals else 0
602
  checks=[("Scoreβ‰₯80",final>=80),("Axesβ‰₯60",all(v>=60 for v in axis.values())),(f"A-avgβ‰₯75({aa:.0f})",aa>=75)]
603
  ch="".join([f'<span style="margin-right:8px">{"βœ…" if ok else "❌"}{lb}</span>' for lb,ok in checks])
604
  err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">⚠️ API Errors: {api_errs} tasks</div>' if api_errs else ""
605
- return f"""{CSS}<div class="summary-card">
606
- <div style="text-align:center">
607
- <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
608
- <h2 style="margin:6px 0;font-size:1.6em">πŸ€– Baseline FINAL: {final:.1f}</h2>
609
- <p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ— HAR {har_p:.3f} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p>
610
- <p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} Β· Judge: {judge_label}</p>
611
- {err_html}
612
- </div><hr style="border-color:#333;margin:12px 0">
613
- <h4 style="color:#aaa;margin:6px 0">🎯 5-Axis Scores</h4>{ax_html}
614
- <hr style="border-color:#333;margin:10px 0">
615
- <div style="font-size:0.88em">{gh}</div>
616
- <div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px">
617
- <span style="font-size:0.85em">MA-ER Gap:</span>
618
- <span style="font-weight:700;color:{gc}">{gap:.3f}</span>
619
- <span style="font-size:0.8em;color:{gc}">({gl})</span>
620
- <span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div>
621
- <div style="font-size:0.82em;margin-top:6px">{ch}</div>
622
- <p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p>
623
- <div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px">
624
- <p style="font-size:0.82em;color:#e94560;margin:0">πŸ”’ <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p>
625
- <p style="font-size:0.75em;color:#aaa;margin:4px 0 0 0">The 3-Phase Protocol can boost performance up to 70%+ on hardest tasks.</p>
626
- </div></div>"""
627
-
628
- def _build_detail_view(results, tasks):
629
  items=""
630
  for t in tasks:
631
- if t.task_id not in results: continue
632
- d=results[t.task_id]; info=DOMAIN_INFO.get(t.domain,{"icon":"?"})
633
- s=d["score"]; resp=html.escape((d.get("response","") or "")[:500])
634
- jc=""; ss=""
635
  try:
636
- jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
637
- jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200])
638
- sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
639
- ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
640
- except: pass
641
- c=_sc(s) if s>=0 else "#ff9800"; badge=f'{s:.1f}' if s>=0 else "JF"
642
  items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β€” <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
643
  return CSS+items
644
 
645
- # ════════════════════════════════════════════════════════════════
646
- # Β§10. Evaluation Engine (Baseline Only)
647
- # ════════════════════════════════════════════════════════════════
648
-
649
- def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
650
- judge_api_key, judge_model_id, judge_provider, state):
651
  try:
652
  sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
653
- f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
654
- f"If unsure, say so honestly.")
655
  print(f" β–Ά {task.task_id} β†’ {eval_provider}/{eval_model_id}")
656
- model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
657
- model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
658
- if (model_response.startswith("[API_ERROR") or
659
- model_response.startswith("[BLOCKED") or
660
- model_response=="[EMPTY]"):
661
  print(f" βœ— {task.task_id}: {model_response[:100]}")
662
- # β˜… API μ—λŸ¬λŠ” μ €μž₯ν•˜λ˜, _load_allμ—μ„œ μžλ™ μ œμ™Έλ¨
663
  _save_result(run_id,task.task_id,model_response,"{}",0)
664
- with state["lock"]:
665
- state["done"]+=1
666
- state["errors"].append(f"{task.task_id}: {model_response[:80]}")
667
  return task.task_id,{"response":model_response,"judge":"{}","score":0}
668
-
669
- print(f" βœ“ {task.task_id} response len={len(model_response)}")
670
- jp = build_judge_prompt(task, model_response)
671
- jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
672
- model_id=judge_model_id, provider=judge_provider)
673
-
674
- if jd is None:
675
- jd={"scores":{k:0.0 for k in RUBRIC},"comment":"JUDGE_PARSE_FAILED","failed":True}
676
-
677
- if jd.get("failed"):
678
- ws=-1.0; jd["comment"]=f"JF:{jd.get('comment','')}"
679
- else:
680
- ws=compute_task_score(jd["scores"])
681
- with state["lock"]: state["parse_ok"]+=1
682
-
683
  jj=json.dumps(jd,ensure_ascii=False)
684
  _save_result(run_id,task.task_id,model_response,jj,ws)
685
  with state["lock"]:
686
- state["done"]+=1
687
- info=DOMAIN_INFO.get(task.domain,{"icon":"?"})
688
  state["active"].append(f'{info["icon"]} {task.task_id}')
689
- if len(state["active"])>10: state["active"]=state["active"][-10:]
690
  return task.task_id,{"response":model_response,"judge":jj,"score":ws}
691
  except Exception as e:
692
  print(f" βœ— {task.task_id} EXCEPTION: {e}")
693
- with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
694
  _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
695
  return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
696
 
697
- # ════════════════════════════════════════════════════════════════
698
- # Β§11. State Machine + Background Thread
699
- # ════════════════════════════════════════════════════════════════
700
-
701
- _EVAL_STATE={
702
- "running":False,"stop_requested":False,"finished":False,
703
- "run_id":"","eval_label":"","judge_label":"","done":0,"total":0,"cached":0,
704
- "errors":[],"active":[],"parse_ok":0,"parse_fail":0,
705
- "start_time":0,"results":{},"tasks":[],
706
- "grade_done":{},"grade_total":{},
707
- "lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5,
708
- }
709
 
710
  def _reset():
711
- with _EVAL_STATE["lock"]:
712
- _EVAL_STATE.update({"running":False,"stop_requested":False,"finished":False,
713
- "done":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,
714
- "start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},
715
- "message":"","csv_path":None,"hf_status":""})
716
-
717
- def _prog_html(state, pending):
718
- done=state["done"]; pct=min(int(done/max(pending,1)*100),100)
719
- gb=""
720
- for g in ["A","B","C"]:
721
- gt=state["grade_total"].get(g,0); gd=state["grade_done"].get(g,0)
722
- if gt==0: continue
723
- gp=min(int(gd/gt*100),100)
724
- c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0")
725
  emoji="πŸ…°οΈ" if g=="A" else "πŸ…±οΈ" if g=="B" else "πŸ…ΎοΈ"
726
  gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ—{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>'
727
  o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>⚑ <b>πŸ€– Baseline</b> β€” {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}'
728
  ac=state.get("active",[])
729
- if ac: o+='<div style="margin-top:8px">πŸ”„ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
730
  er=state.get("errors",[])
731
  if er:
732
- o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
733
- for e in er[-6:]:
734
- o+=f'<div>⚠️ {html.escape(e[:100])}</div>'
735
  o+='</div>'
736
  return o+'</div>'
737
 
738
- def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
739
- judge_api_key, judge_model_id, judge_provider, judge_label,
740
- tasks, run_id, n_workers):
741
  global _EVAL_STATE
742
  try:
743
- with _EVAL_STATE["lock"]:
744
- _EVAL_STATE["start_time"]=time.time()
745
- _EVAL_STATE["message"]=f"⚑ Eval: {eval_label} · Judge: {judge_label} · {len(tasks)} tasks"
746
- # β˜… _load_all은 이제 μ‹€νŒ¨ κ²°κ³Όλ₯Ό μžλ™ μ œμ™Έν•¨
747
- results=dict(_load_all(run_id))
748
- cached=sum(1 for t in tasks if t.task_id in results)
749
- pending=[t for t in tasks if t.task_id not in results]
750
- print(f" πŸ“Š Cached (valid): {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
751
- gt={}
752
- for t in pending: gt.setdefault(t.grade,[]).append(t)
753
- with _EVAL_STATE["lock"]:
754
- _EVAL_STATE["results"]=results; _EVAL_STATE["cached"]=cached
755
- _EVAL_STATE["total"]=len(pending)
756
- _EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()}
757
- _EVAL_STATE["grade_done"]={g:0 for g in gt}
758
- _EVAL_STATE["done"]=0; _EVAL_STATE["errors"]=[]; _EVAL_STATE["active"]=[]
759
  if pending:
760
  with ThreadPoolExecutor(max_workers=n_workers) as ex:
761
  futs={}
762
  for t in pending:
763
- if _EVAL_STATE["stop_requested"]: break
764
- futs[ex.submit(_eval_single,t,run_id,
765
- eval_api_key,eval_model_id,eval_provider,
766
- judge_api_key,judge_model_id,judge_provider,
767
- _EVAL_STATE)]=t
768
  done_set=set()
769
  while len(done_set)<len(futs):
770
- if _EVAL_STATE["stop_requested"]: ex.shutdown(wait=False,cancel_futures=True); break
771
  for f in list(futs):
772
- if f in done_set: continue
773
  if f.done():
774
  done_set.add(f)
775
  try:
776
  tid,data=f.result()
777
- with _EVAL_STATE["lock"]:
778
- _EVAL_STATE["results"][tid]=data
779
- to=futs[f]; _EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1
780
- except: pass
781
  time.sleep(0.5)
782
- with _EVAL_STATE["lock"]: results=dict(_EVAL_STATE["results"])
783
- final,base,har,axis,_=compute_final_score(results,tasks)
784
- stage=determine_agi_stage(final,axis)
785
- csv_str=generate_csv(results,tasks,eval_label,judge_label,"BASELINE")
786
- cp=f"/tmp/final_{run_id}.csv"
787
- with open(cp,"w",encoding="utf-8") as f: f.write(csv_str)
788
  elapsed=int(time.time()-_EVAL_STATE["start_time"])
789
- with _EVAL_STATE["lock"]:
790
- _EVAL_STATE["csv_path"]=cp; _EVAL_STATE["hf_status"]=""
791
- _EVAL_STATE["message"]=f"🏁 {stage['name']} β€” FINAL={final:.1f} Β· {elapsed}s"
792
- _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
793
  except Exception as e:
794
- print(f" ❌ Fatal: {e}")
795
- import traceback; traceback.print_exc()
796
- with _EVAL_STATE["lock"]:
797
- _EVAL_STATE["message"]=f"❌ Fatal: {str(e)[:100]}"
798
- _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
799
-
800
- def _start_eval(eval_api_key, judge_api_key, eval_model_choice, judge_model_choice,
801
- grade_f, diff_f, max_t, n_w, fresh):
802
- global _EVAL_STATE
803
- if _EVAL_STATE["running"]: return "⚠️ Already running"
804
- eval_api_key=(eval_api_key or "").strip()
805
- judge_api_key=(judge_api_key or "").strip()
806
-
807
- eval_model_id, eval_provider = _resolve_model(eval_model_choice)
808
- judge_model_id, judge_provider = _resolve_model(judge_model_choice)
809
-
810
- if not eval_api_key: return f"❌ {eval_provider} API Key required for Eval model"
811
- if not judge_api_key: return f"❌ {judge_provider} API Key required for Judge model"
812
 
 
 
 
 
 
 
 
813
  tasks=ALL_TASKS[:]
814
- if grade_f!="All": tasks=[t for t in tasks if t.grade==grade_f]
815
- if diff_f!="All": tasks=[t for t in tasks if t.difficulty==diff_f]
816
- tasks=tasks[:int(max_t)]
817
- rid=_make_run_id(eval_model_id)
818
- if fresh: _clear_run(rid)
819
  _reset()
820
- with _EVAL_STATE["lock"]:
821
- _EVAL_STATE.update({"running":True,"run_id":rid,
822
- "eval_label":eval_model_choice,"judge_label":judge_model_choice,
823
- "tasks":tasks,"total":len(tasks),"n_workers":int(n_w)})
824
- threading.Thread(target=_bg_eval,daemon=True,
825
- args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,
826
- judge_api_key,judge_model_id,judge_provider,judge_model_choice,
827
- tasks,rid,int(n_w))).start()
828
  return f"⚑ Started β€” Eval: {eval_model_choice} Β· Judge: {judge_model_choice} ({len(tasks)} tasks)"
829
 
830
  def _stop():
831
- if _EVAL_STATE["running"]: _EVAL_STATE["stop_requested"]=True; return "⏹️ Stopping..."
832
  return "ℹ️ Not running"
833
 
834
  def _poll():
835
- with _EVAL_STATE["lock"]:
836
- running=_EVAL_STATE["running"]; finished=_EVAL_STATE["finished"]
837
- tasks=_EVAL_STATE.get("tasks",[]); results=dict(_EVAL_STATE.get("results",{}))
838
- msg=_EVAL_STATE.get("message",""); cp=_EVAL_STATE.get("csv_path")
839
- if not running and not finished and not results:
840
- return("ℹ️ Configure API keys, select models, then press ▢️ Start","","","",None)
841
- if running:
842
- pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0)
843
- ph=CSS+_prog_html(_EVAL_STATE,pend)
844
- elif finished:
845
- ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
846
- else: ph=msg
847
- th=_build_progress_table(results,tasks) if tasks else ""
848
- sh,dh,co="","",None
849
  if finished and tasks:
850
- el=_EVAL_STATE.get("eval_label","?")
851
- jl=_EVAL_STATE.get("judge_label","?")
852
- hf_st=_EVAL_STATE.get("hf_status","")
853
- sh=_build_summary_card(results,tasks,el,jl,hf_st)
854
- dh=_build_detail_view(results,tasks)
855
- co=cp
856
  return(ph,th,sh,dh,co)
857
 
858
- # ════════════════════════════════════════════════════════════════
859
- # Β§12. Gradio App
860
- # ════════════════════════════════════════════════════════════════
861
-
862
- HEADER = """
863
- <div style="text-align:center;padding:16px 0">
864
  <h1 style="margin:0;font-size:1.8em">πŸ† FINAL Bench v4.2 β€” Baseline Evaluation</h1>
865
  <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
866
- <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
867
- <b>100 Tasks Β· 15 Domains Β· 8 TICOS Β· 5-Axis Β· 5-Stage AGI Grade</b><br>
868
- πŸ€– Baseline (Non-AGI) β€” Single LLM Evaluation Β· Multi-Provider<br>
869
- Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google
870
- </p>
871
  <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
872
  <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI Β· GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
873
  <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic Β· Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
874
- <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google Β· Gemini 2.5 Flash / 2.5 Pro / 2.0 Flash</span>
875
- </div>
876
  <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
877
- <p style="color:#e94560;font-size:0.85em;margin:0">πŸ”’ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
878
- </div>
879
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
880
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ“Š Dataset</a>
881
- <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ† Leaderboard</a>
882
- </div></div>"""
883
 
884
  def create_app():
885
- with gr.Blocks(title="FINAL Bench v4.2",
886
- css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
887
  gr.HTML(HEADER)
888
-
889
  gr.Markdown("### πŸ”‘ API Keys")
890
- gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. Same key OK if both use same provider.</p>')
891
  with gr.Row():
892
- eval_api_key=gr.Textbox(label="πŸ€– Eval Model API Key",type="password",
893
- placeholder="sk-... / sk-ant-... / AIza...",
894
- info="OpenAI / Anthropic / Google key for eval",scale=3)
895
- judge_api_key=gr.Textbox(label="βš–οΈ Judge Model API Key",type="password",
896
- placeholder="sk-... / sk-ant-... / AIza...",
897
- info="OpenAI / Anthropic / Google key for judge",scale=3)
898
-
899
  gr.Markdown("### πŸ€– Model Selection")
900
  with gr.Row():
901
- eval_m=gr.Dropdown(label="πŸ€– Evaluation Target",choices=MODEL_CHOICES,
902
- value=DEFAULT_EVAL,info="Model to evaluate",scale=3)
903
- judge_m=gr.Dropdown(label="βš–οΈ Judge Model",choices=MODEL_CHOICES,
904
- value=DEFAULT_JUDGE,info="Model that scores responses",scale=3)
905
-
906
  gr.Markdown("### βš™οΈ Settings")
907
  with gr.Row():
908
  gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
909
- df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty Filter",scale=1)
910
  mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
911
  nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
912
-
913
  with gr.Row():
914
  s_btn=gr.Button("▢️ Start (Resume)",variant="primary",size="lg",scale=2)
915
  f_btn=gr.Button("πŸš€ Fresh Start",variant="secondary",size="lg",scale=2)
916
  x_btn=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
917
  status=gr.Textbox(label="Status",interactive=False,max_lines=2)
918
-
919
  with gr.Tabs():
920
- with gr.Tab("πŸ“Š Progress"): p_html=gr.HTML()
921
- with gr.Tab("πŸ“‹ Results"): t_html=gr.HTML()
922
- with gr.Tab("πŸ† FINAL Score"): s_html=gr.HTML()
923
- with gr.Tab("πŸ” Details"): d_html=gr.HTML()
924
- with gr.Tab("πŸ’Ύ CSV"): c_file=gr.File(label="CSV")
925
-
926
  timer=gr.Timer(value=2,active=True)
927
  timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file])
928
-
929
  eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw]
930
  s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status])
931
  f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
932
  x_btn.click(fn=_stop,outputs=[status])
 
 
933
 
934
  if __name__=="__main__":
935
  sg,sd={},{}
936
- for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
937
  print(f"\n{'='*60}\n FINAL Bench v4.2 β€” Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
938
  print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
939
- for g in ["A","B","C"]: print(f" Grade {g} (Γ—{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
940
  print(f" πŸ”’ MetaCog: COMING SOON\n{'='*60}\n")
941
- app=create_app()
942
- app.queue(default_concurrency_limit=2)
943
- app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)
 
1
+ """
2
+ FINAL Bench v4.2 β€” Baseline (Non-AGI) Evaluation System
3
+ =========================================================
4
+ β˜… Multi-Provider: OpenAI / Anthropic / Google (Gemini 3 Pro Preview)
5
+ β˜… Both Eval Model AND Judge Model support all 3 providers
6
+ β˜… 100 Tasks Β· 15 Domains Β· 8 TICOS Types Β· 5-Axis Β· 5-Stage AGI Grade
7
+ β˜… Dataset: HuggingFace FINAL-Bench/Metacognitive
8
+ Author: Ginigen AI β€” Choi Sunyoung | License: Apache 2.0
9
+ """
10
  import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
11
  from datetime import datetime
12
  from dataclasses import dataclass, field
 
17
  from concurrent.futures import ThreadPoolExecutor
18
  from datasets import load_dataset
19
 
 
 
 
 
20
  DOMAIN_INFO = {
21
+ "Mathematics & Logic":{"icon":"πŸ”’","color":"#FF6B35"},"Science":{"icon":"πŸ”¬","color":"#7B2FF7"},
22
+ "Philosophy":{"icon":"πŸ€”","color":"#00B4D8"},"Medicine":{"icon":"πŸ₯","color":"#2EC4B6"},
23
+ "Economics":{"icon":"πŸ“ˆ","color":"#E63946"},"History":{"icon":"πŸ“œ","color":"#F4A261"},
24
+ "War & Security":{"icon":"πŸ›‘οΈ","color":"#264653"},"Space & Physics":{"icon":"πŸš€","color":"#6C63FF"},
25
+ "Chemistry & Biology":{"icon":"🧬","color":"#06D6A0"},"Language & Writing":{"icon":"✍️","color":"#EF476F"},
26
+ "Literature":{"icon":"πŸ“–","color":"#8338EC"},"Art":{"icon":"🎨","color":"#FF006E"},
27
+ "Religion & Mythology":{"icon":"πŸ•ŠοΈ","color":"#FFD166"},"Ethics":{"icon":"βš–οΈ","color":"#118AB2"},
28
+ "AI & Technology":{"icon":"πŸ€–","color":"#073B4C"},
 
 
 
 
 
 
 
29
  }
30
+ GRADE_WEIGHT={"A":1.5,"B":1.0,"C":0.7}
31
+ RUBRIC={
32
+ "process_quality":{"weight":0.25,"desc":"Systematic reasoning transparency"},
33
+ "metacognitive_accuracy":{"weight":0.25,"desc":"Confidence calibration + uncertainty honesty"},
34
+ "error_recovery":{"weight":0.20,"desc":"Mid-analysis self-correction"},
35
+ "integration_depth":{"weight":0.15,"desc":"Multi-perspective synthesis"},
36
+ "final_correctness":{"weight":0.15,"desc":"Answer accuracy and completeness"},
37
  }
38
+ AXIS_MAP={
39
+ "generalization":{"rubrics":["process_quality","final_correctness"],"ticos":[]},
40
+ "reasoning":{"rubrics":["process_quality","error_recovery"],"ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]},
41
+ "planning":{"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]},
42
+ "reliability":{"rubrics":["metacognitive_accuracy"],"ticos":["E_SelfCorrecting","G_PivotDetection"]},
43
+ "safety":{"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]},
44
  }
45
+ AGI_STAGES=[
46
+ {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence","min":0,"max":39,"color":"#f44336"},
47
+ {"stage":2,"name":"FINAL-Proto","label":"Proto Intelligence","min":40,"max":59,"color":"#ff9800"},
48
+ {"stage":3,"name":"FINAL-Pre","label":"Pre-AGI","min":60,"max":79,"color":"#2196f3"},
49
+ {"stage":4,"name":"FINAL-Pass","label":"AGI Achieved","min":80,"max":94,"color":"#4caf50"},
50
+ {"stage":5,"name":"FINAL-Post","label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"},
51
  ]
52
 
53
  @dataclass
54
  class FinalTask:
55
+ task_id:str;domain:str;grade:str;ticos_type:str
56
+ difficulty:str;lens:str;title:str;prompt:str
57
+ expected_behavior:str;hidden_trap:str
58
  ticos_required:List[str]=field(default_factory=list)
59
  metadata:Dict=field(default_factory=dict)
60
 
 
 
 
 
61
  def load_tasks():
62
  print("πŸ“₯ Loading FINAL-Bench/Metacognitive from HuggingFace...")
63
  try:
64
+ ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
65
+ tasks=[]
66
  for row in ds:
67
+ tr=row.get("ticos_required",[])
68
+ if isinstance(tr,str):
69
+ try:tr=json.loads(tr)
70
+ except:tr=[x.strip() for x in tr.split(",") if x.strip()]
71
+ tasks.append(FinalTask(task_id=row["task_id"],domain=row["domain"],grade=row["grade"],
72
+ ticos_type=row["ticos_type"],difficulty=row["difficulty"],lens=row.get("lens",""),
73
+ title=row.get("title",row["task_id"]),prompt=row["prompt"],
74
+ expected_behavior=row.get("expected_behavior",""),hidden_trap=row.get("hidden_trap",""),
75
+ ticos_required=tr if isinstance(tr,list) else [],metadata={}))
 
 
 
76
  print(f" βœ… Loaded {len(tasks)} tasks from HuggingFace")
77
  return tasks
78
  except Exception as e:
79
+ print(f" ⚠️ HF load failed: {e}")
 
 
 
 
 
 
 
 
 
 
80
  raise FileNotFoundError("Dataset not found!")
81
 
82
+ ALL_TASKS=load_tasks()
83
  print(f"βœ… FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
84
 
85
+ # ═══ Β§3. Model Registry ═══
86
+ PROVIDER_MODELS={
87
+ "OpenAI":{
88
+ "gpt-5.2":"GPT-5.2 (flagship)","gpt-5-mini":"GPT-5 Mini",
89
+ "gpt-4.1":"GPT-4.1","o4-mini":"o4-mini (reasoning)","gpt-4o":"GPT-4o",
 
 
 
 
 
 
90
  },
91
+ "Anthropic":{
92
+ "claude-opus-4-6":"Claude Opus 4.6",
93
+ "claude-sonnet-4-5-20250929":"Claude Sonnet 4.5",
94
+ "claude-haiku-4-5-20251001":"Claude Haiku 4.5",
95
  },
96
+ "Google":{
97
+ "gemini-3-pro-preview":"Gemini 3 Pro Preview",
 
 
98
  },
99
  }
100
+ ALL_MODELS={}
101
+ for prov,models in PROVIDER_MODELS.items():
102
+ for mid,label in models.items():
103
+ ALL_MODELS[f"{label} [{prov}]"]={"id":mid,"provider":prov}
104
+ MODEL_CHOICES=list(ALL_MODELS.keys())
105
+ DEFAULT_EVAL="GPT-5.2 (flagship) [OpenAI]"
106
+ DEFAULT_JUDGE="GPT-5.2 (flagship) [OpenAI]"
 
 
 
107
  def _resolve_model(choice):
108
+ info=ALL_MODELS.get(choice,{})
109
+ return info.get("id","gpt-5.2"),info.get("provider","OpenAI")
 
 
 
 
110
 
111
+ # ═══ Β§4. API Clients ═══
112
  def _strip_think(text):
113
+ if not text:return text
114
+ for tag in['think','thinking','reasoning','reflection']:
115
+ text=re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
116
  return text.strip()
117
 
118
+ def call_openai(prompt,system="",api_key="",model="gpt-5.2",
119
+ max_tokens=8192,temperature=0.6,reasoning_effort=None,
120
+ json_mode=False,json_schema=None):
 
121
  headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
122
  messages=[]
123
+ if system:messages.append({"role":"system","content":system})
124
  messages.append({"role":"user","content":prompt})
125
+ payload={"model":model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages}
126
+ if reasoning_effort:payload["reasoning_effort"]=reasoning_effort
 
127
  if json_schema:
128
  payload["reasoning_effort"]="none"
129
+ payload["response_format"]={"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":json_schema}}
 
130
  elif json_mode:
131
  payload["response_format"]={"type":"json_object"}
132
  for attempt in range(3):
133
  try:
134
+ r=requests.post("https://api.openai.com/v1/chat/completions",headers=headers,data=json.dumps(payload),timeout=300)
135
+ r.raise_for_status();c=r.json()["choices"][0]["message"]["content"]
 
 
136
  return _strip_think(c) if c else "[EMPTY]"
137
  except requests.exceptions.HTTPError:
138
+ if r.status_code==429:time.sleep(5*(attempt+1));continue
139
+ try:err=r.json().get("error",{}).get("message","")
140
+ except:err=str(r.status_code)
141
+ if attempt<2:time.sleep(3*(attempt+1));continue
142
  return f"[API_ERROR] OpenAI {r.status_code}: {err}"
143
  except Exception as e:
144
+ if attempt<2:time.sleep(3*(attempt+1))
145
+ else:return f"[API_ERROR] {e}"
146
+
147
+ def call_anthropic(prompt,system="",api_key="",model="claude-opus-4-6",
148
+ max_tokens=8192,temperature=0.6):
149
+ headers={"Content-Type":"application/json","x-api-key":api_key,"anthropic-version":"2023-06-01"}
 
 
 
 
 
150
  messages=[{"role":"user","content":prompt}]
151
  payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,"messages":messages}
152
+ if system:payload["system"]=system
153
  for attempt in range(3):
154
  try:
155
+ r=requests.post("https://api.anthropic.com/v1/messages",headers=headers,data=json.dumps(payload),timeout=300)
156
+ r.raise_for_status();resp=r.json()
 
 
157
  text_parts=[]
158
  for block in resp.get("content",[]):
159
+ if block.get("type")=="text":text_parts.append(block["text"])
 
160
  c="\n".join(text_parts)
161
  return _strip_think(c) if c else "[EMPTY]"
162
  except requests.exceptions.HTTPError:
163
+ if r.status_code==429:time.sleep(5*(attempt+1));continue
164
+ if r.status_code==529:time.sleep(8*(attempt+1));continue
165
+ try:err=r.json().get("error",{}).get("message","")
166
+ except:err=str(r.status_code)
167
  return f"[API_ERROR] Claude {r.status_code}: {err}"
168
  except Exception as e:
169
+ if attempt<2:time.sleep(3*(attempt+1))
170
+ else:return f"[API_ERROR] {e}"
171
+
172
+ # β˜… Gemini β€” x-goog-api-key header Β· data=json.dumps Β· thinking skip
173
+ GEMINI_API_BASE="https://generativelanguage.googleapis.com/v1beta"
174
+ def call_gemini(prompt,system="",api_key="",model="gemini-3-pro-preview",
175
+ max_tokens=8192,temperature=1.0,json_mode=False):
176
+ url=f"{GEMINI_API_BASE}/models/{model}:generateContent"
177
+ headers={"Content-Type":"application/json","x-goog-api-key":api_key}
178
+ contents=[{"role":"user","parts":[{"text":prompt}]}]
179
+ gen_config={"maxOutputTokens":max_tokens,"temperature":temperature}
180
+ payload={"contents":contents,"generationConfig":gen_config}
181
+ if system:payload["systemInstruction"]={"parts":[{"text":system}]}
182
+ if json_mode:gen_config["responseMimeType"]="application/json"
 
 
 
 
 
 
 
 
 
 
 
183
  for attempt in range(3):
184
  try:
185
+ r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300)
186
+ r.raise_for_status();data=r.json()
187
+ candidates=data.get("candidates",[])
 
 
188
  if not candidates:
189
+ br=data.get("promptFeedback",{}).get("blockReason","UNKNOWN")
190
+ return f"[API_ERROR] Gemini BLOCKED: {br}"
191
+ parts=candidates[0].get("content",{}).get("parts",[])
192
+ result=[]
 
193
  for p in parts:
194
  if "text" in p:
195
+ if p.get("thought",False):continue
 
196
  result.append(p["text"])
197
+ c="\n".join(result) if result else ""
198
  return _strip_think(c) if c else "[EMPTY]"
199
  except requests.exceptions.HTTPError:
200
+ if r.status_code==429:time.sleep(5*(attempt+1)+random.uniform(0,2));continue
201
+ if r.status_code==503:time.sleep(8*(attempt+1)+random.uniform(0,3));continue
202
+ try:err=r.json().get("error",{}).get("message","")
203
+ except:err=str(r.status_code)
 
 
 
 
 
 
 
204
  print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
205
  return f"[API_ERROR] Gemini {r.status_code}: {err}"
206
  except Exception as e:
207
  print(f" [Gemini] Exception: {e}")
208
+ if attempt<2:time.sleep(3*(attempt+1))
209
+ else:return f"[API_ERROR] Gemini: {e}"
210
+
211
+ def call_model(prompt,system="",api_key="",model_id="gpt-5.2",
212
+ provider="OpenAI",max_tokens=8192,temperature=0.6):
213
+ if provider=="OpenAI":return call_openai(prompt,system,api_key,model_id,max_tokens,temperature)
214
+ elif provider=="Anthropic":return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature)
215
+ elif provider=="Google":return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature=1.0)
 
 
 
 
 
 
 
216
  return f"[API_ERROR] Unknown provider: {provider}"
217
 
218
+ # ═══ Β§5. Judge ═══
219
+ JUDGE_SYSTEM="""You are a FINAL Bench judge for AGI-Level Verification.
 
 
 
220
  Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
 
221
  RUBRIC:
222
  process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
223
+ metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max.
224
  error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist.
225
  integration_depth (15%): Multi-perspective synthesis + emergent insights
226
+ final_correctness (15%): Answer accuracy and completeness. INCOMPLETE=0.25 max.
 
 
 
 
 
 
 
227
  STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
228
+ Output ONLY valid JSON: {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
 
 
229
 
230
  def _build_judge_schema():
231
  sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
 
234
  "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
235
  JUDGE_SCHEMA=_build_judge_schema()
236
 
237
+ def build_judge_prompt(task,response):
238
  return f"""FINAL Bench Task Evaluation
239
  Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
240
  TICOS: {task.ticos_type} | Title: {task.title}
241
+ PROMPT:\n{task.prompt[:2000]}
242
+ EXPECTED:\n{task.expected_behavior[:600]}
 
 
 
 
 
243
  HIDDEN TRAPS: {task.hidden_trap or 'None'}
244
+ RESPONSE TO JUDGE:\n{response[:17000]}
245
+ Score all 5 rubrics. Apply {task.ticos_type} bonus criteria.
 
 
 
 
246
  Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
247
 
248
  def _parse_judge_json(text):
249
+ if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":return None
250
+ cleaned=_strip_think(text);VALID={0.0,0.25,0.5,0.75,1.0};keys=list(RUBRIC.keys())
 
 
 
 
251
  try:
252
+ t=re.sub(r'^```(?:json)?\s*','',cleaned.strip());t=re.sub(r'\s*```$','',t.strip())
253
+ data=json.loads(t)
254
+ if "scores" in data and isinstance(data["scores"],dict):
255
+ scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys}
256
+ return {"scores":scores,"comment":data.get("comment","ok")}
257
+ except:pass
 
 
 
 
 
258
  try:
259
+ m=re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL)
260
  if m:
261
+ data=json.loads(m.group())
262
  if "scores" in data:
263
+ scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys}
264
+ return {"scores":scores,"comment":data.get("comment","parsed")}
265
+ except:pass
 
 
 
 
266
  try:
267
+ sc={}
268
  for k in keys:
269
+ m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE)
270
  if m2:
271
+ v=float(m2.group(1))
272
+ if 0<=v<=1:sc[k]=min(VALID,key=lambda x,v=v:abs(x-v))
273
+ if len(sc)>=3:
274
  for k in keys:
275
+ if k not in sc:sc[k]=0.5
276
+ return {"scores":sc,"comment":"regex_parsed"}
277
+ except:pass
278
  return None
279
 
280
+ def call_judge(prompt,system,api_key,model_id,provider,temperature=0.1,max_tokens=2048):
281
+ if provider=="OpenAI":
282
+ raw=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_schema=JUDGE_SCHEMA)
283
+ result=_parse_judge_json(raw)
284
+ if result:return result
285
+ raw2=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_mode=True)
 
 
286
  return _parse_judge_json(raw2)
287
+ elif provider=="Anthropic":
288
+ raw=call_anthropic(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature)
 
289
  return _parse_judge_json(raw)
290
+ elif provider=="Google":
291
+ raw=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=True)
292
+ result=_parse_judge_json(raw)
293
+ if result:return result
294
+ raw2=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=False)
 
 
 
295
  return _parse_judge_json(raw2)
296
  return None
297
 
298
+ # ═══ Β§6. Scoring ═══
 
 
 
299
  def compute_task_score(scores):
300
  return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
301
 
302
+ def compute_axis_scores(results,tasks):
303
+ tm={t.task_id:t for t in tasks};ax={}
304
  for an,ai in AXIS_MAP.items():
305
  vals=[]
306
  for tid,d in results.items():
307
+ if d["score"]<0:continue
308
  t=tm.get(tid)
309
+ if not t:continue
310
+ try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
311
+ except:sc={}
312
  rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
313
  w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
314
+ if rv:vals.append(np.mean(rv)*w)
315
  ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
316
  return ax
317
 
318
+ def compute_final_score(results,tasks):
319
+ tm={t.task_id:t for t in tasks};ds={}
320
  for tid,d in results.items():
321
+ if d["score"]<0:continue
322
  t=tm.get(tid)
323
+ if t:ds.setdefault(t.domain,[]).append(d["score"])
324
  da={d:np.mean(v) for d,v in ds.items() if v}
325
  gd={}
326
+ for t in tasks:gd.setdefault(t.grade,set()).add(t.domain)
327
  ws,wt=0,0
328
  for g,doms in gd.items():
329
  w=GRADE_WEIGHT.get(g,1.0)
330
  for d in doms:
331
+ if d in da:ws+=da[d]*w;wt+=w
332
  base=ws/wt if wt>0 else 0
333
  axis=compute_axis_scores(results,tasks)
334
  av=[max(v,0.01) for v in axis.values()]
 
336
  har_p=har/100.0
337
  return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
338
 
339
+ def determine_agi_stage(score,axis):
340
  all60=all(v>=60 for v in axis.values()) if axis else False
341
  for s in reversed(AGI_STAGES):
342
  if score>=s["min"]:
343
+ if s["stage"]>=4 and not all60:return AGI_STAGES[2]
344
  return s
345
  return AGI_STAGES[0]
346
 
347
+ # ═══ Β§7. Checkpoint DB ═══
 
 
348
  DB_PATH="final_bench_eval.db"
349
  def _init_db():
350
+ c=sqlite3.connect(DB_PATH);c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))");c.commit();c.close()
351
+ def _make_run_id(m):return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
 
 
352
  def _save_result(rid,tid,resp,jresp,sc):
353
+ c=sqlite3.connect(DB_PATH);c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time()));c.commit();c.close()
354
  def _load_all(rid):
355
+ c=sqlite3.connect(DB_PATH);cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,));rows=cur.fetchall();c.close()
356
+ result={}
 
 
 
357
  for r in rows:
358
+ resp=r[1] or "";score=r[3]
359
+ if score<=0 and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]" or resp.startswith("[ERROR")):continue
360
+ result[r[0]]={"response":resp,"judge":r[2],"score":score}
 
 
 
361
  return result
362
  def _clear_run(rid):
363
+ c=sqlite3.connect(DB_PATH);c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,));c.commit();c.close()
364
  _init_db()
365
 
366
+ # ═══ Β§8. CSV Export ═══
367
+ def generate_csv(results,tasks,model_name,judge_name,mode="BASELINE"):
368
+ out=io.StringIO();w=csv.writer(out)
369
+ w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","eval_model","judge_model","mode","weighted_score","process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness","judge_comment","response_preview","timestamp"])
 
 
 
 
 
 
370
  tm={t.task_id:t for t in tasks}
371
  for tid,d in sorted(results.items()):
372
  t=tm.get(tid)
373
+ if not t:continue
374
  jd={}
375
+ try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
376
+ except:pass
377
  sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
378
+ cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200];s=d["score"]
379
+ if s<0:s=-1;cm=f"JUDGE_FAILED:{cm}"
380
+ w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,judge_name,mode,s,sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""),cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat()])
 
 
 
 
 
 
 
381
  return out.getvalue()
382
 
383
+ # ═══ Β§9. HTML Builders ═══
384
+ CSS="""<style>
 
 
385
  .eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
386
  .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
387
  .eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
 
397
  </style>"""
398
 
399
  def _sc(s):
400
+ if s>=80:return "#4caf50"
401
+ if s>=60:return "#ff9800"
402
+ if s>=40:return "#ff5722"
403
  return "#f44336"
404
 
405
+ def _build_progress_table(results,tasks):
406
  rows=""
407
  for t in tasks:
408
  info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
409
  gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
410
  if t.task_id in results:
411
+ d=results[t.task_id];s=d["score"];resp=d.get("response","")
412
+ if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌ JF</td><td>β€”</td></tr>'
413
+ elif s==0 and resp and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
 
 
414
  err_short=html.escape(resp[:60])
415
  rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">🚫 {err_short}</td></tr>'
416
  else:
417
+ c=_sc(s);rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
418
+ else:rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>⏳</td><td>β€”</td></tr>'
 
 
419
  return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
420
 
421
+ def _build_summary_card(results,tasks,eval_label,judge_label,hf_status):
422
  final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks)
423
  stage=determine_agi_stage(final,axis)
424
  labels={"generalization":"🌐 Generalization","reasoning":"🧠 Reasoning","planning":"πŸ“‹ Planning","reliability":"🎯 Reliability","safety":"πŸ›‘οΈ Safety"}
425
  ax_html=""
426
  for an,av in axis.items():
427
+ c=_sc(av);ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
 
428
  gh=""
429
+ for g in["A","B","C"]:
430
+ gd=[t.domain for t in tasks if t.grade==g];gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs]
431
+ if gs:a=np.mean(gs);gh+=f'<span style="margin-right:14px">{g}Γ—{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
 
432
  done=sum(1 for t in tasks if t.task_id in results)
433
  jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
434
+ api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and(results[t.task_id].get("response","") or "").startswith("["))
 
 
435
  ma_vals,er_vals=[],[]
436
  for tid,d in results.items():
437
+ if d["score"]<0:continue
438
  try:
439
+ jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
440
+ if "metacognitive_accuracy" in sc:ma_vals.append(float(sc["metacognitive_accuracy"]))
441
+ if "error_recovery" in sc:er_vals.append(float(sc["error_recovery"]))
442
+ except:pass
443
+ avg_ma=np.mean(ma_vals) if ma_vals else 0;avg_er=np.mean(er_vals) if er_vals else 0
444
+ gap=avg_ma-avg_er;gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
 
445
  gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
446
+ ad=[t.domain for t in tasks if t.grade=="A"];asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs];aa=np.mean(asc_vals) if asc_vals else 0
 
 
447
  checks=[("Scoreβ‰₯80",final>=80),("Axesβ‰₯60",all(v>=60 for v in axis.values())),(f"A-avgβ‰₯75({aa:.0f})",aa>=75)]
448
  ch="".join([f'<span style="margin-right:8px">{"βœ…" if ok else "❌"}{lb}</span>' for lb,ok in checks])
449
  err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">⚠️ API Errors: {api_errs} tasks</div>' if api_errs else ""
450
+ return f"""{CSS}<div class="summary-card"><div style="text-align:center"><div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div><h2 style="margin:6px 0;font-size:1.6em">πŸ€– Baseline FINAL: {final:.1f}</h2><p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ— HAR {har_p:.3f} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p><p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} Β· Judge: {judge_label}</p>{err_html}</div><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa;margin:6px 0">🎯 5-Axis Scores</h4>{ax_html}<hr style="border-color:#333;margin:10px 0"><div style="font-size:0.88em">{gh}</div><div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px"><span style="font-size:0.85em">MA-ER Gap:</span><span style="font-weight:700;color:{gc}">{gap:.3f}</span><span style="font-size:0.8em;color:{gc}">({gl})</span><span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div><div style="font-size:0.82em;margin-top:6px">{ch}</div><p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p><div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px"><p style="font-size:0.82em;color:#e94560;margin:0">πŸ”’ <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p></div></div>"""
451
+
452
+ def _build_detail_view(results,tasks):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  items=""
454
  for t in tasks:
455
+ if t.task_id not in results:continue
456
+ d=results[t.task_id];info=DOMAIN_INFO.get(t.domain,{"icon":"?"});s=d["score"];resp=html.escape((d.get("response","") or "")[:500])
457
+ jc="";ss=""
 
458
  try:
459
+ jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {});jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200]);sc=jd.get("scores",{}) if isinstance(jd,dict) else {};ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
460
+ except:pass
461
+ c=_sc(s) if s>=0 else "#ff9800";badge=f'{s:.1f}' if s>=0 else "JF"
 
 
 
462
  items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β€” <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
463
  return CSS+items
464
 
465
+ # ═══ Β§10. Evaluation Engine ═══
466
+ def _eval_single(task,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,state):
 
 
 
 
467
  try:
468
  sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
469
+ f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. If unsure, say so honestly.")
 
470
  print(f" β–Ά {task.task_id} β†’ {eval_provider}/{eval_model_id}")
471
+ model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
472
+ if model_response.startswith("[API_ERROR") or model_response.startswith("[BLOCKED") or model_response=="[EMPTY]":
 
 
 
473
  print(f" βœ— {task.task_id}: {model_response[:100]}")
 
474
  _save_result(run_id,task.task_id,model_response,"{}",0)
475
+ with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {model_response[:80]}")
 
 
476
  return task.task_id,{"response":model_response,"judge":"{}","score":0}
477
+ print(f" βœ“ {task.task_id} len={len(model_response)}")
478
+ jp=build_judge_prompt(task,model_response)
479
+ jd=call_judge(jp,system=JUDGE_SYSTEM,api_key=judge_api_key,model_id=judge_model_id,provider=judge_provider)
480
+ if jd is None:jd={"scores":{k:0.0 for k in RUBRIC},"comment":"JUDGE_PARSE_FAILED","failed":True}
481
+ if jd.get("failed"):ws=-1.0;jd["comment"]=f"JF:{jd.get('comment','')}"
482
+ else:ws=compute_task_score(jd["scores"]);
483
+ with state["lock"]:state["parse_ok"]+=1
 
 
 
 
 
 
 
 
484
  jj=json.dumps(jd,ensure_ascii=False)
485
  _save_result(run_id,task.task_id,model_response,jj,ws)
486
  with state["lock"]:
487
+ state["done"]+=1;info=DOMAIN_INFO.get(task.domain,{"icon":"?"})
 
488
  state["active"].append(f'{info["icon"]} {task.task_id}')
489
+ if len(state["active"])>10:state["active"]=state["active"][-10:]
490
  return task.task_id,{"response":model_response,"judge":jj,"score":ws}
491
  except Exception as e:
492
  print(f" βœ— {task.task_id} EXCEPTION: {e}")
493
+ with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
494
  _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
495
  return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
496
 
497
+ # ═══ Β§11. State Machine ═══
498
+ _EVAL_STATE={"running":False,"stop_requested":False,"finished":False,"run_id":"","eval_label":"","judge_label":"","done":0,"total":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5}
 
 
 
 
 
 
 
 
 
 
499
 
500
  def _reset():
501
+ with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":False,"stop_requested":False,"finished":False,"done":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"message":"","csv_path":None,"hf_status":""})
502
+
503
+ def _prog_html(state,pending):
504
+ done=state["done"];pct=min(int(done/max(pending,1)*100),100);gb=""
505
+ for g in["A","B","C"]:
506
+ gt=state["grade_total"].get(g,0);gd=state["grade_done"].get(g,0)
507
+ if gt==0:continue
508
+ gp=min(int(gd/gt*100),100);c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0")
 
 
 
 
 
 
509
  emoji="πŸ…°οΈ" if g=="A" else "πŸ…±οΈ" if g=="B" else "πŸ…ΎοΈ"
510
  gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ—{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>'
511
  o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>⚑ <b>πŸ€– Baseline</b> β€” {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}'
512
  ac=state.get("active",[])
513
+ if ac:o+='<div style="margin-top:8px">πŸ”„ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
514
  er=state.get("errors",[])
515
  if er:
516
+ o+='<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
517
+ for e in er[-6:]:o+=f'<div>⚠️ {html.escape(e[:100])}</div>'
 
518
  o+='</div>'
519
  return o+'</div>'
520
 
521
+ def _bg_eval(eval_api_key,eval_model_id,eval_provider,eval_label,judge_api_key,judge_model_id,judge_provider,judge_label,tasks,run_id,n_workers):
 
 
522
  global _EVAL_STATE
523
  try:
524
+ with _EVAL_STATE["lock"]:_EVAL_STATE["start_time"]=time.time();_EVAL_STATE["message"]=f"⚑ Eval: {eval_label} · Judge: {judge_label} · {len(tasks)} tasks"
525
+ results=dict(_load_all(run_id));cached=sum(1 for t in tasks if t.task_id in results);pending=[t for t in tasks if t.task_id not in results]
526
+ print(f" πŸ“Š Cached: {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
527
+ gt={};
528
+ for t in pending:gt.setdefault(t.grade,[]).append(t)
529
+ with _EVAL_STATE["lock"]:_EVAL_STATE["results"]=results;_EVAL_STATE["cached"]=cached;_EVAL_STATE["total"]=len(pending);_EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()};_EVAL_STATE["grade_done"]={g:0 for g in gt};_EVAL_STATE["done"]=0;_EVAL_STATE["errors"]=[];_EVAL_STATE["active"]=[]
 
 
 
 
 
 
 
 
 
 
530
  if pending:
531
  with ThreadPoolExecutor(max_workers=n_workers) as ex:
532
  futs={}
533
  for t in pending:
534
+ if _EVAL_STATE["stop_requested"]:break
535
+ futs[ex.submit(_eval_single,t,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,_EVAL_STATE)]=t
 
 
 
536
  done_set=set()
537
  while len(done_set)<len(futs):
538
+ if _EVAL_STATE["stop_requested"]:ex.shutdown(wait=False,cancel_futures=True);break
539
  for f in list(futs):
540
+ if f in done_set:continue
541
  if f.done():
542
  done_set.add(f)
543
  try:
544
  tid,data=f.result()
545
+ with _EVAL_STATE["lock"]:_EVAL_STATE["results"][tid]=data;to=futs[f];_EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1
546
+ except:pass
 
 
547
  time.sleep(0.5)
548
+ with _EVAL_STATE["lock"]:results=dict(_EVAL_STATE["results"])
549
+ final,base,har,axis,_=compute_final_score(results,tasks);stage=determine_agi_stage(final,axis)
550
+ csv_str=generate_csv(results,tasks,eval_label,judge_label,"BASELINE");cp=f"/tmp/final_{run_id}.csv"
551
+ with open(cp,"w",encoding="utf-8") as f:f.write(csv_str)
 
 
552
  elapsed=int(time.time()-_EVAL_STATE["start_time"])
553
+ with _EVAL_STATE["lock"]:_EVAL_STATE["csv_path"]=cp;_EVAL_STATE["hf_status"]="";_EVAL_STATE["message"]=f"🏁 {stage['name']} β€” FINAL={final:.1f} Β· {elapsed}s";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True
 
 
 
554
  except Exception as e:
555
+ print(f" ❌ Fatal: {e}");import traceback;traceback.print_exc()
556
+ with _EVAL_STATE["lock"]:_EVAL_STATE["message"]=f"❌ Fatal: {str(e)[:100]}";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
+ def _start_eval(eval_api_key,judge_api_key,eval_model_choice,judge_model_choice,grade_f,diff_f,max_t,n_w,fresh):
559
+ global _EVAL_STATE
560
+ if _EVAL_STATE["running"]:return "⚠️ Already running"
561
+ eval_api_key=(eval_api_key or "").strip();judge_api_key=(judge_api_key or "").strip()
562
+ eval_model_id,eval_provider=_resolve_model(eval_model_choice);judge_model_id,judge_provider=_resolve_model(judge_model_choice)
563
+ if not eval_api_key:return f"❌ {eval_provider} API Key required for Eval model"
564
+ if not judge_api_key:return f"❌ {judge_provider} API Key required for Judge model"
565
  tasks=ALL_TASKS[:]
566
+ if grade_f!="All":tasks=[t for t in tasks if t.grade==grade_f]
567
+ if diff_f!="All":tasks=[t for t in tasks if t.difficulty==diff_f]
568
+ tasks=tasks[:int(max_t)];rid=_make_run_id(eval_model_id)
569
+ if fresh:_clear_run(rid)
 
570
  _reset()
571
+ with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":True,"run_id":rid,"eval_label":eval_model_choice,"judge_label":judge_model_choice,"tasks":tasks,"total":len(tasks),"n_workers":int(n_w)})
572
+ threading.Thread(target=_bg_eval,daemon=True,args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,judge_api_key,judge_model_id,judge_provider,judge_model_choice,tasks,rid,int(n_w))).start()
 
 
 
 
 
 
573
  return f"⚑ Started β€” Eval: {eval_model_choice} Β· Judge: {judge_model_choice} ({len(tasks)} tasks)"
574
 
575
  def _stop():
576
+ if _EVAL_STATE["running"]:_EVAL_STATE["stop_requested"]=True;return "⏹️ Stopping..."
577
  return "ℹ️ Not running"
578
 
579
  def _poll():
580
+ with _EVAL_STATE["lock"]:running=_EVAL_STATE["running"];finished=_EVAL_STATE["finished"];tasks=_EVAL_STATE.get("tasks",[]);results=dict(_EVAL_STATE.get("results",{}));msg=_EVAL_STATE.get("message","");cp=_EVAL_STATE.get("csv_path")
581
+ if not running and not finished and not results:return("ℹ️ Configure API keys, select models, then press ▢️ Start","","","",None)
582
+ if running:pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0);ph=CSS+_prog_html(_EVAL_STATE,pend)
583
+ elif finished:ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
584
+ else:ph=msg
585
+ th=_build_progress_table(results,tasks) if tasks else "";sh,dh,co="","",None
 
 
 
 
 
 
 
 
586
  if finished and tasks:
587
+ el=_EVAL_STATE.get("eval_label","?");jl=_EVAL_STATE.get("judge_label","?");hf_st=_EVAL_STATE.get("hf_status","")
588
+ sh=_build_summary_card(results,tasks,el,jl,hf_st);dh=_build_detail_view(results,tasks);co=cp
 
 
 
 
589
  return(ph,th,sh,dh,co)
590
 
591
+ # ═══ Β§12. Gradio App ═══
592
+ HEADER="""<div style="text-align:center;padding:16px 0">
 
 
 
 
593
  <h1 style="margin:0;font-size:1.8em">πŸ† FINAL Bench v4.2 β€” Baseline Evaluation</h1>
594
  <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
595
+ <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto"><b>100 Tasks Β· 15 Domains Β· 8 TICOS Β· 5-Axis Β· 5-Stage AGI Grade</b><br>
596
+ πŸ€– Baseline (Non-AGI) β€” Single LLM Evaluation Β· Multi-Provider<br>Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google</p>
 
 
 
597
  <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
598
  <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI Β· GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
599
  <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic Β· Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
600
+ <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google Β· Gemini 3 Pro Preview</span></div>
 
601
  <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
602
+ <p style="color:#e94560;font-size:0.85em;margin:0">πŸ”’ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p></div>
 
603
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
604
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ“Š Dataset</a>
605
+ <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ† Leaderboard</a></div></div>"""
 
606
 
607
  def create_app():
608
+ with gr.Blocks(title="FINAL Bench v4.2",css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
 
609
  gr.HTML(HEADER)
 
610
  gr.Markdown("### πŸ”‘ API Keys")
 
611
  with gr.Row():
612
+ eval_api_key=gr.Textbox(label="πŸ€– Eval Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3)
613
+ judge_api_key=gr.Textbox(label="βš–οΈ Judge Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3)
 
 
 
 
 
614
  gr.Markdown("### πŸ€– Model Selection")
615
  with gr.Row():
616
+ eval_m=gr.Dropdown(label="πŸ€– Evaluation Target",choices=MODEL_CHOICES,value=DEFAULT_EVAL,scale=3)
617
+ judge_m=gr.Dropdown(label="βš–οΈ Judge Model",choices=MODEL_CHOICES,value=DEFAULT_JUDGE,scale=3)
 
 
 
618
  gr.Markdown("### βš™οΈ Settings")
619
  with gr.Row():
620
  gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
621
+ df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty",scale=1)
622
  mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
623
  nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
 
624
  with gr.Row():
625
  s_btn=gr.Button("▢️ Start (Resume)",variant="primary",size="lg",scale=2)
626
  f_btn=gr.Button("πŸš€ Fresh Start",variant="secondary",size="lg",scale=2)
627
  x_btn=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
628
  status=gr.Textbox(label="Status",interactive=False,max_lines=2)
 
629
  with gr.Tabs():
630
+ with gr.Tab("πŸ“Š Progress"):p_html=gr.HTML()
631
+ with gr.Tab("πŸ“‹ Results"):t_html=gr.HTML()
632
+ with gr.Tab("πŸ† FINAL Score"):s_html=gr.HTML()
633
+ with gr.Tab("πŸ” Details"):d_html=gr.HTML()
634
+ with gr.Tab("πŸ’Ύ CSV"):c_file=gr.File(label="CSV")
 
635
  timer=gr.Timer(value=2,active=True)
636
  timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file])
 
637
  eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw]
638
  s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status])
639
  f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
640
  x_btn.click(fn=_stop,outputs=[status])
641
+ gr.Markdown("---\n<center><b>FINAL Bench v4.2</b> Β· Baseline Β· OpenAI / Anthropic / Google Β· Apache 2.0 Β· <b>Ginigen AI</b></center>")
642
+ return app
643
 
644
  if __name__=="__main__":
645
  sg,sd={},{}
646
+ for t in ALL_TASKS:sg[t.grade]=sg.get(t.grade,0)+1;sd[t.domain]=sd.get(t.domain,0)+1
647
  print(f"\n{'='*60}\n FINAL Bench v4.2 β€” Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
648
  print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
649
+ for g in["A","B","C"]:print(f" Grade {g} (Γ—{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
650
  print(f" πŸ”’ MetaCog: COMING SOON\n{'='*60}\n")
651
+ app=create_app();app.queue(default_concurrency_limit=2)
652
+ app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)