aiqtech commited on
Commit
b58e97e
Β·
verified Β·
1 Parent(s): 752c6b1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +820 -0
app.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FINAL Bench v4.0 β€” Baseline (Non-AGI) Evaluation System
3
+ =========================================================
4
+ Frontier Intelligence Nexus for AGI-Level Verification
5
+
6
+ β˜… Baseline (Non-AGI) single-call evaluation
7
+ β˜… Multi-Provider: OpenAI / Anthropic / Google (Gemini)
8
+ β˜… 100 Tasks Β· 15 Domains Β· 8 TICOS Types Β· 5-Axis Β· 5-Stage AGI Grade
9
+ β˜… Judge: GPT-5.2 Structured Output (default)
10
+ β˜… Dataset: HuggingFace FINAL-Bench/Metacognitive
11
+
12
+ πŸ”’ MetaCog (Self-Correction Protocol) evaluation: COMING SOON
13
+ The 3-Phase Protocol (Initial β†’ Self-Review β†’ Correction) is the core
14
+ contribution of the paper and will be released in a future update.
15
+
16
+ Author: Ginigen AI β€” Choi Sunyoung
17
+ Paper: "FINAL Bench: Measuring Functional Metacognitive Reasoning in LLMs" (2026)
18
+ License: Apache 2.0
19
+ """
20
+
21
+ import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
22
+ from datetime import datetime
23
+ from dataclasses import dataclass, field, asdict
24
+ from typing import List, Dict, Optional
25
+ import requests
26
+ import numpy as np
27
+ import gradio as gr
28
+ from concurrent.futures import ThreadPoolExecutor
29
+ from datasets import load_dataset
30
+
31
+ # ════════════════════════════════════════════════════════════════
32
+ # Β§1. Data Structures & Constants
33
+ # ════════════════════════════════════════════════════════════════
34
+
35
+ DOMAIN_INFO = {
36
+ "Mathematics & Logic": {"icon":"πŸ”’","color":"#FF6B35"},
37
+ "Science": {"icon":"πŸ”¬","color":"#7B2FF7"},
38
+ "Philosophy": {"icon":"πŸ€”","color":"#00B4D8"},
39
+ "Medicine": {"icon":"πŸ₯","color":"#2EC4B6"},
40
+ "Economics": {"icon":"πŸ“ˆ","color":"#E63946"},
41
+ "History": {"icon":"πŸ“œ","color":"#F4A261"},
42
+ "War & Security": {"icon":"πŸ›‘οΈ","color":"#264653"},
43
+ "Space & Physics": {"icon":"πŸš€","color":"#6C63FF"},
44
+ "Chemistry & Biology": {"icon":"🧬","color":"#06D6A0"},
45
+ "Language & Writing": {"icon":"✍️","color":"#EF476F"},
46
+ "Literature": {"icon":"πŸ“–","color":"#8338EC"},
47
+ "Art": {"icon":"🎨","color":"#FF006E"},
48
+ "Religion & Mythology": {"icon":"πŸ•ŠοΈ","color":"#FFD166"},
49
+ "Ethics": {"icon":"βš–οΈ","color":"#118AB2"},
50
+ "AI & Technology": {"icon":"πŸ€–","color":"#073B4C"},
51
+ }
52
+
53
+ GRADE_WEIGHT = {"A": 1.5, "B": 1.0, "C": 0.7}
54
+
55
+ RUBRIC = {
56
+ "process_quality": {"weight": 0.25, "desc": "Systematic reasoning transparency"},
57
+ "metacognitive_accuracy": {"weight": 0.25, "desc": "Confidence calibration + uncertainty honesty"},
58
+ "error_recovery": {"weight": 0.20, "desc": "Mid-analysis self-correction"},
59
+ "integration_depth": {"weight": 0.15, "desc": "Multi-perspective synthesis"},
60
+ "final_correctness": {"weight": 0.15, "desc": "Answer accuracy and completeness"},
61
+ }
62
+
63
+ AXIS_MAP = {
64
+ "generalization": {"rubrics": ["process_quality", "final_correctness"], "ticos": []},
65
+ "reasoning": {"rubrics": ["process_quality", "error_recovery"], "ticos": ["E_SelfCorrecting", "C_ProgressiveDiscovery"]},
66
+ "planning": {"rubrics": ["integration_depth", "process_quality"],"ticos": ["D_MultiConstraint", "H_DecisionUnderUncertainty"]},
67
+ "reliability": {"rubrics": ["metacognitive_accuracy"], "ticos": ["E_SelfCorrecting", "G_PivotDetection"]},
68
+ "safety": {"rubrics": ["error_recovery", "metacognitive_accuracy"], "ticos": ["A_TrapEscape", "G_PivotDetection"]},
69
+ }
70
+
71
+ AGI_STAGES = [
72
+ {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence", "min":0, "max":39, "color":"#f44336"},
73
+ {"stage":2,"name":"FINAL-Proto", "label":"Proto Intelligence", "min":40,"max":59, "color":"#ff9800"},
74
+ {"stage":3,"name":"FINAL-Pre", "label":"Pre-AGI", "min":60,"max":79, "color":"#2196f3"},
75
+ {"stage":4,"name":"FINAL-Pass", "label":"AGI Achieved", "min":80,"max":94, "color":"#4caf50"},
76
+ {"stage":5,"name":"FINAL-Post", "label":"Operationally Mature AGI", "min":95,"max":100,"color":"#9c27b0"},
77
+ ]
78
+
79
+ @dataclass
80
+ class FinalTask:
81
+ task_id:str; domain:str; grade:str; ticos_type:str
82
+ difficulty:str; lens:str; title:str; prompt:str
83
+ expected_behavior:str; hidden_trap:str
84
+ ticos_required:List[str]=field(default_factory=list)
85
+ metadata:Dict=field(default_factory=dict)
86
+
87
+ # ════════════════════════════════════════════════════════════════
88
+ # Β§2. Load Dataset from HuggingFace
89
+ # ════════════════════════════════════════════════════════════════
90
+
91
+ def load_tasks():
92
+ print("πŸ“₯ Loading FINAL-Bench/Metacognitive from HuggingFace...")
93
+ try:
94
+ ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
95
+ tasks = []
96
+ for row in ds:
97
+ tr = row.get("ticos_required", [])
98
+ if isinstance(tr, str):
99
+ try: tr = json.loads(tr)
100
+ except: tr = [x.strip() for x in tr.split(",") if x.strip()]
101
+ tasks.append(FinalTask(
102
+ task_id=row["task_id"], domain=row["domain"], grade=row["grade"],
103
+ ticos_type=row["ticos_type"], difficulty=row["difficulty"],
104
+ lens=row.get("lens",""), title=row.get("title",row["task_id"]),
105
+ prompt=row["prompt"], expected_behavior=row.get("expected_behavior",""),
106
+ hidden_trap=row.get("hidden_trap",""),
107
+ ticos_required=tr if isinstance(tr, list) else [], metadata={}
108
+ ))
109
+ print(f" βœ… Loaded {len(tasks)} tasks from HuggingFace")
110
+ return tasks
111
+ except Exception as e:
112
+ print(f" ⚠️ HF load failed: {e}, trying local...")
113
+ for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json",
114
+ os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]:
115
+ if os.path.exists(p):
116
+ with open(p,"r",encoding="utf-8") as f: data=json.load(f)
117
+ print(f" βœ… Loaded from {p}")
118
+ return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"],
119
+ ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""),
120
+ title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""),
121
+ hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]),
122
+ metadata=t.get("metadata",{})) for t in data["tasks"]]
123
+ raise FileNotFoundError("Dataset not found!")
124
+
125
+ ALL_TASKS = load_tasks()
126
+ print(f"βœ… FINAL Bench v4.0: {len(ALL_TASKS)} tasks loaded")
127
+
128
+ # ════════════════════════════════════════════════════════════════
129
+ # Β§3. Multi-Provider API Layer
130
+ # ════════════════════════════════════════════════════════════════
131
+
132
+ PROVIDER_MODELS = {
133
+ "OpenAI": {
134
+ "gpt-5.2":"GPT-5.2 (flagship)","gpt-5-mini":"GPT-5 Mini",
135
+ "gpt-4.1":"GPT-4.1","o4-mini":"o4-mini (reasoning)","gpt-4o":"GPT-4o",
136
+ },
137
+ "Anthropic": {
138
+ "claude-opus-4-6":"Claude Opus 4.6","claude-sonnet-4-5-20250929":"Claude Sonnet 4.5",
139
+ "claude-haiku-4-5-20251001":"Claude Haiku 4.5",
140
+ },
141
+ "Google": {
142
+ "gemini-3-pro":"Gemini 3 Pro","gemini-2.5-pro":"Gemini 2.5 Pro",
143
+ "gemini-2.5-flash":"Gemini 2.5 Flash",
144
+ },
145
+ }
146
+
147
+ ALL_EVAL_MODELS = {}
148
+ for prov, models in PROVIDER_MODELS.items():
149
+ for mid, label in models.items():
150
+ ALL_EVAL_MODELS[f"{label} [{prov}]"] = {"id": mid, "provider": prov}
151
+
152
+ JUDGE_MODELS = {"gpt-5.2":"GPT-5.2 (default)","gpt-4.1":"GPT-4.1","gpt-4o":"GPT-4o"}
153
+
154
+ def _strip_think(text):
155
+ if not text: return text
156
+ for tag in ['think','thinking','reasoning','reflection']:
157
+ text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
158
+ return text.strip()
159
+
160
+ def call_openai(prompt, system="", api_key="", model="gpt-5.2",
161
+ max_tokens=8192, temperature=0.6, reasoning_effort=None):
162
+ headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
163
+ messages=[]
164
+ if system: messages.append({"role":"system","content":system})
165
+ messages.append({"role":"user","content":prompt})
166
+ payload={"model":model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages}
167
+ if reasoning_effort: payload["reasoning_effort"]=reasoning_effort
168
+ for attempt in range(3):
169
+ try:
170
+ r=requests.post("https://api.openai.com/v1/chat/completions",headers=headers,json=payload,timeout=300)
171
+ r.raise_for_status(); c=r.json()["choices"][0]["message"]["content"]
172
+ return _strip_think(c) if c else "[EMPTY]"
173
+ except requests.exceptions.HTTPError:
174
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
175
+ try: err=r.json().get("error",{}).get("message","")
176
+ except: err=str(r.status_code)
177
+ return f"[API_ERROR] {err}"
178
+ except Exception as e:
179
+ if attempt<2: time.sleep(3*(attempt+1))
180
+ else: return f"[API_ERROR] {e}"
181
+
182
+ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
183
+ max_tokens=8192, temperature=0.6):
184
+ headers={"Content-Type":"application/json","x-api-key":api_key,"anthropic-version":"2023-06-01"}
185
+ payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,
186
+ "messages":[{"role":"user","content":prompt}]}
187
+ if system: payload["system"]=system
188
+ for attempt in range(3):
189
+ try:
190
+ r=requests.post("https://api.anthropic.com/v1/messages",headers=headers,json=payload,timeout=300)
191
+ r.raise_for_status(); data=r.json()
192
+ text="".join(b.get("text","") for b in data.get("content",[]) if b.get("type")=="text")
193
+ return _strip_think(text) if text else "[EMPTY]"
194
+ except requests.exceptions.HTTPError:
195
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
196
+ try: err=r.json().get("error",{}).get("message","")
197
+ except: err=str(r.status_code)
198
+ return f"[API_ERROR] {err}"
199
+ except Exception as e:
200
+ if attempt<2: time.sleep(3*(attempt+1))
201
+ else: return f"[API_ERROR] {e}"
202
+
203
+ def call_gemini(prompt, system="", api_key="", model="gemini-3-pro",
204
+ max_tokens=8192, temperature=0.6):
205
+ url=f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent"
206
+ headers={"Content-Type":"application/json"}; params={"key":api_key}
207
+ payload={"contents":[{"role":"user","parts":[{"text":prompt}]}],
208
+ "generationConfig":{"maxOutputTokens":max_tokens,"temperature":temperature}}
209
+ if system: payload["systemInstruction"]={"parts":[{"text":system}]}
210
+ for attempt in range(3):
211
+ try:
212
+ r=requests.post(url,headers=headers,params=params,json=payload,timeout=300)
213
+ r.raise_for_status(); data=r.json()
214
+ cands=data.get("candidates",[])
215
+ if cands:
216
+ text="".join(p.get("text","") for p in cands[0].get("content",{}).get("parts",[]))
217
+ return _strip_think(text) if text else "[EMPTY]"
218
+ return "[EMPTY]"
219
+ except requests.exceptions.HTTPError:
220
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
221
+ try: err=r.json().get("error",{}).get("message","")
222
+ except: err=str(r.status_code)
223
+ return f"[API_ERROR] {err}"
224
+ except Exception as e:
225
+ if attempt<2: time.sleep(3*(attempt+1))
226
+ else: return f"[API_ERROR] {e}"
227
+
228
+ def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
229
+ provider="OpenAI", max_tokens=8192, temperature=0.6):
230
+ if provider=="OpenAI": return call_openai(prompt,system,api_key,model_id,max_tokens,temperature)
231
+ elif provider=="Anthropic": return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature)
232
+ elif provider=="Google": return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature)
233
+ return f"[API_ERROR] Unknown provider: {provider}"
234
+
235
+ # ════════════════════════════════════════════════════════════════
236
+ # Β§4. Structured Judge (GPT-5.2)
237
+ # ════════════════════════════════════════════════════════════════
238
+
239
+ JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
240
+ Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
241
+
242
+ RUBRIC:
243
+ process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
244
+ metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. Honest uncertainty=0.75+
245
+ error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist.
246
+ integration_depth (15%): Multi-perspective synthesis + emergent insights
247
+ final_correctness (15%): Answer accuracy and completeness. INCOMPLETE answers get 0.25 max.
248
+
249
+ TICOS BONUSES:
250
+ A_TrapEscape: ID'd ALL hidden traps? B_ContradictionResolution: Resolved both sides?
251
+ C_ProgressiveDiscovery: Revised with new info? D_MultiConstraint: Mapped ALL conflicts?
252
+ E_SelfCorrecting: EXPLICIT backtrack? F_ExpertPanel: Max-depth per perspective?
253
+ G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario matrix?
254
+
255
+ STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
256
+ Output JSON: {"scores":{...},"comment":"<50 words>"}"""
257
+
258
+ def _build_judge_schema():
259
+ sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
260
+ return {"type":"object","properties":{"scores":{"type":"object","properties":sp,
261
+ "required":list(RUBRIC.keys()),"additionalProperties":False},
262
+ "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
263
+ JUDGE_SCHEMA=_build_judge_schema()
264
+
265
+ def call_judge_structured(prompt, system="", api_key="", model="gpt-5.2",
266
+ temperature=0.1, max_tokens=2048):
267
+ messages=[]
268
+ if system: messages.append({"role":"system","content":system})
269
+ messages.append({"role":"user","content":prompt})
270
+ payload={"model":model,"max_completion_tokens":max_tokens,"temperature":temperature,
271
+ "messages":messages,"reasoning_effort":"none",
272
+ "response_format":{"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":JUDGE_SCHEMA}}}
273
+ headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
274
+ for attempt in range(3):
275
+ try:
276
+ r=requests.post("https://api.openai.com/v1/chat/completions",headers=headers,json=payload,timeout=180)
277
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
278
+ r.raise_for_status()
279
+ content=_strip_think(r.json()["choices"][0]["message"]["content"] or "")
280
+ if not content:
281
+ if attempt<2: time.sleep(2); continue
282
+ return None
283
+ data=json.loads(content)
284
+ if "scores" in data and isinstance(data["scores"],dict):
285
+ for k in RUBRIC:
286
+ if k not in data["scores"]: data["scores"][k]=0.5
287
+ return {"scores":data["scores"],"comment":data.get("comment","ok")}
288
+ except json.JSONDecodeError:
289
+ if attempt<2: time.sleep(2); continue
290
+ return None
291
+ except:
292
+ if attempt<2: time.sleep(3*(attempt+1)); continue
293
+ return None
294
+ return None
295
+
296
+ def build_judge_prompt(task, response):
297
+ return f"""FINAL Bench Task Evaluation
298
+ Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
299
+ TICOS: {task.ticos_type} | Title: {task.title}
300
+
301
+ PROMPT:
302
+ {task.prompt[:2000]}
303
+
304
+ EXPECTED:
305
+ {task.expected_behavior[:600]}
306
+
307
+ HIDDEN TRAPS: {task.hidden_trap or 'None'}
308
+
309
+ RESPONSE TO JUDGE:
310
+ {response[:17000]}
311
+
312
+ Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness
313
+ Apply {task.ticos_type} bonus. Output ONLY JSON."""
314
+
315
+ def parse_judge_fallback(text, keys):
316
+ if not text or text.startswith("[API_ERROR"):
317
+ return {"scores":{k:0.0 for k in keys},"comment":"API_ERROR","failed":True}
318
+ cleaned=_strip_think(text); VALID={0.0,0.25,0.5,0.75,1.0}
319
+ try:
320
+ m=re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL)
321
+ if m:
322
+ d=json.loads(m.group())
323
+ if "scores" in d:
324
+ return {"scores":{k:min(VALID,key=lambda x:abs(x-float(d["scores"].get(k,0.5)))) for k in keys},"comment":d.get("comment","parsed")}
325
+ except: pass
326
+ try:
327
+ sc={}
328
+ for k in keys:
329
+ m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE)
330
+ if m2:
331
+ v=float(m2.group(1))
332
+ if 0<=v<=1: sc[k]=min(VALID,key=lambda x:abs(x-v))
333
+ if len(sc)>=3:
334
+ for k in keys:
335
+ if k not in sc: sc[k]=0.5
336
+ return {"scores":sc,"comment":"regex"}
337
+ except: pass
338
+ return {"scores":{k:0.0 for k in keys},"comment":"parse_failed","failed":True}
339
+
340
+ # ════════════════════════════════════════════════════════════════
341
+ # Β§5. Scoring Engine
342
+ # ════════════════════════════════════════════════════════════════
343
+
344
+ def compute_task_score(scores):
345
+ return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
346
+
347
+ def compute_axis_scores(results, tasks):
348
+ tm={t.task_id:t for t in tasks}; ax={}
349
+ for an,ai in AXIS_MAP.items():
350
+ vals=[]
351
+ for tid,d in results.items():
352
+ if d["score"]<0: continue
353
+ t=tm.get(tid);
354
+ if not t: continue
355
+ try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]; sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
356
+ except: sc={}
357
+ rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
358
+ w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
359
+ if rv: vals.append(np.mean(rv)*w)
360
+ ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
361
+ return ax
362
+
363
+ def compute_final_score(results, tasks):
364
+ tm={t.task_id:t for t in tasks}; ds={}
365
+ for tid,d in results.items():
366
+ if d["score"]<0: continue
367
+ t=tm.get(tid)
368
+ if t: ds.setdefault(t.domain,[]).append(d["score"])
369
+ da={d:np.mean(v) for d,v in ds.items() if v}
370
+ gd={}
371
+ for t in tasks: gd.setdefault(t.grade,set()).add(t.domain)
372
+ ws,wt=0,0
373
+ for g,doms in gd.items():
374
+ w=GRADE_WEIGHT.get(g,1.0)
375
+ for d in doms:
376
+ if d in da: ws+=da[d]*w; wt+=w
377
+ base=ws/wt if wt>0 else 0
378
+ axis=compute_axis_scores(results,tasks)
379
+ av=[max(v,0.01) for v in axis.values()]
380
+ har=(len(av)/sum(1.0/v for v in av)) if av else 50
381
+ har_p=har/100.0
382
+ return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
383
+
384
+ def determine_agi_stage(score, axis):
385
+ all60=all(v>=60 for v in axis.values()) if axis else False
386
+ for s in reversed(AGI_STAGES):
387
+ if score>=s["min"]:
388
+ if s["stage"]>=4 and not all60: return AGI_STAGES[2]
389
+ return s
390
+ return AGI_STAGES[0]
391
+
392
+ # ════════════════════════════════════════════════════════════════
393
+ # Β§6. Checkpoint DB
394
+ # ════════════════════════════════════════════════════════════════
395
+ DB_PATH="final_bench_eval.db"
396
+ def _init_db():
397
+ c=sqlite3.connect(DB_PATH)
398
+ c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
399
+ c.commit(); c.close()
400
+ def _make_run_id(m): return hashlib.md5(f"FINALv40_BL_{m}".encode()).hexdigest()[:12]
401
+ def _save_result(rid,tid,resp,jresp,sc):
402
+ c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
403
+ def _load_all(rid):
404
+ c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close()
405
+ return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
406
+ def _clear_run(rid):
407
+ c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
408
+ _init_db()
409
+
410
+ # ════════════════════════════════════════════════════════════════
411
+ # Β§7. CSV Export
412
+ # ════════════════════════════════════════════════════════════════
413
+ def generate_csv(results, tasks, model_name, mode="BASELINE"):
414
+ out=io.StringIO(); w=csv.writer(out)
415
+ w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","model","mode","weighted_score",
416
+ "process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness",
417
+ "judge_comment","response_preview","timestamp"])
418
+ tm={t.task_id:t for t in tasks}
419
+ for tid,d in sorted(results.items()):
420
+ t=tm.get(tid)
421
+ if not t: continue
422
+ jd={}
423
+ try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
424
+ except: pass
425
+ sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
426
+ cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200]
427
+ s=d["score"]
428
+ if s<0: s=-1; cm=f"JUDGE_FAILED:{cm}"
429
+ w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,mode,s,
430
+ sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),
431
+ sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""),
432
+ cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat()])
433
+ return out.getvalue()
434
+
435
+ # ════════════════════════════════════════════════════════════════
436
+ # Β§8. HTML Builders
437
+ # ════════════════════════════════════════════════════════════════
438
+ CSS = """<style>
439
+ .eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
440
+ .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
441
+ .eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
442
+ .score-bar{background:#e0e0e0;border-radius:8px;height:16px;overflow:hidden;min-width:70px}
443
+ .score-fill{height:100%;border-radius:8px;transition:width .4s}
444
+ .summary-card{background:linear-gradient(135deg,#0a0a1a,#1a1a3e);border-radius:16px;padding:24px;color:#fff;margin:8px 0}
445
+ .axis-row{display:flex;align-items:center;gap:10px;margin:5px 0}
446
+ .axis-bar{flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden}
447
+ .axis-fill{height:100%;border-radius:6px}
448
+ .stage-badge{display:inline-block;padding:6px 16px;border-radius:20px;font-weight:700;font-size:1.1em;margin:8px 0}
449
+ .progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}
450
+ .progress-fill{height:100%;border-radius:8px;transition:width .4s;background:linear-gradient(90deg,#1565c0,#00c853)}
451
+ </style>"""
452
+
453
+ def _sc(s):
454
+ if s>=80: return "#4caf50"
455
+ if s>=60: return "#ff9800"
456
+ if s>=40: return "#ff5722"
457
+ return "#f44336"
458
+
459
+ def _build_progress_table(results, tasks):
460
+ rows=""
461
+ for t in tasks:
462
+ info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
463
+ gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
464
+ if t.task_id in results:
465
+ s=results[t.task_id]["score"]
466
+ if s<0:
467
+ rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>β€”</td></tr>'
468
+ else:
469
+ c=_sc(s)
470
+ rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
471
+ else:
472
+ rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>⏳</td><td>β€”</td></tr>'
473
+ return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
474
+
475
+ def _build_summary_card(results, tasks, model_name, hf_status):
476
+ final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks)
477
+ stage=determine_agi_stage(final,axis)
478
+ labels={"generalization":"🌐 Generalization","reasoning":"🧠 Reasoning","planning":"πŸ“‹ Planning","reliability":"🎯 Reliability","safety":"πŸ›‘οΈ Safety"}
479
+ ax_html=""
480
+ for an,av in axis.items():
481
+ c=_sc(av)
482
+ ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
483
+ gh=""
484
+ for g in ["A","B","C"]:
485
+ gd=[t.domain for t in tasks if t.grade==g]
486
+ gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs]
487
+ if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}Γ—{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
488
+ done=sum(1 for t in tasks if t.task_id in results)
489
+ jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
490
+ # MA-ER Gap
491
+ ma_vals,er_vals=[],[]
492
+ for tid,d in results.items():
493
+ if d["score"]<0: continue
494
+ try:
495
+ jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]
496
+ sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
497
+ if "metacognitive_accuracy" in sc: ma_vals.append(float(sc["metacognitive_accuracy"]))
498
+ if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
499
+ except: pass
500
+ avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
501
+ gap=avg_ma-avg_er
502
+ gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
503
+ gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
504
+ # Pass checks
505
+ ad=[t.domain for t in tasks if t.grade=="A"]
506
+ asc=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
507
+ aa=np.mean(asc) if asc else 0
508
+ checks=[("Scoreβ‰₯80",final>=80),("Axesβ‰₯60",all(v>=60 for v in axis.values())),(f"A-avgβ‰₯75({aa:.0f})",aa>=75)]
509
+ ch="".join([f'<span style="margin-right:8px">{"βœ…" if ok else "❌"}{lb}</span>' for lb,ok in checks])
510
+ return f"""{CSS}<div class="summary-card">
511
+ <div style="text-align:center">
512
+ <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
513
+ <h2 style="margin:6px 0;font-size:1.6em">πŸ€– Baseline FINAL: {final:.1f}</h2>
514
+ <p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ— HAR {har_p:.3f} Β· {model_name} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p>
515
+ </div><hr style="border-color:#333;margin:12px 0">
516
+ <h4 style="color:#aaa;margin:6px 0">🎯 5-Axis Scores</h4>{ax_html}
517
+ <hr style="border-color:#333;margin:10px 0">
518
+ <div style="font-size:0.88em">{gh}</div>
519
+ <div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px">
520
+ <span style="font-size:0.85em">MA-ER Gap:</span>
521
+ <span style="font-weight:700;color:{gc}">{gap:.3f}</span>
522
+ <span style="font-size:0.8em;color:{gc}">({gl})</span>
523
+ <span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div>
524
+ <div style="font-size:0.82em;margin-top:6px">{ch}</div>
525
+ <p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p>
526
+ <div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px">
527
+ <p style="font-size:0.82em;color:#e94560;margin:0">πŸ”’ <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p>
528
+ <p style="font-size:0.75em;color:#aaa;margin:4px 0 0 0">The 3-Phase Protocol can boost performance up to 70%+ on hardest tasks.</p>
529
+ </div></div>"""
530
+
531
+ def _build_detail_view(results, tasks):
532
+ items=""
533
+ for t in tasks:
534
+ if t.task_id not in results: continue
535
+ d=results[t.task_id]; info=DOMAIN_INFO.get(t.domain,{"icon":"?"})
536
+ s=d["score"]; resp=html.escape((d.get("response","") or "")[:500])
537
+ jc=""; ss=""
538
+ try:
539
+ jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
540
+ jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200])
541
+ sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
542
+ ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
543
+ except: pass
544
+ c=_sc(s) if s>=0 else "#ff9800"; badge=f'{s:.1f}' if s>=0 else "JF"
545
+ items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β€” <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
546
+ return CSS+items
547
+
548
+ # ════════════════════════════════════════════════════════════════
549
+ # Β§9. Evaluation Engine (Baseline Only)
550
+ # ════════════════════════════════════════════════════════════════
551
+
552
+ def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
553
+ judge_api_key, judge_model, state):
554
+ try:
555
+ sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
556
+ f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
557
+ f"If unsure, say so honestly.")
558
+ model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
559
+ model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
560
+ if model_response.startswith("[API_ERROR") or model_response=="[EMPTY]":
561
+ _save_result(run_id,task.task_id,model_response,"{}",0)
562
+ with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {model_response[:50]}")
563
+ return task.task_id,{"response":model_response,"judge":"{}","score":0}
564
+ jp=build_judge_prompt(task,model_response)
565
+ jd=call_judge_structured(jp,system=JUDGE_SYSTEM,api_key=judge_api_key,model=judge_model)
566
+ if jd is None:
567
+ jr=call_openai(jp,system=JUDGE_SYSTEM,api_key=judge_api_key,model=judge_model,max_tokens=2048,temperature=0.05)
568
+ jd=parse_judge_fallback(jr,list(RUBRIC.keys()))
569
+ if jd is None:
570
+ jd={"scores":{k:0.0 for k in RUBRIC},"comment":"FAILURE","failed":True}
571
+ if jd.get("failed"):
572
+ ws=-1.0; jd["comment"]=f"JF:{jd['comment']}"
573
+ else:
574
+ ws=compute_task_score(jd["scores"])
575
+ with state["lock"]: state["parse_ok"]+=1
576
+ jj=json.dumps(jd,ensure_ascii=False)
577
+ _save_result(run_id,task.task_id,model_response,jj,ws)
578
+ with state["lock"]:
579
+ state["done"]+=1
580
+ info=DOMAIN_INFO.get(task.domain,{"icon":"?"})
581
+ state["active"].append(f'{info["icon"]} {task.task_id}')
582
+ if len(state["active"])>10: state["active"]=state["active"][-10:]
583
+ return task.task_id,{"response":model_response,"judge":jj,"score":ws}
584
+ except Exception as e:
585
+ with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
586
+ _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
587
+ return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
588
+
589
+ # ════════════════════════════════════════════════════════════════
590
+ # Β§10. State Machine + Background Thread
591
+ # ════════════════════════════════════════════════════════════════
592
+
593
+ _EVAL_STATE={
594
+ "running":False,"stop_requested":False,"finished":False,
595
+ "run_id":"","model":"","done":0,"total":0,"cached":0,
596
+ "errors":[],"active":[],"parse_ok":0,"parse_fail":0,
597
+ "start_time":0,"results":{},"tasks":[],
598
+ "grade_done":{},"grade_total":{},
599
+ "lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5,
600
+ }
601
+
602
+ def _reset():
603
+ with _EVAL_STATE["lock"]:
604
+ _EVAL_STATE.update({"running":False,"stop_requested":False,"finished":False,
605
+ "done":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,
606
+ "start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},
607
+ "message":"","csv_path":None,"hf_status":""})
608
+
609
+ def _prog_html(state, pending):
610
+ done=state["done"]; pct=min(int(done/max(pending,1)*100),100)
611
+ gb=""
612
+ for g in ["A","B","C"]:
613
+ gt=state["grade_total"].get(g,0); gd=state["grade_done"].get(g,0)
614
+ if gt==0: continue
615
+ gp=min(int(gd/gt*100),100)
616
+ c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0")
617
+ emoji="πŸ…°οΈ" if g=="A" else "πŸ…±οΈ" if g=="B" else "πŸ…ΎοΈ"
618
+ gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ—{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>'
619
+ o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>⚑ <b>πŸ€– Baseline (Non-AGI)</b> β€” {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}'
620
+ ac=state.get("active",[])
621
+ if ac: o+='<div style="margin-top:8px">πŸ”„ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
622
+ er=state.get("errors",[])
623
+ if er: o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em">{" · ".join(["⚠️"+html.escape(e[:30]) for e in er[-3:]])}</div>'
624
+ return o+'</div>'
625
+
626
+ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
627
+ judge_api_key, judge_model, tasks, run_id, n_workers):
628
+ global _EVAL_STATE
629
+ try:
630
+ with _EVAL_STATE["lock"]:
631
+ _EVAL_STATE["start_time"]=time.time()
632
+ _EVAL_STATE["message"]=f"⚑ Baseline β€” {eval_label} β€” {len(tasks)} tasks"
633
+ results=dict(_load_all(run_id))
634
+ cached=sum(1 for t in tasks if t.task_id in results)
635
+ pending=[t for t in tasks if t.task_id not in results]
636
+ gt={}
637
+ for t in pending: gt.setdefault(t.grade,[]).append(t)
638
+ with _EVAL_STATE["lock"]:
639
+ _EVAL_STATE["results"]=results; _EVAL_STATE["cached"]=cached
640
+ _EVAL_STATE["total"]=len(pending)
641
+ _EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()}
642
+ _EVAL_STATE["grade_done"]={g:0 for g in gt}
643
+ _EVAL_STATE["done"]=0; _EVAL_STATE["errors"]=[]; _EVAL_STATE["active"]=[]
644
+ if pending:
645
+ with ThreadPoolExecutor(max_workers=n_workers) as ex:
646
+ futs={}
647
+ for t in pending:
648
+ if _EVAL_STATE["stop_requested"]: break
649
+ futs[ex.submit(_eval_single,t,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model,_EVAL_STATE)]=t
650
+ done_set=set()
651
+ while len(done_set)<len(futs):
652
+ if _EVAL_STATE["stop_requested"]: ex.shutdown(wait=False,cancel_futures=True); break
653
+ for f in list(futs):
654
+ if f in done_set: continue
655
+ if f.done():
656
+ done_set.add(f)
657
+ try:
658
+ tid,data=f.result()
659
+ with _EVAL_STATE["lock"]:
660
+ _EVAL_STATE["results"][tid]=data
661
+ to=futs[f]; _EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1
662
+ except: pass
663
+ time.sleep(0.5)
664
+ with _EVAL_STATE["lock"]: results=dict(_EVAL_STATE["results"])
665
+ final,base,har,axis,_=compute_final_score(results,tasks)
666
+ stage=determine_agi_stage(final,axis)
667
+ csv_str=generate_csv(results,tasks,eval_label,"BASELINE")
668
+ cp=f"/tmp/final_{run_id}.csv"
669
+ with open(cp,"w",encoding="utf-8") as f: f.write(csv_str)
670
+ elapsed=int(time.time()-_EVAL_STATE["start_time"])
671
+ with _EVAL_STATE["lock"]:
672
+ _EVAL_STATE["csv_path"]=cp; _EVAL_STATE["hf_status"]=""
673
+ _EVAL_STATE["message"]=f"🏁 {stage['name']} β€” FINAL={final:.1f} Β· {elapsed}s"
674
+ _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
675
+ except Exception as e:
676
+ with _EVAL_STATE["lock"]:
677
+ _EVAL_STATE["message"]=f"❌ Fatal: {str(e)[:100]}"
678
+ _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
679
+
680
+ def _start_eval(eval_api_key, judge_api_key, eval_model_choice, judge_model,
681
+ grade_f, diff_f, max_t, n_w, fresh):
682
+ global _EVAL_STATE
683
+ if _EVAL_STATE["running"]: return "⚠️ Already running"
684
+ eval_api_key=(eval_api_key or "").strip()
685
+ judge_api_key=(judge_api_key or "").strip()
686
+ model_info=ALL_EVAL_MODELS.get(eval_model_choice)
687
+ if not model_info: return "❌ Select an evaluation model"
688
+ eval_model_id=model_info["id"]; eval_provider=model_info["provider"]
689
+ if not eval_api_key: return f"❌ {eval_provider} API Key required for evaluation model"
690
+ if not judge_api_key: return "❌ OpenAI API Key required for Judge"
691
+ tasks=ALL_TASKS[:]
692
+ if grade_f!="All": tasks=[t for t in tasks if t.grade==grade_f]
693
+ if diff_f!="All": tasks=[t for t in tasks if t.difficulty==diff_f]
694
+ tasks=tasks[:int(max_t)]
695
+ rid=_make_run_id(eval_model_id)
696
+ if fresh: _clear_run(rid)
697
+ _reset()
698
+ with _EVAL_STATE["lock"]:
699
+ _EVAL_STATE.update({"running":True,"run_id":rid,"model":eval_model_choice,
700
+ "tasks":tasks,"total":len(tasks),"n_workers":int(n_w)})
701
+ threading.Thread(target=_bg_eval,daemon=True,
702
+ args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,
703
+ judge_api_key,judge_model,tasks,rid,int(n_w))).start()
704
+ return f"⚑ Baseline Started β€” {eval_model_choice} ({len(tasks)} tasks)"
705
+
706
+ def _stop():
707
+ if _EVAL_STATE["running"]: _EVAL_STATE["stop_requested"]=True; return "⏹️ Stopping..."
708
+ return "ℹ️ Not running"
709
+
710
+ def _poll():
711
+ with _EVAL_STATE["lock"]:
712
+ running=_EVAL_STATE["running"]; finished=_EVAL_STATE["finished"]
713
+ tasks=_EVAL_STATE.get("tasks",[]); results=dict(_EVAL_STATE.get("results",{}))
714
+ msg=_EVAL_STATE.get("message",""); cp=_EVAL_STATE.get("csv_path")
715
+ if not running and not finished and not results:
716
+ return("ℹ️ Configure API keys, select models, then press ▢️ Start","","","",None)
717
+ if running:
718
+ pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0)
719
+ ph=CSS+_prog_html(_EVAL_STATE,pend)
720
+ elif finished:
721
+ ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
722
+ else: ph=msg
723
+ th=_build_progress_table(results,tasks) if tasks else ""
724
+ sh,dh,co="","",None
725
+ if finished and tasks:
726
+ model=_EVAL_STATE.get("model","?")
727
+ hf_st=_EVAL_STATE.get("hf_status","")
728
+ sh=_build_summary_card(results,tasks,model,hf_st)
729
+ dh=_build_detail_view(results,tasks)
730
+ co=cp
731
+ return(ph,th,sh,dh,co)
732
+
733
+ # ════════════════════════════════════════════════════════════════
734
+ # Β§11. Gradio App
735
+ # ════════════════════════════════════════════════════════════════
736
+
737
+ HEADER = """
738
+ <div style="text-align:center;padding:16px 0">
739
+ <h1 style="margin:0;font-size:1.8em">πŸ† FINAL Bench v4.0 β€” Baseline Evaluation</h1>
740
+ <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
741
+ <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
742
+ <b>100 Tasks Β· 15 Domains Β· 8 TICOS Β· 5-Axis Β· 5-Stage AGI Grade</b><br>
743
+ πŸ€– Baseline (Non-AGI) β€” Single LLM Evaluation Β· Multi-Provider<br>
744
+ βš–οΈ Judge: GPT-5.2 Structured Output Β· πŸ“Š AΓ—1.5 BΓ—1.0 CΓ—0.7 Β· HAR Penalty
745
+ </p>
746
+ <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
747
+ <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI</span>
748
+ <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic</span>
749
+ <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google</span>
750
+ </div>
751
+ <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
752
+ <p style="color:#e94560;font-size:0.85em;margin:0">πŸ”’ <b>MetaCog (Self-Correction Protocol) evaluation: COMING SOON</b></p>
753
+ <p style="color:#888;font-size:0.78em;margin:4px 0 0 0">
754
+ 3-Phase Protocol (Initial β†’ Self-Review β†’ Correction) β€” paper's core contribution.<br>
755
+ Performance boost up to 70%+ on hardest tasks. Future update.
756
+ </p></div>
757
+ <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
758
+ <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ“Š Dataset</a>
759
+ <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">πŸ† Leaderboard</a>
760
+ </div></div>"""
761
+
762
+ def create_app():
763
+ with gr.Blocks(title="FINAL Bench v4.0",theme=gr.themes.Soft(),
764
+ css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
765
+ gr.HTML(HEADER)
766
+ gr.Markdown("### πŸ”‘ API Keys")
767
+ with gr.Row():
768
+ eval_api_key=gr.Textbox(label="Evaluation Model API Key",type="password",
769
+ placeholder="OpenAI (sk-...) / Anthropic (sk-ant-...) / Google (AIza...)",
770
+ info="Enter the API key matching your selected evaluation model",scale=3)
771
+ judge_api_key=gr.Textbox(label="Judge API Key (OpenAI required)",type="password",
772
+ placeholder="sk-... (OpenAI key for GPT-5.2 judge)",
773
+ info="Judge always uses OpenAI GPT-5.2 Structured Output",scale=3)
774
+ gr.Markdown("### πŸ€– Model Selection")
775
+ with gr.Row():
776
+ eval_m=gr.Dropdown(label="Evaluation Target",choices=list(ALL_EVAL_MODELS.keys()),
777
+ value="GPT-5.2 (flagship) [OpenAI]",
778
+ info="Select model to evaluate. Use matching API key above.",scale=4)
779
+ judge_m=gr.Dropdown(label="βš–οΈ Judge Model",choices=list(JUDGE_MODELS.keys()),
780
+ value="gpt-5.2",info="Uses OpenAI Structured Output",scale=2)
781
+ gr.Markdown("### βš™οΈ Settings")
782
+ with gr.Row():
783
+ gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
784
+ df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty Filter",scale=1)
785
+ mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
786
+ nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
787
+ with gr.Row():
788
+ s_btn=gr.Button("▢️ Start (Resume)",variant="primary",size="lg",scale=2)
789
+ f_btn=gr.Button("πŸš€ Fresh Start",variant="secondary",size="lg",scale=2)
790
+ x_btn=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
791
+ status=gr.Textbox(label="Status",interactive=False,max_lines=1)
792
+ with gr.Tabs():
793
+ with gr.Tab("πŸ“Š Progress"): p_html=gr.HTML()
794
+ with gr.Tab("πŸ“‹ Results"): t_html=gr.HTML()
795
+ with gr.Tab("πŸ† FINAL Score"): s_html=gr.HTML()
796
+ with gr.Tab("πŸ” Details"): d_html=gr.HTML()
797
+ with gr.Tab("πŸ’Ύ CSV"): c_file=gr.File(label="CSV")
798
+ timer=gr.Timer(value=2,active=True)
799
+ timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file])
800
+ eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw]
801
+ s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status])
802
+ f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
803
+ x_btn.click(fn=_stop,outputs=[status])
804
+ gr.Markdown("""---
805
+ <center><b>FINAL Bench v4.0</b> β€” Baseline (Non-AGI) Β· Multi-Provider Β· 100 Tasks Β· 5-Axis Β· 5-Stage<br>
806
+ πŸ”’ MetaCog (Self-Correction Protocol): <b>COMING SOON</b><br>
807
+ Apache 2.0 Β· <b>Ginigen AI</b> β€” Choi Sunyoung</center>""")
808
+ return app
809
+
810
+ if __name__=="__main__":
811
+ sg,sd={},{}
812
+ for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
813
+ print(f"\n{'='*60}\n FINAL Bench v4.0 β€” Baseline (Non-AGI)\n Multi-Provider: OpenAI / Anthropic / Google\n{'='*60}")
814
+ print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
815
+ for g in ["A","B","C"]: print(f" Grade {g} (Γ—{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
816
+ print(f" Judge: GPT-5.2 Structured Output")
817
+ print(f" πŸ”’ MetaCog: COMING SOON\n{'='*60}\n")
818
+ app=create_app()
819
+ app.queue(default_concurrency_limit=2)
820
+ app.launch(server_name="0.0.0.0",server_port=7860)