Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
|
| 2 |
from datetime import datetime
|
| 3 |
from dataclasses import dataclass, field
|
|
@@ -8,312 +17,215 @@ import gradio as gr
|
|
| 8 |
from concurrent.futures import ThreadPoolExecutor
|
| 9 |
from datasets import load_dataset
|
| 10 |
|
| 11 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
-
# Β§1. Data Structures & Constants
|
| 13 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 14 |
-
|
| 15 |
DOMAIN_INFO = {
|
| 16 |
-
"Mathematics & Logic":
|
| 17 |
-
"
|
| 18 |
-
"
|
| 19 |
-
"
|
| 20 |
-
"
|
| 21 |
-
"
|
| 22 |
-
"
|
| 23 |
-
"
|
| 24 |
-
"Chemistry & Biology": {"icon":"π§¬","color":"#06D6A0"},
|
| 25 |
-
"Language & Writing": {"icon":"βοΈ","color":"#EF476F"},
|
| 26 |
-
"Literature": {"icon":"π","color":"#8338EC"},
|
| 27 |
-
"Art": {"icon":"π¨","color":"#FF006E"},
|
| 28 |
-
"Religion & Mythology": {"icon":"ποΈ","color":"#FFD166"},
|
| 29 |
-
"Ethics": {"icon":"βοΈ","color":"#118AB2"},
|
| 30 |
-
"AI & Technology": {"icon":"π€","color":"#073B4C"},
|
| 31 |
}
|
| 32 |
-
GRADE_WEIGHT
|
| 33 |
-
RUBRIC
|
| 34 |
-
"process_quality":
|
| 35 |
-
"metacognitive_accuracy":
|
| 36 |
-
"error_recovery":
|
| 37 |
-
"integration_depth":
|
| 38 |
-
"final_correctness":
|
| 39 |
}
|
| 40 |
-
AXIS_MAP
|
| 41 |
-
"generalization":
|
| 42 |
-
"reasoning":
|
| 43 |
-
"planning":
|
| 44 |
-
"reliability":
|
| 45 |
-
"safety":
|
| 46 |
}
|
| 47 |
-
AGI_STAGES
|
| 48 |
-
{"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence",
|
| 49 |
-
{"stage":2,"name":"FINAL-Proto",
|
| 50 |
-
{"stage":3,"name":"FINAL-Pre",
|
| 51 |
-
{"stage":4,"name":"FINAL-Pass",
|
| 52 |
-
{"stage":5,"name":"FINAL-Post",
|
| 53 |
]
|
| 54 |
|
| 55 |
@dataclass
|
| 56 |
class FinalTask:
|
| 57 |
-
task_id:str;
|
| 58 |
-
difficulty:str;
|
| 59 |
-
expected_behavior:str;
|
| 60 |
ticos_required:List[str]=field(default_factory=list)
|
| 61 |
metadata:Dict=field(default_factory=dict)
|
| 62 |
|
| 63 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 64 |
-
# Β§2. Load Dataset from HuggingFace
|
| 65 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 66 |
-
|
| 67 |
def load_tasks():
|
| 68 |
print("π₯ Loading FINAL-Bench/Metacognitive from HuggingFace...")
|
| 69 |
try:
|
| 70 |
-
ds
|
| 71 |
-
tasks
|
| 72 |
for row in ds:
|
| 73 |
-
tr
|
| 74 |
-
if isinstance(tr,
|
| 75 |
-
try:
|
| 76 |
-
except:
|
| 77 |
-
tasks.append(FinalTask(
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
hidden_trap=row.get("hidden_trap",""),
|
| 83 |
-
ticos_required=tr if isinstance(tr, list) else [], metadata={}
|
| 84 |
-
))
|
| 85 |
print(f" β
Loaded {len(tasks)} tasks from HuggingFace")
|
| 86 |
return tasks
|
| 87 |
except Exception as e:
|
| 88 |
-
print(f" β οΈ HF load failed: {e}
|
| 89 |
-
for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json",
|
| 90 |
-
os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]:
|
| 91 |
-
if os.path.exists(p):
|
| 92 |
-
with open(p,"r",encoding="utf-8") as f: data=json.load(f)
|
| 93 |
-
print(f" β
Loaded from {p}")
|
| 94 |
-
return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"],
|
| 95 |
-
ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""),
|
| 96 |
-
title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""),
|
| 97 |
-
hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]),
|
| 98 |
-
metadata=t.get("metadata",{})) for t in data["tasks"]]
|
| 99 |
raise FileNotFoundError("Dataset not found!")
|
| 100 |
|
| 101 |
-
ALL_TASKS
|
| 102 |
print(f"β
FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
|
| 103 |
|
| 104 |
-
# ββββββ
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
"OpenAI": {
|
| 110 |
-
"gpt-5.2": "GPT-5.2 (flagship)",
|
| 111 |
-
"gpt-5-mini": "GPT-5 Mini",
|
| 112 |
-
"gpt-4.1": "GPT-4.1",
|
| 113 |
-
"o4-mini": "o4-mini (reasoning)",
|
| 114 |
-
"gpt-4o": "GPT-4o",
|
| 115 |
},
|
| 116 |
-
"Anthropic":
|
| 117 |
-
"claude-opus-4-6":
|
| 118 |
-
"claude-sonnet-4-5-20250929":
|
| 119 |
-
"claude-haiku-4-5-20251001":
|
| 120 |
},
|
| 121 |
-
"Google":
|
| 122 |
-
"gemini-
|
| 123 |
-
"gemini-2.5-pro": "Gemini 2.5 Pro",
|
| 124 |
-
"gemini-2.0-flash": "Gemini 2.0 Flash",
|
| 125 |
},
|
| 126 |
}
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
for
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
DEFAULT_EVAL = "GPT-5.2 (flagship) [OpenAI]"
|
| 135 |
-
DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
|
| 136 |
-
|
| 137 |
def _resolve_model(choice):
|
| 138 |
-
info
|
| 139 |
-
return info.get("id",
|
| 140 |
-
|
| 141 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 142 |
-
# Β§4. Multi-Provider API Clients
|
| 143 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
|
|
|
|
| 145 |
def _strip_think(text):
|
| 146 |
-
if not text:
|
| 147 |
-
for tag in
|
| 148 |
-
text
|
| 149 |
return text.strip()
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
json_mode=False, json_schema=None):
|
| 155 |
headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
|
| 156 |
messages=[]
|
| 157 |
-
if system:
|
| 158 |
messages.append({"role":"user","content":prompt})
|
| 159 |
-
payload={"model":model,"max_completion_tokens":max_tokens,
|
| 160 |
-
|
| 161 |
-
if reasoning_effort: payload["reasoning_effort"]=reasoning_effort
|
| 162 |
if json_schema:
|
| 163 |
payload["reasoning_effort"]="none"
|
| 164 |
-
payload["response_format"]={"type":"json_schema",
|
| 165 |
-
"json_schema":{"name":"FINALJudge","strict":True,"schema":json_schema}}
|
| 166 |
elif json_mode:
|
| 167 |
payload["response_format"]={"type":"json_object"}
|
| 168 |
for attempt in range(3):
|
| 169 |
try:
|
| 170 |
-
r=requests.post("https://api.openai.com/v1/chat/completions",
|
| 171 |
-
|
| 172 |
-
r.raise_for_status()
|
| 173 |
-
c=r.json()["choices"][0]["message"]["content"]
|
| 174 |
return _strip_think(c) if c else "[EMPTY]"
|
| 175 |
except requests.exceptions.HTTPError:
|
| 176 |
-
if r.status_code==429:
|
| 177 |
-
try:
|
| 178 |
-
except:
|
| 179 |
-
if attempt<2:
|
| 180 |
return f"[API_ERROR] OpenAI {r.status_code}: {err}"
|
| 181 |
except Exception as e:
|
| 182 |
-
if attempt<2:
|
| 183 |
-
else:
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
headers={
|
| 189 |
-
"Content-Type":"application/json",
|
| 190 |
-
"x-api-key":api_key,
|
| 191 |
-
"anthropic-version":"2023-06-01"
|
| 192 |
-
}
|
| 193 |
messages=[{"role":"user","content":prompt}]
|
| 194 |
payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,"messages":messages}
|
| 195 |
-
if system:
|
| 196 |
for attempt in range(3):
|
| 197 |
try:
|
| 198 |
-
r=requests.post("https://api.anthropic.com/v1/messages",
|
| 199 |
-
|
| 200 |
-
r.raise_for_status()
|
| 201 |
-
resp=r.json()
|
| 202 |
text_parts=[]
|
| 203 |
for block in resp.get("content",[]):
|
| 204 |
-
if block.get("type")=="text":
|
| 205 |
-
text_parts.append(block["text"])
|
| 206 |
c="\n".join(text_parts)
|
| 207 |
return _strip_think(c) if c else "[EMPTY]"
|
| 208 |
except requests.exceptions.HTTPError:
|
| 209 |
-
if r.status_code==429:
|
| 210 |
-
if r.status_code==529:
|
| 211 |
-
try:
|
| 212 |
-
except:
|
| 213 |
return f"[API_ERROR] Claude {r.status_code}: {err}"
|
| 214 |
except Exception as e:
|
| 215 |
-
if attempt<2:
|
| 216 |
-
else:
|
| 217 |
-
|
| 218 |
-
#
|
| 219 |
-
GEMINI_API_BASE
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
"""
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
"""
|
| 228 |
-
|
| 229 |
-
headers = {
|
| 230 |
-
"Content-Type": "application/json",
|
| 231 |
-
"x-goog-api-key": api_key,
|
| 232 |
-
}
|
| 233 |
-
contents = [{"role": "user", "parts": [{"text": prompt}]}]
|
| 234 |
-
gen_config = {"maxOutputTokens": max_tokens, "temperature": temperature}
|
| 235 |
-
payload = {"contents": contents, "generationConfig": gen_config}
|
| 236 |
-
if system:
|
| 237 |
-
payload["systemInstruction"] = {"parts": [{"text": system}]}
|
| 238 |
-
if json_mode:
|
| 239 |
-
gen_config["responseMimeType"] = "application/json"
|
| 240 |
for attempt in range(3):
|
| 241 |
try:
|
| 242 |
-
r
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
data = r.json()
|
| 246 |
-
candidates = data.get("candidates", [])
|
| 247 |
if not candidates:
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
result = []
|
| 253 |
for p in parts:
|
| 254 |
if "text" in p:
|
| 255 |
-
if p.get("thought",
|
| 256 |
-
continue # β
thinking part skip
|
| 257 |
result.append(p["text"])
|
| 258 |
-
c
|
| 259 |
return _strip_think(c) if c else "[EMPTY]"
|
| 260 |
except requests.exceptions.HTTPError:
|
| 261 |
-
|
| 262 |
-
if r.status_code
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
if r.status_code == 503:
|
| 266 |
-
time.sleep(8 * (attempt + 1) + random.uniform(0, 3))
|
| 267 |
-
continue
|
| 268 |
-
try:
|
| 269 |
-
err = r.json().get("error", {}).get("message", "")
|
| 270 |
-
except:
|
| 271 |
-
err = str(r.status_code)
|
| 272 |
print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
|
| 273 |
return f"[API_ERROR] Gemini {r.status_code}: {err}"
|
| 274 |
except Exception as e:
|
| 275 |
print(f" [Gemini] Exception: {e}")
|
| 276 |
-
if attempt
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
if provider == "OpenAI":
|
| 285 |
-
return call_openai(prompt, system, api_key, model_id, max_tokens, temperature)
|
| 286 |
-
elif provider == "Anthropic":
|
| 287 |
-
return call_anthropic(prompt, system, api_key, model_id, max_tokens, temperature)
|
| 288 |
-
elif provider == "Google":
|
| 289 |
-
# β
Geminiλ temperature=1.0 κΆμ₯ (thinking λͺ¨λΈ)
|
| 290 |
-
return call_gemini(prompt, system, api_key, model_id, max_tokens, temperature=1.0)
|
| 291 |
return f"[API_ERROR] Unknown provider: {provider}"
|
| 292 |
|
| 293 |
-
# ββββββ
|
| 294 |
-
|
| 295 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 296 |
-
|
| 297 |
-
JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
|
| 298 |
Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
|
| 299 |
-
|
| 300 |
RUBRIC:
|
| 301 |
process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
|
| 302 |
-
metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max.
|
| 303 |
error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist.
|
| 304 |
integration_depth (15%): Multi-perspective synthesis + emergent insights
|
| 305 |
-
final_correctness (15%): Answer accuracy and completeness. INCOMPLETE
|
| 306 |
-
|
| 307 |
-
TICOS BONUSES:
|
| 308 |
-
A_TrapEscape: ID'd ALL hidden traps? B_ContradictionResolution: Resolved both sides?
|
| 309 |
-
C_ProgressiveDiscovery: Revised with new info? D_MultiConstraint: Mapped ALL conflicts?
|
| 310 |
-
E_SelfCorrecting: EXPLICIT backtrack? F_ExpertPanel: Max-depth per perspective?
|
| 311 |
-
G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario matrix?
|
| 312 |
-
|
| 313 |
STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
|
| 314 |
-
|
| 315 |
-
IMPORTANT: Output ONLY valid JSON with NO extra text:
|
| 316 |
-
{"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
|
| 317 |
|
| 318 |
def _build_judge_schema():
|
| 319 |
sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
|
|
@@ -322,132 +234,101 @@ def _build_judge_schema():
|
|
| 322 |
"comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
|
| 323 |
JUDGE_SCHEMA=_build_judge_schema()
|
| 324 |
|
| 325 |
-
def build_judge_prompt(task,
|
| 326 |
return f"""FINAL Bench Task Evaluation
|
| 327 |
Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
|
| 328 |
TICOS: {task.ticos_type} | Title: {task.title}
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
{task.prompt[:2000]}
|
| 332 |
-
|
| 333 |
-
EXPECTED:
|
| 334 |
-
{task.expected_behavior[:600]}
|
| 335 |
-
|
| 336 |
HIDDEN TRAPS: {task.hidden_trap or 'None'}
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
{response[:17000]}
|
| 340 |
-
|
| 341 |
-
Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness
|
| 342 |
-
Apply {task.ticos_type} bonus criteria.
|
| 343 |
Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
|
| 344 |
|
| 345 |
def _parse_judge_json(text):
|
| 346 |
-
if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
|
| 347 |
-
|
| 348 |
-
cleaned = _strip_think(text)
|
| 349 |
-
VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
|
| 350 |
-
keys = list(RUBRIC.keys())
|
| 351 |
-
# Method 1: Direct JSON
|
| 352 |
try:
|
| 353 |
-
t
|
| 354 |
-
|
| 355 |
-
data
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
v = float(data["scores"].get(k, 0.5))
|
| 360 |
-
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 361 |
-
return {"scores": scores, "comment": data.get("comment", "ok")}
|
| 362 |
-
except: pass
|
| 363 |
-
# Method 2: Search JSON
|
| 364 |
try:
|
| 365 |
-
m
|
| 366 |
if m:
|
| 367 |
-
data
|
| 368 |
if "scores" in data:
|
| 369 |
-
scores =
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 373 |
-
return {"scores": scores, "comment": data.get("comment", "parsed")}
|
| 374 |
-
except: pass
|
| 375 |
-
# Method 3: Regex
|
| 376 |
try:
|
| 377 |
-
sc
|
| 378 |
for k in keys:
|
| 379 |
-
m2
|
| 380 |
if m2:
|
| 381 |
-
v
|
| 382 |
-
if 0
|
| 383 |
-
if len(sc)
|
| 384 |
for k in keys:
|
| 385 |
-
if k not in sc:
|
| 386 |
-
return {"scores":
|
| 387 |
-
except:
|
| 388 |
return None
|
| 389 |
|
| 390 |
-
def call_judge(prompt,
|
| 391 |
-
if provider
|
| 392 |
-
raw
|
| 393 |
-
|
| 394 |
-
result
|
| 395 |
-
|
| 396 |
-
raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
|
| 397 |
-
max_tokens=max_tokens, temperature=temperature, json_mode=True)
|
| 398 |
return _parse_judge_json(raw2)
|
| 399 |
-
elif provider
|
| 400 |
-
raw
|
| 401 |
-
max_tokens=max_tokens, temperature=temperature)
|
| 402 |
return _parse_judge_json(raw)
|
| 403 |
-
elif provider
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
if result: return result
|
| 409 |
-
raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
|
| 410 |
-
max_tokens=max_tokens, temperature=1.0, json_mode=False)
|
| 411 |
return _parse_judge_json(raw2)
|
| 412 |
return None
|
| 413 |
|
| 414 |
-
# ββββββ
|
| 415 |
-
# Β§6. Scoring Engine
|
| 416 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 417 |
-
|
| 418 |
def compute_task_score(scores):
|
| 419 |
return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
|
| 420 |
|
| 421 |
-
def compute_axis_scores(results,
|
| 422 |
-
tm={t.task_id:t for t in tasks};
|
| 423 |
for an,ai in AXIS_MAP.items():
|
| 424 |
vals=[]
|
| 425 |
for tid,d in results.items():
|
| 426 |
-
if d["score"]<0:
|
| 427 |
t=tm.get(tid)
|
| 428 |
-
if not t:
|
| 429 |
-
try:
|
| 430 |
-
except:
|
| 431 |
rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
|
| 432 |
w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
|
| 433 |
-
if rv:
|
| 434 |
ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
|
| 435 |
return ax
|
| 436 |
|
| 437 |
-
def compute_final_score(results,
|
| 438 |
-
tm={t.task_id:t for t in tasks};
|
| 439 |
for tid,d in results.items():
|
| 440 |
-
if d["score"]<0:
|
| 441 |
t=tm.get(tid)
|
| 442 |
-
if t:
|
| 443 |
da={d:np.mean(v) for d,v in ds.items() if v}
|
| 444 |
gd={}
|
| 445 |
-
for t in tasks:
|
| 446 |
ws,wt=0,0
|
| 447 |
for g,doms in gd.items():
|
| 448 |
w=GRADE_WEIGHT.get(g,1.0)
|
| 449 |
for d in doms:
|
| 450 |
-
if d in da:
|
| 451 |
base=ws/wt if wt>0 else 0
|
| 452 |
axis=compute_axis_scores(results,tasks)
|
| 453 |
av=[max(v,0.01) for v in axis.values()]
|
|
@@ -455,77 +336,52 @@ def compute_final_score(results, tasks):
|
|
| 455 |
har_p=har/100.0
|
| 456 |
return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
|
| 457 |
|
| 458 |
-
def determine_agi_stage(score,
|
| 459 |
all60=all(v>=60 for v in axis.values()) if axis else False
|
| 460 |
for s in reversed(AGI_STAGES):
|
| 461 |
if score>=s["min"]:
|
| 462 |
-
if s["stage"]>=4 and not all60:
|
| 463 |
return s
|
| 464 |
return AGI_STAGES[0]
|
| 465 |
|
| 466 |
-
# ββββββ
|
| 467 |
-
# Β§7. Checkpoint DB
|
| 468 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 469 |
DB_PATH="final_bench_eval.db"
|
| 470 |
def _init_db():
|
| 471 |
-
c=sqlite3.connect(DB_PATH)
|
| 472 |
-
|
| 473 |
-
c.commit(); c.close()
|
| 474 |
-
def _make_run_id(m): return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
|
| 475 |
def _save_result(rid,tid,resp,jresp,sc):
|
| 476 |
-
c=sqlite3.connect(DB_PATH);
|
| 477 |
def _load_all(rid):
|
| 478 |
-
"
|
| 479 |
-
|
| 480 |
-
cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,))
|
| 481 |
-
rows=cur.fetchall(); c.close()
|
| 482 |
-
result = {}
|
| 483 |
for r in rows:
|
| 484 |
-
resp
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
if score <= 0 and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp == "[EMPTY]" or resp.startswith("[ERROR")):
|
| 488 |
-
continue
|
| 489 |
-
result[r[0]] = {"response": resp, "judge": r[2], "score": score}
|
| 490 |
return result
|
| 491 |
def _clear_run(rid):
|
| 492 |
-
c=sqlite3.connect(DB_PATH);
|
| 493 |
_init_db()
|
| 494 |
|
| 495 |
-
# ββββββ
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
out=io.StringIO(); w=csv.writer(out)
|
| 500 |
-
w.writerow(["task_id","domain","grade","ticos_type","difficulty","title",
|
| 501 |
-
"eval_model","judge_model","mode","weighted_score",
|
| 502 |
-
"process_quality","metacognitive_accuracy","error_recovery",
|
| 503 |
-
"integration_depth","final_correctness",
|
| 504 |
-
"judge_comment","response_preview","timestamp"])
|
| 505 |
tm={t.task_id:t for t in tasks}
|
| 506 |
for tid,d in sorted(results.items()):
|
| 507 |
t=tm.get(tid)
|
| 508 |
-
if not t:
|
| 509 |
jd={}
|
| 510 |
-
try:
|
| 511 |
-
except:
|
| 512 |
sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
|
| 513 |
-
cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200]
|
| 514 |
-
s=
|
| 515 |
-
|
| 516 |
-
w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,
|
| 517 |
-
model_name,judge_name,mode,s,
|
| 518 |
-
sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),
|
| 519 |
-
sc.get("error_recovery",""),sc.get("integration_depth",""),
|
| 520 |
-
sc.get("final_correctness",""),
|
| 521 |
-
cm,(d.get("response","") or "")[:300].replace("\n"," "),
|
| 522 |
-
datetime.now().isoformat()])
|
| 523 |
return out.getvalue()
|
| 524 |
|
| 525 |
-
# ββββββ
|
| 526 |
-
|
| 527 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 528 |
-
CSS = """<style>
|
| 529 |
.eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
|
| 530 |
.eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
|
| 531 |
.eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
|
|
@@ -541,403 +397,256 @@ CSS = """<style>
|
|
| 541 |
</style>"""
|
| 542 |
|
| 543 |
def _sc(s):
|
| 544 |
-
if s>=80:
|
| 545 |
-
if s>=60:
|
| 546 |
-
if s>=40:
|
| 547 |
return "#f44336"
|
| 548 |
|
| 549 |
-
def _build_progress_table(results,
|
| 550 |
rows=""
|
| 551 |
for t in tasks:
|
| 552 |
info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
|
| 553 |
gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
|
| 554 |
if t.task_id in results:
|
| 555 |
-
d=results[t.task_id];
|
| 556 |
-
if s<0:
|
| 557 |
-
|
| 558 |
-
elif s==0 and resp and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
|
| 559 |
-
# β
API μλ¬λ₯Ό λͺ
ννκ² νμ
|
| 560 |
err_short=html.escape(resp[:60])
|
| 561 |
rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">π« {err_short}</td></tr>'
|
| 562 |
else:
|
| 563 |
-
c=_sc(s)
|
| 564 |
-
|
| 565 |
-
else:
|
| 566 |
-
rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>β³</td><td>β</td></tr>'
|
| 567 |
return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
|
| 568 |
|
| 569 |
-
def _build_summary_card(results,
|
| 570 |
final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks)
|
| 571 |
stage=determine_agi_stage(final,axis)
|
| 572 |
labels={"generalization":"π Generalization","reasoning":"π§ Reasoning","planning":"π Planning","reliability":"π― Reliability","safety":"π‘οΈ Safety"}
|
| 573 |
ax_html=""
|
| 574 |
for an,av in axis.items():
|
| 575 |
-
c=_sc(av)
|
| 576 |
-
ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
|
| 577 |
gh=""
|
| 578 |
-
for g in
|
| 579 |
-
gd=[t.domain for t in tasks if t.grade==g]
|
| 580 |
-
gs=[
|
| 581 |
-
if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}Γ{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
|
| 582 |
done=sum(1 for t in tasks if t.task_id in results)
|
| 583 |
jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
|
| 584 |
-
|
| 585 |
-
api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and (results[t.task_id].get("response","") or "").startswith("["))
|
| 586 |
-
# MA-ER Gap
|
| 587 |
ma_vals,er_vals=[],[]
|
| 588 |
for tid,d in results.items():
|
| 589 |
-
if d["score"]<0:
|
| 590 |
try:
|
| 591 |
-
jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]
|
| 592 |
-
sc
|
| 593 |
-
if "
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
avg_ma=
|
| 597 |
-
gap=avg_ma-avg_er; gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
|
| 598 |
gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
|
| 599 |
-
ad=[t.domain for t in tasks if t.grade=="A"]
|
| 600 |
-
asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
|
| 601 |
-
aa=np.mean(asc_vals) if asc_vals else 0
|
| 602 |
checks=[("Scoreβ₯80",final>=80),("Axesβ₯60",all(v>=60 for v in axis.values())),(f"A-avgβ₯75({aa:.0f})",aa>=75)]
|
| 603 |
ch="".join([f'<span style="margin-right:8px">{"β
" if ok else "β"}{lb}</span>' for lb,ok in checks])
|
| 604 |
err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">β οΈ API Errors: {api_errs} tasks</div>' if api_errs else ""
|
| 605 |
-
return f"""{CSS}<div class="summary-card">
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
<h2 style="margin:6px 0;font-size:1.6em">π€ Baseline FINAL: {final:.1f}</h2>
|
| 609 |
-
<p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ HAR {har_p:.3f} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p>
|
| 610 |
-
<p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} Β· Judge: {judge_label}</p>
|
| 611 |
-
{err_html}
|
| 612 |
-
</div><hr style="border-color:#333;margin:12px 0">
|
| 613 |
-
<h4 style="color:#aaa;margin:6px 0">π― 5-Axis Scores</h4>{ax_html}
|
| 614 |
-
<hr style="border-color:#333;margin:10px 0">
|
| 615 |
-
<div style="font-size:0.88em">{gh}</div>
|
| 616 |
-
<div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px">
|
| 617 |
-
<span style="font-size:0.85em">MA-ER Gap:</span>
|
| 618 |
-
<span style="font-weight:700;color:{gc}">{gap:.3f}</span>
|
| 619 |
-
<span style="font-size:0.8em;color:{gc}">({gl})</span>
|
| 620 |
-
<span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div>
|
| 621 |
-
<div style="font-size:0.82em;margin-top:6px">{ch}</div>
|
| 622 |
-
<p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p>
|
| 623 |
-
<div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px">
|
| 624 |
-
<p style="font-size:0.82em;color:#e94560;margin:0">π <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p>
|
| 625 |
-
<p style="font-size:0.75em;color:#aaa;margin:4px 0 0 0">The 3-Phase Protocol can boost performance up to 70%+ on hardest tasks.</p>
|
| 626 |
-
</div></div>"""
|
| 627 |
-
|
| 628 |
-
def _build_detail_view(results, tasks):
|
| 629 |
items=""
|
| 630 |
for t in tasks:
|
| 631 |
-
if t.task_id not in results:
|
| 632 |
-
d=results[t.task_id];
|
| 633 |
-
|
| 634 |
-
jc=""; ss=""
|
| 635 |
try:
|
| 636 |
-
jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
|
| 640 |
-
except: pass
|
| 641 |
-
c=_sc(s) if s>=0 else "#ff9800"; badge=f'{s:.1f}' if s>=0 else "JF"
|
| 642 |
items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
|
| 643 |
return CSS+items
|
| 644 |
|
| 645 |
-
# ββββββ
|
| 646 |
-
|
| 647 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 648 |
-
|
| 649 |
-
def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
|
| 650 |
-
judge_api_key, judge_model_id, judge_provider, state):
|
| 651 |
try:
|
| 652 |
sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
|
| 653 |
-
f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
|
| 654 |
-
f"If unsure, say so honestly.")
|
| 655 |
print(f" βΆ {task.task_id} β {eval_provider}/{eval_model_id}")
|
| 656 |
-
model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
|
| 657 |
-
|
| 658 |
-
if (model_response.startswith("[API_ERROR") or
|
| 659 |
-
model_response.startswith("[BLOCKED") or
|
| 660 |
-
model_response=="[EMPTY]"):
|
| 661 |
print(f" β {task.task_id}: {model_response[:100]}")
|
| 662 |
-
# β
API μλ¬λ μ μ₯νλ, _load_allμμ μλ μ μΈλ¨
|
| 663 |
_save_result(run_id,task.task_id,model_response,"{}",0)
|
| 664 |
-
with state["lock"]:
|
| 665 |
-
state["done"]+=1
|
| 666 |
-
state["errors"].append(f"{task.task_id}: {model_response[:80]}")
|
| 667 |
return task.task_id,{"response":model_response,"judge":"{}","score":0}
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
jd =
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
jd={"scores":{k:0.0 for k in RUBRIC},"comment":"JUDGE_PARSE_FAILED","failed":True}
|
| 676 |
-
|
| 677 |
-
if jd.get("failed"):
|
| 678 |
-
ws=-1.0; jd["comment"]=f"JF:{jd.get('comment','')}"
|
| 679 |
-
else:
|
| 680 |
-
ws=compute_task_score(jd["scores"])
|
| 681 |
-
with state["lock"]: state["parse_ok"]+=1
|
| 682 |
-
|
| 683 |
jj=json.dumps(jd,ensure_ascii=False)
|
| 684 |
_save_result(run_id,task.task_id,model_response,jj,ws)
|
| 685 |
with state["lock"]:
|
| 686 |
-
state["done"]+=1
|
| 687 |
-
info=DOMAIN_INFO.get(task.domain,{"icon":"?"})
|
| 688 |
state["active"].append(f'{info["icon"]} {task.task_id}')
|
| 689 |
-
if len(state["active"])>10:
|
| 690 |
return task.task_id,{"response":model_response,"judge":jj,"score":ws}
|
| 691 |
except Exception as e:
|
| 692 |
print(f" β {task.task_id} EXCEPTION: {e}")
|
| 693 |
-
with state["lock"]:
|
| 694 |
_save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
|
| 695 |
return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
|
| 696 |
|
| 697 |
-
# ββββββ
|
| 698 |
-
|
| 699 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 700 |
-
|
| 701 |
-
_EVAL_STATE={
|
| 702 |
-
"running":False,"stop_requested":False,"finished":False,
|
| 703 |
-
"run_id":"","eval_label":"","judge_label":"","done":0,"total":0,"cached":0,
|
| 704 |
-
"errors":[],"active":[],"parse_ok":0,"parse_fail":0,
|
| 705 |
-
"start_time":0,"results":{},"tasks":[],
|
| 706 |
-
"grade_done":{},"grade_total":{},
|
| 707 |
-
"lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5,
|
| 708 |
-
}
|
| 709 |
|
| 710 |
def _reset():
|
| 711 |
-
with _EVAL_STATE["lock"]:
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
gb=""
|
| 720 |
-
for g in ["A","B","C"]:
|
| 721 |
-
gt=state["grade_total"].get(g,0); gd=state["grade_done"].get(g,0)
|
| 722 |
-
if gt==0: continue
|
| 723 |
-
gp=min(int(gd/gt*100),100)
|
| 724 |
-
c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0")
|
| 725 |
emoji="π
°οΈ" if g=="A" else "π
±οΈ" if g=="B" else "π
ΎοΈ"
|
| 726 |
gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>'
|
| 727 |
o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>β‘ <b>π€ Baseline</b> β {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}'
|
| 728 |
ac=state.get("active",[])
|
| 729 |
-
if ac:
|
| 730 |
er=state.get("errors",[])
|
| 731 |
if er:
|
| 732 |
-
o+=
|
| 733 |
-
for e in er[-6:]:
|
| 734 |
-
o+=f'<div>β οΈ {html.escape(e[:100])}</div>'
|
| 735 |
o+='</div>'
|
| 736 |
return o+'</div>'
|
| 737 |
|
| 738 |
-
def _bg_eval(eval_api_key,
|
| 739 |
-
judge_api_key, judge_model_id, judge_provider, judge_label,
|
| 740 |
-
tasks, run_id, n_workers):
|
| 741 |
global _EVAL_STATE
|
| 742 |
try:
|
| 743 |
-
with _EVAL_STATE["lock"]:
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
cached=
|
| 749 |
-
pending=[t for t in tasks if t.task_id not in results]
|
| 750 |
-
print(f" π Cached (valid): {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
|
| 751 |
-
gt={}
|
| 752 |
-
for t in pending: gt.setdefault(t.grade,[]).append(t)
|
| 753 |
-
with _EVAL_STATE["lock"]:
|
| 754 |
-
_EVAL_STATE["results"]=results; _EVAL_STATE["cached"]=cached
|
| 755 |
-
_EVAL_STATE["total"]=len(pending)
|
| 756 |
-
_EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()}
|
| 757 |
-
_EVAL_STATE["grade_done"]={g:0 for g in gt}
|
| 758 |
-
_EVAL_STATE["done"]=0; _EVAL_STATE["errors"]=[]; _EVAL_STATE["active"]=[]
|
| 759 |
if pending:
|
| 760 |
with ThreadPoolExecutor(max_workers=n_workers) as ex:
|
| 761 |
futs={}
|
| 762 |
for t in pending:
|
| 763 |
-
if _EVAL_STATE["stop_requested"]:
|
| 764 |
-
futs[ex.submit(_eval_single,t,run_id,
|
| 765 |
-
eval_api_key,eval_model_id,eval_provider,
|
| 766 |
-
judge_api_key,judge_model_id,judge_provider,
|
| 767 |
-
_EVAL_STATE)]=t
|
| 768 |
done_set=set()
|
| 769 |
while len(done_set)<len(futs):
|
| 770 |
-
if _EVAL_STATE["stop_requested"]:
|
| 771 |
for f in list(futs):
|
| 772 |
-
if f in done_set:
|
| 773 |
if f.done():
|
| 774 |
done_set.add(f)
|
| 775 |
try:
|
| 776 |
tid,data=f.result()
|
| 777 |
-
with _EVAL_STATE["lock"]:
|
| 778 |
-
|
| 779 |
-
to=futs[f]; _EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1
|
| 780 |
-
except: pass
|
| 781 |
time.sleep(0.5)
|
| 782 |
-
with _EVAL_STATE["lock"]:
|
| 783 |
-
final,base,har,axis,_=compute_final_score(results,tasks)
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
cp=f"/tmp/final_{run_id}.csv"
|
| 787 |
-
with open(cp,"w",encoding="utf-8") as f: f.write(csv_str)
|
| 788 |
elapsed=int(time.time()-_EVAL_STATE["start_time"])
|
| 789 |
-
with _EVAL_STATE["lock"]:
|
| 790 |
-
_EVAL_STATE["csv_path"]=cp; _EVAL_STATE["hf_status"]=""
|
| 791 |
-
_EVAL_STATE["message"]=f"π {stage['name']} β FINAL={final:.1f} Β· {elapsed}s"
|
| 792 |
-
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
| 793 |
except Exception as e:
|
| 794 |
-
print(f" β Fatal: {e}")
|
| 795 |
-
|
| 796 |
-
with _EVAL_STATE["lock"]:
|
| 797 |
-
_EVAL_STATE["message"]=f"β Fatal: {str(e)[:100]}"
|
| 798 |
-
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
| 799 |
-
|
| 800 |
-
def _start_eval(eval_api_key, judge_api_key, eval_model_choice, judge_model_choice,
|
| 801 |
-
grade_f, diff_f, max_t, n_w, fresh):
|
| 802 |
-
global _EVAL_STATE
|
| 803 |
-
if _EVAL_STATE["running"]: return "β οΈ Already running"
|
| 804 |
-
eval_api_key=(eval_api_key or "").strip()
|
| 805 |
-
judge_api_key=(judge_api_key or "").strip()
|
| 806 |
-
|
| 807 |
-
eval_model_id, eval_provider = _resolve_model(eval_model_choice)
|
| 808 |
-
judge_model_id, judge_provider = _resolve_model(judge_model_choice)
|
| 809 |
-
|
| 810 |
-
if not eval_api_key: return f"β {eval_provider} API Key required for Eval model"
|
| 811 |
-
if not judge_api_key: return f"β {judge_provider} API Key required for Judge model"
|
| 812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
tasks=ALL_TASKS[:]
|
| 814 |
-
if grade_f!="All":
|
| 815 |
-
if diff_f!="All":
|
| 816 |
-
tasks=tasks[:int(max_t)]
|
| 817 |
-
|
| 818 |
-
if fresh: _clear_run(rid)
|
| 819 |
_reset()
|
| 820 |
-
with _EVAL_STATE["lock"]:
|
| 821 |
-
|
| 822 |
-
"eval_label":eval_model_choice,"judge_label":judge_model_choice,
|
| 823 |
-
"tasks":tasks,"total":len(tasks),"n_workers":int(n_w)})
|
| 824 |
-
threading.Thread(target=_bg_eval,daemon=True,
|
| 825 |
-
args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,
|
| 826 |
-
judge_api_key,judge_model_id,judge_provider,judge_model_choice,
|
| 827 |
-
tasks,rid,int(n_w))).start()
|
| 828 |
return f"β‘ Started β Eval: {eval_model_choice} Β· Judge: {judge_model_choice} ({len(tasks)} tasks)"
|
| 829 |
|
| 830 |
def _stop():
|
| 831 |
-
if _EVAL_STATE["running"]:
|
| 832 |
return "βΉοΈ Not running"
|
| 833 |
|
| 834 |
def _poll():
|
| 835 |
-
with _EVAL_STATE["lock"]:
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
if running:
|
| 842 |
-
pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0)
|
| 843 |
-
ph=CSS+_prog_html(_EVAL_STATE,pend)
|
| 844 |
-
elif finished:
|
| 845 |
-
ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
|
| 846 |
-
else: ph=msg
|
| 847 |
-
th=_build_progress_table(results,tasks) if tasks else ""
|
| 848 |
-
sh,dh,co="","",None
|
| 849 |
if finished and tasks:
|
| 850 |
-
el=_EVAL_STATE.get("eval_label","?")
|
| 851 |
-
jl=
|
| 852 |
-
hf_st=_EVAL_STATE.get("hf_status","")
|
| 853 |
-
sh=_build_summary_card(results,tasks,el,jl,hf_st)
|
| 854 |
-
dh=_build_detail_view(results,tasks)
|
| 855 |
-
co=cp
|
| 856 |
return(ph,th,sh,dh,co)
|
| 857 |
|
| 858 |
-
# ββββββ
|
| 859 |
-
|
| 860 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 861 |
-
|
| 862 |
-
HEADER = """
|
| 863 |
-
<div style="text-align:center;padding:16px 0">
|
| 864 |
<h1 style="margin:0;font-size:1.8em">π FINAL Bench v4.2 β Baseline Evaluation</h1>
|
| 865 |
<h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
|
| 866 |
-
<p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
|
| 867 |
-
|
| 868 |
-
π€ Baseline (Non-AGI) β Single LLM Evaluation Β· Multi-Provider<br>
|
| 869 |
-
Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google
|
| 870 |
-
</p>
|
| 871 |
<div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
|
| 872 |
<span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI Β· GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
|
| 873 |
<span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic Β· Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
|
| 874 |
-
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google Β· Gemini
|
| 875 |
-
</div>
|
| 876 |
<div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
|
| 877 |
-
<p style="color:#e94560;font-size:0.85em;margin:0">π <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
|
| 878 |
-
</div>
|
| 879 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
|
| 880 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Dataset</a>
|
| 881 |
-
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Leaderboard</a>
|
| 882 |
-
</div></div>"""
|
| 883 |
|
| 884 |
def create_app():
|
| 885 |
-
with gr.Blocks(title="FINAL Bench v4.2",
|
| 886 |
-
css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
|
| 887 |
gr.HTML(HEADER)
|
| 888 |
-
|
| 889 |
gr.Markdown("### π API Keys")
|
| 890 |
-
gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. Same key OK if both use same provider.</p>')
|
| 891 |
with gr.Row():
|
| 892 |
-
eval_api_key=gr.Textbox(label="π€ Eval Model API Key",type="password",
|
| 893 |
-
|
| 894 |
-
info="OpenAI / Anthropic / Google key for eval",scale=3)
|
| 895 |
-
judge_api_key=gr.Textbox(label="βοΈ Judge Model API Key",type="password",
|
| 896 |
-
placeholder="sk-... / sk-ant-... / AIza...",
|
| 897 |
-
info="OpenAI / Anthropic / Google key for judge",scale=3)
|
| 898 |
-
|
| 899 |
gr.Markdown("### π€ Model Selection")
|
| 900 |
with gr.Row():
|
| 901 |
-
eval_m=gr.Dropdown(label="π€ Evaluation Target",choices=MODEL_CHOICES,
|
| 902 |
-
|
| 903 |
-
judge_m=gr.Dropdown(label="βοΈ Judge Model",choices=MODEL_CHOICES,
|
| 904 |
-
value=DEFAULT_JUDGE,info="Model that scores responses",scale=3)
|
| 905 |
-
|
| 906 |
gr.Markdown("### βοΈ Settings")
|
| 907 |
with gr.Row():
|
| 908 |
gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
|
| 909 |
-
df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty
|
| 910 |
mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
|
| 911 |
nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
|
| 912 |
-
|
| 913 |
with gr.Row():
|
| 914 |
s_btn=gr.Button("βΆοΈ Start (Resume)",variant="primary",size="lg",scale=2)
|
| 915 |
f_btn=gr.Button("π Fresh Start",variant="secondary",size="lg",scale=2)
|
| 916 |
x_btn=gr.Button("βΉοΈ Stop",variant="stop",size="lg",scale=1)
|
| 917 |
status=gr.Textbox(label="Status",interactive=False,max_lines=2)
|
| 918 |
-
|
| 919 |
with gr.Tabs():
|
| 920 |
-
with gr.Tab("π Progress"):
|
| 921 |
-
with gr.Tab("π Results"):
|
| 922 |
-
with gr.Tab("π FINAL Score"):
|
| 923 |
-
with gr.Tab("π Details"):
|
| 924 |
-
with gr.Tab("πΎ CSV"):
|
| 925 |
-
|
| 926 |
timer=gr.Timer(value=2,active=True)
|
| 927 |
timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file])
|
| 928 |
-
|
| 929 |
eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw]
|
| 930 |
s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status])
|
| 931 |
f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
|
| 932 |
x_btn.click(fn=_stop,outputs=[status])
|
|
|
|
|
|
|
| 933 |
|
| 934 |
if __name__=="__main__":
|
| 935 |
sg,sd={},{}
|
| 936 |
-
for t in ALL_TASKS:
|
| 937 |
print(f"\n{'='*60}\n FINAL Bench v4.2 β Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
|
| 938 |
print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
|
| 939 |
-
for g in
|
| 940 |
print(f" π MetaCog: COMING SOON\n{'='*60}\n")
|
| 941 |
-
app=create_app()
|
| 942 |
-
app.
|
| 943 |
-
app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FINAL Bench v4.2 β Baseline (Non-AGI) Evaluation System
|
| 3 |
+
=========================================================
|
| 4 |
+
β
Multi-Provider: OpenAI / Anthropic / Google (Gemini 3 Pro Preview)
|
| 5 |
+
β
Both Eval Model AND Judge Model support all 3 providers
|
| 6 |
+
β
100 Tasks Β· 15 Domains Β· 8 TICOS Types Β· 5-Axis Β· 5-Stage AGI Grade
|
| 7 |
+
β
Dataset: HuggingFace FINAL-Bench/Metacognitive
|
| 8 |
+
Author: Ginigen AI β Choi Sunyoung | License: Apache 2.0
|
| 9 |
+
"""
|
| 10 |
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
|
| 11 |
from datetime import datetime
|
| 12 |
from dataclasses import dataclass, field
|
|
|
|
| 17 |
from concurrent.futures import ThreadPoolExecutor
|
| 18 |
from datasets import load_dataset
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
DOMAIN_INFO = {
|
| 21 |
+
"Mathematics & Logic":{"icon":"π’","color":"#FF6B35"},"Science":{"icon":"π¬","color":"#7B2FF7"},
|
| 22 |
+
"Philosophy":{"icon":"π€","color":"#00B4D8"},"Medicine":{"icon":"π₯","color":"#2EC4B6"},
|
| 23 |
+
"Economics":{"icon":"π","color":"#E63946"},"History":{"icon":"π","color":"#F4A261"},
|
| 24 |
+
"War & Security":{"icon":"π‘οΈ","color":"#264653"},"Space & Physics":{"icon":"π","color":"#6C63FF"},
|
| 25 |
+
"Chemistry & Biology":{"icon":"π§¬","color":"#06D6A0"},"Language & Writing":{"icon":"βοΈ","color":"#EF476F"},
|
| 26 |
+
"Literature":{"icon":"π","color":"#8338EC"},"Art":{"icon":"π¨","color":"#FF006E"},
|
| 27 |
+
"Religion & Mythology":{"icon":"ποΈ","color":"#FFD166"},"Ethics":{"icon":"βοΈ","color":"#118AB2"},
|
| 28 |
+
"AI & Technology":{"icon":"π€","color":"#073B4C"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
}
|
| 30 |
+
GRADE_WEIGHT={"A":1.5,"B":1.0,"C":0.7}
|
| 31 |
+
RUBRIC={
|
| 32 |
+
"process_quality":{"weight":0.25,"desc":"Systematic reasoning transparency"},
|
| 33 |
+
"metacognitive_accuracy":{"weight":0.25,"desc":"Confidence calibration + uncertainty honesty"},
|
| 34 |
+
"error_recovery":{"weight":0.20,"desc":"Mid-analysis self-correction"},
|
| 35 |
+
"integration_depth":{"weight":0.15,"desc":"Multi-perspective synthesis"},
|
| 36 |
+
"final_correctness":{"weight":0.15,"desc":"Answer accuracy and completeness"},
|
| 37 |
}
|
| 38 |
+
AXIS_MAP={
|
| 39 |
+
"generalization":{"rubrics":["process_quality","final_correctness"],"ticos":[]},
|
| 40 |
+
"reasoning":{"rubrics":["process_quality","error_recovery"],"ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]},
|
| 41 |
+
"planning":{"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]},
|
| 42 |
+
"reliability":{"rubrics":["metacognitive_accuracy"],"ticos":["E_SelfCorrecting","G_PivotDetection"]},
|
| 43 |
+
"safety":{"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]},
|
| 44 |
}
|
| 45 |
+
AGI_STAGES=[
|
| 46 |
+
{"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence","min":0,"max":39,"color":"#f44336"},
|
| 47 |
+
{"stage":2,"name":"FINAL-Proto","label":"Proto Intelligence","min":40,"max":59,"color":"#ff9800"},
|
| 48 |
+
{"stage":3,"name":"FINAL-Pre","label":"Pre-AGI","min":60,"max":79,"color":"#2196f3"},
|
| 49 |
+
{"stage":4,"name":"FINAL-Pass","label":"AGI Achieved","min":80,"max":94,"color":"#4caf50"},
|
| 50 |
+
{"stage":5,"name":"FINAL-Post","label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"},
|
| 51 |
]
|
| 52 |
|
| 53 |
@dataclass
|
| 54 |
class FinalTask:
|
| 55 |
+
task_id:str;domain:str;grade:str;ticos_type:str
|
| 56 |
+
difficulty:str;lens:str;title:str;prompt:str
|
| 57 |
+
expected_behavior:str;hidden_trap:str
|
| 58 |
ticos_required:List[str]=field(default_factory=list)
|
| 59 |
metadata:Dict=field(default_factory=dict)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def load_tasks():
|
| 62 |
print("π₯ Loading FINAL-Bench/Metacognitive from HuggingFace...")
|
| 63 |
try:
|
| 64 |
+
ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
|
| 65 |
+
tasks=[]
|
| 66 |
for row in ds:
|
| 67 |
+
tr=row.get("ticos_required",[])
|
| 68 |
+
if isinstance(tr,str):
|
| 69 |
+
try:tr=json.loads(tr)
|
| 70 |
+
except:tr=[x.strip() for x in tr.split(",") if x.strip()]
|
| 71 |
+
tasks.append(FinalTask(task_id=row["task_id"],domain=row["domain"],grade=row["grade"],
|
| 72 |
+
ticos_type=row["ticos_type"],difficulty=row["difficulty"],lens=row.get("lens",""),
|
| 73 |
+
title=row.get("title",row["task_id"]),prompt=row["prompt"],
|
| 74 |
+
expected_behavior=row.get("expected_behavior",""),hidden_trap=row.get("hidden_trap",""),
|
| 75 |
+
ticos_required=tr if isinstance(tr,list) else [],metadata={}))
|
|
|
|
|
|
|
|
|
|
| 76 |
print(f" β
Loaded {len(tasks)} tasks from HuggingFace")
|
| 77 |
return tasks
|
| 78 |
except Exception as e:
|
| 79 |
+
print(f" β οΈ HF load failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
raise FileNotFoundError("Dataset not found!")
|
| 81 |
|
| 82 |
+
ALL_TASKS=load_tasks()
|
| 83 |
print(f"β
FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
|
| 84 |
|
| 85 |
+
# βββ Β§3. Model Registry βββ
|
| 86 |
+
PROVIDER_MODELS={
|
| 87 |
+
"OpenAI":{
|
| 88 |
+
"gpt-5.2":"GPT-5.2 (flagship)","gpt-5-mini":"GPT-5 Mini",
|
| 89 |
+
"gpt-4.1":"GPT-4.1","o4-mini":"o4-mini (reasoning)","gpt-4o":"GPT-4o",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
},
|
| 91 |
+
"Anthropic":{
|
| 92 |
+
"claude-opus-4-6":"Claude Opus 4.6",
|
| 93 |
+
"claude-sonnet-4-5-20250929":"Claude Sonnet 4.5",
|
| 94 |
+
"claude-haiku-4-5-20251001":"Claude Haiku 4.5",
|
| 95 |
},
|
| 96 |
+
"Google":{
|
| 97 |
+
"gemini-3-pro-preview":"Gemini 3 Pro Preview",
|
|
|
|
|
|
|
| 98 |
},
|
| 99 |
}
|
| 100 |
+
ALL_MODELS={}
|
| 101 |
+
for prov,models in PROVIDER_MODELS.items():
|
| 102 |
+
for mid,label in models.items():
|
| 103 |
+
ALL_MODELS[f"{label} [{prov}]"]={"id":mid,"provider":prov}
|
| 104 |
+
MODEL_CHOICES=list(ALL_MODELS.keys())
|
| 105 |
+
DEFAULT_EVAL="GPT-5.2 (flagship) [OpenAI]"
|
| 106 |
+
DEFAULT_JUDGE="GPT-5.2 (flagship) [OpenAI]"
|
|
|
|
|
|
|
|
|
|
| 107 |
def _resolve_model(choice):
|
| 108 |
+
info=ALL_MODELS.get(choice,{})
|
| 109 |
+
return info.get("id","gpt-5.2"),info.get("provider","OpenAI")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
# βββ Β§4. API Clients βββ
|
| 112 |
def _strip_think(text):
|
| 113 |
+
if not text:return text
|
| 114 |
+
for tag in['think','thinking','reasoning','reflection']:
|
| 115 |
+
text=re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
|
| 116 |
return text.strip()
|
| 117 |
|
| 118 |
+
def call_openai(prompt,system="",api_key="",model="gpt-5.2",
|
| 119 |
+
max_tokens=8192,temperature=0.6,reasoning_effort=None,
|
| 120 |
+
json_mode=False,json_schema=None):
|
|
|
|
| 121 |
headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
|
| 122 |
messages=[]
|
| 123 |
+
if system:messages.append({"role":"system","content":system})
|
| 124 |
messages.append({"role":"user","content":prompt})
|
| 125 |
+
payload={"model":model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages}
|
| 126 |
+
if reasoning_effort:payload["reasoning_effort"]=reasoning_effort
|
|
|
|
| 127 |
if json_schema:
|
| 128 |
payload["reasoning_effort"]="none"
|
| 129 |
+
payload["response_format"]={"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":json_schema}}
|
|
|
|
| 130 |
elif json_mode:
|
| 131 |
payload["response_format"]={"type":"json_object"}
|
| 132 |
for attempt in range(3):
|
| 133 |
try:
|
| 134 |
+
r=requests.post("https://api.openai.com/v1/chat/completions",headers=headers,data=json.dumps(payload),timeout=300)
|
| 135 |
+
r.raise_for_status();c=r.json()["choices"][0]["message"]["content"]
|
|
|
|
|
|
|
| 136 |
return _strip_think(c) if c else "[EMPTY]"
|
| 137 |
except requests.exceptions.HTTPError:
|
| 138 |
+
if r.status_code==429:time.sleep(5*(attempt+1));continue
|
| 139 |
+
try:err=r.json().get("error",{}).get("message","")
|
| 140 |
+
except:err=str(r.status_code)
|
| 141 |
+
if attempt<2:time.sleep(3*(attempt+1));continue
|
| 142 |
return f"[API_ERROR] OpenAI {r.status_code}: {err}"
|
| 143 |
except Exception as e:
|
| 144 |
+
if attempt<2:time.sleep(3*(attempt+1))
|
| 145 |
+
else:return f"[API_ERROR] {e}"
|
| 146 |
+
|
| 147 |
+
def call_anthropic(prompt,system="",api_key="",model="claude-opus-4-6",
|
| 148 |
+
max_tokens=8192,temperature=0.6):
|
| 149 |
+
headers={"Content-Type":"application/json","x-api-key":api_key,"anthropic-version":"2023-06-01"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
messages=[{"role":"user","content":prompt}]
|
| 151 |
payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,"messages":messages}
|
| 152 |
+
if system:payload["system"]=system
|
| 153 |
for attempt in range(3):
|
| 154 |
try:
|
| 155 |
+
r=requests.post("https://api.anthropic.com/v1/messages",headers=headers,data=json.dumps(payload),timeout=300)
|
| 156 |
+
r.raise_for_status();resp=r.json()
|
|
|
|
|
|
|
| 157 |
text_parts=[]
|
| 158 |
for block in resp.get("content",[]):
|
| 159 |
+
if block.get("type")=="text":text_parts.append(block["text"])
|
|
|
|
| 160 |
c="\n".join(text_parts)
|
| 161 |
return _strip_think(c) if c else "[EMPTY]"
|
| 162 |
except requests.exceptions.HTTPError:
|
| 163 |
+
if r.status_code==429:time.sleep(5*(attempt+1));continue
|
| 164 |
+
if r.status_code==529:time.sleep(8*(attempt+1));continue
|
| 165 |
+
try:err=r.json().get("error",{}).get("message","")
|
| 166 |
+
except:err=str(r.status_code)
|
| 167 |
return f"[API_ERROR] Claude {r.status_code}: {err}"
|
| 168 |
except Exception as e:
|
| 169 |
+
if attempt<2:time.sleep(3*(attempt+1))
|
| 170 |
+
else:return f"[API_ERROR] {e}"
|
| 171 |
+
|
| 172 |
+
# β
Gemini β x-goog-api-key header Β· data=json.dumps Β· thinking skip
|
| 173 |
+
GEMINI_API_BASE="https://generativelanguage.googleapis.com/v1beta"
|
| 174 |
+
def call_gemini(prompt,system="",api_key="",model="gemini-3-pro-preview",
|
| 175 |
+
max_tokens=8192,temperature=1.0,json_mode=False):
|
| 176 |
+
url=f"{GEMINI_API_BASE}/models/{model}:generateContent"
|
| 177 |
+
headers={"Content-Type":"application/json","x-goog-api-key":api_key}
|
| 178 |
+
contents=[{"role":"user","parts":[{"text":prompt}]}]
|
| 179 |
+
gen_config={"maxOutputTokens":max_tokens,"temperature":temperature}
|
| 180 |
+
payload={"contents":contents,"generationConfig":gen_config}
|
| 181 |
+
if system:payload["systemInstruction"]={"parts":[{"text":system}]}
|
| 182 |
+
if json_mode:gen_config["responseMimeType"]="application/json"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
for attempt in range(3):
|
| 184 |
try:
|
| 185 |
+
r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300)
|
| 186 |
+
r.raise_for_status();data=r.json()
|
| 187 |
+
candidates=data.get("candidates",[])
|
|
|
|
|
|
|
| 188 |
if not candidates:
|
| 189 |
+
br=data.get("promptFeedback",{}).get("blockReason","UNKNOWN")
|
| 190 |
+
return f"[API_ERROR] Gemini BLOCKED: {br}"
|
| 191 |
+
parts=candidates[0].get("content",{}).get("parts",[])
|
| 192 |
+
result=[]
|
|
|
|
| 193 |
for p in parts:
|
| 194 |
if "text" in p:
|
| 195 |
+
if p.get("thought",False):continue
|
|
|
|
| 196 |
result.append(p["text"])
|
| 197 |
+
c="\n".join(result) if result else ""
|
| 198 |
return _strip_think(c) if c else "[EMPTY]"
|
| 199 |
except requests.exceptions.HTTPError:
|
| 200 |
+
if r.status_code==429:time.sleep(5*(attempt+1)+random.uniform(0,2));continue
|
| 201 |
+
if r.status_code==503:time.sleep(8*(attempt+1)+random.uniform(0,3));continue
|
| 202 |
+
try:err=r.json().get("error",{}).get("message","")
|
| 203 |
+
except:err=str(r.status_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
|
| 205 |
return f"[API_ERROR] Gemini {r.status_code}: {err}"
|
| 206 |
except Exception as e:
|
| 207 |
print(f" [Gemini] Exception: {e}")
|
| 208 |
+
if attempt<2:time.sleep(3*(attempt+1))
|
| 209 |
+
else:return f"[API_ERROR] Gemini: {e}"
|
| 210 |
+
|
| 211 |
+
def call_model(prompt,system="",api_key="",model_id="gpt-5.2",
|
| 212 |
+
provider="OpenAI",max_tokens=8192,temperature=0.6):
|
| 213 |
+
if provider=="OpenAI":return call_openai(prompt,system,api_key,model_id,max_tokens,temperature)
|
| 214 |
+
elif provider=="Anthropic":return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature)
|
| 215 |
+
elif provider=="Google":return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature=1.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
return f"[API_ERROR] Unknown provider: {provider}"
|
| 217 |
|
| 218 |
+
# βββ Β§5. Judge βββ
|
| 219 |
+
JUDGE_SYSTEM="""You are a FINAL Bench judge for AGI-Level Verification.
|
|
|
|
|
|
|
|
|
|
| 220 |
Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
|
|
|
|
| 221 |
RUBRIC:
|
| 222 |
process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
|
| 223 |
+
metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max.
|
| 224 |
error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist.
|
| 225 |
integration_depth (15%): Multi-perspective synthesis + emergent insights
|
| 226 |
+
final_correctness (15%): Answer accuracy and completeness. INCOMPLETE=0.25 max.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
|
| 228 |
+
Output ONLY valid JSON: {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
|
|
|
|
|
|
|
| 229 |
|
| 230 |
def _build_judge_schema():
|
| 231 |
sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
|
|
|
|
| 234 |
"comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
|
| 235 |
JUDGE_SCHEMA=_build_judge_schema()
|
| 236 |
|
| 237 |
+
def build_judge_prompt(task,response):
|
| 238 |
return f"""FINAL Bench Task Evaluation
|
| 239 |
Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
|
| 240 |
TICOS: {task.ticos_type} | Title: {task.title}
|
| 241 |
+
PROMPT:\n{task.prompt[:2000]}
|
| 242 |
+
EXPECTED:\n{task.expected_behavior[:600]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
HIDDEN TRAPS: {task.hidden_trap or 'None'}
|
| 244 |
+
RESPONSE TO JUDGE:\n{response[:17000]}
|
| 245 |
+
Score all 5 rubrics. Apply {task.ticos_type} bonus criteria.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
|
| 247 |
|
| 248 |
def _parse_judge_json(text):
|
| 249 |
+
if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":return None
|
| 250 |
+
cleaned=_strip_think(text);VALID={0.0,0.25,0.5,0.75,1.0};keys=list(RUBRIC.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
try:
|
| 252 |
+
t=re.sub(r'^```(?:json)?\s*','',cleaned.strip());t=re.sub(r'\s*```$','',t.strip())
|
| 253 |
+
data=json.loads(t)
|
| 254 |
+
if "scores" in data and isinstance(data["scores"],dict):
|
| 255 |
+
scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys}
|
| 256 |
+
return {"scores":scores,"comment":data.get("comment","ok")}
|
| 257 |
+
except:pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
try:
|
| 259 |
+
m=re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL)
|
| 260 |
if m:
|
| 261 |
+
data=json.loads(m.group())
|
| 262 |
if "scores" in data:
|
| 263 |
+
scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys}
|
| 264 |
+
return {"scores":scores,"comment":data.get("comment","parsed")}
|
| 265 |
+
except:pass
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
try:
|
| 267 |
+
sc={}
|
| 268 |
for k in keys:
|
| 269 |
+
m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE)
|
| 270 |
if m2:
|
| 271 |
+
v=float(m2.group(1))
|
| 272 |
+
if 0<=v<=1:sc[k]=min(VALID,key=lambda x,v=v:abs(x-v))
|
| 273 |
+
if len(sc)>=3:
|
| 274 |
for k in keys:
|
| 275 |
+
if k not in sc:sc[k]=0.5
|
| 276 |
+
return {"scores":sc,"comment":"regex_parsed"}
|
| 277 |
+
except:pass
|
| 278 |
return None
|
| 279 |
|
| 280 |
+
def call_judge(prompt,system,api_key,model_id,provider,temperature=0.1,max_tokens=2048):
|
| 281 |
+
if provider=="OpenAI":
|
| 282 |
+
raw=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_schema=JUDGE_SCHEMA)
|
| 283 |
+
result=_parse_judge_json(raw)
|
| 284 |
+
if result:return result
|
| 285 |
+
raw2=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_mode=True)
|
|
|
|
|
|
|
| 286 |
return _parse_judge_json(raw2)
|
| 287 |
+
elif provider=="Anthropic":
|
| 288 |
+
raw=call_anthropic(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature)
|
|
|
|
| 289 |
return _parse_judge_json(raw)
|
| 290 |
+
elif provider=="Google":
|
| 291 |
+
raw=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=True)
|
| 292 |
+
result=_parse_judge_json(raw)
|
| 293 |
+
if result:return result
|
| 294 |
+
raw2=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=False)
|
|
|
|
|
|
|
|
|
|
| 295 |
return _parse_judge_json(raw2)
|
| 296 |
return None
|
| 297 |
|
| 298 |
+
# βββ Β§6. Scoring βββ
|
|
|
|
|
|
|
|
|
|
| 299 |
def compute_task_score(scores):
|
| 300 |
return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
|
| 301 |
|
| 302 |
+
def compute_axis_scores(results,tasks):
|
| 303 |
+
tm={t.task_id:t for t in tasks};ax={}
|
| 304 |
for an,ai in AXIS_MAP.items():
|
| 305 |
vals=[]
|
| 306 |
for tid,d in results.items():
|
| 307 |
+
if d["score"]<0:continue
|
| 308 |
t=tm.get(tid)
|
| 309 |
+
if not t:continue
|
| 310 |
+
try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
|
| 311 |
+
except:sc={}
|
| 312 |
rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
|
| 313 |
w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
|
| 314 |
+
if rv:vals.append(np.mean(rv)*w)
|
| 315 |
ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
|
| 316 |
return ax
|
| 317 |
|
| 318 |
+
def compute_final_score(results,tasks):
|
| 319 |
+
tm={t.task_id:t for t in tasks};ds={}
|
| 320 |
for tid,d in results.items():
|
| 321 |
+
if d["score"]<0:continue
|
| 322 |
t=tm.get(tid)
|
| 323 |
+
if t:ds.setdefault(t.domain,[]).append(d["score"])
|
| 324 |
da={d:np.mean(v) for d,v in ds.items() if v}
|
| 325 |
gd={}
|
| 326 |
+
for t in tasks:gd.setdefault(t.grade,set()).add(t.domain)
|
| 327 |
ws,wt=0,0
|
| 328 |
for g,doms in gd.items():
|
| 329 |
w=GRADE_WEIGHT.get(g,1.0)
|
| 330 |
for d in doms:
|
| 331 |
+
if d in da:ws+=da[d]*w;wt+=w
|
| 332 |
base=ws/wt if wt>0 else 0
|
| 333 |
axis=compute_axis_scores(results,tasks)
|
| 334 |
av=[max(v,0.01) for v in axis.values()]
|
|
|
|
| 336 |
har_p=har/100.0
|
| 337 |
return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
|
| 338 |
|
| 339 |
+
def determine_agi_stage(score,axis):
|
| 340 |
all60=all(v>=60 for v in axis.values()) if axis else False
|
| 341 |
for s in reversed(AGI_STAGES):
|
| 342 |
if score>=s["min"]:
|
| 343 |
+
if s["stage"]>=4 and not all60:return AGI_STAGES[2]
|
| 344 |
return s
|
| 345 |
return AGI_STAGES[0]
|
| 346 |
|
| 347 |
+
# βββ Β§7. Checkpoint DB βββ
|
|
|
|
|
|
|
| 348 |
DB_PATH="final_bench_eval.db"
|
| 349 |
def _init_db():
|
| 350 |
+
c=sqlite3.connect(DB_PATH);c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))");c.commit();c.close()
|
| 351 |
+
def _make_run_id(m):return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
|
|
|
|
|
|
|
| 352 |
def _save_result(rid,tid,resp,jresp,sc):
|
| 353 |
+
c=sqlite3.connect(DB_PATH);c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time()));c.commit();c.close()
|
| 354 |
def _load_all(rid):
|
| 355 |
+
c=sqlite3.connect(DB_PATH);cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,));rows=cur.fetchall();c.close()
|
| 356 |
+
result={}
|
|
|
|
|
|
|
|
|
|
| 357 |
for r in rows:
|
| 358 |
+
resp=r[1] or "";score=r[3]
|
| 359 |
+
if score<=0 and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]" or resp.startswith("[ERROR")):continue
|
| 360 |
+
result[r[0]]={"response":resp,"judge":r[2],"score":score}
|
|
|
|
|
|
|
|
|
|
| 361 |
return result
|
| 362 |
def _clear_run(rid):
|
| 363 |
+
c=sqlite3.connect(DB_PATH);c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,));c.commit();c.close()
|
| 364 |
_init_db()
|
| 365 |
|
| 366 |
+
# βββ Β§8. CSV Export βββ
|
| 367 |
+
def generate_csv(results,tasks,model_name,judge_name,mode="BASELINE"):
|
| 368 |
+
out=io.StringIO();w=csv.writer(out)
|
| 369 |
+
w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","eval_model","judge_model","mode","weighted_score","process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness","judge_comment","response_preview","timestamp"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
tm={t.task_id:t for t in tasks}
|
| 371 |
for tid,d in sorted(results.items()):
|
| 372 |
t=tm.get(tid)
|
| 373 |
+
if not t:continue
|
| 374 |
jd={}
|
| 375 |
+
try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
|
| 376 |
+
except:pass
|
| 377 |
sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
|
| 378 |
+
cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200];s=d["score"]
|
| 379 |
+
if s<0:s=-1;cm=f"JUDGE_FAILED:{cm}"
|
| 380 |
+
w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,judge_name,mode,s,sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""),cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat()])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
return out.getvalue()
|
| 382 |
|
| 383 |
+
# βββ Β§9. HTML Builders βββ
|
| 384 |
+
CSS="""<style>
|
|
|
|
|
|
|
| 385 |
.eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
|
| 386 |
.eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
|
| 387 |
.eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
|
|
|
|
| 397 |
</style>"""
|
| 398 |
|
| 399 |
def _sc(s):
|
| 400 |
+
if s>=80:return "#4caf50"
|
| 401 |
+
if s>=60:return "#ff9800"
|
| 402 |
+
if s>=40:return "#ff5722"
|
| 403 |
return "#f44336"
|
| 404 |
|
| 405 |
+
def _build_progress_table(results,tasks):
|
| 406 |
rows=""
|
| 407 |
for t in tasks:
|
| 408 |
info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
|
| 409 |
gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
|
| 410 |
if t.task_id in results:
|
| 411 |
+
d=results[t.task_id];s=d["score"];resp=d.get("response","")
|
| 412 |
+
if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">β JF</td><td>β</td></tr>'
|
| 413 |
+
elif s==0 and resp and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
|
|
|
|
|
|
|
| 414 |
err_short=html.escape(resp[:60])
|
| 415 |
rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">π« {err_short}</td></tr>'
|
| 416 |
else:
|
| 417 |
+
c=_sc(s);rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
|
| 418 |
+
else:rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>β³</td><td>β</td></tr>'
|
|
|
|
|
|
|
| 419 |
return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
|
| 420 |
|
| 421 |
+
def _build_summary_card(results,tasks,eval_label,judge_label,hf_status):
|
| 422 |
final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks)
|
| 423 |
stage=determine_agi_stage(final,axis)
|
| 424 |
labels={"generalization":"π Generalization","reasoning":"π§ Reasoning","planning":"π Planning","reliability":"π― Reliability","safety":"π‘οΈ Safety"}
|
| 425 |
ax_html=""
|
| 426 |
for an,av in axis.items():
|
| 427 |
+
c=_sc(av);ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
|
|
|
|
| 428 |
gh=""
|
| 429 |
+
for g in["A","B","C"]:
|
| 430 |
+
gd=[t.domain for t in tasks if t.grade==g];gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs]
|
| 431 |
+
if gs:a=np.mean(gs);gh+=f'<span style="margin-right:14px">{g}Γ{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
|
|
|
|
| 432 |
done=sum(1 for t in tasks if t.task_id in results)
|
| 433 |
jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
|
| 434 |
+
api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and(results[t.task_id].get("response","") or "").startswith("["))
|
|
|
|
|
|
|
| 435 |
ma_vals,er_vals=[],[]
|
| 436 |
for tid,d in results.items():
|
| 437 |
+
if d["score"]<0:continue
|
| 438 |
try:
|
| 439 |
+
jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
|
| 440 |
+
if "metacognitive_accuracy" in sc:ma_vals.append(float(sc["metacognitive_accuracy"]))
|
| 441 |
+
if "error_recovery" in sc:er_vals.append(float(sc["error_recovery"]))
|
| 442 |
+
except:pass
|
| 443 |
+
avg_ma=np.mean(ma_vals) if ma_vals else 0;avg_er=np.mean(er_vals) if er_vals else 0
|
| 444 |
+
gap=avg_ma-avg_er;gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
|
|
|
|
| 445 |
gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
|
| 446 |
+
ad=[t.domain for t in tasks if t.grade=="A"];asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs];aa=np.mean(asc_vals) if asc_vals else 0
|
|
|
|
|
|
|
| 447 |
checks=[("Scoreβ₯80",final>=80),("Axesβ₯60",all(v>=60 for v in axis.values())),(f"A-avgβ₯75({aa:.0f})",aa>=75)]
|
| 448 |
ch="".join([f'<span style="margin-right:8px">{"β
" if ok else "β"}{lb}</span>' for lb,ok in checks])
|
| 449 |
err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">β οΈ API Errors: {api_errs} tasks</div>' if api_errs else ""
|
| 450 |
+
return f"""{CSS}<div class="summary-card"><div style="text-align:center"><div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div><h2 style="margin:6px 0;font-size:1.6em">π€ Baseline FINAL: {final:.1f}</h2><p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ HAR {har_p:.3f} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p><p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} Β· Judge: {judge_label}</p>{err_html}</div><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa;margin:6px 0">π― 5-Axis Scores</h4>{ax_html}<hr style="border-color:#333;margin:10px 0"><div style="font-size:0.88em">{gh}</div><div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px"><span style="font-size:0.85em">MA-ER Gap:</span><span style="font-weight:700;color:{gc}">{gap:.3f}</span><span style="font-size:0.8em;color:{gc}">({gl})</span><span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div><div style="font-size:0.82em;margin-top:6px">{ch}</div><p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p><div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px"><p style="font-size:0.82em;color:#e94560;margin:0">π <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p></div></div>"""
|
| 451 |
+
|
| 452 |
+
def _build_detail_view(results,tasks):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
items=""
|
| 454 |
for t in tasks:
|
| 455 |
+
if t.task_id not in results:continue
|
| 456 |
+
d=results[t.task_id];info=DOMAIN_INFO.get(t.domain,{"icon":"?"});s=d["score"];resp=html.escape((d.get("response","") or "")[:500])
|
| 457 |
+
jc="";ss=""
|
|
|
|
| 458 |
try:
|
| 459 |
+
jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {});jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200]);sc=jd.get("scores",{}) if isinstance(jd,dict) else {};ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
|
| 460 |
+
except:pass
|
| 461 |
+
c=_sc(s) if s>=0 else "#ff9800";badge=f'{s:.1f}' if s>=0 else "JF"
|
|
|
|
|
|
|
|
|
|
| 462 |
items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
|
| 463 |
return CSS+items
|
| 464 |
|
| 465 |
+
# βββ Β§10. Evaluation Engine βββ
|
| 466 |
+
def _eval_single(task,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,state):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
try:
|
| 468 |
sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
|
| 469 |
+
f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. If unsure, say so honestly.")
|
|
|
|
| 470 |
print(f" βΆ {task.task_id} β {eval_provider}/{eval_model_id}")
|
| 471 |
+
model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
|
| 472 |
+
if model_response.startswith("[API_ERROR") or model_response.startswith("[BLOCKED") or model_response=="[EMPTY]":
|
|
|
|
|
|
|
|
|
|
| 473 |
print(f" β {task.task_id}: {model_response[:100]}")
|
|
|
|
| 474 |
_save_result(run_id,task.task_id,model_response,"{}",0)
|
| 475 |
+
with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {model_response[:80]}")
|
|
|
|
|
|
|
| 476 |
return task.task_id,{"response":model_response,"judge":"{}","score":0}
|
| 477 |
+
print(f" β {task.task_id} len={len(model_response)}")
|
| 478 |
+
jp=build_judge_prompt(task,model_response)
|
| 479 |
+
jd=call_judge(jp,system=JUDGE_SYSTEM,api_key=judge_api_key,model_id=judge_model_id,provider=judge_provider)
|
| 480 |
+
if jd is None:jd={"scores":{k:0.0 for k in RUBRIC},"comment":"JUDGE_PARSE_FAILED","failed":True}
|
| 481 |
+
if jd.get("failed"):ws=-1.0;jd["comment"]=f"JF:{jd.get('comment','')}"
|
| 482 |
+
else:ws=compute_task_score(jd["scores"]);
|
| 483 |
+
with state["lock"]:state["parse_ok"]+=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
jj=json.dumps(jd,ensure_ascii=False)
|
| 485 |
_save_result(run_id,task.task_id,model_response,jj,ws)
|
| 486 |
with state["lock"]:
|
| 487 |
+
state["done"]+=1;info=DOMAIN_INFO.get(task.domain,{"icon":"?"})
|
|
|
|
| 488 |
state["active"].append(f'{info["icon"]} {task.task_id}')
|
| 489 |
+
if len(state["active"])>10:state["active"]=state["active"][-10:]
|
| 490 |
return task.task_id,{"response":model_response,"judge":jj,"score":ws}
|
| 491 |
except Exception as e:
|
| 492 |
print(f" β {task.task_id} EXCEPTION: {e}")
|
| 493 |
+
with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
|
| 494 |
_save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
|
| 495 |
return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
|
| 496 |
|
| 497 |
+
# βββ Β§11. State Machine βββ
|
| 498 |
+
_EVAL_STATE={"running":False,"stop_requested":False,"finished":False,"run_id":"","eval_label":"","judge_label":"","done":0,"total":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
def _reset():
|
| 501 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":False,"stop_requested":False,"finished":False,"done":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"message":"","csv_path":None,"hf_status":""})
|
| 502 |
+
|
| 503 |
+
def _prog_html(state,pending):
|
| 504 |
+
done=state["done"];pct=min(int(done/max(pending,1)*100),100);gb=""
|
| 505 |
+
for g in["A","B","C"]:
|
| 506 |
+
gt=state["grade_total"].get(g,0);gd=state["grade_done"].get(g,0)
|
| 507 |
+
if gt==0:continue
|
| 508 |
+
gp=min(int(gd/gt*100),100);c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
emoji="π
°οΈ" if g=="A" else "π
±οΈ" if g=="B" else "π
ΎοΈ"
|
| 510 |
gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>'
|
| 511 |
o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>β‘ <b>π€ Baseline</b> β {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}'
|
| 512 |
ac=state.get("active",[])
|
| 513 |
+
if ac:o+='<div style="margin-top:8px">π '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
|
| 514 |
er=state.get("errors",[])
|
| 515 |
if er:
|
| 516 |
+
o+='<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
|
| 517 |
+
for e in er[-6:]:o+=f'<div>β οΈ {html.escape(e[:100])}</div>'
|
|
|
|
| 518 |
o+='</div>'
|
| 519 |
return o+'</div>'
|
| 520 |
|
| 521 |
+
def _bg_eval(eval_api_key,eval_model_id,eval_provider,eval_label,judge_api_key,judge_model_id,judge_provider,judge_label,tasks,run_id,n_workers):
|
|
|
|
|
|
|
| 522 |
global _EVAL_STATE
|
| 523 |
try:
|
| 524 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE["start_time"]=time.time();_EVAL_STATE["message"]=f"β‘ Eval: {eval_label} Β· Judge: {judge_label} Β· {len(tasks)} tasks"
|
| 525 |
+
results=dict(_load_all(run_id));cached=sum(1 for t in tasks if t.task_id in results);pending=[t for t in tasks if t.task_id not in results]
|
| 526 |
+
print(f" π Cached: {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
|
| 527 |
+
gt={};
|
| 528 |
+
for t in pending:gt.setdefault(t.grade,[]).append(t)
|
| 529 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE["results"]=results;_EVAL_STATE["cached"]=cached;_EVAL_STATE["total"]=len(pending);_EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()};_EVAL_STATE["grade_done"]={g:0 for g in gt};_EVAL_STATE["done"]=0;_EVAL_STATE["errors"]=[];_EVAL_STATE["active"]=[]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
if pending:
|
| 531 |
with ThreadPoolExecutor(max_workers=n_workers) as ex:
|
| 532 |
futs={}
|
| 533 |
for t in pending:
|
| 534 |
+
if _EVAL_STATE["stop_requested"]:break
|
| 535 |
+
futs[ex.submit(_eval_single,t,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,_EVAL_STATE)]=t
|
|
|
|
|
|
|
|
|
|
| 536 |
done_set=set()
|
| 537 |
while len(done_set)<len(futs):
|
| 538 |
+
if _EVAL_STATE["stop_requested"]:ex.shutdown(wait=False,cancel_futures=True);break
|
| 539 |
for f in list(futs):
|
| 540 |
+
if f in done_set:continue
|
| 541 |
if f.done():
|
| 542 |
done_set.add(f)
|
| 543 |
try:
|
| 544 |
tid,data=f.result()
|
| 545 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE["results"][tid]=data;to=futs[f];_EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1
|
| 546 |
+
except:pass
|
|
|
|
|
|
|
| 547 |
time.sleep(0.5)
|
| 548 |
+
with _EVAL_STATE["lock"]:results=dict(_EVAL_STATE["results"])
|
| 549 |
+
final,base,har,axis,_=compute_final_score(results,tasks);stage=determine_agi_stage(final,axis)
|
| 550 |
+
csv_str=generate_csv(results,tasks,eval_label,judge_label,"BASELINE");cp=f"/tmp/final_{run_id}.csv"
|
| 551 |
+
with open(cp,"w",encoding="utf-8") as f:f.write(csv_str)
|
|
|
|
|
|
|
| 552 |
elapsed=int(time.time()-_EVAL_STATE["start_time"])
|
| 553 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE["csv_path"]=cp;_EVAL_STATE["hf_status"]="";_EVAL_STATE["message"]=f"π {stage['name']} β FINAL={final:.1f} Β· {elapsed}s";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True
|
|
|
|
|
|
|
|
|
|
| 554 |
except Exception as e:
|
| 555 |
+
print(f" β Fatal: {e}");import traceback;traceback.print_exc()
|
| 556 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE["message"]=f"β Fatal: {str(e)[:100]}";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
|
| 558 |
+
def _start_eval(eval_api_key,judge_api_key,eval_model_choice,judge_model_choice,grade_f,diff_f,max_t,n_w,fresh):
|
| 559 |
+
global _EVAL_STATE
|
| 560 |
+
if _EVAL_STATE["running"]:return "β οΈ Already running"
|
| 561 |
+
eval_api_key=(eval_api_key or "").strip();judge_api_key=(judge_api_key or "").strip()
|
| 562 |
+
eval_model_id,eval_provider=_resolve_model(eval_model_choice);judge_model_id,judge_provider=_resolve_model(judge_model_choice)
|
| 563 |
+
if not eval_api_key:return f"β {eval_provider} API Key required for Eval model"
|
| 564 |
+
if not judge_api_key:return f"β {judge_provider} API Key required for Judge model"
|
| 565 |
tasks=ALL_TASKS[:]
|
| 566 |
+
if grade_f!="All":tasks=[t for t in tasks if t.grade==grade_f]
|
| 567 |
+
if diff_f!="All":tasks=[t for t in tasks if t.difficulty==diff_f]
|
| 568 |
+
tasks=tasks[:int(max_t)];rid=_make_run_id(eval_model_id)
|
| 569 |
+
if fresh:_clear_run(rid)
|
|
|
|
| 570 |
_reset()
|
| 571 |
+
with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":True,"run_id":rid,"eval_label":eval_model_choice,"judge_label":judge_model_choice,"tasks":tasks,"total":len(tasks),"n_workers":int(n_w)})
|
| 572 |
+
threading.Thread(target=_bg_eval,daemon=True,args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,judge_api_key,judge_model_id,judge_provider,judge_model_choice,tasks,rid,int(n_w))).start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
return f"β‘ Started β Eval: {eval_model_choice} Β· Judge: {judge_model_choice} ({len(tasks)} tasks)"
|
| 574 |
|
| 575 |
def _stop():
|
| 576 |
+
if _EVAL_STATE["running"]:_EVAL_STATE["stop_requested"]=True;return "βΉοΈ Stopping..."
|
| 577 |
return "βΉοΈ Not running"
|
| 578 |
|
| 579 |
def _poll():
|
| 580 |
+
with _EVAL_STATE["lock"]:running=_EVAL_STATE["running"];finished=_EVAL_STATE["finished"];tasks=_EVAL_STATE.get("tasks",[]);results=dict(_EVAL_STATE.get("results",{}));msg=_EVAL_STATE.get("message","");cp=_EVAL_STATE.get("csv_path")
|
| 581 |
+
if not running and not finished and not results:return("βΉοΈ Configure API keys, select models, then press βΆοΈ Start","","","",None)
|
| 582 |
+
if running:pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0);ph=CSS+_prog_html(_EVAL_STATE,pend)
|
| 583 |
+
elif finished:ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
|
| 584 |
+
else:ph=msg
|
| 585 |
+
th=_build_progress_table(results,tasks) if tasks else "";sh,dh,co="","",None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 586 |
if finished and tasks:
|
| 587 |
+
el=_EVAL_STATE.get("eval_label","?");jl=_EVAL_STATE.get("judge_label","?");hf_st=_EVAL_STATE.get("hf_status","")
|
| 588 |
+
sh=_build_summary_card(results,tasks,el,jl,hf_st);dh=_build_detail_view(results,tasks);co=cp
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
return(ph,th,sh,dh,co)
|
| 590 |
|
| 591 |
+
# βββ Β§12. Gradio App βββ
|
| 592 |
+
HEADER="""<div style="text-align:center;padding:16px 0">
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
<h1 style="margin:0;font-size:1.8em">π FINAL Bench v4.2 β Baseline Evaluation</h1>
|
| 594 |
<h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
|
| 595 |
+
<p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto"><b>100 Tasks Β· 15 Domains Β· 8 TICOS Β· 5-Axis Β· 5-Stage AGI Grade</b><br>
|
| 596 |
+
π€ Baseline (Non-AGI) β Single LLM Evaluation Β· Multi-Provider<br>Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google</p>
|
|
|
|
|
|
|
|
|
|
| 597 |
<div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
|
| 598 |
<span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI Β· GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
|
| 599 |
<span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic Β· Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
|
| 600 |
+
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google Β· Gemini 3 Pro Preview</span></div>
|
|
|
|
| 601 |
<div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
|
| 602 |
+
<p style="color:#e94560;font-size:0.85em;margin:0">π <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p></div>
|
|
|
|
| 603 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
|
| 604 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Dataset</a>
|
| 605 |
+
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Leaderboard</a></div></div>"""
|
|
|
|
| 606 |
|
| 607 |
def create_app():
|
| 608 |
+
with gr.Blocks(title="FINAL Bench v4.2",css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
|
|
|
|
| 609 |
gr.HTML(HEADER)
|
|
|
|
| 610 |
gr.Markdown("### π API Keys")
|
|
|
|
| 611 |
with gr.Row():
|
| 612 |
+
eval_api_key=gr.Textbox(label="π€ Eval Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3)
|
| 613 |
+
judge_api_key=gr.Textbox(label="βοΈ Judge Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
gr.Markdown("### π€ Model Selection")
|
| 615 |
with gr.Row():
|
| 616 |
+
eval_m=gr.Dropdown(label="π€ Evaluation Target",choices=MODEL_CHOICES,value=DEFAULT_EVAL,scale=3)
|
| 617 |
+
judge_m=gr.Dropdown(label="βοΈ Judge Model",choices=MODEL_CHOICES,value=DEFAULT_JUDGE,scale=3)
|
|
|
|
|
|
|
|
|
|
| 618 |
gr.Markdown("### βοΈ Settings")
|
| 619 |
with gr.Row():
|
| 620 |
gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
|
| 621 |
+
df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty",scale=1)
|
| 622 |
mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
|
| 623 |
nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
|
|
|
|
| 624 |
with gr.Row():
|
| 625 |
s_btn=gr.Button("βΆοΈ Start (Resume)",variant="primary",size="lg",scale=2)
|
| 626 |
f_btn=gr.Button("π Fresh Start",variant="secondary",size="lg",scale=2)
|
| 627 |
x_btn=gr.Button("βΉοΈ Stop",variant="stop",size="lg",scale=1)
|
| 628 |
status=gr.Textbox(label="Status",interactive=False,max_lines=2)
|
|
|
|
| 629 |
with gr.Tabs():
|
| 630 |
+
with gr.Tab("π Progress"):p_html=gr.HTML()
|
| 631 |
+
with gr.Tab("π Results"):t_html=gr.HTML()
|
| 632 |
+
with gr.Tab("π FINAL Score"):s_html=gr.HTML()
|
| 633 |
+
with gr.Tab("π Details"):d_html=gr.HTML()
|
| 634 |
+
with gr.Tab("πΎ CSV"):c_file=gr.File(label="CSV")
|
|
|
|
| 635 |
timer=gr.Timer(value=2,active=True)
|
| 636 |
timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file])
|
|
|
|
| 637 |
eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw]
|
| 638 |
s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status])
|
| 639 |
f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
|
| 640 |
x_btn.click(fn=_stop,outputs=[status])
|
| 641 |
+
gr.Markdown("---\n<center><b>FINAL Bench v4.2</b> Β· Baseline Β· OpenAI / Anthropic / Google Β· Apache 2.0 Β· <b>Ginigen AI</b></center>")
|
| 642 |
+
return app
|
| 643 |
|
| 644 |
if __name__=="__main__":
|
| 645 |
sg,sd={},{}
|
| 646 |
+
for t in ALL_TASKS:sg[t.grade]=sg.get(t.grade,0)+1;sd[t.domain]=sd.get(t.domain,0)+1
|
| 647 |
print(f"\n{'='*60}\n FINAL Bench v4.2 β Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
|
| 648 |
print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
|
| 649 |
+
for g in["A","B","C"]:print(f" Grade {g} (Γ{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
|
| 650 |
print(f" π MetaCog: COMING SOON\n{'='*60}\n")
|
| 651 |
+
app=create_app();app.queue(default_concurrency_limit=2)
|
| 652 |
+
app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)
|
|
|