Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
|
| 2 |
from datetime import datetime
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
from typing import List, Dict
|
|
@@ -99,10 +99,10 @@ def load_tasks():
|
|
| 99 |
raise FileNotFoundError("Dataset not found!")
|
| 100 |
|
| 101 |
ALL_TASKS = load_tasks()
|
| 102 |
-
print(f"โ
FINAL Bench v4.
|
| 103 |
|
| 104 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 105 |
-
# ยง3. Multi-Provider Model Registry
|
| 106 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 107 |
|
| 108 |
PROVIDER_MODELS = {
|
|
@@ -119,13 +119,12 @@ PROVIDER_MODELS = {
|
|
| 119 |
"claude-haiku-4-5-20251001": "Claude Haiku 4.5",
|
| 120 |
},
|
| 121 |
"Google": {
|
| 122 |
-
"gemini-
|
| 123 |
-
"gemini-2.5-pro":
|
| 124 |
-
"gemini-2.
|
| 125 |
},
|
| 126 |
}
|
| 127 |
|
| 128 |
-
# Build unified model list โ used for BOTH eval and judge dropdowns
|
| 129 |
ALL_MODELS = {}
|
| 130 |
for prov, models in PROVIDER_MODELS.items():
|
| 131 |
for mid, label in models.items():
|
|
@@ -136,7 +135,6 @@ DEFAULT_EVAL = "GPT-5.2 (flagship) [OpenAI]"
|
|
| 136 |
DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
|
| 137 |
|
| 138 |
def _resolve_model(choice):
|
| 139 |
-
"""Resolve dropdown choice โ (model_id, provider)"""
|
| 140 |
info = ALL_MODELS.get(choice, {})
|
| 141 |
return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
|
| 142 |
|
|
@@ -150,7 +148,7 @@ def _strip_think(text):
|
|
| 150 |
text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
|
| 151 |
return text.strip()
|
| 152 |
|
| 153 |
-
#
|
| 154 |
def call_openai(prompt, system="", api_key="", model="gpt-5.2",
|
| 155 |
max_tokens=8192, temperature=0.6, reasoning_effort=None,
|
| 156 |
json_mode=False, json_schema=None):
|
|
@@ -171,19 +169,20 @@ def call_openai(prompt, system="", api_key="", model="gpt-5.2",
|
|
| 171 |
try:
|
| 172 |
r=requests.post("https://api.openai.com/v1/chat/completions",
|
| 173 |
headers=headers,data=json.dumps(payload),timeout=300)
|
| 174 |
-
|
| 175 |
-
|
| 176 |
return _strip_think(c) if c else "[EMPTY]"
|
| 177 |
except requests.exceptions.HTTPError:
|
|
|
|
| 178 |
try: err=r.json().get("error",{}).get("message","")
|
| 179 |
except: err=str(r.status_code)
|
| 180 |
if attempt<2: time.sleep(3*(attempt+1)); continue
|
| 181 |
-
return f"[API_ERROR] {err}"
|
| 182 |
except Exception as e:
|
| 183 |
if attempt<2: time.sleep(3*(attempt+1))
|
| 184 |
else: return f"[API_ERROR] {e}"
|
| 185 |
|
| 186 |
-
#
|
| 187 |
def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
|
| 188 |
max_tokens=8192, temperature=0.6):
|
| 189 |
headers={
|
|
@@ -198,8 +197,6 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
|
|
| 198 |
try:
|
| 199 |
r=requests.post("https://api.anthropic.com/v1/messages",
|
| 200 |
headers=headers,data=json.dumps(payload),timeout=300)
|
| 201 |
-
if r.status_code==429: time.sleep(5*(attempt+1)); continue
|
| 202 |
-
if r.status_code==529: time.sleep(8*(attempt+1)); continue
|
| 203 |
r.raise_for_status()
|
| 204 |
resp=r.json()
|
| 205 |
text_parts=[]
|
|
@@ -209,69 +206,92 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
|
|
| 209 |
c="\n".join(text_parts)
|
| 210 |
return _strip_think(c) if c else "[EMPTY]"
|
| 211 |
except requests.exceptions.HTTPError:
|
|
|
|
|
|
|
| 212 |
try: err=r.json().get("error",{}).get("message","")
|
| 213 |
except: err=str(r.status_code)
|
| 214 |
-
|
| 215 |
-
return f"[API_ERROR] {err}"
|
| 216 |
except Exception as e:
|
| 217 |
if attempt<2: time.sleep(3*(attempt+1))
|
| 218 |
else: return f"[API_ERROR] {e}"
|
| 219 |
|
| 220 |
-
#
|
| 221 |
GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
| 222 |
|
| 223 |
-
def call_gemini(prompt, system="", api_key="", model="gemini-
|
| 224 |
max_tokens=8192, temperature=1.0, json_mode=False):
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
}
|
| 230 |
-
contents=[{"role":"user","parts":[{"text":prompt}]}]
|
| 231 |
-
gen_config={"maxOutputTokens":max_tokens,"temperature":temperature}
|
| 232 |
-
payload={"contents":contents,"generationConfig":gen_config}
|
| 233 |
if system:
|
| 234 |
-
payload["systemInstruction"]={"parts":[{"text":system}]}
|
| 235 |
if json_mode:
|
| 236 |
-
gen_config["responseMimeType"]="application/json"
|
| 237 |
for attempt in range(3):
|
| 238 |
try:
|
| 239 |
-
r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300)
|
| 240 |
-
|
| 241 |
-
if r.status_code==503: time.sleep(8*(attempt+1)); continue
|
| 242 |
r.raise_for_status()
|
| 243 |
-
data=r.json()
|
| 244 |
-
candidates=data.get("candidates",[])
|
| 245 |
if not candidates:
|
| 246 |
-
block_reason=data.get("promptFeedback",{}).get("blockReason","UNKNOWN")
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
| 250 |
for p in parts:
|
| 251 |
if "text" in p:
|
| 252 |
-
if p.get("thought",False):
|
|
|
|
| 253 |
result.append(p["text"])
|
| 254 |
-
c="\n".join(result) if result else ""
|
| 255 |
return _strip_think(c) if c else "[EMPTY]"
|
| 256 |
except requests.exceptions.HTTPError:
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return f"[API_ERROR] Gemini {r.status_code}: {err}"
|
| 261 |
except Exception as e:
|
| 262 |
-
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
#
|
| 266 |
def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
|
| 267 |
provider="OpenAI", max_tokens=8192, temperature=0.6):
|
| 268 |
-
if provider=="OpenAI":
|
| 269 |
-
|
| 270 |
-
elif provider=="
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
return f"[API_ERROR] Unknown provider: {provider}"
|
| 272 |
|
| 273 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 274 |
-
# ยง5. Judge โ Multi-Provider
|
| 275 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 276 |
|
| 277 |
JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
|
|
@@ -292,9 +312,8 @@ G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario
|
|
| 292 |
|
| 293 |
STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
|
| 294 |
|
| 295 |
-
IMPORTANT: Output ONLY valid JSON with NO extra text
|
| 296 |
-
{"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}
|
| 297 |
-
Where each X is one of: 0.0, 0.25, 0.5, 0.75, 1.0"""
|
| 298 |
|
| 299 |
def _build_judge_schema():
|
| 300 |
sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
|
|
@@ -323,18 +342,14 @@ Score: process_quality, metacognitive_accuracy, error_recovery, integration_dept
|
|
| 323 |
Apply {task.ticos_type} bonus criteria.
|
| 324 |
Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
|
| 325 |
|
| 326 |
-
|
| 327 |
def _parse_judge_json(text):
|
| 328 |
-
"""Parse judge response โ dict with scores, works for all providers"""
|
| 329 |
if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
|
| 330 |
return None
|
| 331 |
cleaned = _strip_think(text)
|
| 332 |
VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
|
| 333 |
keys = list(RUBRIC.keys())
|
| 334 |
-
|
| 335 |
-
# Method 1: Direct JSON parse
|
| 336 |
try:
|
| 337 |
-
# Strip markdown fences
|
| 338 |
t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
|
| 339 |
t = re.sub(r'\s*```$', '', t.strip())
|
| 340 |
data = json.loads(t)
|
|
@@ -344,10 +359,8 @@ def _parse_judge_json(text):
|
|
| 344 |
v = float(data["scores"].get(k, 0.5))
|
| 345 |
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 346 |
return {"scores": scores, "comment": data.get("comment", "ok")}
|
| 347 |
-
except:
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
# Method 2: Find JSON object in text
|
| 351 |
try:
|
| 352 |
m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
|
| 353 |
if m:
|
|
@@ -358,62 +371,44 @@ def _parse_judge_json(text):
|
|
| 358 |
v = float(data["scores"].get(k, 0.5))
|
| 359 |
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 360 |
return {"scores": scores, "comment": data.get("comment", "parsed")}
|
| 361 |
-
except:
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
# Method 3: Regex extraction
|
| 365 |
try:
|
| 366 |
sc = {}
|
| 367 |
for k in keys:
|
| 368 |
m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
|
| 369 |
if m2:
|
| 370 |
v = float(m2.group(1))
|
| 371 |
-
if 0 <= v <= 1:
|
| 372 |
-
sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 373 |
if len(sc) >= 3:
|
| 374 |
for k in keys:
|
| 375 |
if k not in sc: sc[k] = 0.5
|
| 376 |
return {"scores": sc, "comment": "regex_parsed"}
|
| 377 |
-
except:
|
| 378 |
-
pass
|
| 379 |
-
|
| 380 |
return None
|
| 381 |
|
| 382 |
-
|
| 383 |
def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
|
| 384 |
-
"""โ
Universal Judge โ routes to correct provider with JSON enforcement"""
|
| 385 |
-
|
| 386 |
if provider == "OpenAI":
|
| 387 |
-
# OpenAI: use structured output (best quality)
|
| 388 |
raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
|
| 389 |
-
max_tokens=max_tokens, temperature=temperature,
|
| 390 |
-
json_schema=JUDGE_SCHEMA)
|
| 391 |
result = _parse_judge_json(raw)
|
| 392 |
-
if result:
|
| 393 |
-
return result
|
| 394 |
-
# Fallback: try without structured output
|
| 395 |
raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
|
| 396 |
max_tokens=max_tokens, temperature=temperature, json_mode=True)
|
| 397 |
return _parse_judge_json(raw2)
|
| 398 |
-
|
| 399 |
elif provider == "Anthropic":
|
| 400 |
-
# Anthropic: prompt-based JSON enforcement
|
| 401 |
raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
|
| 402 |
max_tokens=max_tokens, temperature=temperature)
|
| 403 |
return _parse_judge_json(raw)
|
| 404 |
-
|
| 405 |
elif provider == "Google":
|
| 406 |
-
#
|
| 407 |
raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
|
| 408 |
-
max_tokens=max_tokens, temperature=
|
| 409 |
result = _parse_judge_json(raw)
|
| 410 |
-
if result:
|
| 411 |
-
return result
|
| 412 |
-
# Fallback without json_mode
|
| 413 |
raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
|
| 414 |
-
max_tokens=max_tokens, temperature=
|
| 415 |
return _parse_judge_json(raw2)
|
| 416 |
-
|
| 417 |
return None
|
| 418 |
|
| 419 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
@@ -476,12 +471,23 @@ def _init_db():
|
|
| 476 |
c=sqlite3.connect(DB_PATH)
|
| 477 |
c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
|
| 478 |
c.commit(); c.close()
|
| 479 |
-
def _make_run_id(m): return hashlib.md5(f"
|
| 480 |
def _save_result(rid,tid,resp,jresp,sc):
|
| 481 |
c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
|
| 482 |
def _load_all(rid):
|
| 483 |
-
|
| 484 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
def _clear_run(rid):
|
| 486 |
c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
|
| 487 |
_init_db()
|
|
@@ -546,9 +552,13 @@ def _build_progress_table(results, tasks):
|
|
| 546 |
info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
|
| 547 |
gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
|
| 548 |
if t.task_id in results:
|
| 549 |
-
|
| 550 |
if s<0:
|
| 551 |
-
rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">โ</td><td>โ</td></tr>'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
else:
|
| 553 |
c=_sc(s)
|
| 554 |
rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
|
|
@@ -571,6 +581,8 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
|
|
| 571 |
if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}ร{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
|
| 572 |
done=sum(1 for t in tasks if t.task_id in results)
|
| 573 |
jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
|
|
|
|
|
|
|
| 574 |
# MA-ER Gap
|
| 575 |
ma_vals,er_vals=[],[]
|
| 576 |
for tid,d in results.items():
|
|
@@ -582,21 +594,21 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
|
|
| 582 |
if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
|
| 583 |
except: pass
|
| 584 |
avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
|
| 585 |
-
gap=avg_ma-avg_er
|
| 586 |
-
gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
|
| 587 |
gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
|
| 588 |
-
# Pass checks
|
| 589 |
ad=[t.domain for t in tasks if t.grade=="A"]
|
| 590 |
asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
|
| 591 |
aa=np.mean(asc_vals) if asc_vals else 0
|
| 592 |
checks=[("Scoreโฅ80",final>=80),("Axesโฅ60",all(v>=60 for v in axis.values())),(f"A-avgโฅ75({aa:.0f})",aa>=75)]
|
| 593 |
ch="".join([f'<span style="margin-right:8px">{"โ
" if ok else "โ"}{lb}</span>' for lb,ok in checks])
|
|
|
|
| 594 |
return f"""{CSS}<div class="summary-card">
|
| 595 |
<div style="text-align:center">
|
| 596 |
<div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
|
| 597 |
<h2 style="margin:6px 0;font-size:1.6em">๐ค Baseline FINAL: {final:.1f}</h2>
|
| 598 |
<p style="color:#aaa;font-size:0.85em">{stage['label']} ยท Base {base:.1f} ร HAR {har_p:.3f} ยท {done}/{len(tasks)}{f" ยท JF={jf}" if jf else ""}</p>
|
| 599 |
<p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} ยท Judge: {judge_label}</p>
|
|
|
|
| 600 |
</div><hr style="border-color:#333;margin:12px 0">
|
| 601 |
<h4 style="color:#aaa;margin:6px 0">๐ฏ 5-Axis Scores</h4>{ax_html}
|
| 602 |
<hr style="border-color:#333;margin:10px 0">
|
|
@@ -637,18 +649,24 @@ def _build_detail_view(results, tasks):
|
|
| 637 |
def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
|
| 638 |
judge_api_key, judge_model_id, judge_provider, state):
|
| 639 |
try:
|
| 640 |
-
# 1) Eval model call
|
| 641 |
sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
|
| 642 |
f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
|
| 643 |
f"If unsure, say so honestly.")
|
|
|
|
| 644 |
model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
|
| 645 |
model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
|
| 646 |
-
if model_response.startswith("[API_ERROR") or
|
|
|
|
|
|
|
|
|
|
|
|
|
| 647 |
_save_result(run_id,task.task_id,model_response,"{}",0)
|
| 648 |
-
with state["lock"]:
|
|
|
|
|
|
|
| 649 |
return task.task_id,{"response":model_response,"judge":"{}","score":0}
|
| 650 |
|
| 651 |
-
|
| 652 |
jp = build_judge_prompt(task, model_response)
|
| 653 |
jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
|
| 654 |
model_id=judge_model_id, provider=judge_provider)
|
|
@@ -671,6 +689,7 @@ def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
|
|
| 671 |
if len(state["active"])>10: state["active"]=state["active"][-10:]
|
| 672 |
return task.task_id,{"response":model_response,"judge":jj,"score":ws}
|
| 673 |
except Exception as e:
|
|
|
|
| 674 |
with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
|
| 675 |
_save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
|
| 676 |
return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
|
|
@@ -709,7 +728,11 @@ def _prog_html(state, pending):
|
|
| 709 |
ac=state.get("active",[])
|
| 710 |
if ac: o+='<div style="margin-top:8px">๐ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
|
| 711 |
er=state.get("errors",[])
|
| 712 |
-
if er:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 713 |
return o+'</div>'
|
| 714 |
|
| 715 |
def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
|
|
@@ -720,9 +743,11 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
|
|
| 720 |
with _EVAL_STATE["lock"]:
|
| 721 |
_EVAL_STATE["start_time"]=time.time()
|
| 722 |
_EVAL_STATE["message"]=f"โก Eval: {eval_label} ยท Judge: {judge_label} ยท {len(tasks)} tasks"
|
|
|
|
| 723 |
results=dict(_load_all(run_id))
|
| 724 |
cached=sum(1 for t in tasks if t.task_id in results)
|
| 725 |
pending=[t for t in tasks if t.task_id not in results]
|
|
|
|
| 726 |
gt={}
|
| 727 |
for t in pending: gt.setdefault(t.grade,[]).append(t)
|
| 728 |
with _EVAL_STATE["lock"]:
|
|
@@ -766,6 +791,8 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
|
|
| 766 |
_EVAL_STATE["message"]=f"๐ {stage['name']} โ FINAL={final:.1f} ยท {elapsed}s"
|
| 767 |
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
| 768 |
except Exception as e:
|
|
|
|
|
|
|
| 769 |
with _EVAL_STATE["lock"]:
|
| 770 |
_EVAL_STATE["message"]=f"โ Fatal: {str(e)[:100]}"
|
| 771 |
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
|
@@ -834,7 +861,7 @@ def _poll():
|
|
| 834 |
|
| 835 |
HEADER = """
|
| 836 |
<div style="text-align:center;padding:16px 0">
|
| 837 |
-
<h1 style="margin:0;font-size:1.8em">๐ FINAL Bench v4.
|
| 838 |
<h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
|
| 839 |
<p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
|
| 840 |
<b>100 Tasks ยท 15 Domains ยท 8 TICOS ยท 5-Axis ยท 5-Stage AGI Grade</b><br>
|
|
@@ -844,45 +871,38 @@ Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google
|
|
| 844 |
<div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
|
| 845 |
<span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI ยท GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
|
| 846 |
<span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic ยท Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
|
| 847 |
-
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google ยท Gemini
|
| 848 |
</div>
|
| 849 |
<div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
|
| 850 |
<p style="color:#e94560;font-size:0.85em;margin:0">๐ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
|
| 851 |
-
<
|
| 852 |
-
3-Phase Protocol (Initial โ Self-Review โ Correction) โ paper's core contribution.
|
| 853 |
-
</p></div>
|
| 854 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
|
| 855 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐ Dataset</a>
|
| 856 |
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐ Leaderboard</a>
|
| 857 |
</div></div>"""
|
| 858 |
|
| 859 |
def create_app():
|
| 860 |
-
with gr.Blocks(title="FINAL Bench v4.
|
| 861 |
css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
|
| 862 |
gr.HTML(HEADER)
|
| 863 |
|
| 864 |
-
# --- API Keys ---
|
| 865 |
gr.Markdown("### ๐ API Keys")
|
| 866 |
-
gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider.
|
| 867 |
with gr.Row():
|
| 868 |
eval_api_key=gr.Textbox(label="๐ค Eval Model API Key",type="password",
|
| 869 |
placeholder="sk-... / sk-ant-... / AIza...",
|
| 870 |
-
info="OpenAI / Anthropic / Google key for
|
| 871 |
judge_api_key=gr.Textbox(label="โ๏ธ Judge Model API Key",type="password",
|
| 872 |
placeholder="sk-... / sk-ant-... / AIza...",
|
| 873 |
-
info="OpenAI / Anthropic / Google key for
|
| 874 |
|
| 875 |
-
# --- Model Selection (SAME choices for both) ---
|
| 876 |
gr.Markdown("### ๐ค Model Selection")
|
| 877 |
with gr.Row():
|
| 878 |
eval_m=gr.Dropdown(label="๐ค Evaluation Target",choices=MODEL_CHOICES,
|
| 879 |
-
value=DEFAULT_EVAL,
|
| 880 |
-
info="Model to be evaluated on FINAL Bench tasks",scale=3)
|
| 881 |
judge_m=gr.Dropdown(label="โ๏ธ Judge Model",choices=MODEL_CHOICES,
|
| 882 |
-
value=DEFAULT_JUDGE,
|
| 883 |
-
info="Model that scores the evaluation responses",scale=3)
|
| 884 |
|
| 885 |
-
# --- Settings ---
|
| 886 |
gr.Markdown("### โ๏ธ Settings")
|
| 887 |
with gr.Row():
|
| 888 |
gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
|
|
@@ -890,14 +910,12 @@ def create_app():
|
|
| 890 |
mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
|
| 891 |
nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
|
| 892 |
|
| 893 |
-
# --- Buttons ---
|
| 894 |
with gr.Row():
|
| 895 |
s_btn=gr.Button("โถ๏ธ Start (Resume)",variant="primary",size="lg",scale=2)
|
| 896 |
f_btn=gr.Button("๐ Fresh Start",variant="secondary",size="lg",scale=2)
|
| 897 |
x_btn=gr.Button("โน๏ธ Stop",variant="stop",size="lg",scale=1)
|
| 898 |
-
status=gr.Textbox(label="Status",interactive=False,max_lines=
|
| 899 |
|
| 900 |
-
# --- Results ---
|
| 901 |
with gr.Tabs():
|
| 902 |
with gr.Tab("๐ Progress"): p_html=gr.HTML()
|
| 903 |
with gr.Tab("๐ Results"): t_html=gr.HTML()
|
|
@@ -913,17 +931,10 @@ def create_app():
|
|
| 913 |
f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
|
| 914 |
x_btn.click(fn=_stop,outputs=[status])
|
| 915 |
|
| 916 |
-
gr.Markdown("""---
|
| 917 |
-
<center><b>FINAL Bench v4.1</b> โ Baseline (Non-AGI) ยท Multi-Provider Eval & Judge<br>
|
| 918 |
-
100 Tasks ยท 5-Axis ยท 5-Stage ยท OpenAI / Anthropic / Google<br>
|
| 919 |
-
๐ MetaCog (Self-Correction Protocol): <b>COMING SOON</b><br>
|
| 920 |
-
Apache 2.0 ยท <b>Ginigen AI</b> โ Choi Sunyoung</center>""")
|
| 921 |
-
return app
|
| 922 |
-
|
| 923 |
if __name__=="__main__":
|
| 924 |
sg,sd={},{}
|
| 925 |
for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
|
| 926 |
-
print(f"\n{'='*60}\n FINAL Bench v4.
|
| 927 |
print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
|
| 928 |
for g in ["A","B","C"]: print(f" Grade {g} (ร{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
|
| 929 |
print(f" ๐ MetaCog: COMING SOON\n{'='*60}\n")
|
|
|
|
| 1 |
+
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
|
| 2 |
from datetime import datetime
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
from typing import List, Dict
|
|
|
|
| 99 |
raise FileNotFoundError("Dataset not found!")
|
| 100 |
|
| 101 |
ALL_TASKS = load_tasks()
|
| 102 |
+
print(f"โ
FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
|
| 103 |
|
| 104 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 105 |
+
# ยง3. Multi-Provider Model Registry
|
| 106 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 107 |
|
| 108 |
PROVIDER_MODELS = {
|
|
|
|
| 119 |
"claude-haiku-4-5-20251001": "Claude Haiku 4.5",
|
| 120 |
},
|
| 121 |
"Google": {
|
| 122 |
+
"gemini-2.5-flash": "Gemini 2.5 Flash",
|
| 123 |
+
"gemini-2.5-pro": "Gemini 2.5 Pro",
|
| 124 |
+
"gemini-2.0-flash": "Gemini 2.0 Flash",
|
| 125 |
},
|
| 126 |
}
|
| 127 |
|
|
|
|
| 128 |
ALL_MODELS = {}
|
| 129 |
for prov, models in PROVIDER_MODELS.items():
|
| 130 |
for mid, label in models.items():
|
|
|
|
| 135 |
DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
|
| 136 |
|
| 137 |
def _resolve_model(choice):
|
|
|
|
| 138 |
info = ALL_MODELS.get(choice, {})
|
| 139 |
return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
|
| 140 |
|
|
|
|
| 148 |
text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
|
| 149 |
return text.strip()
|
| 150 |
|
| 151 |
+
# โโ OpenAI โโ
|
| 152 |
def call_openai(prompt, system="", api_key="", model="gpt-5.2",
|
| 153 |
max_tokens=8192, temperature=0.6, reasoning_effort=None,
|
| 154 |
json_mode=False, json_schema=None):
|
|
|
|
| 169 |
try:
|
| 170 |
r=requests.post("https://api.openai.com/v1/chat/completions",
|
| 171 |
headers=headers,data=json.dumps(payload),timeout=300)
|
| 172 |
+
r.raise_for_status()
|
| 173 |
+
c=r.json()["choices"][0]["message"]["content"]
|
| 174 |
return _strip_think(c) if c else "[EMPTY]"
|
| 175 |
except requests.exceptions.HTTPError:
|
| 176 |
+
if r.status_code==429: time.sleep(5*(attempt+1)); continue
|
| 177 |
try: err=r.json().get("error",{}).get("message","")
|
| 178 |
except: err=str(r.status_code)
|
| 179 |
if attempt<2: time.sleep(3*(attempt+1)); continue
|
| 180 |
+
return f"[API_ERROR] OpenAI {r.status_code}: {err}"
|
| 181 |
except Exception as e:
|
| 182 |
if attempt<2: time.sleep(3*(attempt+1))
|
| 183 |
else: return f"[API_ERROR] {e}"
|
| 184 |
|
| 185 |
+
# โโ Anthropic Claude (โ
์ฐธ๊ณ ์ฝ๋ ๋์ผ ํจํด) โโ
|
| 186 |
def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
|
| 187 |
max_tokens=8192, temperature=0.6):
|
| 188 |
headers={
|
|
|
|
| 197 |
try:
|
| 198 |
r=requests.post("https://api.anthropic.com/v1/messages",
|
| 199 |
headers=headers,data=json.dumps(payload),timeout=300)
|
|
|
|
|
|
|
| 200 |
r.raise_for_status()
|
| 201 |
resp=r.json()
|
| 202 |
text_parts=[]
|
|
|
|
| 206 |
c="\n".join(text_parts)
|
| 207 |
return _strip_think(c) if c else "[EMPTY]"
|
| 208 |
except requests.exceptions.HTTPError:
|
| 209 |
+
if r.status_code==429: time.sleep(5*(attempt+1)); continue
|
| 210 |
+
if r.status_code==529: time.sleep(8*(attempt+1)); continue
|
| 211 |
try: err=r.json().get("error",{}).get("message","")
|
| 212 |
except: err=str(r.status_code)
|
| 213 |
+
return f"[API_ERROR] Claude {r.status_code}: {err}"
|
|
|
|
| 214 |
except Exception as e:
|
| 215 |
if attempt<2: time.sleep(3*(attempt+1))
|
| 216 |
else: return f"[API_ERROR] {e}"
|
| 217 |
|
| 218 |
+
# โโ Google Gemini (โ
โ
โ
์ฐธ๊ณ ์ฝ๋์ 100% ๋์ผ ํจํด โ
โ
โ
) โโ
|
| 219 |
GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
| 220 |
|
| 221 |
+
def call_gemini(prompt, system="", api_key="", model="gemini-2.5-flash",
|
| 222 |
max_tokens=8192, temperature=1.0, json_mode=False):
|
| 223 |
+
"""Google Gemini generateContent REST API
|
| 224 |
+
โ
x-goog-api-key ํค๋ ์ธ์ฆ
|
| 225 |
+
โ
data=json.dumps(payload) ์ ์ก
|
| 226 |
+
โ
thinking part (thought:True) ์คํต
|
| 227 |
+
"""
|
| 228 |
+
url = f"{GEMINI_API_BASE}/models/{model}:generateContent"
|
| 229 |
+
headers = {
|
| 230 |
+
"Content-Type": "application/json",
|
| 231 |
+
"x-goog-api-key": api_key,
|
| 232 |
}
|
| 233 |
+
contents = [{"role": "user", "parts": [{"text": prompt}]}]
|
| 234 |
+
gen_config = {"maxOutputTokens": max_tokens, "temperature": temperature}
|
| 235 |
+
payload = {"contents": contents, "generationConfig": gen_config}
|
| 236 |
if system:
|
| 237 |
+
payload["systemInstruction"] = {"parts": [{"text": system}]}
|
| 238 |
if json_mode:
|
| 239 |
+
gen_config["responseMimeType"] = "application/json"
|
| 240 |
for attempt in range(3):
|
| 241 |
try:
|
| 242 |
+
r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=300)
|
| 243 |
+
# โ
raise_for_status FIRST โ ์ฐธ๊ณ ์ฝ๋ ๋์ผ ํจํด
|
|
|
|
| 244 |
r.raise_for_status()
|
| 245 |
+
data = r.json()
|
| 246 |
+
candidates = data.get("candidates", [])
|
| 247 |
if not candidates:
|
| 248 |
+
block_reason = data.get("promptFeedback", {}).get("blockReason", "UNKNOWN")
|
| 249 |
+
print(f" [Gemini] BLOCKED: {block_reason}")
|
| 250 |
+
return f"[API_ERROR] Gemini BLOCKED: {block_reason}"
|
| 251 |
+
parts = candidates[0].get("content", {}).get("parts", [])
|
| 252 |
+
result = []
|
| 253 |
for p in parts:
|
| 254 |
if "text" in p:
|
| 255 |
+
if p.get("thought", False):
|
| 256 |
+
continue # โ
thinking part skip
|
| 257 |
result.append(p["text"])
|
| 258 |
+
c = "\n".join(result) if result else ""
|
| 259 |
return _strip_think(c) if c else "[EMPTY]"
|
| 260 |
except requests.exceptions.HTTPError:
|
| 261 |
+
# โ
์ฐธ๊ณ ์ฝ๋ ๋์ผ: 429/503๋ง retry, ๋๋จธ์ง๋ ์ฆ์ ์๋ฌ ๋ฐํ
|
| 262 |
+
if r.status_code == 429:
|
| 263 |
+
time.sleep(5 * (attempt + 1) + random.uniform(0, 2))
|
| 264 |
+
continue
|
| 265 |
+
if r.status_code == 503:
|
| 266 |
+
time.sleep(8 * (attempt + 1) + random.uniform(0, 3))
|
| 267 |
+
continue
|
| 268 |
+
try:
|
| 269 |
+
err = r.json().get("error", {}).get("message", "")
|
| 270 |
+
except:
|
| 271 |
+
err = str(r.status_code)
|
| 272 |
+
print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
|
| 273 |
return f"[API_ERROR] Gemini {r.status_code}: {err}"
|
| 274 |
except Exception as e:
|
| 275 |
+
print(f" [Gemini] Exception: {e}")
|
| 276 |
+
if attempt < 2:
|
| 277 |
+
time.sleep(3 * (attempt + 1))
|
| 278 |
+
else:
|
| 279 |
+
return f"[API_ERROR] Gemini: {e}"
|
| 280 |
|
| 281 |
+
# โโ Unified Dispatcher โโ
|
| 282 |
def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
|
| 283 |
provider="OpenAI", max_tokens=8192, temperature=0.6):
|
| 284 |
+
if provider == "OpenAI":
|
| 285 |
+
return call_openai(prompt, system, api_key, model_id, max_tokens, temperature)
|
| 286 |
+
elif provider == "Anthropic":
|
| 287 |
+
return call_anthropic(prompt, system, api_key, model_id, max_tokens, temperature)
|
| 288 |
+
elif provider == "Google":
|
| 289 |
+
# โ
Gemini๋ temperature=1.0 ๊ถ์ฅ (thinking ๋ชจ๋ธ)
|
| 290 |
+
return call_gemini(prompt, system, api_key, model_id, max_tokens, temperature=1.0)
|
| 291 |
return f"[API_ERROR] Unknown provider: {provider}"
|
| 292 |
|
| 293 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 294 |
+
# ยง5. Judge โ Multi-Provider
|
| 295 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 296 |
|
| 297 |
JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
|
|
|
|
| 312 |
|
| 313 |
STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
|
| 314 |
|
| 315 |
+
IMPORTANT: Output ONLY valid JSON with NO extra text:
|
| 316 |
+
{"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
|
|
|
|
| 317 |
|
| 318 |
def _build_judge_schema():
|
| 319 |
sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
|
|
|
|
| 342 |
Apply {task.ticos_type} bonus criteria.
|
| 343 |
Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
|
| 344 |
|
|
|
|
| 345 |
def _parse_judge_json(text):
|
|
|
|
| 346 |
if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
|
| 347 |
return None
|
| 348 |
cleaned = _strip_think(text)
|
| 349 |
VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
|
| 350 |
keys = list(RUBRIC.keys())
|
| 351 |
+
# Method 1: Direct JSON
|
|
|
|
| 352 |
try:
|
|
|
|
| 353 |
t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
|
| 354 |
t = re.sub(r'\s*```$', '', t.strip())
|
| 355 |
data = json.loads(t)
|
|
|
|
| 359 |
v = float(data["scores"].get(k, 0.5))
|
| 360 |
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 361 |
return {"scores": scores, "comment": data.get("comment", "ok")}
|
| 362 |
+
except: pass
|
| 363 |
+
# Method 2: Search JSON
|
|
|
|
|
|
|
| 364 |
try:
|
| 365 |
m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
|
| 366 |
if m:
|
|
|
|
| 371 |
v = float(data["scores"].get(k, 0.5))
|
| 372 |
scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
| 373 |
return {"scores": scores, "comment": data.get("comment", "parsed")}
|
| 374 |
+
except: pass
|
| 375 |
+
# Method 3: Regex
|
|
|
|
|
|
|
| 376 |
try:
|
| 377 |
sc = {}
|
| 378 |
for k in keys:
|
| 379 |
m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
|
| 380 |
if m2:
|
| 381 |
v = float(m2.group(1))
|
| 382 |
+
if 0 <= v <= 1: sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
|
|
|
|
| 383 |
if len(sc) >= 3:
|
| 384 |
for k in keys:
|
| 385 |
if k not in sc: sc[k] = 0.5
|
| 386 |
return {"scores": sc, "comment": "regex_parsed"}
|
| 387 |
+
except: pass
|
|
|
|
|
|
|
| 388 |
return None
|
| 389 |
|
|
|
|
| 390 |
def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
|
|
|
|
|
|
|
| 391 |
if provider == "OpenAI":
|
|
|
|
| 392 |
raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
|
| 393 |
+
max_tokens=max_tokens, temperature=temperature, json_schema=JUDGE_SCHEMA)
|
|
|
|
| 394 |
result = _parse_judge_json(raw)
|
| 395 |
+
if result: return result
|
|
|
|
|
|
|
| 396 |
raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
|
| 397 |
max_tokens=max_tokens, temperature=temperature, json_mode=True)
|
| 398 |
return _parse_judge_json(raw2)
|
|
|
|
| 399 |
elif provider == "Anthropic":
|
|
|
|
| 400 |
raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
|
| 401 |
max_tokens=max_tokens, temperature=temperature)
|
| 402 |
return _parse_judge_json(raw)
|
|
|
|
| 403 |
elif provider == "Google":
|
| 404 |
+
# โ
Gemini judge๋ temperature=1.0 ๊ณ ์ (thinking ๋ชจ๋ธ ํธํ)
|
| 405 |
raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
|
| 406 |
+
max_tokens=max_tokens, temperature=1.0, json_mode=True)
|
| 407 |
result = _parse_judge_json(raw)
|
| 408 |
+
if result: return result
|
|
|
|
|
|
|
| 409 |
raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
|
| 410 |
+
max_tokens=max_tokens, temperature=1.0, json_mode=False)
|
| 411 |
return _parse_judge_json(raw2)
|
|
|
|
| 412 |
return None
|
| 413 |
|
| 414 |
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
|
|
|
| 471 |
c=sqlite3.connect(DB_PATH)
|
| 472 |
c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
|
| 473 |
c.commit(); c.close()
|
| 474 |
+
def _make_run_id(m): return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
|
| 475 |
def _save_result(rid,tid,resp,jresp,sc):
|
| 476 |
c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
|
| 477 |
def _load_all(rid):
|
| 478 |
+
"""โ
์บ์ ๋ก๋ ์ ์คํจ ๊ฒฐ๊ณผ(score=0 + API_ERROR) ์๋ ์ ์ธ โ ์ฌ์๋ ๋ณด์ฅ"""
|
| 479 |
+
c=sqlite3.connect(DB_PATH)
|
| 480 |
+
cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,))
|
| 481 |
+
rows=cur.fetchall(); c.close()
|
| 482 |
+
result = {}
|
| 483 |
+
for r in rows:
|
| 484 |
+
resp = r[1] or ""
|
| 485 |
+
score = r[3]
|
| 486 |
+
# โ
API ์๋ฌ/๋น ์๋ต/0์ ์ ์บ์์์ ์ ์ธ โ ๋ค์ ์คํ ์ ์ฌ์๋
|
| 487 |
+
if score <= 0 and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp == "[EMPTY]" or resp.startswith("[ERROR")):
|
| 488 |
+
continue
|
| 489 |
+
result[r[0]] = {"response": resp, "judge": r[2], "score": score}
|
| 490 |
+
return result
|
| 491 |
def _clear_run(rid):
|
| 492 |
c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
|
| 493 |
_init_db()
|
|
|
|
| 552 |
info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
|
| 553 |
gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
|
| 554 |
if t.task_id in results:
|
| 555 |
+
d=results[t.task_id]; s=d["score"]; resp=d.get("response","")
|
| 556 |
if s<0:
|
| 557 |
+
rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">โ JF</td><td>โ</td></tr>'
|
| 558 |
+
elif s==0 and resp and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
|
| 559 |
+
# โ
API ์๋ฌ๋ฅผ ๋ช
ํํ๊ฒ ํ์
|
| 560 |
+
err_short=html.escape(resp[:60])
|
| 561 |
+
rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">๐ซ {err_short}</td></tr>'
|
| 562 |
else:
|
| 563 |
c=_sc(s)
|
| 564 |
rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
|
|
|
|
| 581 |
if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}ร{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
|
| 582 |
done=sum(1 for t in tasks if t.task_id in results)
|
| 583 |
jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
|
| 584 |
+
# API errors
|
| 585 |
+
api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and (results[t.task_id].get("response","") or "").startswith("["))
|
| 586 |
# MA-ER Gap
|
| 587 |
ma_vals,er_vals=[],[]
|
| 588 |
for tid,d in results.items():
|
|
|
|
| 594 |
if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
|
| 595 |
except: pass
|
| 596 |
avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
|
| 597 |
+
gap=avg_ma-avg_er; gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
|
|
|
|
| 598 |
gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
|
|
|
|
| 599 |
ad=[t.domain for t in tasks if t.grade=="A"]
|
| 600 |
asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
|
| 601 |
aa=np.mean(asc_vals) if asc_vals else 0
|
| 602 |
checks=[("Scoreโฅ80",final>=80),("Axesโฅ60",all(v>=60 for v in axis.values())),(f"A-avgโฅ75({aa:.0f})",aa>=75)]
|
| 603 |
ch="".join([f'<span style="margin-right:8px">{"โ
" if ok else "โ"}{lb}</span>' for lb,ok in checks])
|
| 604 |
+
err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">โ ๏ธ API Errors: {api_errs} tasks</div>' if api_errs else ""
|
| 605 |
return f"""{CSS}<div class="summary-card">
|
| 606 |
<div style="text-align:center">
|
| 607 |
<div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
|
| 608 |
<h2 style="margin:6px 0;font-size:1.6em">๐ค Baseline FINAL: {final:.1f}</h2>
|
| 609 |
<p style="color:#aaa;font-size:0.85em">{stage['label']} ยท Base {base:.1f} ร HAR {har_p:.3f} ยท {done}/{len(tasks)}{f" ยท JF={jf}" if jf else ""}</p>
|
| 610 |
<p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} ยท Judge: {judge_label}</p>
|
| 611 |
+
{err_html}
|
| 612 |
</div><hr style="border-color:#333;margin:12px 0">
|
| 613 |
<h4 style="color:#aaa;margin:6px 0">๐ฏ 5-Axis Scores</h4>{ax_html}
|
| 614 |
<hr style="border-color:#333;margin:10px 0">
|
|
|
|
| 649 |
def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
|
| 650 |
judge_api_key, judge_model_id, judge_provider, state):
|
| 651 |
try:
|
|
|
|
| 652 |
sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
|
| 653 |
f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
|
| 654 |
f"If unsure, say so honestly.")
|
| 655 |
+
print(f" โถ {task.task_id} โ {eval_provider}/{eval_model_id}")
|
| 656 |
model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
|
| 657 |
model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
|
| 658 |
+
if (model_response.startswith("[API_ERROR") or
|
| 659 |
+
model_response.startswith("[BLOCKED") or
|
| 660 |
+
model_response=="[EMPTY]"):
|
| 661 |
+
print(f" โ {task.task_id}: {model_response[:100]}")
|
| 662 |
+
# โ
API ์๋ฌ๋ ์ ์ฅํ๋, _load_all์์ ์๋ ์ ์ธ๋จ
|
| 663 |
_save_result(run_id,task.task_id,model_response,"{}",0)
|
| 664 |
+
with state["lock"]:
|
| 665 |
+
state["done"]+=1
|
| 666 |
+
state["errors"].append(f"{task.task_id}: {model_response[:80]}")
|
| 667 |
return task.task_id,{"response":model_response,"judge":"{}","score":0}
|
| 668 |
|
| 669 |
+
print(f" โ {task.task_id} response len={len(model_response)}")
|
| 670 |
jp = build_judge_prompt(task, model_response)
|
| 671 |
jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
|
| 672 |
model_id=judge_model_id, provider=judge_provider)
|
|
|
|
| 689 |
if len(state["active"])>10: state["active"]=state["active"][-10:]
|
| 690 |
return task.task_id,{"response":model_response,"judge":jj,"score":ws}
|
| 691 |
except Exception as e:
|
| 692 |
+
print(f" โ {task.task_id} EXCEPTION: {e}")
|
| 693 |
with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
|
| 694 |
_save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
|
| 695 |
return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
|
|
|
|
| 728 |
ac=state.get("active",[])
|
| 729 |
if ac: o+='<div style="margin-top:8px">๐ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
|
| 730 |
er=state.get("errors",[])
|
| 731 |
+
if er:
|
| 732 |
+
o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
|
| 733 |
+
for e in er[-6:]:
|
| 734 |
+
o+=f'<div>โ ๏ธ {html.escape(e[:100])}</div>'
|
| 735 |
+
o+='</div>'
|
| 736 |
return o+'</div>'
|
| 737 |
|
| 738 |
def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
|
|
|
|
| 743 |
with _EVAL_STATE["lock"]:
|
| 744 |
_EVAL_STATE["start_time"]=time.time()
|
| 745 |
_EVAL_STATE["message"]=f"โก Eval: {eval_label} ยท Judge: {judge_label} ยท {len(tasks)} tasks"
|
| 746 |
+
# โ
_load_all์ ์ด์ ์คํจ ๊ฒฐ๊ณผ๋ฅผ ์๋ ์ ์ธํจ
|
| 747 |
results=dict(_load_all(run_id))
|
| 748 |
cached=sum(1 for t in tasks if t.task_id in results)
|
| 749 |
pending=[t for t in tasks if t.task_id not in results]
|
| 750 |
+
print(f" ๐ Cached (valid): {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
|
| 751 |
gt={}
|
| 752 |
for t in pending: gt.setdefault(t.grade,[]).append(t)
|
| 753 |
with _EVAL_STATE["lock"]:
|
|
|
|
| 791 |
_EVAL_STATE["message"]=f"๐ {stage['name']} โ FINAL={final:.1f} ยท {elapsed}s"
|
| 792 |
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
| 793 |
except Exception as e:
|
| 794 |
+
print(f" โ Fatal: {e}")
|
| 795 |
+
import traceback; traceback.print_exc()
|
| 796 |
with _EVAL_STATE["lock"]:
|
| 797 |
_EVAL_STATE["message"]=f"โ Fatal: {str(e)[:100]}"
|
| 798 |
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
|
|
|
|
| 861 |
|
| 862 |
HEADER = """
|
| 863 |
<div style="text-align:center;padding:16px 0">
|
| 864 |
+
<h1 style="margin:0;font-size:1.8em">๐ FINAL Bench v4.2 โ Baseline Evaluation</h1>
|
| 865 |
<h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
|
| 866 |
<p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
|
| 867 |
<b>100 Tasks ยท 15 Domains ยท 8 TICOS ยท 5-Axis ยท 5-Stage AGI Grade</b><br>
|
|
|
|
| 871 |
<div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
|
| 872 |
<span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI ยท GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
|
| 873 |
<span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic ยท Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
|
| 874 |
+
<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google ยท Gemini 2.5 Flash / 2.5 Pro / 2.0 Flash</span>
|
| 875 |
</div>
|
| 876 |
<div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
|
| 877 |
<p style="color:#e94560;font-size:0.85em;margin:0">๐ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
|
| 878 |
+
</div>
|
|
|
|
|
|
|
| 879 |
<div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
|
| 880 |
<a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐ Dataset</a>
|
| 881 |
<a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐ Leaderboard</a>
|
| 882 |
</div></div>"""
|
| 883 |
|
| 884 |
def create_app():
|
| 885 |
+
with gr.Blocks(title="FINAL Bench v4.2",
|
| 886 |
css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
|
| 887 |
gr.HTML(HEADER)
|
| 888 |
|
|
|
|
| 889 |
gr.Markdown("### ๐ API Keys")
|
| 890 |
+
gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. Same key OK if both use same provider.</p>')
|
| 891 |
with gr.Row():
|
| 892 |
eval_api_key=gr.Textbox(label="๐ค Eval Model API Key",type="password",
|
| 893 |
placeholder="sk-... / sk-ant-... / AIza...",
|
| 894 |
+
info="OpenAI / Anthropic / Google key for eval",scale=3)
|
| 895 |
judge_api_key=gr.Textbox(label="โ๏ธ Judge Model API Key",type="password",
|
| 896 |
placeholder="sk-... / sk-ant-... / AIza...",
|
| 897 |
+
info="OpenAI / Anthropic / Google key for judge",scale=3)
|
| 898 |
|
|
|
|
| 899 |
gr.Markdown("### ๐ค Model Selection")
|
| 900 |
with gr.Row():
|
| 901 |
eval_m=gr.Dropdown(label="๐ค Evaluation Target",choices=MODEL_CHOICES,
|
| 902 |
+
value=DEFAULT_EVAL,info="Model to evaluate",scale=3)
|
|
|
|
| 903 |
judge_m=gr.Dropdown(label="โ๏ธ Judge Model",choices=MODEL_CHOICES,
|
| 904 |
+
value=DEFAULT_JUDGE,info="Model that scores responses",scale=3)
|
|
|
|
| 905 |
|
|
|
|
| 906 |
gr.Markdown("### โ๏ธ Settings")
|
| 907 |
with gr.Row():
|
| 908 |
gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
|
|
|
|
| 910 |
mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
|
| 911 |
nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
|
| 912 |
|
|
|
|
| 913 |
with gr.Row():
|
| 914 |
s_btn=gr.Button("โถ๏ธ Start (Resume)",variant="primary",size="lg",scale=2)
|
| 915 |
f_btn=gr.Button("๐ Fresh Start",variant="secondary",size="lg",scale=2)
|
| 916 |
x_btn=gr.Button("โน๏ธ Stop",variant="stop",size="lg",scale=1)
|
| 917 |
+
status=gr.Textbox(label="Status",interactive=False,max_lines=2)
|
| 918 |
|
|
|
|
| 919 |
with gr.Tabs():
|
| 920 |
with gr.Tab("๐ Progress"): p_html=gr.HTML()
|
| 921 |
with gr.Tab("๐ Results"): t_html=gr.HTML()
|
|
|
|
| 931 |
f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
|
| 932 |
x_btn.click(fn=_stop,outputs=[status])
|
| 933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
if __name__=="__main__":
|
| 935 |
sg,sd={},{}
|
| 936 |
for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
|
| 937 |
+
print(f"\n{'='*60}\n FINAL Bench v4.2 โ Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
|
| 938 |
print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
|
| 939 |
for g in ["A","B","C"]: print(f" Grade {g} (ร{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
|
| 940 |
print(f" ๐ MetaCog: COMING SOON\n{'='*60}\n")
|