aiqtech commited on
Commit
ebaf6e8
ยท
verified ยท
1 Parent(s): 0ab3bed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -129
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
2
  from datetime import datetime
3
  from dataclasses import dataclass, field
4
  from typing import List, Dict
@@ -99,10 +99,10 @@ def load_tasks():
99
  raise FileNotFoundError("Dataset not found!")
100
 
101
  ALL_TASKS = load_tasks()
102
- print(f"โœ… FINAL Bench v4.1: {len(ALL_TASKS)} tasks loaded")
103
 
104
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
105
- # ยง3. Multi-Provider Model Registry (Eval + Judge SAME)
106
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
107
 
108
  PROVIDER_MODELS = {
@@ -119,13 +119,12 @@ PROVIDER_MODELS = {
119
  "claude-haiku-4-5-20251001": "Claude Haiku 4.5",
120
  },
121
  "Google": {
122
- "gemini-3-pro": "Gemini 3 Pro",
123
- "gemini-2.5-pro": "Gemini 2.5 Pro",
124
- "gemini-2.5-flash": "Gemini 2.5 Flash",
125
  },
126
  }
127
 
128
- # Build unified model list โ€” used for BOTH eval and judge dropdowns
129
  ALL_MODELS = {}
130
  for prov, models in PROVIDER_MODELS.items():
131
  for mid, label in models.items():
@@ -136,7 +135,6 @@ DEFAULT_EVAL = "GPT-5.2 (flagship) [OpenAI]"
136
  DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
137
 
138
  def _resolve_model(choice):
139
- """Resolve dropdown choice โ†’ (model_id, provider)"""
140
  info = ALL_MODELS.get(choice, {})
141
  return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
142
 
@@ -150,7 +148,7 @@ def _strip_think(text):
150
  text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
151
  return text.strip()
152
 
153
- # --- OpenAI ---
154
  def call_openai(prompt, system="", api_key="", model="gpt-5.2",
155
  max_tokens=8192, temperature=0.6, reasoning_effort=None,
156
  json_mode=False, json_schema=None):
@@ -171,19 +169,20 @@ def call_openai(prompt, system="", api_key="", model="gpt-5.2",
171
  try:
172
  r=requests.post("https://api.openai.com/v1/chat/completions",
173
  headers=headers,data=json.dumps(payload),timeout=300)
174
- if r.status_code==429: time.sleep(5*(attempt+1)); continue
175
- r.raise_for_status(); c=r.json()["choices"][0]["message"]["content"]
176
  return _strip_think(c) if c else "[EMPTY]"
177
  except requests.exceptions.HTTPError:
 
178
  try: err=r.json().get("error",{}).get("message","")
179
  except: err=str(r.status_code)
180
  if attempt<2: time.sleep(3*(attempt+1)); continue
181
- return f"[API_ERROR] {err}"
182
  except Exception as e:
183
  if attempt<2: time.sleep(3*(attempt+1))
184
  else: return f"[API_ERROR] {e}"
185
 
186
- # --- Anthropic (โ˜… data=json.dumps, 429+529 retry) ---
187
  def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
188
  max_tokens=8192, temperature=0.6):
189
  headers={
@@ -198,8 +197,6 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
198
  try:
199
  r=requests.post("https://api.anthropic.com/v1/messages",
200
  headers=headers,data=json.dumps(payload),timeout=300)
201
- if r.status_code==429: time.sleep(5*(attempt+1)); continue
202
- if r.status_code==529: time.sleep(8*(attempt+1)); continue
203
  r.raise_for_status()
204
  resp=r.json()
205
  text_parts=[]
@@ -209,69 +206,92 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
209
  c="\n".join(text_parts)
210
  return _strip_think(c) if c else "[EMPTY]"
211
  except requests.exceptions.HTTPError:
 
 
212
  try: err=r.json().get("error",{}).get("message","")
213
  except: err=str(r.status_code)
214
- if attempt<2: time.sleep(3*(attempt+1)); continue
215
- return f"[API_ERROR] {err}"
216
  except Exception as e:
217
  if attempt<2: time.sleep(3*(attempt+1))
218
  else: return f"[API_ERROR] {e}"
219
 
220
- # --- Google Gemini (โ˜… x-goog-api-key header, data=json.dumps, thinking filter) ---
221
  GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
222
 
223
- def call_gemini(prompt, system="", api_key="", model="gemini-3-pro",
224
  max_tokens=8192, temperature=1.0, json_mode=False):
225
- url=f"{GEMINI_API_BASE}/models/{model}:generateContent"
226
- headers={
227
- "Content-Type":"application/json",
228
- "x-goog-api-key":api_key,
 
 
 
 
 
229
  }
230
- contents=[{"role":"user","parts":[{"text":prompt}]}]
231
- gen_config={"maxOutputTokens":max_tokens,"temperature":temperature}
232
- payload={"contents":contents,"generationConfig":gen_config}
233
  if system:
234
- payload["systemInstruction"]={"parts":[{"text":system}]}
235
  if json_mode:
236
- gen_config["responseMimeType"]="application/json"
237
  for attempt in range(3):
238
  try:
239
- r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300)
240
- if r.status_code==429: time.sleep(5*(attempt+1)); continue
241
- if r.status_code==503: time.sleep(8*(attempt+1)); continue
242
  r.raise_for_status()
243
- data=r.json()
244
- candidates=data.get("candidates",[])
245
  if not candidates:
246
- block_reason=data.get("promptFeedback",{}).get("blockReason","UNKNOWN")
247
- return f"[BLOCKED] Gemini blocked response: {block_reason}"
248
- parts=candidates[0].get("content",{}).get("parts",[])
249
- result=[]
 
250
  for p in parts:
251
  if "text" in p:
252
- if p.get("thought",False): continue # skip thinking parts
 
253
  result.append(p["text"])
254
- c="\n".join(result) if result else ""
255
  return _strip_think(c) if c else "[EMPTY]"
256
  except requests.exceptions.HTTPError:
257
- try: err=r.json().get("error",{}).get("message","")
258
- except: err=str(r.status_code)
259
- if attempt<2: time.sleep(3*(attempt+1)); continue
 
 
 
 
 
 
 
 
 
260
  return f"[API_ERROR] Gemini {r.status_code}: {err}"
261
  except Exception as e:
262
- if attempt<2: time.sleep(3*(attempt+1))
263
- else: return f"[API_ERROR] {e}"
 
 
 
264
 
265
- # --- Unified Dispatcher ---
266
  def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
267
  provider="OpenAI", max_tokens=8192, temperature=0.6):
268
- if provider=="OpenAI": return call_openai(prompt,system,api_key,model_id,max_tokens,temperature)
269
- elif provider=="Anthropic": return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature)
270
- elif provider=="Google": return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature)
 
 
 
 
271
  return f"[API_ERROR] Unknown provider: {provider}"
272
 
273
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
274
- # ยง5. Judge โ€” Multi-Provider (OpenAI / Anthropic / Google)
275
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
276
 
277
  JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
@@ -292,9 +312,8 @@ G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario
292
 
293
  STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
294
 
295
- IMPORTANT: Output ONLY valid JSON with NO extra text before or after:
296
- {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}
297
- Where each X is one of: 0.0, 0.25, 0.5, 0.75, 1.0"""
298
 
299
  def _build_judge_schema():
300
  sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
@@ -323,18 +342,14 @@ Score: process_quality, metacognitive_accuracy, error_recovery, integration_dept
323
  Apply {task.ticos_type} bonus criteria.
324
  Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
325
 
326
-
327
  def _parse_judge_json(text):
328
- """Parse judge response โ†’ dict with scores, works for all providers"""
329
  if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
330
  return None
331
  cleaned = _strip_think(text)
332
  VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
333
  keys = list(RUBRIC.keys())
334
-
335
- # Method 1: Direct JSON parse
336
  try:
337
- # Strip markdown fences
338
  t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
339
  t = re.sub(r'\s*```$', '', t.strip())
340
  data = json.loads(t)
@@ -344,10 +359,8 @@ def _parse_judge_json(text):
344
  v = float(data["scores"].get(k, 0.5))
345
  scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
346
  return {"scores": scores, "comment": data.get("comment", "ok")}
347
- except:
348
- pass
349
-
350
- # Method 2: Find JSON object in text
351
  try:
352
  m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
353
  if m:
@@ -358,62 +371,44 @@ def _parse_judge_json(text):
358
  v = float(data["scores"].get(k, 0.5))
359
  scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
360
  return {"scores": scores, "comment": data.get("comment", "parsed")}
361
- except:
362
- pass
363
-
364
- # Method 3: Regex extraction
365
  try:
366
  sc = {}
367
  for k in keys:
368
  m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
369
  if m2:
370
  v = float(m2.group(1))
371
- if 0 <= v <= 1:
372
- sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
373
  if len(sc) >= 3:
374
  for k in keys:
375
  if k not in sc: sc[k] = 0.5
376
  return {"scores": sc, "comment": "regex_parsed"}
377
- except:
378
- pass
379
-
380
  return None
381
 
382
-
383
  def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
384
- """โ˜… Universal Judge โ€” routes to correct provider with JSON enforcement"""
385
-
386
  if provider == "OpenAI":
387
- # OpenAI: use structured output (best quality)
388
  raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
389
- max_tokens=max_tokens, temperature=temperature,
390
- json_schema=JUDGE_SCHEMA)
391
  result = _parse_judge_json(raw)
392
- if result:
393
- return result
394
- # Fallback: try without structured output
395
  raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
396
  max_tokens=max_tokens, temperature=temperature, json_mode=True)
397
  return _parse_judge_json(raw2)
398
-
399
  elif provider == "Anthropic":
400
- # Anthropic: prompt-based JSON enforcement
401
  raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
402
  max_tokens=max_tokens, temperature=temperature)
403
  return _parse_judge_json(raw)
404
-
405
  elif provider == "Google":
406
- # Google: JSON mode supported
407
  raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
408
- max_tokens=max_tokens, temperature=temperature, json_mode=True)
409
  result = _parse_judge_json(raw)
410
- if result:
411
- return result
412
- # Fallback without json_mode
413
  raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
414
- max_tokens=max_tokens, temperature=temperature, json_mode=False)
415
  return _parse_judge_json(raw2)
416
-
417
  return None
418
 
419
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
@@ -476,12 +471,23 @@ def _init_db():
476
  c=sqlite3.connect(DB_PATH)
477
  c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
478
  c.commit(); c.close()
479
- def _make_run_id(m): return hashlib.md5(f"FINALv41_BL_{m}".encode()).hexdigest()[:12]
480
  def _save_result(rid,tid,resp,jresp,sc):
481
  c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
482
  def _load_all(rid):
483
- c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close()
484
- return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
 
 
 
 
 
 
 
 
 
 
 
485
  def _clear_run(rid):
486
  c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
487
  _init_db()
@@ -546,9 +552,13 @@ def _build_progress_table(results, tasks):
546
  info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
547
  gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
548
  if t.task_id in results:
549
- s=results[t.task_id]["score"]
550
  if s<0:
551
- rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">โŒ</td><td>โ€”</td></tr>'
 
 
 
 
552
  else:
553
  c=_sc(s)
554
  rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
@@ -571,6 +581,8 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
571
  if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}ร—{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
572
  done=sum(1 for t in tasks if t.task_id in results)
573
  jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
 
 
574
  # MA-ER Gap
575
  ma_vals,er_vals=[],[]
576
  for tid,d in results.items():
@@ -582,21 +594,21 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
582
  if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
583
  except: pass
584
  avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
585
- gap=avg_ma-avg_er
586
- gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
587
  gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
588
- # Pass checks
589
  ad=[t.domain for t in tasks if t.grade=="A"]
590
  asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
591
  aa=np.mean(asc_vals) if asc_vals else 0
592
  checks=[("Scoreโ‰ฅ80",final>=80),("Axesโ‰ฅ60",all(v>=60 for v in axis.values())),(f"A-avgโ‰ฅ75({aa:.0f})",aa>=75)]
593
  ch="".join([f'<span style="margin-right:8px">{"โœ…" if ok else "โŒ"}{lb}</span>' for lb,ok in checks])
 
594
  return f"""{CSS}<div class="summary-card">
595
  <div style="text-align:center">
596
  <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
597
  <h2 style="margin:6px 0;font-size:1.6em">๐Ÿค– Baseline FINAL: {final:.1f}</h2>
598
  <p style="color:#aaa;font-size:0.85em">{stage['label']} ยท Base {base:.1f} ร— HAR {har_p:.3f} ยท {done}/{len(tasks)}{f" ยท JF={jf}" if jf else ""}</p>
599
  <p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} ยท Judge: {judge_label}</p>
 
600
  </div><hr style="border-color:#333;margin:12px 0">
601
  <h4 style="color:#aaa;margin:6px 0">๐ŸŽฏ 5-Axis Scores</h4>{ax_html}
602
  <hr style="border-color:#333;margin:10px 0">
@@ -637,18 +649,24 @@ def _build_detail_view(results, tasks):
637
  def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
638
  judge_api_key, judge_model_id, judge_provider, state):
639
  try:
640
- # 1) Eval model call
641
  sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
642
  f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
643
  f"If unsure, say so honestly.")
 
644
  model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
645
  model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
646
- if model_response.startswith("[API_ERROR") or model_response=="[EMPTY]":
 
 
 
 
647
  _save_result(run_id,task.task_id,model_response,"{}",0)
648
- with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {model_response[:50]}")
 
 
649
  return task.task_id,{"response":model_response,"judge":"{}","score":0}
650
 
651
- # 2) Judge call โ€” any provider
652
  jp = build_judge_prompt(task, model_response)
653
  jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
654
  model_id=judge_model_id, provider=judge_provider)
@@ -671,6 +689,7 @@ def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
671
  if len(state["active"])>10: state["active"]=state["active"][-10:]
672
  return task.task_id,{"response":model_response,"judge":jj,"score":ws}
673
  except Exception as e:
 
674
  with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
675
  _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
676
  return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
@@ -709,7 +728,11 @@ def _prog_html(state, pending):
709
  ac=state.get("active",[])
710
  if ac: o+='<div style="margin-top:8px">๐Ÿ”„ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
711
  er=state.get("errors",[])
712
- if er: o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em">{" ยท ".join(["โš ๏ธ"+html.escape(e[:30]) for e in er[-3:]])}</div>'
 
 
 
 
713
  return o+'</div>'
714
 
715
  def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
@@ -720,9 +743,11 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
720
  with _EVAL_STATE["lock"]:
721
  _EVAL_STATE["start_time"]=time.time()
722
  _EVAL_STATE["message"]=f"โšก Eval: {eval_label} ยท Judge: {judge_label} ยท {len(tasks)} tasks"
 
723
  results=dict(_load_all(run_id))
724
  cached=sum(1 for t in tasks if t.task_id in results)
725
  pending=[t for t in tasks if t.task_id not in results]
 
726
  gt={}
727
  for t in pending: gt.setdefault(t.grade,[]).append(t)
728
  with _EVAL_STATE["lock"]:
@@ -766,6 +791,8 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
766
  _EVAL_STATE["message"]=f"๐Ÿ {stage['name']} โ€” FINAL={final:.1f} ยท {elapsed}s"
767
  _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
768
  except Exception as e:
 
 
769
  with _EVAL_STATE["lock"]:
770
  _EVAL_STATE["message"]=f"โŒ Fatal: {str(e)[:100]}"
771
  _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
@@ -834,7 +861,7 @@ def _poll():
834
 
835
  HEADER = """
836
  <div style="text-align:center;padding:16px 0">
837
- <h1 style="margin:0;font-size:1.8em">๐Ÿ† FINAL Bench v4.1 โ€” Baseline Evaluation</h1>
838
  <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
839
  <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
840
  <b>100 Tasks ยท 15 Domains ยท 8 TICOS ยท 5-Axis ยท 5-Stage AGI Grade</b><br>
@@ -844,45 +871,38 @@ Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google
844
  <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
845
  <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI ยท GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
846
  <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic ยท Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
847
- <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google ยท Gemini 3 Pro / 2.5 Pro / 2.5 Flash</span>
848
  </div>
849
  <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
850
  <p style="color:#e94560;font-size:0.85em;margin:0">๐Ÿ”’ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
851
- <p style="color:#888;font-size:0.78em;margin:4px 0 0 0">
852
- 3-Phase Protocol (Initial โ†’ Self-Review โ†’ Correction) โ€” paper's core contribution.
853
- </p></div>
854
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
855
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐Ÿ“Š Dataset</a>
856
  <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐Ÿ† Leaderboard</a>
857
  </div></div>"""
858
 
859
  def create_app():
860
- with gr.Blocks(title="FINAL Bench v4.1",theme=gr.themes.Soft(),
861
  css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
862
  gr.HTML(HEADER)
863
 
864
- # --- API Keys ---
865
  gr.Markdown("### ๐Ÿ”‘ API Keys")
866
- gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. If both models use the same provider, you can enter the same key.</p>')
867
  with gr.Row():
868
  eval_api_key=gr.Textbox(label="๐Ÿค– Eval Model API Key",type="password",
869
  placeholder="sk-... / sk-ant-... / AIza...",
870
- info="OpenAI / Anthropic / Google key for the evaluation target",scale=3)
871
  judge_api_key=gr.Textbox(label="โš–๏ธ Judge Model API Key",type="password",
872
  placeholder="sk-... / sk-ant-... / AIza...",
873
- info="OpenAI / Anthropic / Google key for the judge",scale=3)
874
 
875
- # --- Model Selection (SAME choices for both) ---
876
  gr.Markdown("### ๐Ÿค– Model Selection")
877
  with gr.Row():
878
  eval_m=gr.Dropdown(label="๐Ÿค– Evaluation Target",choices=MODEL_CHOICES,
879
- value=DEFAULT_EVAL,
880
- info="Model to be evaluated on FINAL Bench tasks",scale=3)
881
  judge_m=gr.Dropdown(label="โš–๏ธ Judge Model",choices=MODEL_CHOICES,
882
- value=DEFAULT_JUDGE,
883
- info="Model that scores the evaluation responses",scale=3)
884
 
885
- # --- Settings ---
886
  gr.Markdown("### โš™๏ธ Settings")
887
  with gr.Row():
888
  gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
@@ -890,14 +910,12 @@ def create_app():
890
  mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
891
  nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
892
 
893
- # --- Buttons ---
894
  with gr.Row():
895
  s_btn=gr.Button("โ–ถ๏ธ Start (Resume)",variant="primary",size="lg",scale=2)
896
  f_btn=gr.Button("๐Ÿš€ Fresh Start",variant="secondary",size="lg",scale=2)
897
  x_btn=gr.Button("โน๏ธ Stop",variant="stop",size="lg",scale=1)
898
- status=gr.Textbox(label="Status",interactive=False,max_lines=1)
899
 
900
- # --- Results ---
901
  with gr.Tabs():
902
  with gr.Tab("๐Ÿ“Š Progress"): p_html=gr.HTML()
903
  with gr.Tab("๐Ÿ“‹ Results"): t_html=gr.HTML()
@@ -913,17 +931,10 @@ def create_app():
913
  f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
914
  x_btn.click(fn=_stop,outputs=[status])
915
 
916
- gr.Markdown("""---
917
- <center><b>FINAL Bench v4.1</b> โ€” Baseline (Non-AGI) ยท Multi-Provider Eval & Judge<br>
918
- 100 Tasks ยท 5-Axis ยท 5-Stage ยท OpenAI / Anthropic / Google<br>
919
- ๐Ÿ”’ MetaCog (Self-Correction Protocol): <b>COMING SOON</b><br>
920
- Apache 2.0 ยท <b>Ginigen AI</b> โ€” Choi Sunyoung</center>""")
921
- return app
922
-
923
  if __name__=="__main__":
924
  sg,sd={},{}
925
  for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
926
- print(f"\n{'='*60}\n FINAL Bench v4.1 โ€” Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
927
  print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
928
  for g in ["A","B","C"]: print(f" Grade {g} (ร—{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
929
  print(f" ๐Ÿ”’ MetaCog: COMING SOON\n{'='*60}\n")
 
1
+ import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
2
  from datetime import datetime
3
  from dataclasses import dataclass, field
4
  from typing import List, Dict
 
99
  raise FileNotFoundError("Dataset not found!")
100
 
101
  ALL_TASKS = load_tasks()
102
+ print(f"โœ… FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
103
 
104
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
105
+ # ยง3. Multi-Provider Model Registry
106
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
107
 
108
  PROVIDER_MODELS = {
 
119
  "claude-haiku-4-5-20251001": "Claude Haiku 4.5",
120
  },
121
  "Google": {
122
+ "gemini-2.5-flash": "Gemini 2.5 Flash",
123
+ "gemini-2.5-pro": "Gemini 2.5 Pro",
124
+ "gemini-2.0-flash": "Gemini 2.0 Flash",
125
  },
126
  }
127
 
 
128
  ALL_MODELS = {}
129
  for prov, models in PROVIDER_MODELS.items():
130
  for mid, label in models.items():
 
135
  DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
136
 
137
  def _resolve_model(choice):
 
138
  info = ALL_MODELS.get(choice, {})
139
  return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
140
 
 
148
  text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
149
  return text.strip()
150
 
151
+ # โ”€โ”€ OpenAI โ”€โ”€
152
  def call_openai(prompt, system="", api_key="", model="gpt-5.2",
153
  max_tokens=8192, temperature=0.6, reasoning_effort=None,
154
  json_mode=False, json_schema=None):
 
169
  try:
170
  r=requests.post("https://api.openai.com/v1/chat/completions",
171
  headers=headers,data=json.dumps(payload),timeout=300)
172
+ r.raise_for_status()
173
+ c=r.json()["choices"][0]["message"]["content"]
174
  return _strip_think(c) if c else "[EMPTY]"
175
  except requests.exceptions.HTTPError:
176
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
177
  try: err=r.json().get("error",{}).get("message","")
178
  except: err=str(r.status_code)
179
  if attempt<2: time.sleep(3*(attempt+1)); continue
180
+ return f"[API_ERROR] OpenAI {r.status_code}: {err}"
181
  except Exception as e:
182
  if attempt<2: time.sleep(3*(attempt+1))
183
  else: return f"[API_ERROR] {e}"
184
 
185
+ # โ”€โ”€ Anthropic Claude (โ˜… ์ฐธ๊ณ ์ฝ”๋“œ ๋™์ผ ํŒจํ„ด) โ”€โ”€
186
  def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
187
  max_tokens=8192, temperature=0.6):
188
  headers={
 
197
  try:
198
  r=requests.post("https://api.anthropic.com/v1/messages",
199
  headers=headers,data=json.dumps(payload),timeout=300)
 
 
200
  r.raise_for_status()
201
  resp=r.json()
202
  text_parts=[]
 
206
  c="\n".join(text_parts)
207
  return _strip_think(c) if c else "[EMPTY]"
208
  except requests.exceptions.HTTPError:
209
+ if r.status_code==429: time.sleep(5*(attempt+1)); continue
210
+ if r.status_code==529: time.sleep(8*(attempt+1)); continue
211
  try: err=r.json().get("error",{}).get("message","")
212
  except: err=str(r.status_code)
213
+ return f"[API_ERROR] Claude {r.status_code}: {err}"
 
214
  except Exception as e:
215
  if attempt<2: time.sleep(3*(attempt+1))
216
  else: return f"[API_ERROR] {e}"
217
 
218
+ # โ”€โ”€ Google Gemini (โ˜…โ˜…โ˜… ์ฐธ๊ณ ์ฝ”๋“œ์™€ 100% ๋™์ผ ํŒจํ„ด โ˜…โ˜…โ˜…) โ”€โ”€
219
  GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
220
 
221
+ def call_gemini(prompt, system="", api_key="", model="gemini-2.5-flash",
222
  max_tokens=8192, temperature=1.0, json_mode=False):
223
+ """Google Gemini generateContent REST API
224
+ โ˜… x-goog-api-key ํ—ค๋” ์ธ์ฆ
225
+ โ˜… data=json.dumps(payload) ์ „์†ก
226
+ โ˜… thinking part (thought:True) ์Šคํ‚ต
227
+ """
228
+ url = f"{GEMINI_API_BASE}/models/{model}:generateContent"
229
+ headers = {
230
+ "Content-Type": "application/json",
231
+ "x-goog-api-key": api_key,
232
  }
233
+ contents = [{"role": "user", "parts": [{"text": prompt}]}]
234
+ gen_config = {"maxOutputTokens": max_tokens, "temperature": temperature}
235
+ payload = {"contents": contents, "generationConfig": gen_config}
236
  if system:
237
+ payload["systemInstruction"] = {"parts": [{"text": system}]}
238
  if json_mode:
239
+ gen_config["responseMimeType"] = "application/json"
240
  for attempt in range(3):
241
  try:
242
+ r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=300)
243
+ # โ˜… raise_for_status FIRST โ€” ์ฐธ๊ณ ์ฝ”๋“œ ๋™์ผ ํŒจํ„ด
 
244
  r.raise_for_status()
245
+ data = r.json()
246
+ candidates = data.get("candidates", [])
247
  if not candidates:
248
+ block_reason = data.get("promptFeedback", {}).get("blockReason", "UNKNOWN")
249
+ print(f" [Gemini] BLOCKED: {block_reason}")
250
+ return f"[API_ERROR] Gemini BLOCKED: {block_reason}"
251
+ parts = candidates[0].get("content", {}).get("parts", [])
252
+ result = []
253
  for p in parts:
254
  if "text" in p:
255
+ if p.get("thought", False):
256
+ continue # โ˜… thinking part skip
257
  result.append(p["text"])
258
+ c = "\n".join(result) if result else ""
259
  return _strip_think(c) if c else "[EMPTY]"
260
  except requests.exceptions.HTTPError:
261
+ # โ˜… ์ฐธ๊ณ ์ฝ”๋“œ ๋™์ผ: 429/503๋งŒ retry, ๋‚˜๋จธ์ง€๋Š” ์ฆ‰์‹œ ์—๋Ÿฌ ๋ฐ˜ํ™˜
262
+ if r.status_code == 429:
263
+ time.sleep(5 * (attempt + 1) + random.uniform(0, 2))
264
+ continue
265
+ if r.status_code == 503:
266
+ time.sleep(8 * (attempt + 1) + random.uniform(0, 3))
267
+ continue
268
+ try:
269
+ err = r.json().get("error", {}).get("message", "")
270
+ except:
271
+ err = str(r.status_code)
272
+ print(f" [Gemini] ERROR {r.status_code}: {err[:200]}")
273
  return f"[API_ERROR] Gemini {r.status_code}: {err}"
274
  except Exception as e:
275
+ print(f" [Gemini] Exception: {e}")
276
+ if attempt < 2:
277
+ time.sleep(3 * (attempt + 1))
278
+ else:
279
+ return f"[API_ERROR] Gemini: {e}"
280
 
281
+ # โ”€โ”€ Unified Dispatcher โ”€โ”€
282
  def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
283
  provider="OpenAI", max_tokens=8192, temperature=0.6):
284
+ if provider == "OpenAI":
285
+ return call_openai(prompt, system, api_key, model_id, max_tokens, temperature)
286
+ elif provider == "Anthropic":
287
+ return call_anthropic(prompt, system, api_key, model_id, max_tokens, temperature)
288
+ elif provider == "Google":
289
+ # โ˜… Gemini๋Š” temperature=1.0 ๊ถŒ์žฅ (thinking ๋ชจ๋ธ)
290
+ return call_gemini(prompt, system, api_key, model_id, max_tokens, temperature=1.0)
291
  return f"[API_ERROR] Unknown provider: {provider}"
292
 
293
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
294
+ # ยง5. Judge โ€” Multi-Provider
295
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
296
 
297
  JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
 
312
 
313
  STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
314
 
315
+ IMPORTANT: Output ONLY valid JSON with NO extra text:
316
+ {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
 
317
 
318
  def _build_judge_schema():
319
  sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
 
342
  Apply {task.ticos_type} bonus criteria.
343
  Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
344
 
 
345
  def _parse_judge_json(text):
 
346
  if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
347
  return None
348
  cleaned = _strip_think(text)
349
  VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
350
  keys = list(RUBRIC.keys())
351
+ # Method 1: Direct JSON
 
352
  try:
 
353
  t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
354
  t = re.sub(r'\s*```$', '', t.strip())
355
  data = json.loads(t)
 
359
  v = float(data["scores"].get(k, 0.5))
360
  scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
361
  return {"scores": scores, "comment": data.get("comment", "ok")}
362
+ except: pass
363
+ # Method 2: Search JSON
 
 
364
  try:
365
  m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
366
  if m:
 
371
  v = float(data["scores"].get(k, 0.5))
372
  scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
373
  return {"scores": scores, "comment": data.get("comment", "parsed")}
374
+ except: pass
375
+ # Method 3: Regex
 
 
376
  try:
377
  sc = {}
378
  for k in keys:
379
  m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
380
  if m2:
381
  v = float(m2.group(1))
382
+ if 0 <= v <= 1: sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
 
383
  if len(sc) >= 3:
384
  for k in keys:
385
  if k not in sc: sc[k] = 0.5
386
  return {"scores": sc, "comment": "regex_parsed"}
387
+ except: pass
 
 
388
  return None
389
 
 
390
  def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
 
 
391
  if provider == "OpenAI":
 
392
  raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
393
+ max_tokens=max_tokens, temperature=temperature, json_schema=JUDGE_SCHEMA)
 
394
  result = _parse_judge_json(raw)
395
+ if result: return result
 
 
396
  raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
397
  max_tokens=max_tokens, temperature=temperature, json_mode=True)
398
  return _parse_judge_json(raw2)
 
399
  elif provider == "Anthropic":
 
400
  raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
401
  max_tokens=max_tokens, temperature=temperature)
402
  return _parse_judge_json(raw)
 
403
  elif provider == "Google":
404
+ # โ˜… Gemini judge๋„ temperature=1.0 ๊ณ ์ • (thinking ๋ชจ๋ธ ํ˜ธํ™˜)
405
  raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
406
+ max_tokens=max_tokens, temperature=1.0, json_mode=True)
407
  result = _parse_judge_json(raw)
408
+ if result: return result
 
 
409
  raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
410
+ max_tokens=max_tokens, temperature=1.0, json_mode=False)
411
  return _parse_judge_json(raw2)
 
412
  return None
413
 
414
  # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
 
471
  c=sqlite3.connect(DB_PATH)
472
  c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
473
  c.commit(); c.close()
474
+ def _make_run_id(m): return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
475
  def _save_result(rid,tid,resp,jresp,sc):
476
  c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
477
  def _load_all(rid):
478
+ """โ˜… ์บ์‹œ ๋กœ๋“œ ์‹œ ์‹คํŒจ ๊ฒฐ๊ณผ(score=0 + API_ERROR) ์ž๋™ ์ œ์™ธ โ†’ ์žฌ์‹œ๋„ ๋ณด์žฅ"""
479
+ c=sqlite3.connect(DB_PATH)
480
+ cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,))
481
+ rows=cur.fetchall(); c.close()
482
+ result = {}
483
+ for r in rows:
484
+ resp = r[1] or ""
485
+ score = r[3]
486
+ # โ˜… API ์—๋Ÿฌ/๋นˆ ์‘๋‹ต/0์ ์€ ์บ์‹œ์—์„œ ์ œ์™ธ โ†’ ๋‹ค์Œ ์‹คํ–‰ ์‹œ ์žฌ์‹œ๋„
487
+ if score <= 0 and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp == "[EMPTY]" or resp.startswith("[ERROR")):
488
+ continue
489
+ result[r[0]] = {"response": resp, "judge": r[2], "score": score}
490
+ return result
491
  def _clear_run(rid):
492
  c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
493
  _init_db()
 
552
  info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
553
  gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
554
  if t.task_id in results:
555
+ d=results[t.task_id]; s=d["score"]; resp=d.get("response","")
556
  if s<0:
557
+ rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">โŒ JF</td><td>โ€”</td></tr>'
558
+ elif s==0 and resp and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
559
+ # โ˜… API ์—๋Ÿฌ๋ฅผ ๋ช…ํ™•ํ•˜๊ฒŒ ํ‘œ์‹œ
560
+ err_short=html.escape(resp[:60])
561
+ rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">๐Ÿšซ {err_short}</td></tr>'
562
  else:
563
  c=_sc(s)
564
  rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
 
581
  if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}ร—{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
582
  done=sum(1 for t in tasks if t.task_id in results)
583
  jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
584
+ # API errors
585
+ api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and (results[t.task_id].get("response","") or "").startswith("["))
586
  # MA-ER Gap
587
  ma_vals,er_vals=[],[]
588
  for tid,d in results.items():
 
594
  if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
595
  except: pass
596
  avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
597
+ gap=avg_ma-avg_er; gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
 
598
  gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
 
599
  ad=[t.domain for t in tasks if t.grade=="A"]
600
  asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
601
  aa=np.mean(asc_vals) if asc_vals else 0
602
  checks=[("Scoreโ‰ฅ80",final>=80),("Axesโ‰ฅ60",all(v>=60 for v in axis.values())),(f"A-avgโ‰ฅ75({aa:.0f})",aa>=75)]
603
  ch="".join([f'<span style="margin-right:8px">{"โœ…" if ok else "โŒ"}{lb}</span>' for lb,ok in checks])
604
+ err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">โš ๏ธ API Errors: {api_errs} tasks</div>' if api_errs else ""
605
  return f"""{CSS}<div class="summary-card">
606
  <div style="text-align:center">
607
  <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
608
  <h2 style="margin:6px 0;font-size:1.6em">๐Ÿค– Baseline FINAL: {final:.1f}</h2>
609
  <p style="color:#aaa;font-size:0.85em">{stage['label']} ยท Base {base:.1f} ร— HAR {har_p:.3f} ยท {done}/{len(tasks)}{f" ยท JF={jf}" if jf else ""}</p>
610
  <p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} ยท Judge: {judge_label}</p>
611
+ {err_html}
612
  </div><hr style="border-color:#333;margin:12px 0">
613
  <h4 style="color:#aaa;margin:6px 0">๐ŸŽฏ 5-Axis Scores</h4>{ax_html}
614
  <hr style="border-color:#333;margin:10px 0">
 
649
  def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
650
  judge_api_key, judge_model_id, judge_provider, state):
651
  try:
 
652
  sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
653
  f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
654
  f"If unsure, say so honestly.")
655
+ print(f" โ–ถ {task.task_id} โ†’ {eval_provider}/{eval_model_id}")
656
  model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
657
  model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
658
+ if (model_response.startswith("[API_ERROR") or
659
+ model_response.startswith("[BLOCKED") or
660
+ model_response=="[EMPTY]"):
661
+ print(f" โœ— {task.task_id}: {model_response[:100]}")
662
+ # โ˜… API ์—๋Ÿฌ๋Š” ์ €์žฅํ•˜๋˜, _load_all์—์„œ ์ž๋™ ์ œ์™ธ๋จ
663
  _save_result(run_id,task.task_id,model_response,"{}",0)
664
+ with state["lock"]:
665
+ state["done"]+=1
666
+ state["errors"].append(f"{task.task_id}: {model_response[:80]}")
667
  return task.task_id,{"response":model_response,"judge":"{}","score":0}
668
 
669
+ print(f" โœ“ {task.task_id} response len={len(model_response)}")
670
  jp = build_judge_prompt(task, model_response)
671
  jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
672
  model_id=judge_model_id, provider=judge_provider)
 
689
  if len(state["active"])>10: state["active"]=state["active"][-10:]
690
  return task.task_id,{"response":model_response,"judge":jj,"score":ws}
691
  except Exception as e:
692
+ print(f" โœ— {task.task_id} EXCEPTION: {e}")
693
  with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
694
  _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
695
  return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
 
728
  ac=state.get("active",[])
729
  if ac: o+='<div style="margin-top:8px">๐Ÿ”„ '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
730
  er=state.get("errors",[])
731
+ if er:
732
+ o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
733
+ for e in er[-6:]:
734
+ o+=f'<div>โš ๏ธ {html.escape(e[:100])}</div>'
735
+ o+='</div>'
736
  return o+'</div>'
737
 
738
  def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
 
743
  with _EVAL_STATE["lock"]:
744
  _EVAL_STATE["start_time"]=time.time()
745
  _EVAL_STATE["message"]=f"โšก Eval: {eval_label} ยท Judge: {judge_label} ยท {len(tasks)} tasks"
746
+ # โ˜… _load_all์€ ์ด์ œ ์‹คํŒจ ๊ฒฐ๊ณผ๋ฅผ ์ž๋™ ์ œ์™ธํ•จ
747
  results=dict(_load_all(run_id))
748
  cached=sum(1 for t in tasks if t.task_id in results)
749
  pending=[t for t in tasks if t.task_id not in results]
750
+ print(f" ๐Ÿ“Š Cached (valid): {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
751
  gt={}
752
  for t in pending: gt.setdefault(t.grade,[]).append(t)
753
  with _EVAL_STATE["lock"]:
 
791
  _EVAL_STATE["message"]=f"๐Ÿ {stage['name']} โ€” FINAL={final:.1f} ยท {elapsed}s"
792
  _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
793
  except Exception as e:
794
+ print(f" โŒ Fatal: {e}")
795
+ import traceback; traceback.print_exc()
796
  with _EVAL_STATE["lock"]:
797
  _EVAL_STATE["message"]=f"โŒ Fatal: {str(e)[:100]}"
798
  _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
 
861
 
862
  HEADER = """
863
  <div style="text-align:center;padding:16px 0">
864
+ <h1 style="margin:0;font-size:1.8em">๐Ÿ† FINAL Bench v4.2 โ€” Baseline Evaluation</h1>
865
  <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
866
  <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
867
  <b>100 Tasks ยท 15 Domains ยท 8 TICOS ยท 5-Axis ยท 5-Stage AGI Grade</b><br>
 
871
  <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
872
  <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI ยท GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
873
  <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic ยท Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
874
+ <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google ยท Gemini 2.5 Flash / 2.5 Pro / 2.0 Flash</span>
875
  </div>
876
  <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
877
  <p style="color:#e94560;font-size:0.85em;margin:0">๐Ÿ”’ <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
878
+ </div>
 
 
879
  <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
880
  <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐Ÿ“Š Dataset</a>
881
  <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">๐Ÿ† Leaderboard</a>
882
  </div></div>"""
883
 
884
  def create_app():
885
+ with gr.Blocks(title="FINAL Bench v4.2",
886
  css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
887
  gr.HTML(HEADER)
888
 
 
889
  gr.Markdown("### ๐Ÿ”‘ API Keys")
890
+ gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. Same key OK if both use same provider.</p>')
891
  with gr.Row():
892
  eval_api_key=gr.Textbox(label="๐Ÿค– Eval Model API Key",type="password",
893
  placeholder="sk-... / sk-ant-... / AIza...",
894
+ info="OpenAI / Anthropic / Google key for eval",scale=3)
895
  judge_api_key=gr.Textbox(label="โš–๏ธ Judge Model API Key",type="password",
896
  placeholder="sk-... / sk-ant-... / AIza...",
897
+ info="OpenAI / Anthropic / Google key for judge",scale=3)
898
 
 
899
  gr.Markdown("### ๐Ÿค– Model Selection")
900
  with gr.Row():
901
  eval_m=gr.Dropdown(label="๐Ÿค– Evaluation Target",choices=MODEL_CHOICES,
902
+ value=DEFAULT_EVAL,info="Model to evaluate",scale=3)
 
903
  judge_m=gr.Dropdown(label="โš–๏ธ Judge Model",choices=MODEL_CHOICES,
904
+ value=DEFAULT_JUDGE,info="Model that scores responses",scale=3)
 
905
 
 
906
  gr.Markdown("### โš™๏ธ Settings")
907
  with gr.Row():
908
  gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
 
910
  mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
911
  nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
912
 
 
913
  with gr.Row():
914
  s_btn=gr.Button("โ–ถ๏ธ Start (Resume)",variant="primary",size="lg",scale=2)
915
  f_btn=gr.Button("๐Ÿš€ Fresh Start",variant="secondary",size="lg",scale=2)
916
  x_btn=gr.Button("โน๏ธ Stop",variant="stop",size="lg",scale=1)
917
+ status=gr.Textbox(label="Status",interactive=False,max_lines=2)
918
 
 
919
  with gr.Tabs():
920
  with gr.Tab("๐Ÿ“Š Progress"): p_html=gr.HTML()
921
  with gr.Tab("๐Ÿ“‹ Results"): t_html=gr.HTML()
 
931
  f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
932
  x_btn.click(fn=_stop,outputs=[status])
933
 
 
 
 
 
 
 
 
934
  if __name__=="__main__":
935
  sg,sd={},{}
936
  for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
937
+ print(f"\n{'='*60}\n FINAL Bench v4.2 โ€” Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
938
  print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains")
939
  for g in ["A","B","C"]: print(f" Grade {g} (ร—{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
940
  print(f" ๐Ÿ”’ MetaCog: COMING SOON\n{'='*60}\n")