Spaces:

aiqtech
/

final-bench-Proprietary

Paused

App Files Files Community

aiqtech commited on Feb 21

Commit

ebaf6e8

verified ·

1 Parent(s): 0ab3bed

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -129

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
 from datetime import datetime
 from dataclasses import dataclass, field
 from typing import List, Dict
@@ -99,10 +99,10 @@ def load_tasks():
         raise FileNotFoundError("Dataset not found!")
 ALL_TASKS = load_tasks()
-print(f"✅ FINAL Bench v4.1: {len(ALL_TASKS)} tasks loaded")
 # ════════════════════════════════════════════════════════════════
-# §3. Multi-Provider Model Registry (Eval + Judge SAME)
 # ════════════════════════════════════════════════════════════════
 PROVIDER_MODELS = {
@@ -119,13 +119,12 @@ PROVIDER_MODELS = {
         "claude-haiku-4-5-20251001":    "Claude Haiku 4.5",
     },
     "Google": {
-        "gemini-3-pro":         "Gemini 3 Pro",
-        "gemini-2.5-pro":       "Gemini 2.5 Pro",
-        "gemini-2.5-flash":     "Gemini 2.5 Flash",
     },
 }
-# Build unified model list — used for BOTH eval and judge dropdowns
 ALL_MODELS = {}
 for prov, models in PROVIDER_MODELS.items():
     for mid, label in models.items():
@@ -136,7 +135,6 @@ DEFAULT_EVAL  = "GPT-5.2 (flagship) [OpenAI]"
 DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
 def _resolve_model(choice):
-    """Resolve dropdown choice → (model_id, provider)"""
     info = ALL_MODELS.get(choice, {})
     return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
@@ -150,7 +148,7 @@ def _strip_think(text):
         text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
     return text.strip()
-# --- OpenAI ---
 def call_openai(prompt, system="", api_key="", model="gpt-5.2",
                 max_tokens=8192, temperature=0.6, reasoning_effort=None,
                 json_mode=False, json_schema=None):
@@ -171,19 +169,20 @@ def call_openai(prompt, system="", api_key="", model="gpt-5.2",
         try:
             r=requests.post("https://api.openai.com/v1/chat/completions",
                             headers=headers,data=json.dumps(payload),timeout=300)
-            if r.status_code==429: time.sleep(5*(attempt+1)); continue
-            r.raise_for_status(); c=r.json()["choices"][0]["message"]["content"]
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
             try: err=r.json().get("error",{}).get("message","")
             except: err=str(r.status_code)
             if attempt<2: time.sleep(3*(attempt+1)); continue
-            return f"[API_ERROR] {err}"
         except Exception as e:
             if attempt<2: time.sleep(3*(attempt+1))
             else: return f"[API_ERROR] {e}"
-# --- Anthropic (★ data=json.dumps, 429+529 retry) ---
 def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
                    max_tokens=8192, temperature=0.6):
     headers={
@@ -198,8 +197,6 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
         try:
             r=requests.post("https://api.anthropic.com/v1/messages",
                             headers=headers,data=json.dumps(payload),timeout=300)
-            if r.status_code==429: time.sleep(5*(attempt+1)); continue
-            if r.status_code==529: time.sleep(8*(attempt+1)); continue
             r.raise_for_status()
             resp=r.json()
             text_parts=[]
@@ -209,69 +206,92 @@ def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
             c="\n".join(text_parts)
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
             try: err=r.json().get("error",{}).get("message","")
             except: err=str(r.status_code)
-            if attempt<2: time.sleep(3*(attempt+1)); continue
-            return f"[API_ERROR] {err}"
         except Exception as e:
             if attempt<2: time.sleep(3*(attempt+1))
             else: return f"[API_ERROR] {e}"
-# --- Google Gemini (★ x-goog-api-key header, data=json.dumps, thinking filter) ---
 GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
-def call_gemini(prompt, system="", api_key="", model="gemini-3-pro",
                 max_tokens=8192, temperature=1.0, json_mode=False):
-    url=f"{GEMINI_API_BASE}/models/{model}:generateContent"
-    headers={
-        "Content-Type":"application/json",
-        "x-goog-api-key":api_key,
     }
-    contents=[{"role":"user","parts":[{"text":prompt}]}]
-    gen_config={"maxOutputTokens":max_tokens,"temperature":temperature}
-    payload={"contents":contents,"generationConfig":gen_config}
     if system:
-        payload["systemInstruction"]={"parts":[{"text":system}]}
     if json_mode:
-        gen_config["responseMimeType"]="application/json"
     for attempt in range(3):
         try:
-            r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300)
-            if r.status_code==429: time.sleep(5*(attempt+1)); continue
-            if r.status_code==503: time.sleep(8*(attempt+1)); continue
             r.raise_for_status()
-            data=r.json()
-            candidates=data.get("candidates",[])
             if not candidates:
-                block_reason=data.get("promptFeedback",{}).get("blockReason","UNKNOWN")
-                return f"[BLOCKED] Gemini blocked response: {block_reason}"
-            parts=candidates[0].get("content",{}).get("parts",[])
-            result=[]
             for p in parts:
                 if "text" in p:
-                    if p.get("thought",False): continue  # skip thinking parts
                     result.append(p["text"])
-            c="\n".join(result) if result else ""
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
-            try: err=r.json().get("error",{}).get("message","")
-            except: err=str(r.status_code)
-            if attempt<2: time.sleep(3*(attempt+1)); continue
             return f"[API_ERROR] Gemini {r.status_code}: {err}"
         except Exception as e:
-            if attempt<2: time.sleep(3*(attempt+1))
-            else: return f"[API_ERROR] {e}"
-# --- Unified Dispatcher ---
 def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
                provider="OpenAI", max_tokens=8192, temperature=0.6):
-    if provider=="OpenAI":    return call_openai(prompt,system,api_key,model_id,max_tokens,temperature)
-    elif provider=="Anthropic": return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature)
-    elif provider=="Google":  return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature)
     return f"[API_ERROR] Unknown provider: {provider}"
 # ════════════════════════════════════════════════════════════════
-# §5. Judge — Multi-Provider (OpenAI / Anthropic / Google)
 # ════════════════════════════════════════════════════════════════
 JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
@@ -292,9 +312,8 @@ G_PivotDetection: Found reversing premise? H_DecisionUnderUncertainty: Scenario
 STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
-IMPORTANT: Output ONLY valid JSON with NO extra text before or after:
-{"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}
-Where each X is one of: 0.0, 0.25, 0.5, 0.75, 1.0"""
 def _build_judge_schema():
     sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
@@ -323,18 +342,14 @@ Score: process_quality, metacognitive_accuracy, error_recovery, integration_dept
 Apply {task.ticos_type} bonus criteria.
 Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
 def _parse_judge_json(text):
-    """Parse judge response → dict with scores, works for all providers"""
     if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
         return None
     cleaned = _strip_think(text)
     VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
     keys = list(RUBRIC.keys())
-    # Method 1: Direct JSON parse
     try:
-        # Strip markdown fences
         t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
         t = re.sub(r'\s*```$', '', t.strip())
         data = json.loads(t)
@@ -344,10 +359,8 @@ def _parse_judge_json(text):
                 v = float(data["scores"].get(k, 0.5))
                 scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
             return {"scores": scores, "comment": data.get("comment", "ok")}
-    except:
-        pass
-    # Method 2: Find JSON object in text
     try:
         m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
         if m:
@@ -358,62 +371,44 @@ def _parse_judge_json(text):
                     v = float(data["scores"].get(k, 0.5))
                     scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
                 return {"scores": scores, "comment": data.get("comment", "parsed")}
-    except:
-        pass
-    # Method 3: Regex extraction
     try:
         sc = {}
         for k in keys:
             m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
             if m2:
                 v = float(m2.group(1))
-                if 0 <= v <= 1:
-                    sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
         if len(sc) >= 3:
             for k in keys:
                 if k not in sc: sc[k] = 0.5
             return {"scores": sc, "comment": "regex_parsed"}
-    except:
-        pass
     return None
 def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
-    """★ Universal Judge — routes to correct provider with JSON enforcement"""
     if provider == "OpenAI":
-        # OpenAI: use structured output (best quality)
         raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
-                          max_tokens=max_tokens, temperature=temperature,
-                          json_schema=JUDGE_SCHEMA)
         result = _parse_judge_json(raw)
-        if result:
-            return result
-        # Fallback: try without structured output
         raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
                            max_tokens=max_tokens, temperature=temperature, json_mode=True)
         return _parse_judge_json(raw2)
     elif provider == "Anthropic":
-        # Anthropic: prompt-based JSON enforcement
         raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
                              max_tokens=max_tokens, temperature=temperature)
         return _parse_judge_json(raw)
     elif provider == "Google":
-        # Google: JSON mode supported
         raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
-                          max_tokens=max_tokens, temperature=temperature, json_mode=True)
         result = _parse_judge_json(raw)
-        if result:
-            return result
-        # Fallback without json_mode
         raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
-                           max_tokens=max_tokens, temperature=temperature, json_mode=False)
         return _parse_judge_json(raw2)
     return None
 # ════════════════════════════════════════════════════════════════
@@ -476,12 +471,23 @@ def _init_db():
     c=sqlite3.connect(DB_PATH)
     c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
     c.commit(); c.close()
-def _make_run_id(m): return hashlib.md5(f"FINALv41_BL_{m}".encode()).hexdigest()[:12]
 def _save_result(rid,tid,resp,jresp,sc):
     c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
 def _load_all(rid):
-    c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close()
-    return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
 def _clear_run(rid):
     c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
 _init_db()
@@ -546,9 +552,13 @@ def _build_progress_table(results, tasks):
         info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
         gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
         if t.task_id in results:
-            s=results[t.task_id]["score"]
             if s<0:
-                rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>—</td></tr>'
             else:
                 c=_sc(s)
                 rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
@@ -571,6 +581,8 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
         if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}×{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
     done=sum(1 for t in tasks if t.task_id in results)
     jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
     # MA-ER Gap
     ma_vals,er_vals=[],[]
     for tid,d in results.items():
@@ -582,21 +594,21 @@ def _build_summary_card(results, tasks, eval_label, judge_label, hf_status):
             if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
         except: pass
     avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
-    gap=avg_ma-avg_er
-    gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
     gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
-    # Pass checks
     ad=[t.domain for t in tasks if t.grade=="A"]
     asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
     aa=np.mean(asc_vals) if asc_vals else 0
     checks=[("Score≥80",final>=80),("Axes≥60",all(v>=60 for v in axis.values())),(f"A-avg≥75({aa:.0f})",aa>=75)]
     ch="".join([f'<span style="margin-right:8px">{"✅" if ok else "❌"}{lb}</span>' for lb,ok in checks])
     return f"""{CSS}<div class="summary-card">
 <div style="text-align:center">
 <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
 <h2 style="margin:6px 0;font-size:1.6em">🤖 Baseline FINAL: {final:.1f}</h2>
 <p style="color:#aaa;font-size:0.85em">{stage['label']} · Base {base:.1f} × HAR {har_p:.3f} · {done}/{len(tasks)}{f" · JF={jf}" if jf else ""}</p>
 <p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} · Judge: {judge_label}</p>
 </div><hr style="border-color:#333;margin:12px 0">
 <h4 style="color:#aaa;margin:6px 0">🎯 5-Axis Scores</h4>{ax_html}
 <hr style="border-color:#333;margin:10px 0">
@@ -637,18 +649,24 @@ def _build_detail_view(results, tasks):
 def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
                  judge_api_key, judge_model_id, judge_provider, state):
     try:
-        # 1) Eval model call
         sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
                f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
                f"If unsure, say so honestly.")
         model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
                                    model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
-        if model_response.startswith("[API_ERROR") or model_response=="[EMPTY]":
             _save_result(run_id,task.task_id,model_response,"{}",0)
-            with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {model_response[:50]}")
             return task.task_id,{"response":model_response,"judge":"{}","score":0}
-        # 2) Judge call — any provider
         jp = build_judge_prompt(task, model_response)
         jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
                         model_id=judge_model_id, provider=judge_provider)
@@ -671,6 +689,7 @@ def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
             if len(state["active"])>10: state["active"]=state["active"][-10:]
         return task.task_id,{"response":model_response,"judge":jj,"score":ws}
     except Exception as e:
         with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
         _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
         return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
@@ -709,7 +728,11 @@ def _prog_html(state, pending):
     ac=state.get("active",[])
     if ac: o+='<div style="margin-top:8px">🔄 '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
     er=state.get("errors",[])
-    if er: o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em">{" · ".join(["⚠️"+html.escape(e[:30]) for e in er[-3:]])}</div>'
     return o+'</div>'
 def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
@@ -720,9 +743,11 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
         with _EVAL_STATE["lock"]:
             _EVAL_STATE["start_time"]=time.time()
             _EVAL_STATE["message"]=f"⚡ Eval: {eval_label} · Judge: {judge_label} · {len(tasks)} tasks"
         results=dict(_load_all(run_id))
         cached=sum(1 for t in tasks if t.task_id in results)
         pending=[t for t in tasks if t.task_id not in results]
         gt={}
         for t in pending: gt.setdefault(t.grade,[]).append(t)
         with _EVAL_STATE["lock"]:
@@ -766,6 +791,8 @@ def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
             _EVAL_STATE["message"]=f"🏁 {stage['name']} — FINAL={final:.1f} · {elapsed}s"
             _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
     except Exception as e:
         with _EVAL_STATE["lock"]:
             _EVAL_STATE["message"]=f"❌ Fatal: {str(e)[:100]}"
             _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
@@ -834,7 +861,7 @@ def _poll():
 HEADER = """
 <div style="text-align:center;padding:16px 0">
-<h1 style="margin:0;font-size:1.8em">🏆 FINAL Bench v4.1 — Baseline Evaluation</h1>
 <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
 <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
 <b>100 Tasks · 15 Domains · 8 TICOS · 5-Axis · 5-Stage AGI Grade</b><br>
@@ -844,45 +871,38 @@ Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google
 <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
 <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI · GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
 <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic · Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
-<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google · Gemini 3 Pro / 2.5 Pro / 2.5 Flash</span>
 </div>
 <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
 <p style="color:#e94560;font-size:0.85em;margin:0">🔒 <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
-<p style="color:#888;font-size:0.78em;margin:4px 0 0 0">
-3-Phase Protocol (Initial → Self-Review → Correction) — paper's core contribution.
-</p></div>
 <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
 <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">📊 Dataset</a>
 <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">🏆 Leaderboard</a>
 </div></div>"""
 def create_app():
-    with gr.Blocks(title="FINAL Bench v4.1",theme=gr.themes.Soft(),
                    css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
         gr.HTML(HEADER)
-        # --- API Keys ---
         gr.Markdown("### 🔑 API Keys")
-        gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. If both models use the same provider, you can enter the same key.</p>')
         with gr.Row():
             eval_api_key=gr.Textbox(label="🤖 Eval Model API Key",type="password",
                 placeholder="sk-... / sk-ant-... / AIza...",
-                info="OpenAI / Anthropic / Google key for the evaluation target",scale=3)
             judge_api_key=gr.Textbox(label="⚖️ Judge Model API Key",type="password",
                 placeholder="sk-... / sk-ant-... / AIza...",
-                info="OpenAI / Anthropic / Google key for the judge",scale=3)
-        # --- Model Selection (SAME choices for both) ---
         gr.Markdown("### 🤖 Model Selection")
         with gr.Row():
             eval_m=gr.Dropdown(label="🤖 Evaluation Target",choices=MODEL_CHOICES,
-                value=DEFAULT_EVAL,
-                info="Model to be evaluated on FINAL Bench tasks",scale=3)
             judge_m=gr.Dropdown(label="⚖️ Judge Model",choices=MODEL_CHOICES,
-                value=DEFAULT_JUDGE,
-                info="Model that scores the evaluation responses",scale=3)
-        # --- Settings ---
         gr.Markdown("### ⚙️ Settings")
         with gr.Row():
             gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
@@ -890,14 +910,12 @@ def create_app():
             mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
             nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
-        # --- Buttons ---
         with gr.Row():
             s_btn=gr.Button("▶️ Start (Resume)",variant="primary",size="lg",scale=2)
             f_btn=gr.Button("🚀 Fresh Start",variant="secondary",size="lg",scale=2)
             x_btn=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
-        status=gr.Textbox(label="Status",interactive=False,max_lines=1)
-        # --- Results ---
         with gr.Tabs():
             with gr.Tab("📊 Progress"): p_html=gr.HTML()
             with gr.Tab("📋 Results"): t_html=gr.HTML()
@@ -913,17 +931,10 @@ def create_app():
         f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
         x_btn.click(fn=_stop,outputs=[status])
-        gr.Markdown("""---
-<center><b>FINAL Bench v4.1</b> — Baseline (Non-AGI) · Multi-Provider Eval & Judge<br>
-100 Tasks · 5-Axis · 5-Stage · OpenAI / Anthropic / Google<br>
-🔒 MetaCog (Self-Correction Protocol): <b>COMING SOON</b><br>
-Apache 2.0 · <b>Ginigen AI</b> — Choi Sunyoung</center>""")
-    return app
 if __name__=="__main__":
     sg,sd={},{}
     for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
-    print(f"\n{'='*60}\n  FINAL Bench v4.1 — Baseline (Non-AGI)\n  Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
     print(f"  {len(ALL_TASKS)} tasks | {len(sd)} domains")
     for g in ["A","B","C"]: print(f"  Grade {g} (×{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
     print(f"  🔒 MetaCog: COMING SOON\n{'='*60}\n")

+import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random
 from datetime import datetime
 from dataclasses import dataclass, field
 from typing import List, Dict
         raise FileNotFoundError("Dataset not found!")
 ALL_TASKS = load_tasks()
+print(f"✅ FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded")
 # ════════════════════════════════════════════════════════════════
+# §3. Multi-Provider Model Registry
 # ════════════════════════════════════════════════════════════════
 PROVIDER_MODELS = {
         "claude-haiku-4-5-20251001":    "Claude Haiku 4.5",
     },
     "Google": {
+        "gemini-2.5-flash":             "Gemini 2.5 Flash",
+        "gemini-2.5-pro":               "Gemini 2.5 Pro",
+        "gemini-2.0-flash":             "Gemini 2.0 Flash",
     },
 }
 ALL_MODELS = {}
 for prov, models in PROVIDER_MODELS.items():
     for mid, label in models.items():
 DEFAULT_JUDGE = "GPT-5.2 (flagship) [OpenAI]"
 def _resolve_model(choice):
     info = ALL_MODELS.get(choice, {})
     return info.get("id", "gpt-5.2"), info.get("provider", "OpenAI")
         text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
     return text.strip()
+# ── OpenAI ──
 def call_openai(prompt, system="", api_key="", model="gpt-5.2",
                 max_tokens=8192, temperature=0.6, reasoning_effort=None,
                 json_mode=False, json_schema=None):
         try:
             r=requests.post("https://api.openai.com/v1/chat/completions",
                             headers=headers,data=json.dumps(payload),timeout=300)
+            r.raise_for_status()
+            c=r.json()["choices"][0]["message"]["content"]
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
+            if r.status_code==429: time.sleep(5*(attempt+1)); continue
             try: err=r.json().get("error",{}).get("message","")
             except: err=str(r.status_code)
             if attempt<2: time.sleep(3*(attempt+1)); continue
+            return f"[API_ERROR] OpenAI {r.status_code}: {err}"
         except Exception as e:
             if attempt<2: time.sleep(3*(attempt+1))
             else: return f"[API_ERROR] {e}"
+# ── Anthropic Claude (★ 참고코드 동일 패턴) ──
 def call_anthropic(prompt, system="", api_key="", model="claude-opus-4-6",
                    max_tokens=8192, temperature=0.6):
     headers={
         try:
             r=requests.post("https://api.anthropic.com/v1/messages",
                             headers=headers,data=json.dumps(payload),timeout=300)
             r.raise_for_status()
             resp=r.json()
             text_parts=[]
             c="\n".join(text_parts)
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
+            if r.status_code==429: time.sleep(5*(attempt+1)); continue
+            if r.status_code==529: time.sleep(8*(attempt+1)); continue
             try: err=r.json().get("error",{}).get("message","")
             except: err=str(r.status_code)
+            return f"[API_ERROR] Claude {r.status_code}: {err}"
         except Exception as e:
             if attempt<2: time.sleep(3*(attempt+1))
             else: return f"[API_ERROR] {e}"
+# ── Google Gemini (★★★ 참고코드와 100% 동일 패턴 ★★★) ──
 GEMINI_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
+def call_gemini(prompt, system="", api_key="", model="gemini-2.5-flash",
                 max_tokens=8192, temperature=1.0, json_mode=False):
+    """Google Gemini generateContent REST API
+    ★ x-goog-api-key 헤더 인증
+    ★ data=json.dumps(payload) 전송
+    ★ thinking part (thought:True) 스킵
+    """
+    url = f"{GEMINI_API_BASE}/models/{model}:generateContent"
+    headers = {
+        "Content-Type": "application/json",
+        "x-goog-api-key": api_key,
     }
+    contents = [{"role": "user", "parts": [{"text": prompt}]}]
+    gen_config = {"maxOutputTokens": max_tokens, "temperature": temperature}
+    payload = {"contents": contents, "generationConfig": gen_config}
     if system:
+        payload["systemInstruction"] = {"parts": [{"text": system}]}
     if json_mode:
+        gen_config["responseMimeType"] = "application/json"
     for attempt in range(3):
         try:
+            r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=300)
+            # ★ raise_for_status FIRST — 참고코드 동일 패턴
             r.raise_for_status()
+            data = r.json()
+            candidates = data.get("candidates", [])
             if not candidates:
+                block_reason = data.get("promptFeedback", {}).get("blockReason", "UNKNOWN")
+                print(f"  [Gemini] BLOCKED: {block_reason}")
+                return f"[API_ERROR] Gemini BLOCKED: {block_reason}"
+            parts = candidates[0].get("content", {}).get("parts", [])
+            result = []
             for p in parts:
                 if "text" in p:
+                    if p.get("thought", False):
+                        continue  # ★ thinking part skip
                     result.append(p["text"])
+            c = "\n".join(result) if result else ""
             return _strip_think(c) if c else "[EMPTY]"
         except requests.exceptions.HTTPError:
+            # ★ 참고코드 동일: 429/503만 retry, 나머지는 즉시 에러 반환
+            if r.status_code == 429:
+                time.sleep(5 * (attempt + 1) + random.uniform(0, 2))
+                continue
+            if r.status_code == 503:
+                time.sleep(8 * (attempt + 1) + random.uniform(0, 3))
+                continue
+            try:
+                err = r.json().get("error", {}).get("message", "")
+            except:
+                err = str(r.status_code)
+            print(f"  [Gemini] ERROR {r.status_code}: {err[:200]}")
             return f"[API_ERROR] Gemini {r.status_code}: {err}"
         except Exception as e:
+            print(f"  [Gemini] Exception: {e}")
+            if attempt < 2:
+                time.sleep(3 * (attempt + 1))
+            else:
+                return f"[API_ERROR] Gemini: {e}"
+# ── Unified Dispatcher ──
 def call_model(prompt, system="", api_key="", model_id="gpt-5.2",
                provider="OpenAI", max_tokens=8192, temperature=0.6):
+    if provider == "OpenAI":
+        return call_openai(prompt, system, api_key, model_id, max_tokens, temperature)
+    elif provider == "Anthropic":
+        return call_anthropic(prompt, system, api_key, model_id, max_tokens, temperature)
+    elif provider == "Google":
+        # ★ Gemini는 temperature=1.0 권장 (thinking 모델)
+        return call_gemini(prompt, system, api_key, model_id, max_tokens, temperature=1.0)
     return f"[API_ERROR] Unknown provider: {provider}"
 # ════════════════════════════════════════════════════════════════
+# §5. Judge — Multi-Provider
 # ════════════════════════════════════════════════════════════════
 JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
 STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure
+IMPORTANT: Output ONLY valid JSON with NO extra text:
+{"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}"""
 def _build_judge_schema():
     sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
 Apply {task.ticos_type} bonus criteria.
 Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}"""
 def _parse_judge_json(text):
     if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":
         return None
     cleaned = _strip_think(text)
     VALID = {0.0, 0.25, 0.5, 0.75, 1.0}
     keys = list(RUBRIC.keys())
+    # Method 1: Direct JSON
     try:
         t = re.sub(r'^```(?:json)?\s*', '', cleaned.strip())
         t = re.sub(r'\s*```$', '', t.strip())
         data = json.loads(t)
                 v = float(data["scores"].get(k, 0.5))
                 scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
             return {"scores": scores, "comment": data.get("comment", "ok")}
+    except: pass
+    # Method 2: Search JSON
     try:
         m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}', cleaned, re.DOTALL)
         if m:
                     v = float(data["scores"].get(k, 0.5))
                     scores[k] = min(VALID, key=lambda x, v=v: abs(x - v))
                 return {"scores": scores, "comment": data.get("comment", "parsed")}
+    except: pass
+    # Method 3: Regex
     try:
         sc = {}
         for k in keys:
             m2 = re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)', cleaned, re.IGNORECASE)
             if m2:
                 v = float(m2.group(1))
+                if 0 <= v <= 1: sc[k] = min(VALID, key=lambda x, v=v: abs(x - v))
         if len(sc) >= 3:
             for k in keys:
                 if k not in sc: sc[k] = 0.5
             return {"scores": sc, "comment": "regex_parsed"}
+    except: pass
     return None
 def call_judge(prompt, system, api_key, model_id, provider, temperature=0.1, max_tokens=2048):
     if provider == "OpenAI":
         raw = call_openai(prompt, system=system, api_key=api_key, model=model_id,
+                          max_tokens=max_tokens, temperature=temperature, json_schema=JUDGE_SCHEMA)
         result = _parse_judge_json(raw)
+        if result: return result
         raw2 = call_openai(prompt, system=system, api_key=api_key, model=model_id,
                            max_tokens=max_tokens, temperature=temperature, json_mode=True)
         return _parse_judge_json(raw2)
     elif provider == "Anthropic":
         raw = call_anthropic(prompt, system=system, api_key=api_key, model=model_id,
                              max_tokens=max_tokens, temperature=temperature)
         return _parse_judge_json(raw)
     elif provider == "Google":
+        # ★ Gemini judge도 temperature=1.0 고정 (thinking 모델 호환)
         raw = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
+                          max_tokens=max_tokens, temperature=1.0, json_mode=True)
         result = _parse_judge_json(raw)
+        if result: return result
         raw2 = call_gemini(prompt, system=system, api_key=api_key, model=model_id,
+                           max_tokens=max_tokens, temperature=1.0, json_mode=False)
         return _parse_judge_json(raw2)
     return None
 # ════════════════════════════════════════════════════════════════
     c=sqlite3.connect(DB_PATH)
     c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
     c.commit(); c.close()
+def _make_run_id(m): return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12]
 def _save_result(rid,tid,resp,jresp,sc):
     c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
 def _load_all(rid):
+    """★ 캐시 로드 시 실패 결과(score=0 + API_ERROR) 자동 제외 → 재시도 보장"""
+    c=sqlite3.connect(DB_PATH)
+    cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,))
+    rows=cur.fetchall(); c.close()
+    result = {}
+    for r in rows:
+        resp = r[1] or ""
+        score = r[3]
+        # ★ API 에러/빈 응답/0점은 캐시에서 제외 → 다음 실행 시 재시도
+        if score <= 0 and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp == "[EMPTY]" or resp.startswith("[ERROR")):
+            continue
+        result[r[0]] = {"response": resp, "judge": r[2], "score": score}
+    return result
 def _clear_run(rid):
     c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
 _init_db()
         info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
         gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
         if t.task_id in results:
+            d=results[t.task_id]; s=d["score"]; resp=d.get("response","")
             if s<0:
+                rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌ JF</td><td>—</td></tr>'
+            elif s==0 and resp and (resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"):
+                # ★ API 에러를 명확하게 표시
+                err_short=html.escape(resp[:60])
+                rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">🚫 {err_short}</td></tr>'
             else:
                 c=_sc(s)
                 rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
         if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}×{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
     done=sum(1 for t in tasks if t.task_id in results)
     jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
+    # API errors
+    api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and (results[t.task_id].get("response","") or "").startswith("["))
     # MA-ER Gap
     ma_vals,er_vals=[],[]
     for tid,d in results.items():
             if "error_recovery" in sc: er_vals.append(float(sc["error_recovery"]))
         except: pass
     avg_ma=np.mean(ma_vals) if ma_vals else 0; avg_er=np.mean(er_vals) if er_vals else 0
+    gap=avg_ma-avg_er; gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50"
     gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced"
     ad=[t.domain for t in tasks if t.grade=="A"]
     asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
     aa=np.mean(asc_vals) if asc_vals else 0
     checks=[("Score≥80",final>=80),("Axes≥60",all(v>=60 for v in axis.values())),(f"A-avg≥75({aa:.0f})",aa>=75)]
     ch="".join([f'<span style="margin-right:8px">{"✅" if ok else "❌"}{lb}</span>' for lb,ok in checks])
+    err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">⚠️ API Errors: {api_errs} tasks</div>' if api_errs else ""
     return f"""{CSS}<div class="summary-card">
 <div style="text-align:center">
 <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
 <h2 style="margin:6px 0;font-size:1.6em">🤖 Baseline FINAL: {final:.1f}</h2>
 <p style="color:#aaa;font-size:0.85em">{stage['label']} · Base {base:.1f} × HAR {har_p:.3f} · {done}/{len(tasks)}{f" · JF={jf}" if jf else ""}</p>
 <p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} · Judge: {judge_label}</p>
+{err_html}
 </div><hr style="border-color:#333;margin:12px 0">
 <h4 style="color:#aaa;margin:6px 0">🎯 5-Axis Scores</h4>{ax_html}
 <hr style="border-color:#333;margin:10px 0">
 def _eval_single(task, run_id, eval_api_key, eval_model_id, eval_provider,
                  judge_api_key, judge_model_id, judge_provider, state):
     try:
         sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
                f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
                f"If unsure, say so honestly.")
+        print(f"  ▶ {task.task_id} → {eval_provider}/{eval_model_id}")
         model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,
                                    model_id=eval_model_id,provider=eval_provider,max_tokens=12288)
+        if (model_response.startswith("[API_ERROR") or
+            model_response.startswith("[BLOCKED") or
+            model_response=="[EMPTY]"):
+            print(f"  ✗ {task.task_id}: {model_response[:100]}")
+            # ★ API 에러는 저장하되, _load_all에서 자동 제외됨
             _save_result(run_id,task.task_id,model_response,"{}",0)
+            with state["lock"]:
+                state["done"]+=1
+                state["errors"].append(f"{task.task_id}: {model_response[:80]}")
             return task.task_id,{"response":model_response,"judge":"{}","score":0}
+        print(f"  ✓ {task.task_id} response len={len(model_response)}")
         jp = build_judge_prompt(task, model_response)
         jd = call_judge(jp, system=JUDGE_SYSTEM, api_key=judge_api_key,
                         model_id=judge_model_id, provider=judge_provider)
             if len(state["active"])>10: state["active"]=state["active"][-10:]
         return task.task_id,{"response":model_response,"judge":jj,"score":ws}
     except Exception as e:
+        print(f"  ✗ {task.task_id} EXCEPTION: {e}")
         with state["lock"]: state["done"]+=1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
         _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0)
         return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0}
     ac=state.get("active",[])
     if ac: o+='<div style="margin-top:8px">🔄 '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>'
     er=state.get("errors",[])
+    if er:
+        o+=f'<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">'
+        for e in er[-6:]:
+            o+=f'<div>⚠️ {html.escape(e[:100])}</div>'
+        o+='</div>'
     return o+'</div>'
 def _bg_eval(eval_api_key, eval_model_id, eval_provider, eval_label,
         with _EVAL_STATE["lock"]:
             _EVAL_STATE["start_time"]=time.time()
             _EVAL_STATE["message"]=f"⚡ Eval: {eval_label} · Judge: {judge_label} · {len(tasks)} tasks"
+        # ★ _load_all은 이제 실패 결과를 자동 제외함
         results=dict(_load_all(run_id))
         cached=sum(1 for t in tasks if t.task_id in results)
         pending=[t for t in tasks if t.task_id not in results]
+        print(f"  📊 Cached (valid): {cached} / Pending: {len(pending)} / Total: {len(tasks)}")
         gt={}
         for t in pending: gt.setdefault(t.grade,[]).append(t)
         with _EVAL_STATE["lock"]:
             _EVAL_STATE["message"]=f"🏁 {stage['name']} — FINAL={final:.1f} · {elapsed}s"
             _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
     except Exception as e:
+        print(f"  ❌ Fatal: {e}")
+        import traceback; traceback.print_exc()
         with _EVAL_STATE["lock"]:
             _EVAL_STATE["message"]=f"❌ Fatal: {str(e)[:100]}"
             _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
 HEADER = """
 <div style="text-align:center;padding:16px 0">
+<h1 style="margin:0;font-size:1.8em">🏆 FINAL Bench v4.2 — Baseline Evaluation</h1>
 <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
 <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto">
 <b>100 Tasks · 15 Domains · 8 TICOS · 5-Axis · 5-Stage AGI Grade</b><br>
 <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
 <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI · GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span>
 <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic · Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span>
+<span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google · Gemini 2.5 Flash / 2.5 Pro / 2.0 Flash</span>
 </div>
 <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px">
 <p style="color:#e94560;font-size:0.85em;margin:0">🔒 <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p>
+</div>
 <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em">
 <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">📊 Dataset</a>
 <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">🏆 Leaderboard</a>
 </div></div>"""
 def create_app():
+    with gr.Blocks(title="FINAL Bench v4.2",
                    css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app:
         gr.HTML(HEADER)
         gr.Markdown("### 🔑 API Keys")
+        gr.HTML('<p style="color:#888;font-size:0.82em;margin:0 0 6px 0">Enter the API key matching each model\'s provider. Same key OK if both use same provider.</p>')
         with gr.Row():
             eval_api_key=gr.Textbox(label="🤖 Eval Model API Key",type="password",
                 placeholder="sk-... / sk-ant-... / AIza...",
+                info="OpenAI / Anthropic / Google key for eval",scale=3)
             judge_api_key=gr.Textbox(label="⚖️ Judge Model API Key",type="password",
                 placeholder="sk-... / sk-ant-... / AIza...",
+                info="OpenAI / Anthropic / Google key for judge",scale=3)
         gr.Markdown("### 🤖 Model Selection")
         with gr.Row():
             eval_m=gr.Dropdown(label="🤖 Evaluation Target",choices=MODEL_CHOICES,
+                value=DEFAULT_EVAL,info="Model to evaluate",scale=3)
             judge_m=gr.Dropdown(label="⚖️ Judge Model",choices=MODEL_CHOICES,
+                value=DEFAULT_JUDGE,info="Model that scores responses",scale=3)
         gr.Markdown("### ⚙️ Settings")
         with gr.Row():
             gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1)
             mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1)
             nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1)
         with gr.Row():
             s_btn=gr.Button("▶️ Start (Resume)",variant="primary",size="lg",scale=2)
             f_btn=gr.Button("🚀 Fresh Start",variant="secondary",size="lg",scale=2)
             x_btn=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
+        status=gr.Textbox(label="Status",interactive=False,max_lines=2)
         with gr.Tabs():
             with gr.Tab("📊 Progress"): p_html=gr.HTML()
             with gr.Tab("📋 Results"): t_html=gr.HTML()
         f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status])
         x_btn.click(fn=_stop,outputs=[status])
 if __name__=="__main__":
     sg,sd={},{}
     for t in ALL_TASKS: sg[t.grade]=sg.get(t.grade,0)+1; sd[t.domain]=sd.get(t.domain,0)+1
+    print(f"\n{'='*60}\n  FINAL Bench v4.2 — Baseline (Non-AGI)\n  Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}")
     print(f"  {len(ALL_TASKS)} tasks | {len(sd)} domains")
     for g in ["A","B","C"]: print(f"  Grade {g} (×{GRADE_WEIGHT[g]}): {sg.get(g,0)}")
     print(f"  🔒 MetaCog: COMING SOON\n{'='*60}\n")