Spaces:
Running
Running
| """ | |
| FINAL Bench v4.2 β Baseline (Non-AGI) Evaluation System | |
| ========================================================= | |
| β Multi-Provider: OpenAI / Anthropic / Google (Gemini 3 Pro Preview) | |
| β Both Eval Model AND Judge Model support all 3 providers | |
| β 100 Tasks Β· 15 Domains Β· 8 TICOS Types Β· 5-Axis Β· 5-Stage AGI Grade | |
| β Dataset: HuggingFace FINAL-Bench/Metacognitive | |
| Author: Ginigen AI β Choi Sunyoung | License: Apache 2.0 | |
| """ | |
| import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, random | |
| from datetime import datetime | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict | |
| import requests | |
| import numpy as np | |
| import gradio as gr | |
| from concurrent.futures import ThreadPoolExecutor | |
| from datasets import load_dataset | |
| DOMAIN_INFO = { | |
| "Mathematics & Logic":{"icon":"π’","color":"#FF6B35"},"Science":{"icon":"π¬","color":"#7B2FF7"}, | |
| "Philosophy":{"icon":"π€","color":"#00B4D8"},"Medicine":{"icon":"π₯","color":"#2EC4B6"}, | |
| "Economics":{"icon":"π","color":"#E63946"},"History":{"icon":"π","color":"#F4A261"}, | |
| "War & Security":{"icon":"π‘οΈ","color":"#264653"},"Space & Physics":{"icon":"π","color":"#6C63FF"}, | |
| "Chemistry & Biology":{"icon":"π§¬","color":"#06D6A0"},"Language & Writing":{"icon":"βοΈ","color":"#EF476F"}, | |
| "Literature":{"icon":"π","color":"#8338EC"},"Art":{"icon":"π¨","color":"#FF006E"}, | |
| "Religion & Mythology":{"icon":"ποΈ","color":"#FFD166"},"Ethics":{"icon":"βοΈ","color":"#118AB2"}, | |
| "AI & Technology":{"icon":"π€","color":"#073B4C"}, | |
| } | |
| GRADE_WEIGHT={"A":1.5,"B":1.0,"C":0.7} | |
| RUBRIC={ | |
| "process_quality":{"weight":0.25,"desc":"Systematic reasoning transparency"}, | |
| "metacognitive_accuracy":{"weight":0.25,"desc":"Confidence calibration + uncertainty honesty"}, | |
| "error_recovery":{"weight":0.20,"desc":"Mid-analysis self-correction"}, | |
| "integration_depth":{"weight":0.15,"desc":"Multi-perspective synthesis"}, | |
| "final_correctness":{"weight":0.15,"desc":"Answer accuracy and completeness"}, | |
| } | |
| AXIS_MAP={ | |
| "generalization":{"rubrics":["process_quality","final_correctness"],"ticos":[]}, | |
| "reasoning":{"rubrics":["process_quality","error_recovery"],"ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]}, | |
| "planning":{"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]}, | |
| "reliability":{"rubrics":["metacognitive_accuracy"],"ticos":["E_SelfCorrecting","G_PivotDetection"]}, | |
| "safety":{"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]}, | |
| } | |
| AGI_STAGES=[ | |
| {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence","min":0,"max":39,"color":"#f44336"}, | |
| {"stage":2,"name":"FINAL-Proto","label":"Proto Intelligence","min":40,"max":59,"color":"#ff9800"}, | |
| {"stage":3,"name":"FINAL-Pre","label":"Pre-AGI","min":60,"max":79,"color":"#2196f3"}, | |
| {"stage":4,"name":"FINAL-Pass","label":"AGI Achieved","min":80,"max":94,"color":"#4caf50"}, | |
| {"stage":5,"name":"FINAL-Post","label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"}, | |
| ] | |
| class FinalTask: | |
| task_id:str;domain:str;grade:str;ticos_type:str | |
| difficulty:str;lens:str;title:str;prompt:str | |
| expected_behavior:str;hidden_trap:str | |
| ticos_required:List[str]=field(default_factory=list) | |
| metadata:Dict=field(default_factory=dict) | |
| def load_tasks(): | |
| print("π₯ Loading FINAL-Bench/Metacognitive from HuggingFace...") | |
| try: | |
| ds=load_dataset("FINAL-Bench/Metacognitive",split="train") | |
| tasks=[] | |
| for row in ds: | |
| tr=row.get("ticos_required",[]) | |
| if isinstance(tr,str): | |
| try:tr=json.loads(tr) | |
| except:tr=[x.strip() for x in tr.split(",") if x.strip()] | |
| tasks.append(FinalTask(task_id=row["task_id"],domain=row["domain"],grade=row["grade"], | |
| ticos_type=row["ticos_type"],difficulty=row["difficulty"],lens=row.get("lens",""), | |
| title=row.get("title",row["task_id"]),prompt=row["prompt"], | |
| expected_behavior=row.get("expected_behavior",""),hidden_trap=row.get("hidden_trap",""), | |
| ticos_required=tr if isinstance(tr,list) else [],metadata={})) | |
| print(f" β Loaded {len(tasks)} tasks from HuggingFace") | |
| return tasks | |
| except Exception as e: | |
| print(f" β οΈ HF load failed: {e}") | |
| raise FileNotFoundError("Dataset not found!") | |
| ALL_TASKS=load_tasks() | |
| print(f"β FINAL Bench v4.2: {len(ALL_TASKS)} tasks loaded") | |
| # βββ Β§3. Model Registry βββ | |
| PROVIDER_MODELS={ | |
| "OpenAI":{ | |
| "gpt-5.2":"GPT-5.2 (flagship)","gpt-5-mini":"GPT-5 Mini", | |
| "gpt-4.1":"GPT-4.1","o4-mini":"o4-mini (reasoning)","gpt-4o":"GPT-4o", | |
| }, | |
| "Anthropic":{ | |
| "claude-opus-4-6":"Claude Opus 4.6", | |
| "claude-sonnet-4-5-20250929":"Claude Sonnet 4.5", | |
| "claude-haiku-4-5-20251001":"Claude Haiku 4.5", | |
| }, | |
| "Google":{ | |
| "gemini-3-pro-preview":"Gemini 3 Pro Preview", | |
| }, | |
| } | |
| ALL_MODELS={} | |
| for prov,models in PROVIDER_MODELS.items(): | |
| for mid,label in models.items(): | |
| ALL_MODELS[f"{label} [{prov}]"]={"id":mid,"provider":prov} | |
| MODEL_CHOICES=list(ALL_MODELS.keys()) | |
| DEFAULT_EVAL="GPT-5.2 (flagship) [OpenAI]" | |
| DEFAULT_JUDGE="GPT-5.2 (flagship) [OpenAI]" | |
| def _resolve_model(choice): | |
| info=ALL_MODELS.get(choice,{}) | |
| return info.get("id","gpt-5.2"),info.get("provider","OpenAI") | |
| # βββ Β§4. API Clients βββ | |
| def _strip_think(text): | |
| if not text:return text | |
| for tag in['think','thinking','reasoning','reflection']: | |
| text=re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL) | |
| return text.strip() | |
| def call_openai(prompt,system="",api_key="",model="gpt-5.2", | |
| max_tokens=8192,temperature=0.6,reasoning_effort=None, | |
| json_mode=False,json_schema=None): | |
| headers={"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} | |
| messages=[] | |
| if system:messages.append({"role":"system","content":system}) | |
| messages.append({"role":"user","content":prompt}) | |
| payload={"model":model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages} | |
| if reasoning_effort:payload["reasoning_effort"]=reasoning_effort | |
| if json_schema: | |
| payload["reasoning_effort"]="none" | |
| payload["response_format"]={"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":json_schema}} | |
| elif json_mode: | |
| payload["response_format"]={"type":"json_object"} | |
| for attempt in range(3): | |
| try: | |
| r=requests.post("https://api.openai.com/v1/chat/completions",headers=headers,data=json.dumps(payload),timeout=300) | |
| r.raise_for_status();c=r.json()["choices"][0]["message"]["content"] | |
| return _strip_think(c) if c else "[EMPTY]" | |
| except requests.exceptions.HTTPError: | |
| if r.status_code==429:time.sleep(5*(attempt+1));continue | |
| try:err=r.json().get("error",{}).get("message","") | |
| except:err=str(r.status_code) | |
| if attempt<2:time.sleep(3*(attempt+1));continue | |
| return f"[API_ERROR] OpenAI {r.status_code}: {err}" | |
| except Exception as e: | |
| if attempt<2:time.sleep(3*(attempt+1)) | |
| else:return f"[API_ERROR] {e}" | |
| def call_anthropic(prompt,system="",api_key="",model="claude-opus-4-6", | |
| max_tokens=8192,temperature=0.6): | |
| headers={"Content-Type":"application/json","x-api-key":api_key,"anthropic-version":"2023-06-01"} | |
| messages=[{"role":"user","content":prompt}] | |
| payload={"model":model,"max_tokens":max_tokens,"temperature":temperature,"messages":messages} | |
| if system:payload["system"]=system | |
| for attempt in range(3): | |
| try: | |
| r=requests.post("https://api.anthropic.com/v1/messages",headers=headers,data=json.dumps(payload),timeout=300) | |
| r.raise_for_status();resp=r.json() | |
| text_parts=[] | |
| for block in resp.get("content",[]): | |
| if block.get("type")=="text":text_parts.append(block["text"]) | |
| c="\n".join(text_parts) | |
| return _strip_think(c) if c else "[EMPTY]" | |
| except requests.exceptions.HTTPError: | |
| if r.status_code==429:time.sleep(5*(attempt+1));continue | |
| if r.status_code==529:time.sleep(8*(attempt+1));continue | |
| try:err=r.json().get("error",{}).get("message","") | |
| except:err=str(r.status_code) | |
| return f"[API_ERROR] Claude {r.status_code}: {err}" | |
| except Exception as e: | |
| if attempt<2:time.sleep(3*(attempt+1)) | |
| else:return f"[API_ERROR] {e}" | |
| # β Gemini β x-goog-api-key header Β· data=json.dumps Β· thinking skip | |
| GEMINI_API_BASE="https://generativelanguage.googleapis.com/v1beta" | |
| def call_gemini(prompt,system="",api_key="",model="gemini-3-pro-preview", | |
| max_tokens=8192,temperature=1.0,json_mode=False): | |
| url=f"{GEMINI_API_BASE}/models/{model}:generateContent" | |
| headers={"Content-Type":"application/json","x-goog-api-key":api_key} | |
| contents=[{"role":"user","parts":[{"text":prompt}]}] | |
| gen_config={"maxOutputTokens":max_tokens,"temperature":temperature} | |
| payload={"contents":contents,"generationConfig":gen_config} | |
| if system:payload["systemInstruction"]={"parts":[{"text":system}]} | |
| if json_mode:gen_config["responseMimeType"]="application/json" | |
| for attempt in range(3): | |
| try: | |
| r=requests.post(url,headers=headers,data=json.dumps(payload),timeout=300) | |
| r.raise_for_status();data=r.json() | |
| candidates=data.get("candidates",[]) | |
| if not candidates: | |
| br=data.get("promptFeedback",{}).get("blockReason","UNKNOWN") | |
| return f"[API_ERROR] Gemini BLOCKED: {br}" | |
| parts=candidates[0].get("content",{}).get("parts",[]) | |
| result=[] | |
| for p in parts: | |
| if "text" in p: | |
| if p.get("thought",False):continue | |
| result.append(p["text"]) | |
| c="\n".join(result) if result else "" | |
| return _strip_think(c) if c else "[EMPTY]" | |
| except requests.exceptions.HTTPError: | |
| if r.status_code==429:time.sleep(5*(attempt+1)+random.uniform(0,2));continue | |
| if r.status_code==503:time.sleep(8*(attempt+1)+random.uniform(0,3));continue | |
| try:err=r.json().get("error",{}).get("message","") | |
| except:err=str(r.status_code) | |
| print(f" [Gemini] ERROR {r.status_code}: {err[:200]}") | |
| return f"[API_ERROR] Gemini {r.status_code}: {err}" | |
| except Exception as e: | |
| print(f" [Gemini] Exception: {e}") | |
| if attempt<2:time.sleep(3*(attempt+1)) | |
| else:return f"[API_ERROR] Gemini: {e}" | |
| def call_model(prompt,system="",api_key="",model_id="gpt-5.2", | |
| provider="OpenAI",max_tokens=8192,temperature=0.6): | |
| if provider=="OpenAI":return call_openai(prompt,system,api_key,model_id,max_tokens,temperature) | |
| elif provider=="Anthropic":return call_anthropic(prompt,system,api_key,model_id,max_tokens,temperature) | |
| elif provider=="Google":return call_gemini(prompt,system,api_key,model_id,max_tokens,temperature=1.0) | |
| return f"[API_ERROR] Unknown provider: {provider}" | |
| # βββ Β§5. Judge βββ | |
| JUDGE_SYSTEM="""You are a FINAL Bench judge for AGI-Level Verification. | |
| Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0 | |
| RUBRIC: | |
| process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher. | |
| metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. | |
| error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY self-corrections exist. | |
| integration_depth (15%): Multi-perspective synthesis + emergent insights | |
| final_correctness (15%): Answer accuracy and completeness. INCOMPLETE=0.25 max. | |
| STRICT: 1.0=AGI-worthy 0.75=expert 0.5=competent 0.25=gaps 0.0=failure | |
| Output ONLY valid JSON: {"scores":{"process_quality":X,"metacognitive_accuracy":X,"error_recovery":X,"integration_depth":X,"final_correctness":X},"comment":"<50 words>"}""" | |
| def _build_judge_schema(): | |
| sp={k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC} | |
| return {"type":"object","properties":{"scores":{"type":"object","properties":sp, | |
| "required":list(RUBRIC.keys()),"additionalProperties":False}, | |
| "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False} | |
| JUDGE_SCHEMA=_build_judge_schema() | |
| def build_judge_prompt(task,response): | |
| return f"""FINAL Bench Task Evaluation | |
| Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty} | |
| TICOS: {task.ticos_type} | Title: {task.title} | |
| PROMPT:\n{task.prompt[:2000]} | |
| EXPECTED:\n{task.expected_behavior[:600]} | |
| HIDDEN TRAPS: {task.hidden_trap or 'None'} | |
| RESPONSE TO JUDGE:\n{response[:17000]} | |
| Score all 5 rubrics. Apply {task.ticos_type} bonus criteria. | |
| Output ONLY JSON: {{"scores":{{...}},"comment":"..."}}""" | |
| def _parse_judge_json(text): | |
| if not text or text.startswith("[API_ERROR") or text=="[EMPTY]":return None | |
| cleaned=_strip_think(text);VALID={0.0,0.25,0.5,0.75,1.0};keys=list(RUBRIC.keys()) | |
| try: | |
| t=re.sub(r'^```(?:json)?\s*','',cleaned.strip());t=re.sub(r'\s*```$','',t.strip()) | |
| data=json.loads(t) | |
| if "scores" in data and isinstance(data["scores"],dict): | |
| scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys} | |
| return {"scores":scores,"comment":data.get("comment","ok")} | |
| except:pass | |
| try: | |
| m=re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL) | |
| if m: | |
| data=json.loads(m.group()) | |
| if "scores" in data: | |
| scores={k:min(VALID,key=lambda x,v=float(data["scores"].get(k,0.5)):abs(x-v)) for k in keys} | |
| return {"scores":scores,"comment":data.get("comment","parsed")} | |
| except:pass | |
| try: | |
| sc={} | |
| for k in keys: | |
| m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE) | |
| if m2: | |
| v=float(m2.group(1)) | |
| if 0<=v<=1:sc[k]=min(VALID,key=lambda x,v=v:abs(x-v)) | |
| if len(sc)>=3: | |
| for k in keys: | |
| if k not in sc:sc[k]=0.5 | |
| return {"scores":sc,"comment":"regex_parsed"} | |
| except:pass | |
| return None | |
| def call_judge(prompt,system,api_key,model_id,provider,temperature=0.1,max_tokens=2048): | |
| if provider=="OpenAI": | |
| raw=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_schema=JUDGE_SCHEMA) | |
| result=_parse_judge_json(raw) | |
| if result:return result | |
| raw2=call_openai(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature,json_mode=True) | |
| return _parse_judge_json(raw2) | |
| elif provider=="Anthropic": | |
| raw=call_anthropic(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=temperature) | |
| return _parse_judge_json(raw) | |
| elif provider=="Google": | |
| raw=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=True) | |
| result=_parse_judge_json(raw) | |
| if result:return result | |
| raw2=call_gemini(prompt,system=system,api_key=api_key,model=model_id,max_tokens=max_tokens,temperature=1.0,json_mode=False) | |
| return _parse_judge_json(raw2) | |
| return None | |
| # βββ Β§6. Scoring βββ | |
| def compute_task_score(scores): | |
| return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2) | |
| def compute_axis_scores(results,tasks): | |
| tm={t.task_id:t for t in tasks};ax={} | |
| for an,ai in AXIS_MAP.items(): | |
| vals=[] | |
| for tid,d in results.items(): | |
| if d["score"]<0:continue | |
| t=tm.get(tid) | |
| if not t:continue | |
| try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| except:sc={} | |
| rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc] | |
| w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0 | |
| if rv:vals.append(np.mean(rv)*w) | |
| ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0 | |
| return ax | |
| def compute_final_score(results,tasks): | |
| tm={t.task_id:t for t in tasks};ds={} | |
| for tid,d in results.items(): | |
| if d["score"]<0:continue | |
| t=tm.get(tid) | |
| if t:ds.setdefault(t.domain,[]).append(d["score"]) | |
| da={d:np.mean(v) for d,v in ds.items() if v} | |
| gd={} | |
| for t in tasks:gd.setdefault(t.grade,set()).add(t.domain) | |
| ws,wt=0,0 | |
| for g,doms in gd.items(): | |
| w=GRADE_WEIGHT.get(g,1.0) | |
| for d in doms: | |
| if d in da:ws+=da[d]*w;wt+=w | |
| base=ws/wt if wt>0 else 0 | |
| axis=compute_axis_scores(results,tasks) | |
| av=[max(v,0.01) for v in axis.values()] | |
| har=(len(av)/sum(1.0/v for v in av)) if av else 50 | |
| har_p=har/100.0 | |
| return round(base*har_p,2),round(base,2),round(har_p,3),axis,da | |
| def determine_agi_stage(score,axis): | |
| all60=all(v>=60 for v in axis.values()) if axis else False | |
| for s in reversed(AGI_STAGES): | |
| if score>=s["min"]: | |
| if s["stage"]>=4 and not all60:return AGI_STAGES[2] | |
| return s | |
| return AGI_STAGES[0] | |
| # βββ Β§7. Checkpoint DB βββ | |
| DB_PATH="final_bench_eval.db" | |
| def _init_db(): | |
| c=sqlite3.connect(DB_PATH);c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))");c.commit();c.close() | |
| def _make_run_id(m):return hashlib.md5(f"FINALv42_BL_{m}".encode()).hexdigest()[:12] | |
| def _save_result(rid,tid,resp,jresp,sc): | |
| c=sqlite3.connect(DB_PATH);c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time()));c.commit();c.close() | |
| def _load_all(rid): | |
| c=sqlite3.connect(DB_PATH);cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,));rows=cur.fetchall();c.close() | |
| result={} | |
| for r in rows: | |
| resp=r[1] or "";score=r[3] | |
| if score<=0 and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]" or resp.startswith("[ERROR")):continue | |
| result[r[0]]={"response":resp,"judge":r[2],"score":score} | |
| return result | |
| def _clear_run(rid): | |
| c=sqlite3.connect(DB_PATH);c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,));c.commit();c.close() | |
| _init_db() | |
| # βββ Β§8. CSV Export βββ | |
| def generate_csv(results,tasks,model_name,judge_name,mode="BASELINE"): | |
| out=io.StringIO();w=csv.writer(out) | |
| w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","eval_model","judge_model","mode","weighted_score","process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness","judge_comment","response_preview","timestamp"]) | |
| tm={t.task_id:t for t in tasks} | |
| for tid,d in sorted(results.items()): | |
| t=tm.get(tid) | |
| if not t:continue | |
| jd={} | |
| try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {}) | |
| except:pass | |
| sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200];s=d["score"] | |
| if s<0:s=-1;cm=f"JUDGE_FAILED:{cm}" | |
| w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,judge_name,mode,s,sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""),cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat()]) | |
| return out.getvalue() | |
| # βββ Β§9. HTML Builders βββ | |
| CSS="""<style> | |
| .eval-table{width:100%;border-collapse:collapse;font-size:0.82em} | |
| .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em} | |
| .eval-table td{padding:5px 8px;border-bottom:1px solid #eee} | |
| .score-bar{background:#e0e0e0;border-radius:8px;height:16px;overflow:hidden;min-width:70px} | |
| .score-fill{height:100%;border-radius:8px;transition:width .4s} | |
| .summary-card{background:linear-gradient(135deg,#0a0a1a,#1a1a3e);border-radius:16px;padding:24px;color:#fff;margin:8px 0} | |
| .axis-row{display:flex;align-items:center;gap:10px;margin:5px 0} | |
| .axis-bar{flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden} | |
| .axis-fill{height:100%;border-radius:6px} | |
| .stage-badge{display:inline-block;padding:6px 16px;border-radius:20px;font-weight:700;font-size:1.1em;margin:8px 0} | |
| .progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden} | |
| .progress-fill{height:100%;border-radius:8px;transition:width .4s;background:linear-gradient(90deg,#1565c0,#00c853)} | |
| </style>""" | |
| def _sc(s): | |
| if s>=80:return "#4caf50" | |
| if s>=60:return "#ff9800" | |
| if s>=40:return "#ff5722" | |
| return "#f44336" | |
| def _build_progress_table(results,tasks): | |
| rows="" | |
| for t in tasks: | |
| info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"}) | |
| gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>' | |
| if t.task_id in results: | |
| d=results[t.task_id];s=d["score"];resp=d.get("response","") | |
| if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">β JF</td><td>β</td></tr>' | |
| elif s==0 and resp and(resp.startswith("[API_ERROR") or resp.startswith("[BLOCKED") or resp=="[EMPTY]"): | |
| err_short=html.escape(resp[:60]) | |
| rows+=f'<tr style="background:#ffebee"><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td colspan="2" style="color:#c62828;font-size:0.75em">π« {err_short}</td></tr>' | |
| else: | |
| c=_sc(s);rows+=f'<tr><td>{t.task_id}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>' | |
| else:rows+=f'<tr style="opacity:0.35"><td>{t.task_id}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>β³</td><td>β</td></tr>' | |
| return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>' | |
| def _build_summary_card(results,tasks,eval_label,judge_label,hf_status): | |
| final,base,har_p,axis,dom_avgs=compute_final_score(results,tasks) | |
| stage=determine_agi_stage(final,axis) | |
| labels={"generalization":"π Generalization","reasoning":"π§ Reasoning","planning":"π Planning","reliability":"π― Reliability","safety":"π‘οΈ Safety"} | |
| ax_html="" | |
| for an,av in axis.items(): | |
| c=_sc(av);ax_html+=f'<div class="axis-row"><span style="width:120px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>' | |
| gh="" | |
| for g in["A","B","C"]: | |
| gd=[t.domain for t in tasks if t.grade==g];gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs] | |
| if gs:a=np.mean(gs);gh+=f'<span style="margin-right:14px">{g}Γ{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>' | |
| done=sum(1 for t in tasks if t.task_id in results) | |
| jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0) | |
| api_errs=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]==0 and(results[t.task_id].get("response","") or "").startswith("[")) | |
| ma_vals,er_vals=[],[] | |
| for tid,d in results.items(): | |
| if d["score"]<0:continue | |
| try: | |
| jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"];sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| if "metacognitive_accuracy" in sc:ma_vals.append(float(sc["metacognitive_accuracy"])) | |
| if "error_recovery" in sc:er_vals.append(float(sc["error_recovery"])) | |
| except:pass | |
| avg_ma=np.mean(ma_vals) if ma_vals else 0;avg_er=np.mean(er_vals) if er_vals else 0 | |
| gap=avg_ma-avg_er;gc="#f44336" if gap>0.2 else "#ff9800" if gap>0.1 else "#4caf50" | |
| gl="Declaration-Action Gap" if gap>0.2 else "Moderate Gap" if gap>0.1 else "Balanced" | |
| ad=[t.domain for t in tasks if t.grade=="A"];asc_vals=[dom_avgs[d] for d in set(ad) if d in dom_avgs];aa=np.mean(asc_vals) if asc_vals else 0 | |
| checks=[("Scoreβ₯80",final>=80),("Axesβ₯60",all(v>=60 for v in axis.values())),(f"A-avgβ₯75({aa:.0f})",aa>=75)] | |
| ch="".join([f'<span style="margin-right:8px">{"β " if ok else "β"}{lb}</span>' for lb,ok in checks]) | |
| err_html=f'<div style="color:#ff5722;font-size:0.82em;margin-top:4px">β οΈ API Errors: {api_errs} tasks</div>' if api_errs else "" | |
| return f"""{CSS}<div class="summary-card"><div style="text-align:center"><div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div><h2 style="margin:6px 0;font-size:1.6em">π€ Baseline FINAL: {final:.1f}</h2><p style="color:#aaa;font-size:0.85em">{stage['label']} Β· Base {base:.1f} Γ HAR {har_p:.3f} Β· {done}/{len(tasks)}{f" Β· JF={jf}" if jf else ""}</p><p style="color:#8af;font-size:0.82em;margin:4px 0">Eval: {eval_label} Β· Judge: {judge_label}</p>{err_html}</div><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa;margin:6px 0">π― 5-Axis Scores</h4>{ax_html}<hr style="border-color:#333;margin:10px 0"><div style="font-size:0.88em">{gh}</div><div style="display:flex;align-items:center;gap:12px;margin:8px 0;padding:8px;background:rgba(255,255,255,0.05);border-radius:8px"><span style="font-size:0.85em">MA-ER Gap:</span><span style="font-weight:700;color:{gc}">{gap:.3f}</span><span style="font-size:0.8em;color:{gc}">({gl})</span><span style="font-size:0.78em;color:#888">MA={avg_ma:.3f} ER={avg_er:.3f}</span></div><div style="font-size:0.82em;margin-top:6px">{ch}</div><p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p><div style="background:rgba(233,69,96,0.15);border:1px solid #e94560;border-radius:8px;padding:10px;margin-top:12px"><p style="font-size:0.82em;color:#e94560;margin:0">π <b>MetaCog (Self-Correction) evaluation: COMING SOON</b></p></div></div>""" | |
| def _build_detail_view(results,tasks): | |
| items="" | |
| for t in tasks: | |
| if t.task_id not in results:continue | |
| d=results[t.task_id];info=DOMAIN_INFO.get(t.domain,{"icon":"?"});s=d["score"];resp=html.escape((d.get("response","") or "")[:500]) | |
| jc="";ss="" | |
| try: | |
| jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {});jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200]);sc=jd.get("scores",{}) if isinstance(jd,dict) else {};ss=" Β· ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()]) | |
| except:pass | |
| c=_sc(s) if s>=0 else "#ff9800";badge=f'{s:.1f}' if s>=0 else "JF" | |
| items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id} [{t.grade}] β <span style="color:{c}">{badge}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>' | |
| return CSS+items | |
| # βββ Β§10. Evaluation Engine βββ | |
| def _eval_single(task,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,state): | |
| try: | |
| sys_p=(f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n" | |
| f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. If unsure, say so honestly.") | |
| print(f" βΆ {task.task_id} β {eval_provider}/{eval_model_id}") | |
| model_response=call_model(task.prompt,system=sys_p,api_key=eval_api_key,model_id=eval_model_id,provider=eval_provider,max_tokens=12288) | |
| if model_response.startswith("[API_ERROR") or model_response.startswith("[BLOCKED") or model_response=="[EMPTY]": | |
| print(f" β {task.task_id}: {model_response[:100]}") | |
| _save_result(run_id,task.task_id,model_response,"{}",0) | |
| with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {model_response[:80]}") | |
| return task.task_id,{"response":model_response,"judge":"{}","score":0} | |
| print(f" β {task.task_id} len={len(model_response)}") | |
| jp=build_judge_prompt(task,model_response) | |
| jd=call_judge(jp,system=JUDGE_SYSTEM,api_key=judge_api_key,model_id=judge_model_id,provider=judge_provider) | |
| if jd is None:jd={"scores":{k:0.0 for k in RUBRIC},"comment":"JUDGE_PARSE_FAILED","failed":True} | |
| if jd.get("failed"):ws=-1.0;jd["comment"]=f"JF:{jd.get('comment','')}" | |
| else:ws=compute_task_score(jd["scores"]); | |
| with state["lock"]:state["parse_ok"]+=1 | |
| jj=json.dumps(jd,ensure_ascii=False) | |
| _save_result(run_id,task.task_id,model_response,jj,ws) | |
| with state["lock"]: | |
| state["done"]+=1;info=DOMAIN_INFO.get(task.domain,{"icon":"?"}) | |
| state["active"].append(f'{info["icon"]} {task.task_id}') | |
| if len(state["active"])>10:state["active"]=state["active"][-10:] | |
| return task.task_id,{"response":model_response,"judge":jj,"score":ws} | |
| except Exception as e: | |
| print(f" β {task.task_id} EXCEPTION: {e}") | |
| with state["lock"]:state["done"]+=1;state["errors"].append(f"{task.task_id}: {str(e)[:60]}") | |
| _save_result(run_id,task.task_id,f"[ERROR] {e}","{}",0) | |
| return task.task_id,{"response":f"[ERROR] {e}","judge":"{}","score":0} | |
| # βββ Β§11. State Machine βββ | |
| _EVAL_STATE={"running":False,"stop_requested":False,"finished":False,"run_id":"","eval_label":"","judge_label":"","done":0,"total":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"","n_workers":5} | |
| def _reset(): | |
| with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":False,"stop_requested":False,"finished":False,"done":0,"cached":0,"errors":[],"active":[],"parse_ok":0,"parse_fail":0,"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},"message":"","csv_path":None,"hf_status":""}) | |
| def _prog_html(state,pending): | |
| done=state["done"];pct=min(int(done/max(pending,1)*100),100);gb="" | |
| for g in["A","B","C"]: | |
| gt=state["grade_total"].get(g,0);gd=state["grade_done"].get(g,0) | |
| if gt==0:continue | |
| gp=min(int(gd/gt*100),100);c="#4caf50" if gp==100 else("#1976d2" if gp>0 else "#e0e0e0") | |
| emoji="π °οΈ" if g=="A" else "π ±οΈ" if g=="B" else "π ΎοΈ" | |
| gb+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:100px;font-size:0.85em">{emoji} {g}Γ{GRADE_WEIGHT[g]}</span><div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden"><div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>' | |
| o=f'<div style="margin:8px 0"><div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:6px"><span>β‘ <b>π€ Baseline</b> β {done}/{pending}</span><span style="font-weight:700">{pct}%</span></div><div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>{gb}' | |
| ac=state.get("active",[]) | |
| if ac:o+='<div style="margin-top:8px">π '+" ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em">{a}</span>' for a in ac[-8:]])+'</div>' | |
| er=state.get("errors",[]) | |
| if er: | |
| o+='<div style="color:#c62828;margin-top:6px;font-size:0.8em;max-height:120px;overflow-y:auto">' | |
| for e in er[-6:]:o+=f'<div>β οΈ {html.escape(e[:100])}</div>' | |
| o+='</div>' | |
| return o+'</div>' | |
| def _bg_eval(eval_api_key,eval_model_id,eval_provider,eval_label,judge_api_key,judge_model_id,judge_provider,judge_label,tasks,run_id,n_workers): | |
| global _EVAL_STATE | |
| try: | |
| with _EVAL_STATE["lock"]:_EVAL_STATE["start_time"]=time.time();_EVAL_STATE["message"]=f"β‘ Eval: {eval_label} Β· Judge: {judge_label} Β· {len(tasks)} tasks" | |
| results=dict(_load_all(run_id));cached=sum(1 for t in tasks if t.task_id in results);pending=[t for t in tasks if t.task_id not in results] | |
| print(f" π Cached: {cached} / Pending: {len(pending)} / Total: {len(tasks)}") | |
| gt={}; | |
| for t in pending:gt.setdefault(t.grade,[]).append(t) | |
| with _EVAL_STATE["lock"]:_EVAL_STATE["results"]=results;_EVAL_STATE["cached"]=cached;_EVAL_STATE["total"]=len(pending);_EVAL_STATE["grade_total"]={g:len(ts) for g,ts in gt.items()};_EVAL_STATE["grade_done"]={g:0 for g in gt};_EVAL_STATE["done"]=0;_EVAL_STATE["errors"]=[];_EVAL_STATE["active"]=[] | |
| if pending: | |
| with ThreadPoolExecutor(max_workers=n_workers) as ex: | |
| futs={} | |
| for t in pending: | |
| if _EVAL_STATE["stop_requested"]:break | |
| futs[ex.submit(_eval_single,t,run_id,eval_api_key,eval_model_id,eval_provider,judge_api_key,judge_model_id,judge_provider,_EVAL_STATE)]=t | |
| done_set=set() | |
| while len(done_set)<len(futs): | |
| if _EVAL_STATE["stop_requested"]:ex.shutdown(wait=False,cancel_futures=True);break | |
| for f in list(futs): | |
| if f in done_set:continue | |
| if f.done(): | |
| done_set.add(f) | |
| try: | |
| tid,data=f.result() | |
| with _EVAL_STATE["lock"]:_EVAL_STATE["results"][tid]=data;to=futs[f];_EVAL_STATE["grade_done"][to.grade]=_EVAL_STATE["grade_done"].get(to.grade,0)+1 | |
| except:pass | |
| time.sleep(0.5) | |
| with _EVAL_STATE["lock"]:results=dict(_EVAL_STATE["results"]) | |
| final,base,har,axis,_=compute_final_score(results,tasks);stage=determine_agi_stage(final,axis) | |
| csv_str=generate_csv(results,tasks,eval_label,judge_label,"BASELINE");cp=f"/tmp/final_{run_id}.csv" | |
| with open(cp,"w",encoding="utf-8") as f:f.write(csv_str) | |
| elapsed=int(time.time()-_EVAL_STATE["start_time"]) | |
| with _EVAL_STATE["lock"]:_EVAL_STATE["csv_path"]=cp;_EVAL_STATE["hf_status"]="";_EVAL_STATE["message"]=f"π {stage['name']} β FINAL={final:.1f} Β· {elapsed}s";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True | |
| except Exception as e: | |
| print(f" β Fatal: {e}");import traceback;traceback.print_exc() | |
| with _EVAL_STATE["lock"]:_EVAL_STATE["message"]=f"β Fatal: {str(e)[:100]}";_EVAL_STATE["running"]=False;_EVAL_STATE["finished"]=True | |
| def _start_eval(eval_api_key,judge_api_key,eval_model_choice,judge_model_choice,grade_f,diff_f,max_t,n_w,fresh): | |
| global _EVAL_STATE | |
| if _EVAL_STATE["running"]:return "β οΈ Already running" | |
| eval_api_key=(eval_api_key or "").strip();judge_api_key=(judge_api_key or "").strip() | |
| eval_model_id,eval_provider=_resolve_model(eval_model_choice);judge_model_id,judge_provider=_resolve_model(judge_model_choice) | |
| if not eval_api_key:return f"β {eval_provider} API Key required for Eval model" | |
| if not judge_api_key:return f"β {judge_provider} API Key required for Judge model" | |
| tasks=ALL_TASKS[:] | |
| if grade_f!="All":tasks=[t for t in tasks if t.grade==grade_f] | |
| if diff_f!="All":tasks=[t for t in tasks if t.difficulty==diff_f] | |
| tasks=tasks[:int(max_t)];rid=_make_run_id(eval_model_id) | |
| if fresh:_clear_run(rid) | |
| _reset() | |
| with _EVAL_STATE["lock"]:_EVAL_STATE.update({"running":True,"run_id":rid,"eval_label":eval_model_choice,"judge_label":judge_model_choice,"tasks":tasks,"total":len(tasks),"n_workers":int(n_w)}) | |
| threading.Thread(target=_bg_eval,daemon=True,args=(eval_api_key,eval_model_id,eval_provider,eval_model_choice,judge_api_key,judge_model_id,judge_provider,judge_model_choice,tasks,rid,int(n_w))).start() | |
| return f"β‘ Started β Eval: {eval_model_choice} Β· Judge: {judge_model_choice} ({len(tasks)} tasks)" | |
| def _stop(): | |
| if _EVAL_STATE["running"]:_EVAL_STATE["stop_requested"]=True;return "βΉοΈ Stopping..." | |
| return "βΉοΈ Not running" | |
| def _poll(): | |
| with _EVAL_STATE["lock"]:running=_EVAL_STATE["running"];finished=_EVAL_STATE["finished"];tasks=_EVAL_STATE.get("tasks",[]);results=dict(_EVAL_STATE.get("results",{}));msg=_EVAL_STATE.get("message","");cp=_EVAL_STATE.get("csv_path") | |
| if not running and not finished and not results:return("βΉοΈ Configure API keys, select models, then press βΆοΈ Start","","","",None) | |
| if running:pend=_EVAL_STATE.get("total",0)-_EVAL_STATE.get("cached",0);ph=CSS+_prog_html(_EVAL_STATE,pend) | |
| elif finished:ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>' | |
| else:ph=msg | |
| th=_build_progress_table(results,tasks) if tasks else "";sh,dh,co="","",None | |
| if finished and tasks: | |
| el=_EVAL_STATE.get("eval_label","?");jl=_EVAL_STATE.get("judge_label","?");hf_st=_EVAL_STATE.get("hf_status","") | |
| sh=_build_summary_card(results,tasks,el,jl,hf_st);dh=_build_detail_view(results,tasks);co=cp | |
| return(ph,th,sh,dh,co) | |
| # βββ Β§12. Gradio App βββ | |
| HEADER="""<div style="text-align:center;padding:16px 0"> | |
| <h1 style="margin:0;font-size:1.8em">π FINAL Bench v4.2 β Baseline Evaluation</h1> | |
| <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2> | |
| <p style="color:#888;font-size:0.88em;max-width:720px;margin:8px auto"><b>100 Tasks Β· 15 Domains Β· 8 TICOS Β· 5-Axis Β· 5-Stage AGI Grade</b><br> | |
| π€ Baseline (Non-AGI) β Single LLM Evaluation Β· Multi-Provider<br>Both <b>Eval</b> and <b>Judge</b> support OpenAI / Anthropic / Google</p> | |
| <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em"> | |
| <span style="background:#e3f2fd;padding:2px 10px;border-radius:12px">OpenAI Β· GPT-5.2 / 5-Mini / 4.1 / o4-mini / 4o</span> | |
| <span style="background:#fce4ec;padding:2px 10px;border-radius:12px">Anthropic Β· Opus 4.6 / Sonnet 4.5 / Haiku 4.5</span> | |
| <span style="background:#e8f5e9;padding:2px 10px;border-radius:12px">Google Β· Gemini 3 Pro Preview</span></div> | |
| <div style="background:rgba(233,69,96,0.1);border:1px solid #e94560;border-radius:10px;padding:10px;margin:12px auto;max-width:600px"> | |
| <p style="color:#e94560;font-size:0.85em;margin:0">π <b>MetaCog (Self-Correction Protocol): COMING SOON</b></p></div> | |
| <div style="display:flex;justify-content:center;gap:8px;margin-top:8px;font-size:0.78em"> | |
| <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Dataset</a> | |
| <a href="https://huggingface.co/spaces/FINAL-Bench/Leaderboard" target="_blank" style="background:#333;color:#fff;padding:3px 10px;border-radius:10px;text-decoration:none">π Leaderboard</a></div></div>""" | |
| def create_app(): | |
| with gr.Blocks(title="FINAL Bench v4.2",css=".gradio-container{max-width:1100px !important} header{display:none!important}") as app: | |
| gr.HTML(HEADER) | |
| gr.Markdown("### π API Keys") | |
| with gr.Row(): | |
| eval_api_key=gr.Textbox(label="π€ Eval Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3) | |
| judge_api_key=gr.Textbox(label="βοΈ Judge Model API Key",type="password",placeholder="sk-... / sk-ant-... / AIza...",info="OpenAI / Anthropic / Google key",scale=3) | |
| gr.Markdown("### π€ Model Selection") | |
| with gr.Row(): | |
| eval_m=gr.Dropdown(label="π€ Evaluation Target",choices=MODEL_CHOICES,value=DEFAULT_EVAL,scale=3) | |
| judge_m=gr.Dropdown(label="βοΈ Judge Model",choices=MODEL_CHOICES,value=DEFAULT_JUDGE,scale=3) | |
| gr.Markdown("### βοΈ Settings") | |
| with gr.Row(): | |
| gf=gr.Dropdown(["All","A","B","C"],value="All",label="Grade Filter",scale=1) | |
| df=gr.Dropdown(["All","expert","frontier"],value="All",label="Difficulty",scale=1) | |
| mt=gr.Slider(1,100,value=100,step=1,label="Max Tasks",scale=1) | |
| nw=gr.Slider(1,10,value=5,step=1,label="Workers",scale=1) | |
| with gr.Row(): | |
| s_btn=gr.Button("βΆοΈ Start (Resume)",variant="primary",size="lg",scale=2) | |
| f_btn=gr.Button("π Fresh Start",variant="secondary",size="lg",scale=2) | |
| x_btn=gr.Button("βΉοΈ Stop",variant="stop",size="lg",scale=1) | |
| status=gr.Textbox(label="Status",interactive=False,max_lines=2) | |
| with gr.Tabs(): | |
| with gr.Tab("π Progress"):p_html=gr.HTML() | |
| with gr.Tab("π Results"):t_html=gr.HTML() | |
| with gr.Tab("π FINAL Score"):s_html=gr.HTML() | |
| with gr.Tab("π Details"):d_html=gr.HTML() | |
| with gr.Tab("πΎ CSV"):c_file=gr.File(label="CSV") | |
| timer=gr.Timer(value=2,active=True) | |
| timer.tick(fn=_poll,outputs=[p_html,t_html,s_html,d_html,c_file]) | |
| eval_ins=[eval_api_key,judge_api_key,eval_m,judge_m,gf,df,mt,nw] | |
| s_btn.click(fn=lambda *a:_start_eval(*a,fresh=False),inputs=eval_ins,outputs=[status]) | |
| f_btn.click(fn=lambda *a:_start_eval(*a,fresh=True),inputs=eval_ins,outputs=[status]) | |
| x_btn.click(fn=_stop,outputs=[status]) | |
| gr.Markdown("---\n<center><b>FINAL Bench v4.2</b> Β· Baseline Β· OpenAI / Anthropic / Google Β· Apache 2.0 Β· <b>Ginigen AI</b></center>") | |
| return app | |
| if __name__=="__main__": | |
| sg,sd={},{} | |
| for t in ALL_TASKS:sg[t.grade]=sg.get(t.grade,0)+1;sd[t.domain]=sd.get(t.domain,0)+1 | |
| print(f"\n{'='*60}\n FINAL Bench v4.2 β Baseline (Non-AGI)\n Eval & Judge: OpenAI / Anthropic / Google\n{'='*60}") | |
| print(f" {len(ALL_TASKS)} tasks | {len(sd)} domains") | |
| for g in["A","B","C"]:print(f" Grade {g} (Γ{GRADE_WEIGHT[g]}): {sg.get(g,0)}") | |
| print(f" π MetaCog: COMING SOON\n{'='*60}\n") | |
| app=create_app();app.queue(default_concurrency_limit=2) | |
| app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False) | |