EXAM-FINALBENCH2

Sleeping

App Files Files Community

seawolf2357 commited on Mar 9

Commit

90ad6a9

verified ·

1 Parent(s): e460f3e

Update app.py

Browse files

Files changed (1) hide show

app.py +643 -365

app.py CHANGED Viewed

@@ -1,445 +1,723 @@
 """
-FINAL Bench Auto-Evaluator v1.0 — ALL Bench Metacog 자동 측정
-=============================================================
-FINAL-Bench/Metacognitive 100문제 x HF Inference API x GPT Judge
--> final_scores.json -> ALL Bench Leaderboard 자동 연동
-TICOS 채점: T=Trap I=Insight C=Confidence O=Self-Correction S=Synthesis
-Author: Ginigen AI · FINAL-Bench · License: Apache 2.0
 """
-import json,os,time,csv,io,re,html,hashlib,sqlite3,threading
 from datetime import datetime
 from dataclasses import dataclass
 from typing import Optional
 import requests, numpy as np, gradio as gr
 @dataclass
-class FinalTask:
-    task_id:str; domain:str; grade:str; ticos_type:str; difficulty:str
-    lens:str; title:str; prompt:str; expected_behavior:str
-    hidden_trap:Optional[str]=None; ticos_required:str=""; ticos_optional:str=""
 def load_tasks():
     try:
         from datasets import load_dataset
-        ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
-        tasks=[FinalTask(task_id=r["task_id"],domain=r["domain"],grade=r["grade"],
-            ticos_type=r["ticos_type"],difficulty=r["difficulty"],lens=r.get("lens",""),
-            title=r["title"],prompt=r["prompt"],expected_behavior=r["expected_behavior"],
-            hidden_trap=r.get("hidden_trap"),ticos_required=r.get("ticos_required",""),
-            ticos_optional=r.get("ticos_optional","")) for r in ds]
-        print(f"FINAL Bench: {len(tasks)} tasks loaded (HF Dataset)")
         return tasks
     except Exception as e:
-        print(f"HF load failed: {e}"); return []
-ALL_TASKS=load_tasks()
-TICOS_INFO={
-    "E_SelfCorrecting":{"name":"자기수정","icon":"🔄"},
-    "A_TrapEscape":{"name":"함정탈출","icon":"🪤"},
-    "B_ContradictionResolution":{"name":"모순해결","icon":"⚡"},
-    "C_ProgressiveDiscovery":{"name":"점진발견","icon":"🔬"},
-    "D_MultiConstraint":{"name":"다중제약","icon":"🎯"},
-    "F_ExpertPanel":{"name":"전문가토론","icon":"👥"},
-    "G_PivotDetection":{"name":"전환감지","icon":"🔀"},
-    "H_ConfidenceCalibration":{"name":"확신도보정","icon":"📊"},
 }
-RUBRIC_KEYS=["trap_detection","insight_depth","confidence_calibration","self_correction","synthesis_quality"]
-RUBRIC_W={"trap_detection":0.20,"insight_depth":0.20,"confidence_calibration":0.25,"self_correction":0.20,"synthesis_quality":0.15}
-RUBRIC_D={"trap_detection":"숨겨진 함정/오류 감지","insight_depth":"통찰 깊이와 정확성",
-    "confidence_calibration":"확신도-정확도 일치 (과대확신 감점)","self_correction":"오류 인지 후 수정 실행",
-    "synthesis_quality":"종합의 일관성과 완결성"}
-def final_score(scores):
-    return round(sum(scores.get(k,0.5)*w for k,w in RUBRIC_W.items())*100,2)
 def _strip(t):
-    if not t:return t
-    t=re.sub(r'<think>.*?</think>','',t,flags=re.DOTALL)
-    t=re.sub(r'<thinking>.*?</thinking>','',t,flags=re.DOTALL)
     return t.strip()
-def call_hf(prompt,sys="",key="",mid="Qwen/Qwen3.5-397B-A17B",mt=4096,temp=0.6):
-    msgs=[]
-    if sys:msgs.append({"role":"system","content":sys})
-    msgs.append({"role":"user","content":prompt})
-    h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
-    p={"model":mid,"messages":msgs,"max_tokens":mt,"temperature":temp,"stream":False}
-    for a in range(3):
         try:
-            r=requests.post(f"https://router.huggingface.co/hf-inference/models/{mid}/v1/chat/completions",headers=h,json=p,timeout=120)
-            if r.status_code in(429,503):time.sleep(5*(a+1));continue
             r.raise_for_status()
-            return _strip(r.json()["choices"][0]["message"]["content"])
         except Exception as e:
-            if a<2:time.sleep(3*(a+1))
-            else:return f"[API_ERROR] {e}"
-def call_oai(prompt,sys="",key="",model="gpt-5.2",mt=4096,temp=0.6):
-    msgs=[]
-    if sys:msgs.append({"role":"system","content":sys})
-    msgs.append({"role":"user","content":prompt})
-    h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
-    p={"model":model,"messages":msgs,"max_tokens":mt,"temperature":temp}
-    for a in range(2):
         try:
-            r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=120)
-            if r.status_code==429:time.sleep(5*(a+1));continue
             r.raise_for_status()
             return _strip(r.json()["choices"][0]["message"]["content"])
         except Exception as e:
-            if a<1:time.sleep(3)
-            else:return f"[API_ERROR] {e}"
-def call_model(prompt,sys="",key="",mid="",at="hf",mt=4096,temp=0.6):
-    if at=="openai":return call_oai(prompt,sys,key,mid,mt,temp)
-    return call_hf(prompt,sys,key,mid,mt,temp)
-HF_MODELS={
-    "Qwen3.5-397B":"Qwen/Qwen3.5-397B-A17B","Qwen3.5-122B":"Qwen/Qwen3.5-122B-A10B",
-    "Qwen3.5-27B":"Qwen/Qwen3.5-27B","Qwen3.5-35B":"Qwen/Qwen3.5-35B-A3B",
-    "Qwen3.5-9B":"Qwen/Qwen3.5-9B","Qwen3.5-4B":"Qwen/Qwen3.5-4B",
-    "DeepSeek V3.2":"deepseek-ai/DeepSeek-V3-0324","DeepSeek R1":"deepseek-ai/DeepSeek-R1",
-    "Llama 4 Scout":"meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "Llama 4 Maverick":"meta-llama/Llama-4-Maverick-17B-128E-Instruct",
-    "Phi-4":"microsoft/phi-4","Mistral Large 3":"mistralai/Mistral-Large-Instruct-2501",
 }
-OAI_MODELS={"GPT-5.2":"gpt-5.2","GPT-5.4":"gpt-5.4","GPT-5.1":"gpt-5.1"}
-JUDGE_SYS="""You are a FINAL Bench Metacognition Judge. Score each TICOS dimension using ONLY 0.0/0.25/0.5/0.75/1.0:
-1. trap_detection (T): Did model detect hidden traps/false premises?
-   1.0=all traps found, 0.5=some missed, 0.0=fell into traps
-2. insight_depth (I): Genuine deep understanding?
-   1.0=novel insights, 0.5=correct but shallow, 0.0=wrong
-3. confidence_calibration (C): Confidence matches accuracy?
-   1.0=well-calibrated, 0.5=inconsistent, 0.0=overconfident on wrong answers
-   CRITICAL: Overconfidence is WORSE than underconfidence.
-4. self_correction (O): Caught and fixed own errors?
-   1.0=explicit backtrack+correct, 0.5=acknowledged not fixed, 0.0=no correction
-5. synthesis_quality (S): Final synthesis coherent and complete?
-   1.0=unified nuanced conclusion, 0.5=partial, 0.0=fragmented
-Output JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"한줄 평가"}"""
-def build_jprompt(task,resp):
-    rl="\n".join([f'  "{k}": {d}' for k,d in RUBRIC_D.items()])
-    sk=", ".join([f'"{k}": ___' for k in RUBRIC_KEYS])
-    ht=f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
-    return f"""[FINAL Bench Metacognition Evaluation]
 Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
 Title: {task.title}
 Prompt: {task.prompt[:1200]}
 Expected: {task.expected_behavior[:500]}{ht}
 === RESPONSE ===
-{resp[:8000]}
 === END ===
-Score TICOS (0.0/0.25/0.5/0.75/1.0):
-{rl}
-Output ONLY: {{"scores": {{{sk}}}, "comment": "한줄 평가"}}"""
-def call_judge(prompt,key,model="gpt-5.2"):
-    schema={"type":"object","properties":{"scores":{"type":"object",
-        "properties":{k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC_KEYS},
-        "required":RUBRIC_KEYS,"additionalProperties":False},
-        "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
-    msgs=[{"role":"system","content":JUDGE_SYS},{"role":"user","content":prompt}]
-    p={"model":model,"max_completion_tokens":4096,"temperature":0.1,"messages":msgs,
-       "response_format":{"type":"json_schema","json_schema":{"name":"FINALResult","strict":True,"schema":schema}}}
-    h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
     for a in range(3):
         try:
-            r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=180)
-            if r.status_code==429:time.sleep(5*(a+1));continue
             r.raise_for_status()
-            c=r.json()["choices"][0]["message"]["content"]
             if not c:
-                if a<2:time.sleep(2);continue
                 return None
-            d=json.loads(_strip(c))
             if "scores" in d:
-                for k in RUBRIC_KEYS:
-                    if k not in d["scores"]:d["scores"][k]=0.5
                 return d
-        except:
-            if a<2:time.sleep(3*(a+1))
     return None
-DB="final_bench.db"
-def _idb():
-    c=sqlite3.connect(DB)
-    c.execute("CREATE TABLE IF NOT EXISTS r(rid TEXT,tid TEXT,resp TEXT,judge TEXT,score REAL,ts REAL,PRIMARY KEY(rid,tid))")
-    c.commit();c.close()
-def _rid(m):return hashlib.md5(f"FB_{m}".encode()).hexdigest()[:12]
-def _sv(rid,tid,resp,jj,sc):
-    c=sqlite3.connect(DB);c.execute("INSERT OR REPLACE INTO r VALUES(?,?,?,?,?,?)",(rid,tid,resp,jj,sc,time.time()));c.commit();c.close()
-def _la(rid):
-    c=sqlite3.connect(DB);cur=c.execute("SELECT tid,resp,judge,score FROM r WHERE rid=?", (rid,));rows=cur.fetchall();c.close()
-    return{r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
-def _clr(rid):c=sqlite3.connect(DB);c.execute("DELETE FROM r WHERE rid=?",(rid,));c.commit();c.close()
-_idb()
-SF="final_scores.json"
-def _lsf():
     try:
-        with open(SF) as f:return json.load(f)
-    except:return{"version":"1.0","bench":"FINAL-Bench/Metacognitive","updated":"","models":{}}
-def _ssf(mn,sc,ds,ts,nt,nc):
-    d=_lsf();d["updated"]=datetime.now().isoformat()
-    d["models"][mn]={"final_score":sc,"domain_scores":ds,"ticos_scores":ts,
-        "tasks_total":nt,"tasks_completed":nc,"evaluated_at":datetime.now().isoformat()}
-    with open(SF,"w") as f:json.dump(d,f,indent=2,ensure_ascii=False)
     return d
-def _uhf(d):
-    tk=os.getenv("HF_TOKEN","")
-    if not tk:return "HF_TOKEN 미설정"
     try:
         from huggingface_hub import HfApi
-        api=HfApi(token=tk)
-        api.upload_file(path_or_fileobj=json.dumps(d,indent=2,ensure_ascii=False).encode("utf-8"),
-            path_in_repo="final_scores.json",repo_id="FINAL-Bench/ALL-Bench-Leaderboard",
-            repo_type="dataset",commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
-        return "HF upload OK"
-    except Exception as e:return f"Upload fail: {e}"
 from concurrent.futures import ThreadPoolExecutor
-def _e1(t,rid,key,jk,mid,jm,at,st):
     try:
-        resp=call_model(t.prompt,key=key,mid=mid,at=at)
-        if resp.startswith("[API_ERROR"):
-            _sv(rid,t.task_id,resp,"{}",0)
-            with st["lk"]:st["dn"]+=1;st["er"].append(t.task_id)
-            return t.task_id,{"response":resp,"judge":"{}","score":0}
-        jp=build_jprompt(t,resp)
-        jd=call_judge(jp,jk,jm)
         if jd is None:
-            jd={"scores":{k:0.0 for k in RUBRIC_KEYS},"comment":"judge_failed","failed":True}
-        if jd.get("failed"):sc=-1.0
-        else:sc=final_score(jd["scores"]);
-        with st["lk"]:
-            if not jd.get("failed"):st["jok"]+=1
-        jj=json.dumps(jd,ensure_ascii=False)
-        _sv(rid,t.task_id,resp,jj,sc)
-        with st["lk"]:
-            st["dn"]+=1;ic=TICOS_INFO.get(t.ticos_type,{})
-            st["ac"].append(f'{ic.get("icon","")}{t.task_id}');
-            if len(st["ac"])>10:st["ac"]=st["ac"][-10:]
-        return t.task_id,{"response":resp,"judge":jj,"score":sc}
     except Exception as e:
-        _sv(rid,t.task_id,f"[ERR]{e}","{}",0)
-        with st["lk"]:st["dn"]+=1;st["er"].append(f"{t.task_id}:{str(e)[:40]}")
-        return t.task_id,{"response":f"[ERR]{e}","judge":"{}","score":0}
-_S={"run":False,"stp":False,"fin":False,"rid":"","mdl":"","dn":0,"tot":0,"cch":0,
-    "er":[],"ac":[],"jok":0,"t0":0,"res":{},"tsk":[],"lk":threading.Lock(),
-    "msg":"","csv":None,"hfs":""}
-def _rst():
-    global _S
-    with _S["lk"]:
-        _S.update({"run":False,"stp":False,"fin":False,"dn":0,"cch":0,"er":[],"ac":[],"jok":0,
-            "t0":0,"res":{},"tsk":[],"msg":"","csv":None,"hfs":""})
-def _bgev(key,jk,mid,mn,jm,at,tasks,rid,wk):
-    global _S
     try:
-        res=dict(_la(rid));cch=sum(1 for t in tasks if t.task_id in res)
-        pend=[t for t in tasks if t.task_id not in res]
-        with _S["lk"]:_S["res"]=res;_S["cch"]=cch;_S["tot"]=len(tasks);_S["t0"]=time.time()
-        if not pend:
-            with _S["lk"]:_S["msg"]=f"Cache: {cch}"
-            _fin(tasks,res,mn);return
-        with _S["lk"]:_S["msg"]=f"{len(pend)} tasks, {wk} workers"
-        with ThreadPoolExecutor(max_workers=wk) as exe:
-            futs={exe.submit(_e1,t,rid,key,jk,mid,jm,at,_S):t for t in pend if not _S["stp"]}
-            done=set()
-            while len(done)<len(futs):
-                if _S["stp"]:
-                    with _S["lk"]:_S["msg"]="Stopped";_S["run"]=False;_S["fin"]=True
                     return
                 for f in list(futs):
-                    if f in done:continue
                     if f.done():
-                        done.add(f)
                         try:
-                            tid,d=f.result()
-                            with _S["lk"]:_S["res"][tid]=d
-                        except:pass
                 time.sleep(0.5)
-        with _S["lk"]:res=dict(_S["res"])
-        _fin(tasks,res,mn)
     except Exception as e:
-        with _S["lk"]:_S["msg"]=f"ERR:{str(e)[:80]}";_S["run"]=False;_S["fin"]=True
-def _fin(tasks,res,mn):
-    global _S
-    ds={};ts={}
     for dom in set(t.domain for t in tasks):
-        v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
-        if v:ds[dom]=round(np.mean(v),2)
     for tt in set(t.ticos_type for t in tasks):
-        v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
-        if v:ts[tt]=round(np.mean(v),2)
-    av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
-    fs=round(np.mean(av),2) if av else 0
-    sd=_ssf(mn,fs,ds,ts,len(tasks),len(av))
-    rid=_S["rid"]
-    cp=f"/tmp/fb_{rid}.csv"
-    with open(cp,"w",encoding="utf-8") as f:
-        w=csv.writer(f);w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","score","comment","ts"])
-        tm={t.task_id:t for t in tasks}
-        for tid,d in sorted(res.items()):
-            t=tm.get(tid)
-            if not t:continue
-            jd={}
-            try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else {}
-            except:pass
-            w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,d["score"],
-                (jd.get("comment","") if isinstance(jd,dict) else "")[:200],datetime.now().isoformat()])
-    hfs=_uhf(sd)
-    el=int(time.time()-_S["t0"]) if _S["t0"] else 0
-    with _S["lk"]:
-        _S["csv"]=cp;_S["hfs"]=hfs
-        _S["msg"]=f"FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
-        _S["run"]=False;_S["fin"]=True
-CSS='<style>.et{width:100%;border-collapse:collapse;font-size:.85em}.et th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc}.et td{padding:6px 8px;border-bottom:1px solid #eee}.sb{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}.sf{height:100%;border-radius:8px}.sc{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:20px;color:#fff;margin:8px 0}.pb{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}.pf{height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1)}</style>'
-def _clr2(s):
-    if s>=80:return"#4caf50"
-    if s>=60:return"#ff9800"
-    return"#f44336"
-def _poll():
-    global _S
-    with _S["lk"]:
-        run=_S["run"];fin=_S["fin"];tasks=_S.get("tsk",[]);res=dict(_S.get("res",{}))
-        msg=_S.get("msg","");csvp=_S.get("csv")
-    if not run and not fin and not res:
-        return("Select model and press Start.","","",None)
-    if run:
-        dn=_S["dn"];tot=_S.get("tot",1);pct=min(int(dn/max(tot,1)*100),100)
-        el=int(time.time()-_S.get("t0",time.time()));eta=int((el/max(dn,1))*(tot-dn)) if dn>0 else 0
-        ac=_S.get("ac",[]);jok=_S.get("jok",0)
-        tg=" ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;font-size:.78em">{a}</span>' for a in ac[-8:]])
-        prog=f'{CSS}<div><div style="display:flex;justify-content:space-between;margin-bottom:4px"><span>🧬 {dn}/{tot} · {el}s · ETA {eta}s · Judge✅{jok}</span><span style="font-weight:700;color:#7c3aed">{pct}%</span></div><div class="pb"><div class="pf" style="width:{pct}%"></div></div><div style="margin-top:6px">{tg}</div></div>'
-    elif fin:
-        prog=f'<div style="background:#f0fdf4;padding:14px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a">🏁 {msg}</div>'
-    else:prog=msg
-    tbl=""
     if tasks:
-        rows=""
         for t in tasks:
-            ic=TICOS_INFO.get(t.ticos_type,{})
-            if t.task_id in res:
-                s=res[t.task_id]["score"]
-                if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td></tr>'
                 else:
-                    c=_clr2(s);rows+=f'<tr><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td><div class="sb"><div class="sf" style="width:{min(s,100)}%;background:{c}"></div></div><span style="color:{c};font-weight:700">{s:.1f}</span></td></tr>'
-            else:rows+=f'<tr style="opacity:.4"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>-</td><td>-</td><td>⏳</td></tr>'
-        tbl=f'{CSS}<table class="et"><thead><tr><th>ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
-    sm=""
-    if fin and tasks:
-        av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
-        fs=round(np.mean(av),2) if av else 0
-        gr2="A" if fs>=80 else("B+" if fs>=70 else("B" if fs>=60 else "C"))
-        dh=""
         for dom in sorted(set(t.domain for t in tasks)):
-            v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
-            if v:a=np.mean(v);c=_clr2(a);dh+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:.85em">{dom}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
-        th=""
-        for tt,info in TICOS_INFO.items():
-            v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
-            if v:a=np.mean(v);c=_clr2(a);th+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:150px;font-size:.85em">{info["icon"]} {info["name"]}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
-        jf=sum(1 for t in tasks if t.task_id in res and res[t.task_id]["score"]<0)
-        sm=f'{CSS}<div class="sc"><h2 style="margin:0;font-size:1.6em;text-align:center">🧬 FINAL Score: {fs} / 100</h2><h3 style="margin:4px 0;text-align:center;color:#aaa">Grade {gr2} · {_S.get("mdl","")}</h3><p style="text-align:center;color:#888;font-size:.9em">{len(av)}문제{f" · ❌{jf}" if jf else ""}</p><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">📚 도메인별</h4>{dh}<hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">🧬 TICOS별</h4>{th}<hr style="border-color:#333;margin:12px 0"><p style="font-size:.85em;color:#aaa">{_S.get("hfs","")}</p></div>'
-    return(prog,tbl,sm,csvp)
-def _start(mc,at,ek,jk,jm,df,mt,nw,fresh):
-    global _S
-    if _S["run"]:return"Already running"
-    ek=(ek or"").strip() or os.getenv("HF_TOKEN","")
-    jk=(jk or"").strip() or os.getenv("OPENAI_API_KEY","")
-    if not ek:return"Need API key"
-    if not jk:return"Need Judge key"
-    if at=="HuggingFace Inference":mid=HF_MODELS.get(mc,mc);a="hf"
-    else:mid=OAI_MODELS.get(mc,mc);a="openai"
-    tasks=ALL_TASKS[:]
-    if df!="전체":tasks=[t for t in tasks if t.difficulty==df]
-    tasks=tasks[:int(mt)]
-    rid=_rid(mid)
-    if fresh:_clr(rid)
-    _rst()
-    with _S["lk"]:_S["run"]=True;_S["rid"]=rid;_S["mdl"]=mc;_S["tsk"]=tasks;_S["tot"]=len(tasks)
-    threading.Thread(target=_bgev,args=(ek,jk,mid,mc,jm,a,tasks,rid,int(nw)),daemon=True).start()
-    return f"🧬 {mc} FINAL Bench ({len(tasks)} tasks, {int(nw)} workers)"
-def _stop():
-    global _S
-    if _S["run"]:_S["stp"]=True;return"Stopping..."
-    return"Not running"
-def _um(at):
-    if at=="HuggingFace Inference":return gr.update(choices=list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0])
-    return gr.update(choices=list(OAI_MODELS.keys()),value=list(OAI_MODELS.keys())[0])
-HEADER="""<div style="text-align:center;padding:16px 0">
-<h1 style="margin:0;font-size:1.8em">🧬 FINAL Bench Auto-Evaluator v1.0</h1>
-<h2 style="margin:4px 0;color:#555;font-size:1.05em">Metacognitive Intelligence · 100 Tasks · TICOS Scoring</h2>
-<p style="color:#888;font-size:.88em;max-width:700px;margin:8px auto">
-📊 <b>FINAL-Bench/Metacognitive</b> · 100문제 · 15도메인 · 8 TICOS유형<br>
-🧬 TICOS: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
-📡 HF Inference (오픈소스) + 🔑 OpenAI (클로즈드) → ⚖️ GPT-5.2 Judge<br>
-📊 → <code>final_scores.json</code> → ALL Bench Metacog 자동 반영</p></div>"""
 def create_app():
-    with gr.Blocks(title="FINAL Bench Evaluator",theme=gr.themes.Soft(),
-                   css=".gradio-container{max-width:1100px!important}") as app:
-        gr.HTML(HEADER)
         with gr.Row():
-            at=gr.Radio(["HuggingFace Inference","OpenAI Compatible"],value="HuggingFace Inference",label="📡 API",scale=2)
-            md=gr.Dropdown(list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0],label="🤖 Model",scale=3,allow_custom_value=True)
-        at.change(_um,[at],[md])
         with gr.Row():
-            ek=gr.Textbox(label="🔑 Eval Key",type="password",placeholder="hf_... or sk-...",value=os.getenv("HF_TOKEN",""),scale=3)
-            jk=gr.Textbox(label="⚖️ Judge Key",type="password",placeholder="sk-...",value=os.getenv("OPENAI_API_KEY",""),scale=3)
         with gr.Row():
-            jm=gr.Textbox(label="⚖️ Judge",value="gpt-5.2",scale=2)
-            df=gr.Dropdown(["전체","expert","frontier"],value="전체",label="Difficulty",scale=1)
-            mt=gr.Slider(1,100,value=100,step=1,label="Tasks",scale=2)
-            nw=gr.Slider(1,20,value=10,step=1,label="Workers",scale=1)
         with gr.Row():
-            sb=gr.Button("▶️ Start",variant="primary",size="lg",scale=2)
-            fb=gr.Button("🚀 Fresh",variant="secondary",size="lg",scale=2)
-            xb=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
-        st=gr.Textbox(label="Status",interactive=False,max_lines=1)
-        with gr.Accordion("📊 Existing Scores",open=False):
-            gr.JSON(value=_lsf(),label="final_scores.json")
         with gr.Tabs():
-            with gr.Tab("📊 Progress"):p=gr.HTML()
-            with gr.Tab("📋 Results"):t=gr.HTML()
-            with gr.Tab("🏆 Summary"):s=gr.HTML()
-            with gr.Tab("💾 CSV"):c=gr.File(label="CSV")
-        timer=gr.Timer(value=2,active=True)
-        timer.tick(fn=_poll,outputs=[p,t,s,c])
-        ins=[md,at,ek,jk,jm,df,mt,nw]
-        sb.click(fn=lambda *a:_start(*a,fresh=False),inputs=ins,outputs=[st])
-        fb.click(fn=lambda *a:_start(*a,fresh=True),inputs=ins,outputs=[st])
-        xb.click(fn=_stop,outputs=[st])
-        gr.Markdown(f"---\n<center>🧬 FINAL Bench v1.0 · Apache 2.0 · Ginigen AI<br>Data: FINAL-Bench/Metacognitive · {len(ALL_TASKS)} tasks · TICOS</center>")
     return app
-if __name__=="__main__":
-    st={}
-    for t in ALL_TASKS:st[t.ticos_type]=st.get(t.ticos_type,0)+1
-    print(f"FINAL Bench Evaluator: {len(ALL_TASKS)} tasks")
-    for tt,n in sorted(st.items()):i=TICOS_INFO.get(tt,{});print(f"  {i.get('icon','')} {tt}: {n}")
-    app=create_app()
     app.queue(default_concurrency_limit=2)
-    app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)

 """
+FINAL Bench Auto-Evaluator v1.1
+================================
+FINAL-Bench/Metacognitive 100 tasks
+HF Inference API + OpenAI Judge -> final_scores.json
+Gradio 6.x compatible
+Author: Ginigen AI · License: Apache 2.0
 """
+import json, os, time, re, hashlib, sqlite3, threading, csv, io
 from datetime import datetime
 from dataclasses import dataclass
 from typing import Optional
 import requests, numpy as np, gradio as gr
+# ══════════════ DATA ══════════════
 @dataclass
+class Task:
+    task_id: str; domain: str; grade: str; ticos_type: str
+    difficulty: str; lens: str; title: str; prompt: str
+    expected_behavior: str; hidden_trap: Optional[str] = None
+    ticos_required: str = ""; ticos_optional: str = ""
 def load_tasks():
     try:
         from datasets import load_dataset
+        ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
+        tasks = []
+        for r in ds:
+            tasks.append(Task(
+                task_id=r["task_id"], domain=r["domain"], grade=r["grade"],
+                ticos_type=r["ticos_type"], difficulty=r["difficulty"],
+                lens=r.get("lens",""), title=r["title"], prompt=r["prompt"],
+                expected_behavior=r["expected_behavior"],
+                hidden_trap=r.get("hidden_trap"),
+                ticos_required=r.get("ticos_required",""),
+                ticos_optional=r.get("ticos_optional","")))
+        print(f"✅ {len(tasks)} tasks loaded")
         return tasks
     except Exception as e:
+        print(f"❌ Load failed: {e}")
+        return []
+TASKS = load_tasks()
+# TICOS types from actual dataset
+TICOS = {
+    "A_TrapEscape":              {"n": "함정탈출",     "i": "🪤"},
+    "B_ContradictionResolution": {"n": "모순해결",     "i": "⚡"},
+    "C_ProgressiveDiscovery":    {"n": "점진발견",     "i": "🔬"},
+    "D_MultiConstraint":         {"n": "다중제약",     "i": "🎯"},
+    "E_SelfCorrecting":          {"n": "자기수정",     "i": "🔄"},
+    "F_ExpertPanel":             {"n": "전문가토론",   "i": "👥"},
+    "G_PivotDetection":          {"n": "전환감지",     "i": "🔀"},
+    "H_DecisionUnderUncertainty":{"n": "불확실성판단", "i": "📊"},
 }
+# ══════════════ RUBRIC ══════════════
+RK = ["trap_detection", "insight_depth", "confidence_calibration", "self_correction", "synthesis_quality"]
+RW = {"trap_detection": 0.20, "insight_depth": 0.20, "confidence_calibration": 0.25,
+      "self_correction": 0.20, "synthesis_quality": 0.15}
+RD = {"trap_detection": "Hidden trap/error detection",
+      "insight_depth": "Depth of genuine insight",
+      "confidence_calibration": "Confidence-accuracy alignment (overconfidence penalized)",
+      "self_correction": "Error detection and actual correction",
+      "synthesis_quality": "Coherent final synthesis"}
+def calc_score(scores):
+    return round(sum(scores.get(k, 0.5) * w for k, w in RW.items()) * 100, 2)
+# ══════════════ LLM CALLS ══════════════
 def _strip(t):
+    if not t: return t
+    for tag in ['think', 'thinking', 'reasoning', 'reflection']:
+        t = re.sub(rf'<{tag}>.*?</{tag}>', '', t, flags=re.DOTALL)
     return t.strip()
+def call_hf(prompt, sys_msg="", key="", model="Qwen/Qwen3.5-397B-A17B", max_tok=4096, temp=0.6):
+    msgs = []
+    if sys_msg: msgs.append({"role": "system", "content": sys_msg})
+    msgs.append({"role": "user", "content": prompt})
+    h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
+    body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp, "stream": False}
+    for attempt in range(3):
         try:
+            print(f"  📡 HF call: {model} (attempt {attempt+1})")
+            r = requests.post(
+                f"https://router.huggingface.co/hf-inference/models/{model}/v1/chat/completions",
+                headers=h, json=body, timeout=120)
+            print(f"  📡 Status: {r.status_code}")
+            if r.status_code in (429, 503):
+                wait = 10 * (attempt + 1)
+                print(f"  ⏳ Rate limited, waiting {wait}s")
+                time.sleep(wait); continue
             r.raise_for_status()
+            content = r.json()["choices"][0]["message"]["content"]
+            print(f"  ✅ Got {len(content)} chars")
+            return _strip(content)
         except Exception as e:
+            print(f"  ❌ HF error: {e}")
+            if attempt < 2: time.sleep(3 * (attempt + 1))
+            else: return f"[API_ERROR] {e}"
+def call_oai(prompt, sys_msg="", key="", model="gpt-5.2", max_tok=4096, temp=0.6):
+    msgs = []
+    if sys_msg: msgs.append({"role": "system", "content": sys_msg})
+    msgs.append({"role": "user", "content": prompt})
+    h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
+    body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp}
+    for attempt in range(2):
         try:
+            print(f"  🔑 OpenAI call: {model}")
+            r = requests.post("https://api.openai.com/v1/chat/completions",
+                              headers=h, json=body, timeout=120)
+            if r.status_code == 429:
+                time.sleep(5 * (attempt + 1)); continue
             r.raise_for_status()
             return _strip(r.json()["choices"][0]["message"]["content"])
         except Exception as e:
+            print(f"  ❌ OpenAI error: {e}")
+            if attempt < 1: time.sleep(3)
+            else: return f"[API_ERROR] {e}"
+def call_model(prompt, sys_msg="", key="", model="", api_type="hf", max_tok=4096, temp=0.6):
+    if api_type == "openai":
+        return call_oai(prompt, sys_msg, key, model, max_tok, temp)
+    return call_hf(prompt, sys_msg, key, model, max_tok, temp)
+# ══════════════ MODELS ══════════════
+HF_MODELS = {
+    "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
+    "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
+    "Qwen3.5-27B":  "Qwen/Qwen3.5-27B",
+    "Qwen3.5-35B":  "Qwen/Qwen3.5-35B-A3B",
+    "Qwen3.5-9B":   "Qwen/Qwen3.5-9B",
+    "Qwen3.5-4B":   "Qwen/Qwen3.5-4B",
+    "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
+    "DeepSeek R1":   "deepseek-ai/DeepSeek-R1",
+    "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+    "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+    "Phi-4": "microsoft/phi-4",
+    "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
 }
+OAI_MODELS = {"GPT-5.2": "gpt-5.2", "GPT-5.4": "gpt-5.4", "GPT-5.1": "gpt-5.1"}
+# ══════════════ JUDGE ══════════════
+JUDGE_SYS = """You are a FINAL Bench Metacognition Judge. Score 5 TICOS dimensions using ONLY 0.0/0.25/0.5/0.75/1.0:
+1. trap_detection: Did model detect hidden traps? 1.0=all found, 0.0=fell in
+2. insight_depth: Genuine deep understanding? 1.0=novel, 0.0=wrong
+3. confidence_calibration: Confidence matches accuracy? 1.0=calibrated, 0.0=overconfident. Overconfidence is WORSE than underconfidence.
+4. self_correction: Caught and fixed own errors? 1.0=backtracked+fixed, 0.0=none
+5. synthesis_quality: Final synthesis coherent? 1.0=unified, 0.0=fragmented
+Output ONLY JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"one line"}"""
+def make_judge_prompt(task, response):
+    sk = ', '.join([f'"{k}": ___' for k in RK])
+    ht = f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
+    return f"""[FINAL Bench Evaluation]
 Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
 Title: {task.title}
 Prompt: {task.prompt[:1200]}
 Expected: {task.expected_behavior[:500]}{ht}
 === RESPONSE ===
+{response[:8000]}
 === END ===
+Output ONLY: {{"scores": {{{sk}}}, "comment": "..."}}"""
+def judge(prompt, key, model="gpt-5.2"):
+    schema = {
+        "type": "object",
+        "properties": {
+            "scores": {
+                "type": "object",
+                "properties": {k: {"type": "number", "enum": [0.0, 0.25, 0.5, 0.75, 1.0]} for k in RK},
+                "required": RK, "additionalProperties": False},
+            "comment": {"type": "string"}},
+        "required": ["scores", "comment"], "additionalProperties": False}
+    msgs = [{"role": "system", "content": JUDGE_SYS}, {"role": "user", "content": prompt}]
+    payload = {"model": model, "max_completion_tokens": 4096, "temperature": 0.1,
+               "messages": msgs,
+               "response_format": {"type": "json_schema",
+                                   "json_schema": {"name": "FBResult", "strict": True, "schema": schema}}}
+    h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
     for a in range(3):
         try:
+            print(f"  ⚖️ Judge call (attempt {a+1})")
+            r = requests.post("https://api.openai.com/v1/chat/completions",
+                              headers=h, json=payload, timeout=180)
+            print(f"  ⚖️ Judge status: {r.status_code}")
+            if r.status_code == 429:
+                time.sleep(5 * (a + 1)); continue
             r.raise_for_status()
+            c = r.json()["choices"][0]["message"]["content"]
             if not c:
+                if a < 2: time.sleep(2); continue
                 return None
+            d = json.loads(_strip(c))
             if "scores" in d:
+                for k in RK:
+                    if k not in d["scores"]: d["scores"][k] = 0.5
+                print(f"  ✅ Judge OK: {d.get('comment','')[:50]}")
                 return d
+        except Exception as e:
+            print(f"  ❌ Judge error: {e}")
+            if a < 2: time.sleep(3 * (a + 1))
     return None
+# ══════════════ DB ══════════════
+DB = "final_bench.db"
+def db_init():
+    c = sqlite3.connect(DB)
+    c.execute("CREATE TABLE IF NOT EXISTS results(rid TEXT, tid TEXT, resp TEXT, jdg TEXT, score REAL, ts REAL, PRIMARY KEY(rid,tid))")
+    c.commit(); c.close()
+def db_save(rid, tid, resp, jdg, score):
+    c = sqlite3.connect(DB)
+    c.execute("INSERT OR REPLACE INTO results VALUES(?,?,?,?,?,?)", (rid, tid, resp, jdg, score, time.time()))
+    c.commit(); c.close()
+def db_load(rid):
+    c = sqlite3.connect(DB)
+    rows = c.execute("SELECT tid, resp, jdg, score FROM results WHERE rid=?", (rid,)).fetchall()
+    c.close()
+    return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows}
+def db_clear(rid):
+    c = sqlite3.connect(DB)
+    c.execute("DELETE FROM results WHERE rid=?", (rid,))
+    c.commit(); c.close()
+db_init()
+# ══════════════ SCORES FILE ══════════════
+SF = "final_scores.json"
+def sf_load():
     try:
+        with open(SF) as f: return json.load(f)
+    except: return {"version": "1.1", "bench": "FINAL-Bench/Metacognitive", "updated": "", "models": {}}
+def sf_save(name, score, dom_scores, ticos_scores, n_total, n_done):
+    d = sf_load()
+    d["updated"] = datetime.now().isoformat()
+    d["models"][name] = {
+        "final_score": score, "domain_scores": dom_scores,
+        "ticos_scores": ticos_scores, "tasks_total": n_total,
+        "tasks_completed": n_done, "evaluated_at": datetime.now().isoformat()}
+    with open(SF, "w") as f: json.dump(d, f, indent=2, ensure_ascii=False)
     return d
+def sf_upload(d):
+    tk = os.getenv("HF_TOKEN", "")
+    if not tk: return "⚠️ HF_TOKEN not set"
     try:
         from huggingface_hub import HfApi
+        HfApi(token=tk).upload_file(
+            path_or_fileobj=json.dumps(d, indent=2, ensure_ascii=False).encode("utf-8"),
+            path_in_repo="final_scores.json",
+            repo_id="FINAL-Bench/ALL-Bench-Leaderboard", repo_type="dataset",
+            commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
+        return "✅ Uploaded to HF"
+    except Exception as e: return f"❌ Upload: {e}"
+# ══════════════ EVAL ENGINE ══════════════
 from concurrent.futures import ThreadPoolExecutor
+def eval_one(task, rid, key, jkey, mid, jmodel, atype, state):
+    print(f"\n{'='*40}\n📝 Evaluating: {task.task_id} ({task.ticos_type})")
     try:
+        # 1. Model response
+        resp = call_model(task.prompt, key=key, model=mid, api_type=atype)
+        if not resp or resp.startswith("[API_ERROR"):
+            print(f"  ❌ Model failed: {resp[:100]}")
+            db_save(rid, task.task_id, resp or "empty", "{}", 0)
+            with state["lock"]:
+                state["done"] += 1
+                state["errors"].append(task.task_id)
+            return task.task_id, {"response": resp, "judge": "{}", "score": 0}
+        # 2. Judge
+        jp = make_judge_prompt(task, resp)
+        jd = judge(jp, jkey, jmodel)
         if jd is None:
+            print(f"  ❌ Judge failed for {task.task_id}")
+            jd = {"scores": {k: 0.0 for k in RK}, "comment": "judge_failed", "failed": True}
+        if jd.get("failed"):
+            sc = -1.0
+        else:
+            sc = calc_score(jd["scores"])
+            with state["lock"]: state["jok"] += 1
+        jj = json.dumps(jd, ensure_ascii=False)
+        db_save(rid, task.task_id, resp, jj, sc)
+        print(f"  📊 Score: {sc}")
+        with state["lock"]:
+            state["done"] += 1
+            ti = TICOS.get(task.ticos_type, {})
+            state["active"].append(f'{ti.get("i","📝")} {task.task_id} → {sc}')
+            if len(state["active"]) > 10:
+                state["active"] = state["active"][-10:]
+        return task.task_id, {"response": resp, "judge": jj, "score": sc}
     except Exception as e:
+        print(f"  💥 Exception: {e}")
+        db_save(rid, task.task_id, f"[ERR] {e}", "{}", 0)
+        with state["lock"]:
+            state["done"] += 1
+            state["errors"].append(f"{task.task_id}: {str(e)[:40]}")
+        return task.task_id, {"response": f"[ERR] {e}", "judge": "{}", "score": 0}
+# ── State ──
+ST = {
+    "running": False, "stop": False, "finished": False,
+    "rid": "", "model": "", "done": 0, "total": 0, "cached": 0,
+    "errors": [], "active": [], "jok": 0, "t0": 0,
+    "results": {}, "tasks": [],
+    "lock": threading.Lock(), "msg": "", "csv": None, "hf": "",
+}
+def st_reset():
+    with ST["lock"]:
+        ST.update({"running": False, "stop": False, "finished": False,
+                   "done": 0, "cached": 0, "errors": [], "active": [], "jok": 0,
+                   "t0": 0, "results": {}, "tasks": [],
+                   "msg": "", "csv": None, "hf": ""})
+def bg_eval(key, jkey, mid, mname, jmodel, atype, tasks, rid, nw):
+    print(f"\n{'#'*50}")
+    print(f"# BG EVAL START: {mname} ({len(tasks)} tasks, {nw} workers)")
+    print(f"# API type: {atype}, Model ID: {mid}")
+    print(f"{'#'*50}\n")
     try:
+        cached = db_load(rid)
+        nc = sum(1 for t in tasks if t.task_id in cached)
+        pending = [t for t in tasks if t.task_id not in cached]
+        with ST["lock"]:
+            ST["results"] = cached
+            ST["cached"] = nc
+            ST["total"] = len(tasks)
+            ST["t0"] = time.time()
+        if not pending:
+            with ST["lock"]: ST["msg"] = f"💾 All cached ({nc})"
+            finalize(tasks, cached, mname)
+            return
+        with ST["lock"]: ST["msg"] = f"⚡ {len(pending)} tasks, {nw} workers"
+        print(f"📋 Pending: {len(pending)}, Cached: {nc}")
+        with ThreadPoolExecutor(max_workers=nw) as exe:
+            futs = {}
+            for task in pending:
+                if ST["stop"]: break
+                f = exe.submit(eval_one, task, rid, key, jkey, mid, jmodel, atype, ST)
+                futs[f] = task
+            done_set = set()
+            while len(done_set) < len(futs):
+                if ST["stop"]:
+                    print("⏹️ Stop requested")
+                    with ST["lock"]:
+                        ST["msg"] = "⏹️ Stopped"
+                        ST["running"] = False
+                        ST["finished"] = True
                     return
                 for f in list(futs):
+                    if f in done_set: continue
                     if f.done():
+                        done_set.add(f)
                         try:
+                            tid, data = f.result()
+                            with ST["lock"]: ST["results"][tid] = data
+                        except Exception as e:
+                            print(f"Future error: {e}")
                 time.sleep(0.5)
+        with ST["lock"]: results = dict(ST["results"])
+        finalize(tasks, results, mname)
     except Exception as e:
+        print(f"💥 BG EVAL CRASH: {e}")
+        import traceback; traceback.print_exc()
+        with ST["lock"]:
+            ST["msg"] = f"❌ {str(e)[:100]}"
+            ST["running"] = False
+            ST["finished"] = True
+def finalize(tasks, results, mname):
+    print(f"\n🏁 Finalizing: {len(results)} results")
+    ds = {}
     for dom in set(t.domain for t in tasks):
+        v = [results[t.task_id]["score"] for t in tasks
+             if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
+        if v: ds[dom] = round(np.mean(v), 2)
+    ts = {}
     for tt in set(t.ticos_type for t in tasks):
+        v = [results[t.task_id]["score"] for t in tasks
+             if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
+        if v: ts[tt] = round(np.mean(v), 2)
+    av = [results[t.task_id]["score"] for t in tasks
+          if t.task_id in results and results[t.task_id]["score"] >= 0]
+    fs = round(np.mean(av), 2) if av else 0
+    print(f"📊 FINAL Score: {fs} ({len(av)}/{len(tasks)} tasks)")
+    sd = sf_save(mname, fs, ds, ts, len(tasks), len(av))
+    hf = sf_upload(sd)
+    el = int(time.time() - ST["t0"]) if ST["t0"] else 0
+    # CSV
+    cp = f"/tmp/fb_{ST['rid']}.csv"
+    with open(cp, "w", encoding="utf-8") as f:
+        w = csv.writer(f)
+        w.writerow(["task_id","domain","grade","ticos","difficulty","title","score","comment"])
+        tm = {t.task_id: t for t in tasks}
+        for tid, d in sorted(results.items()):
+            t = tm.get(tid)
+            if not t: continue
+            try: jd = json.loads(d["judge"]) if isinstance(d["judge"], str) else {}
+            except: jd = {}
+            w.writerow([tid, t.domain, t.grade, t.ticos_type, t.difficulty, t.title,
+                        d["score"], (jd.get("comment","") if isinstance(jd,dict) else "")[:200]])
+    with ST["lock"]:
+        ST["csv"] = cp
+        ST["hf"] = hf
+        ST["msg"] = f"🏁 FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
+        ST["running"] = False
+        ST["finished"] = True
+    print(f"✅ Done: FINAL Score = {fs}")
+# ══════════════ UI CALLBACKS ══════════════
+def do_start(model, api_type, eval_key, judge_key, judge_model, diff, max_t, workers, fresh):
+    print(f"\n🔘 START clicked: model={model}, api={api_type}, fresh={fresh}")
+    if ST["running"]:
+        return "⚠️ Already running"
+    eval_key = (eval_key or "").strip() or os.getenv("HF_TOKEN", "")
+    judge_key = (judge_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
+    if not eval_key:
+        print("❌ No eval key")
+        return "❌ API Key needed"
+    if not judge_key:
+        print("❌ No judge key")
+        return "❌ Judge Key needed"
+    print(f"  Keys: eval={eval_key[:8]}... judge={judge_key[:8]}...")
+    if api_type == "HuggingFace Inference":
+        mid = HF_MODELS.get(model, model)
+        at = "hf"
+    else:
+        mid = OAI_MODELS.get(model, model)
+        at = "openai"
+    tasks = TASKS[:]
+    if diff != "전체":
+        tasks = [t for t in tasks if t.difficulty == diff]
+    tasks = tasks[:int(max_t)]
+    print(f"  Model ID: {mid}, Tasks: {len(tasks)}")
+    rid = hashlib.md5(f"FB_{mid}".encode()).hexdigest()[:12]
+    if fresh:
+        db_clear(rid)
+        print("  🗑️ Cache cleared")
+    st_reset()
+    with ST["lock"]:
+        ST["running"] = True
+        ST["rid"] = rid
+        ST["model"] = model
+        ST["tasks"] = tasks
+        ST["total"] = len(tasks)
+    thread = threading.Thread(
+        target=bg_eval,
+        args=(eval_key, judge_key, mid, model, judge_model, at, tasks, rid, int(workers)),
+        daemon=True)
+    thread.start()
+    print(f"  🧵 Thread started")
+    return f"🧬 {model} started ({len(tasks)} tasks, {int(workers)} workers)"
+def do_stop():
+    if ST["running"]:
+        ST["stop"] = True
+        return "⏹️ Stopping..."
+    return "Not running"
+def do_poll():
+    with ST["lock"]:
+        running = ST["running"]
+        finished = ST["finished"]
+        tasks = ST.get("tasks", [])
+        results = dict(ST.get("results", {}))
+        msg = ST.get("msg", "")
+        csvp = ST.get("csv")
+    if not running and not finished and not results:
+        return ("ℹ️ Select model → press ▶️ Start", "", "", None)
+    # Progress bar
+    if running:
+        dn = ST["done"]
+        tot = ST.get("total", 1)
+        pct = min(int(dn / max(tot, 1) * 100), 100)
+        el = int(time.time() - ST.get("t0", time.time()))
+        eta = int((el / max(dn, 1)) * (tot - dn)) if dn > 0 else 0
+        active = ST.get("active", [])
+        jok = ST.get("jok", 0)
+        errs = ST.get("errors", [])
+        tags = " ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;'
+                         f'font-size:12px">{a}</span>' for a in active[-6:]])
+        err_html = ""
+        if errs:
+            err_html = f'<div style="color:#dc2626;margin-top:6px;font-size:12px">⚠️ Errors: {", ".join(errs[-3:])}</div>'
+        prog = f"""<div style="padding:12px;background:#fafafa;border-radius:8px;border:1px solid #e5e7eb">
+            <div style="display:flex;justify-content:space-between;margin-bottom:6px">
+                <span style="font-size:14px">🧬 {dn}/{tot} · {el}s · ETA ~{eta}s · Judge ✅{jok}</span>
+                <span style="font-weight:700;color:#7c3aed;font-size:16px">{pct}%</span>
+            </div>
+            <div style="background:#e5e7eb;border-radius:8px;height:24px;overflow:hidden">
+                <div style="width:{pct}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1);transition:width 0.3s"></div>
+            </div>
+            <div style="margin-top:8px">{tags}</div>{err_html}
+        </div>"""
+    elif finished:
+        prog = f'<div style="background:#f0fdf4;padding:16px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a;font-size:16px">🏁 {msg}</div>'
+    else:
+        prog = f'<div style="padding:12px">{msg}</div>'
+    # Results table
+    tbl = ""
     if tasks:
+        rows = ""
         for t in tasks:
+            ti = TICOS.get(t.ticos_type, {"i": "📝", "n": t.ticos_type})
+            if t.task_id in results:
+                s = results[t.task_id]["score"]
+                if s < 0:
+                    rows += f'<tr style="background:#fef3c7"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td style="color:#f59e0b;font-weight:700">❌ Judge failed</td></tr>'
                 else:
+                    c = "#22c55e" if s >= 80 else ("#f59e0b" if s >= 60 else "#ef4444")
+                    rows += f'<tr><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td><div style="display:flex;align-items:center;gap:6px"><div style="background:#e5e7eb;border-radius:6px;height:16px;width:80px;overflow:hidden"><div style="width:{min(s,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="color:{c};font-weight:700;font-size:12px">{s:.1f}</span></div></td></tr>'
+            else:
+                rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td>⏳</td></tr>'
+        tbl = f'<table style="width:100%;border-collapse:collapse;font-size:13px"><thead><tr style="background:#f1f5f9"><th style="padding:8px;text-align:left">ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
+    # Summary
+    sm = ""
+    if finished and tasks:
+        av = [results[t.task_id]["score"] for t in tasks
+              if t.task_id in results and results[t.task_id]["score"] >= 0]
+        fs = round(np.mean(av), 2) if av else 0
+        # Domain bars
+        dh = ""
         for dom in sorted(set(t.domain for t in tasks)):
+            v = [results[t.task_id]["score"] for t in tasks
+                 if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
+            if v:
+                a = round(np.mean(v), 1)
+                c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
+                dh += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:13px">{dom}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
+        # TICOS bars
+        th = ""
+        for tt, info in TICOS.items():
+            v = [results[t.task_id]["score"] for t in tasks
+                 if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
+            if v:
+                a = round(np.mean(v), 1)
+                c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
+                th += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:160px;font-size:13px">{info["i"]} {info["n"]}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
+        sm = f"""<div style="background:linear-gradient(135deg,#1e1b4b,#312e81);border-radius:14px;padding:24px;color:#fff;margin:8px 0">
+            <h2 style="margin:0;font-size:28px;text-align:center">🧬 FINAL Score: {fs} / 100</h2>
+            <p style="text-align:center;color:#a5b4fc;margin:8px 0">{ST.get("model","")} · {len(av)} tasks</p>
+            <hr style="border-color:#4338ca;margin:16px 0">
+            <h4 style="color:#a5b4fc;margin:8px 0">📚 Domains</h4>{dh}
+            <hr style="border-color:#4338ca;margin:16px 0">
+            <h4 style="color:#a5b4fc;margin:8px 0">🧬 TICOS Types</h4>{th}
+            <hr style="border-color:#4338ca;margin:16px 0">
+            <p style="font-size:12px;color:#818cf8">{ST.get("hf","")}</p></div>"""
+    return (prog, tbl, sm, csvp)
+def update_models(api_type):
+    if api_type == "HuggingFace Inference":
+        return gr.update(choices=list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0])
+    return gr.update(choices=list(OAI_MODELS.keys()), value=list(OAI_MODELS.keys())[0])
+# ══════════════ GRADIO APP ══════════════
+HEADER_HTML = """<div style="text-align:center;padding:16px 0">
+<h1 style="margin:0;font-size:28px">🧬 FINAL Bench Auto-Evaluator v1.1</h1>
+<h2 style="margin:4px 0;color:#6b7280;font-size:16px">Metacognitive Intelligence · 100 Tasks · TICOS Scoring</h2>
+<p style="color:#9ca3af;font-size:13px;max-width:700px;margin:8px auto;line-height:1.6">
+📊 <b>FINAL-Bench/Metacognitive</b> 100 tasks · 15 domains · 8 TICOS types<br>
+🧬 <b>TICOS</b>: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
+📡 HF Inference API (open-source) + 🔑 OpenAI (closed) → ⚖️ GPT-5.2 Judge<br>
+📊 → <code>final_scores.json</code> → ALL Bench Metacog column</p></div>"""
 def create_app():
+    with gr.Blocks(title="FINAL Bench Evaluator") as app:
+        gr.HTML(HEADER_HTML)
         with gr.Row():
+            api_type = gr.Radio(
+                ["HuggingFace Inference", "OpenAI Compatible"],
+                value="HuggingFace Inference", label="📡 API Type", scale=2)
+            model_dd = gr.Dropdown(
+                list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0],
+                label="🤖 Target Model", scale=3, allow_custom_value=True)
+        api_type.change(update_models, [api_type], [model_dd])
         with gr.Row():
+            eval_key = gr.Textbox(
+                label="🔑 Eval API Key (HF Token or OpenAI)",
+                type="password", placeholder="hf_... or sk-...",
+                value=os.getenv("HF_TOKEN", ""), scale=3)
+            judge_key = gr.Textbox(
+                label="⚖️ Judge Key (OpenAI)",
+                type="password", placeholder="sk-...",
+                value=os.getenv("OPENAI_API_KEY", ""), scale=3)
         with gr.Row():
+            judge_model = gr.Textbox(label="⚖️ Judge Model", value="gpt-5.2", scale=2)
+            diff_dd = gr.Dropdown(
+                ["전체", "expert", "frontier"],
+                value="전체", label="Difficulty", scale=1)
+            max_tasks = gr.Slider(1, 100, value=100, step=1, label="Max Tasks", scale=2)
+            workers = gr.Slider(1, 20, value=10, step=1, label="⚡ Workers", scale=1)
         with gr.Row():
+            start_btn = gr.Button("▶️ Start (Resume)", variant="primary", size="lg", scale=2)
+            fresh_btn = gr.Button("🚀 Fresh Start", variant="secondary", size="lg", scale=2)
+            stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", scale=1)
+        status = gr.Textbox(label="Status", interactive=False, max_lines=1)
+        with gr.Accordion("📊 Existing FINAL Scores", open=False):
+            gr.JSON(value=sf_load(), label="final_scores.json")
         with gr.Tabs():
+            with gr.Tab("📊 Progress"):
+                prog_html = gr.HTML()
+            with gr.Tab("📋 Results"):
+                table_html = gr.HTML()
+            with gr.Tab("🏆 Summary"):
+                summary_html = gr.HTML()
+            with gr.Tab("💾 CSV"):
+                csv_file = gr.File(label="CSV Download")
+        # Timer for polling
+        timer = gr.Timer(value=2, active=True)
+        timer.tick(fn=do_poll, outputs=[prog_html, table_html, summary_html, csv_file])
+        # Button handlers
+        inputs = [model_dd, api_type, eval_key, judge_key, judge_model,
+                  diff_dd, max_tasks, workers]
+        start_btn.click(
+            fn=lambda *a: do_start(*a, fresh=False),
+            inputs=inputs, outputs=[status])
+        fresh_btn.click(
+            fn=lambda *a: do_start(*a, fresh=True),
+            inputs=inputs, outputs=[status])
+        stop_btn.click(fn=do_stop, outputs=[status])
+        gr.Markdown(f"""---
+<center>🧬 FINAL Bench Auto-Evaluator v1.1 · Apache 2.0 · Ginigen AI<br>
+Data: <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive">FINAL-Bench/Metacognitive</a> ({len(TASKS)} tasks)<br>
+→ ALL Bench Leaderboard Metacog auto-sync</center>""")
     return app
+if __name__ == "__main__":
+    stats = {}
+    for t in TASKS:
+        stats[t.ticos_type] = stats.get(t.ticos_type, 0) + 1
+    print(f"FINAL Bench Evaluator: {len(TASKS)} tasks")
+    for tt, n in sorted(stats.items()):
+        info = TICOS.get(tt, {"i": "?", "n": tt})
+        print(f"  {info['i']} {tt}: {n}")
+    app = create_app()
     app.queue(default_concurrency_limit=2)
+    app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False,
+               theme=gr.themes.Soft(),
+               css=".gradio-container{max-width:1100px !important}")