seawolf2357 commited on
Commit
90ad6a9
Β·
verified Β·
1 Parent(s): e460f3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +643 -365
app.py CHANGED
@@ -1,445 +1,723 @@
1
  """
2
- FINAL Bench Auto-Evaluator v1.0 β€” ALL Bench Metacog μžλ™ μΈ‘μ •
3
- =============================================================
4
- FINAL-Bench/Metacognitive 100문제 x HF Inference API x GPT Judge
5
- -> final_scores.json -> ALL Bench Leaderboard μžλ™ 연동
 
6
 
7
- TICOS 채점: T=Trap I=Insight C=Confidence O=Self-Correction S=Synthesis
8
-
9
- Author: Ginigen AI Β· FINAL-Bench Β· License: Apache 2.0
10
  """
11
- import json,os,time,csv,io,re,html,hashlib,sqlite3,threading
12
  from datetime import datetime
13
  from dataclasses import dataclass
14
  from typing import Optional
15
  import requests, numpy as np, gradio as gr
16
 
 
 
17
  @dataclass
18
- class FinalTask:
19
- task_id:str; domain:str; grade:str; ticos_type:str; difficulty:str
20
- lens:str; title:str; prompt:str; expected_behavior:str
21
- hidden_trap:Optional[str]=None; ticos_required:str=""; ticos_optional:str=""
 
22
 
23
  def load_tasks():
24
  try:
25
  from datasets import load_dataset
26
- ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
27
- tasks=[FinalTask(task_id=r["task_id"],domain=r["domain"],grade=r["grade"],
28
- ticos_type=r["ticos_type"],difficulty=r["difficulty"],lens=r.get("lens",""),
29
- title=r["title"],prompt=r["prompt"],expected_behavior=r["expected_behavior"],
30
- hidden_trap=r.get("hidden_trap"),ticos_required=r.get("ticos_required",""),
31
- ticos_optional=r.get("ticos_optional","")) for r in ds]
32
- print(f"FINAL Bench: {len(tasks)} tasks loaded (HF Dataset)")
 
 
 
 
 
33
  return tasks
34
  except Exception as e:
35
- print(f"HF load failed: {e}"); return []
36
-
37
- ALL_TASKS=load_tasks()
38
-
39
- TICOS_INFO={
40
- "E_SelfCorrecting":{"name":"μžκΈ°μˆ˜μ •","icon":"πŸ”„"},
41
- "A_TrapEscape":{"name":"ν•¨μ •νƒˆμΆœ","icon":"πŸͺ€"},
42
- "B_ContradictionResolution":{"name":"λͺ¨μˆœν•΄κ²°","icon":"⚑"},
43
- "C_ProgressiveDiscovery":{"name":"μ μ§„λ°œκ²¬","icon":"πŸ”¬"},
44
- "D_MultiConstraint":{"name":"λ‹€μ€‘μ œμ•½","icon":"🎯"},
45
- "F_ExpertPanel":{"name":"μ „λ¬Έκ°€ν† λ‘ ","icon":"πŸ‘₯"},
46
- "G_PivotDetection":{"name":"μ „ν™˜κ°μ§€","icon":"πŸ”€"},
47
- "H_ConfidenceCalibration":{"name":"확신도보정","icon":"πŸ“Š"},
 
 
48
  }
49
 
50
- RUBRIC_KEYS=["trap_detection","insight_depth","confidence_calibration","self_correction","synthesis_quality"]
51
- RUBRIC_W={"trap_detection":0.20,"insight_depth":0.20,"confidence_calibration":0.25,"self_correction":0.20,"synthesis_quality":0.15}
52
- RUBRIC_D={"trap_detection":"μˆ¨κ²¨μ§„ 함정/였λ₯˜ 감지","insight_depth":"톡찰 κΉŠμ΄μ™€ μ •ν™•μ„±",
53
- "confidence_calibration":"확신도-정확도 일치 (κ³ΌλŒ€ν™•μ‹  감점)","self_correction":"였λ₯˜ 인지 ν›„ μˆ˜μ • μ‹€ν–‰",
54
- "synthesis_quality":"μ’…ν•©μ˜ 일관성과 μ™„κ²°μ„±"}
 
 
 
 
 
55
 
56
- def final_score(scores):
57
- return round(sum(scores.get(k,0.5)*w for k,w in RUBRIC_W.items())*100,2)
 
 
58
 
59
  def _strip(t):
60
- if not t:return t
61
- t=re.sub(r'<think>.*?</think>','',t,flags=re.DOTALL)
62
- t=re.sub(r'<thinking>.*?</thinking>','',t,flags=re.DOTALL)
63
  return t.strip()
64
 
65
- def call_hf(prompt,sys="",key="",mid="Qwen/Qwen3.5-397B-A17B",mt=4096,temp=0.6):
66
- msgs=[]
67
- if sys:msgs.append({"role":"system","content":sys})
68
- msgs.append({"role":"user","content":prompt})
69
- h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
70
- p={"model":mid,"messages":msgs,"max_tokens":mt,"temperature":temp,"stream":False}
71
- for a in range(3):
 
72
  try:
73
- r=requests.post(f"https://router.huggingface.co/hf-inference/models/{mid}/v1/chat/completions",headers=h,json=p,timeout=120)
74
- if r.status_code in(429,503):time.sleep(5*(a+1));continue
 
 
 
 
 
 
 
75
  r.raise_for_status()
76
- return _strip(r.json()["choices"][0]["message"]["content"])
 
 
77
  except Exception as e:
78
- if a<2:time.sleep(3*(a+1))
79
- else:return f"[API_ERROR] {e}"
80
-
81
- def call_oai(prompt,sys="",key="",model="gpt-5.2",mt=4096,temp=0.6):
82
- msgs=[]
83
- if sys:msgs.append({"role":"system","content":sys})
84
- msgs.append({"role":"user","content":prompt})
85
- h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
86
- p={"model":model,"messages":msgs,"max_tokens":mt,"temperature":temp}
87
- for a in range(2):
 
 
88
  try:
89
- r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=120)
90
- if r.status_code==429:time.sleep(5*(a+1));continue
 
 
 
91
  r.raise_for_status()
92
  return _strip(r.json()["choices"][0]["message"]["content"])
93
  except Exception as e:
94
- if a<1:time.sleep(3)
95
- else:return f"[API_ERROR] {e}"
96
-
97
- def call_model(prompt,sys="",key="",mid="",at="hf",mt=4096,temp=0.6):
98
- if at=="openai":return call_oai(prompt,sys,key,mid,mt,temp)
99
- return call_hf(prompt,sys,key,mid,mt,temp)
100
-
101
- HF_MODELS={
102
- "Qwen3.5-397B":"Qwen/Qwen3.5-397B-A17B","Qwen3.5-122B":"Qwen/Qwen3.5-122B-A10B",
103
- "Qwen3.5-27B":"Qwen/Qwen3.5-27B","Qwen3.5-35B":"Qwen/Qwen3.5-35B-A3B",
104
- "Qwen3.5-9B":"Qwen/Qwen3.5-9B","Qwen3.5-4B":"Qwen/Qwen3.5-4B",
105
- "DeepSeek V3.2":"deepseek-ai/DeepSeek-V3-0324","DeepSeek R1":"deepseek-ai/DeepSeek-R1",
106
- "Llama 4 Scout":"meta-llama/Llama-4-Scout-17B-16E-Instruct",
107
- "Llama 4 Maverick":"meta-llama/Llama-4-Maverick-17B-128E-Instruct",
108
- "Phi-4":"microsoft/phi-4","Mistral Large 3":"mistralai/Mistral-Large-Instruct-2501",
 
 
 
 
 
 
 
 
 
109
  }
110
- OAI_MODELS={"GPT-5.2":"gpt-5.2","GPT-5.4":"gpt-5.4","GPT-5.1":"gpt-5.1"}
111
-
112
- JUDGE_SYS="""You are a FINAL Bench Metacognition Judge. Score each TICOS dimension using ONLY 0.0/0.25/0.5/0.75/1.0:
113
-
114
- 1. trap_detection (T): Did model detect hidden traps/false premises?
115
- 1.0=all traps found, 0.5=some missed, 0.0=fell into traps
116
- 2. insight_depth (I): Genuine deep understanding?
117
- 1.0=novel insights, 0.5=correct but shallow, 0.0=wrong
118
- 3. confidence_calibration (C): Confidence matches accuracy?
119
- 1.0=well-calibrated, 0.5=inconsistent, 0.0=overconfident on wrong answers
120
- CRITICAL: Overconfidence is WORSE than underconfidence.
121
- 4. self_correction (O): Caught and fixed own errors?
122
- 1.0=explicit backtrack+correct, 0.5=acknowledged not fixed, 0.0=no correction
123
- 5. synthesis_quality (S): Final synthesis coherent and complete?
124
- 1.0=unified nuanced conclusion, 0.5=partial, 0.0=fragmented
125
-
126
- Output JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"ν•œμ€„ 평가"}"""
127
-
128
- def build_jprompt(task,resp):
129
- rl="\n".join([f' "{k}": {d}' for k,d in RUBRIC_D.items()])
130
- sk=", ".join([f'"{k}": ___' for k in RUBRIC_KEYS])
131
- ht=f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
132
- return f"""[FINAL Bench Metacognition Evaluation]
133
  Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
134
  Title: {task.title}
135
  Prompt: {task.prompt[:1200]}
136
  Expected: {task.expected_behavior[:500]}{ht}
137
  === RESPONSE ===
138
- {resp[:8000]}
139
  === END ===
140
- Score TICOS (0.0/0.25/0.5/0.75/1.0):
141
- {rl}
142
- Output ONLY: {{"scores": {{{sk}}}, "comment": "ν•œμ€„ 평가"}}"""
143
-
144
- def call_judge(prompt,key,model="gpt-5.2"):
145
- schema={"type":"object","properties":{"scores":{"type":"object",
146
- "properties":{k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC_KEYS},
147
- "required":RUBRIC_KEYS,"additionalProperties":False},
148
- "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
149
- msgs=[{"role":"system","content":JUDGE_SYS},{"role":"user","content":prompt}]
150
- p={"model":model,"max_completion_tokens":4096,"temperature":0.1,"messages":msgs,
151
- "response_format":{"type":"json_schema","json_schema":{"name":"FINALResult","strict":True,"schema":schema}}}
152
- h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
 
 
 
 
 
 
 
153
  for a in range(3):
154
  try:
155
- r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=180)
156
- if r.status_code==429:time.sleep(5*(a+1));continue
 
 
 
 
157
  r.raise_for_status()
158
- c=r.json()["choices"][0]["message"]["content"]
159
  if not c:
160
- if a<2:time.sleep(2);continue
161
  return None
162
- d=json.loads(_strip(c))
163
  if "scores" in d:
164
- for k in RUBRIC_KEYS:
165
- if k not in d["scores"]:d["scores"][k]=0.5
 
166
  return d
167
- except:
168
- if a<2:time.sleep(3*(a+1))
 
169
  return None
170
 
171
- DB="final_bench.db"
172
- def _idb():
173
- c=sqlite3.connect(DB)
174
- c.execute("CREATE TABLE IF NOT EXISTS r(rid TEXT,tid TEXT,resp TEXT,judge TEXT,score REAL,ts REAL,PRIMARY KEY(rid,tid))")
175
- c.commit();c.close()
176
- def _rid(m):return hashlib.md5(f"FB_{m}".encode()).hexdigest()[:12]
177
- def _sv(rid,tid,resp,jj,sc):
178
- c=sqlite3.connect(DB);c.execute("INSERT OR REPLACE INTO r VALUES(?,?,?,?,?,?)",(rid,tid,resp,jj,sc,time.time()));c.commit();c.close()
179
- def _la(rid):
180
- c=sqlite3.connect(DB);cur=c.execute("SELECT tid,resp,judge,score FROM r WHERE rid=?", (rid,));rows=cur.fetchall();c.close()
181
- return{r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
182
- def _clr(rid):c=sqlite3.connect(DB);c.execute("DELETE FROM r WHERE rid=?",(rid,));c.commit();c.close()
183
- _idb()
184
-
185
- SF="final_scores.json"
186
- def _lsf():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  try:
188
- with open(SF) as f:return json.load(f)
189
- except:return{"version":"1.0","bench":"FINAL-Bench/Metacognitive","updated":"","models":{}}
190
-
191
- def _ssf(mn,sc,ds,ts,nt,nc):
192
- d=_lsf();d["updated"]=datetime.now().isoformat()
193
- d["models"][mn]={"final_score":sc,"domain_scores":ds,"ticos_scores":ts,
194
- "tasks_total":nt,"tasks_completed":nc,"evaluated_at":datetime.now().isoformat()}
195
- with open(SF,"w") as f:json.dump(d,f,indent=2,ensure_ascii=False)
 
 
 
196
  return d
197
 
198
- def _uhf(d):
199
- tk=os.getenv("HF_TOKEN","")
200
- if not tk:return "HF_TOKEN λ―Έμ„€μ •"
201
  try:
202
  from huggingface_hub import HfApi
203
- api=HfApi(token=tk)
204
- api.upload_file(path_or_fileobj=json.dumps(d,indent=2,ensure_ascii=False).encode("utf-8"),
205
- path_in_repo="final_scores.json",repo_id="FINAL-Bench/ALL-Bench-Leaderboard",
206
- repo_type="dataset",commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
207
- return "HF upload OK"
208
- except Exception as e:return f"Upload fail: {e}"
 
 
 
209
 
210
  from concurrent.futures import ThreadPoolExecutor
211
 
212
- def _e1(t,rid,key,jk,mid,jm,at,st):
 
213
  try:
214
- resp=call_model(t.prompt,key=key,mid=mid,at=at)
215
- if resp.startswith("[API_ERROR"):
216
- _sv(rid,t.task_id,resp,"{}",0)
217
- with st["lk"]:st["dn"]+=1;st["er"].append(t.task_id)
218
- return t.task_id,{"response":resp,"judge":"{}","score":0}
219
- jp=build_jprompt(t,resp)
220
- jd=call_judge(jp,jk,jm)
 
 
 
 
 
 
221
  if jd is None:
222
- jd={"scores":{k:0.0 for k in RUBRIC_KEYS},"comment":"judge_failed","failed":True}
223
- if jd.get("failed"):sc=-1.0
224
- else:sc=final_score(jd["scores"]);
225
- with st["lk"]:
226
- if not jd.get("failed"):st["jok"]+=1
227
- jj=json.dumps(jd,ensure_ascii=False)
228
- _sv(rid,t.task_id,resp,jj,sc)
229
- with st["lk"]:
230
- st["dn"]+=1;ic=TICOS_INFO.get(t.ticos_type,{})
231
- st["ac"].append(f'{ic.get("icon","")}{t.task_id}');
232
- if len(st["ac"])>10:st["ac"]=st["ac"][-10:]
233
- return t.task_id,{"response":resp,"judge":jj,"score":sc}
 
 
 
 
 
 
 
 
 
 
234
  except Exception as e:
235
- _sv(rid,t.task_id,f"[ERR]{e}","{}",0)
236
- with st["lk"]:st["dn"]+=1;st["er"].append(f"{t.task_id}:{str(e)[:40]}")
237
- return t.task_id,{"response":f"[ERR]{e}","judge":"{}","score":0}
238
-
239
- _S={"run":False,"stp":False,"fin":False,"rid":"","mdl":"","dn":0,"tot":0,"cch":0,
240
- "er":[],"ac":[],"jok":0,"t0":0,"res":{},"tsk":[],"lk":threading.Lock(),
241
- "msg":"","csv":None,"hfs":""}
242
-
243
- def _rst():
244
- global _S
245
- with _S["lk"]:
246
- _S.update({"run":False,"stp":False,"fin":False,"dn":0,"cch":0,"er":[],"ac":[],"jok":0,
247
- "t0":0,"res":{},"tsk":[],"msg":"","csv":None,"hfs":""})
248
-
249
- def _bgev(key,jk,mid,mn,jm,at,tasks,rid,wk):
250
- global _S
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  try:
252
- res=dict(_la(rid));cch=sum(1 for t in tasks if t.task_id in res)
253
- pend=[t for t in tasks if t.task_id not in res]
254
- with _S["lk"]:_S["res"]=res;_S["cch"]=cch;_S["tot"]=len(tasks);_S["t0"]=time.time()
255
- if not pend:
256
- with _S["lk"]:_S["msg"]=f"Cache: {cch}"
257
- _fin(tasks,res,mn);return
258
- with _S["lk"]:_S["msg"]=f"{len(pend)} tasks, {wk} workers"
259
- with ThreadPoolExecutor(max_workers=wk) as exe:
260
- futs={exe.submit(_e1,t,rid,key,jk,mid,jm,at,_S):t for t in pend if not _S["stp"]}
261
- done=set()
262
- while len(done)<len(futs):
263
- if _S["stp"]:
264
- with _S["lk"]:_S["msg"]="Stopped";_S["run"]=False;_S["fin"]=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  return
266
  for f in list(futs):
267
- if f in done:continue
268
  if f.done():
269
- done.add(f)
270
  try:
271
- tid,d=f.result()
272
- with _S["lk"]:_S["res"][tid]=d
273
- except:pass
 
274
  time.sleep(0.5)
275
- with _S["lk"]:res=dict(_S["res"])
276
- _fin(tasks,res,mn)
 
 
277
  except Exception as e:
278
- with _S["lk"]:_S["msg"]=f"ERR:{str(e)[:80]}";_S["run"]=False;_S["fin"]=True
 
 
 
 
 
 
 
 
279
 
280
- def _fin(tasks,res,mn):
281
- global _S
282
- ds={};ts={}
283
  for dom in set(t.domain for t in tasks):
284
- v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
285
- if v:ds[dom]=round(np.mean(v),2)
 
 
 
286
  for tt in set(t.ticos_type for t in tasks):
287
- v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
288
- if v:ts[tt]=round(np.mean(v),2)
289
- av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
290
- fs=round(np.mean(av),2) if av else 0
291
- sd=_ssf(mn,fs,ds,ts,len(tasks),len(av))
292
- rid=_S["rid"]
293
- cp=f"/tmp/fb_{rid}.csv"
294
- with open(cp,"w",encoding="utf-8") as f:
295
- w=csv.writer(f);w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","score","comment","ts"])
296
- tm={t.task_id:t for t in tasks}
297
- for tid,d in sorted(res.items()):
298
- t=tm.get(tid)
299
- if not t:continue
300
- jd={}
301
- try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else {}
302
- except:pass
303
- w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,d["score"],
304
- (jd.get("comment","") if isinstance(jd,dict) else "")[:200],datetime.now().isoformat()])
305
- hfs=_uhf(sd)
306
- el=int(time.time()-_S["t0"]) if _S["t0"] else 0
307
- with _S["lk"]:
308
- _S["csv"]=cp;_S["hfs"]=hfs
309
- _S["msg"]=f"FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
310
- _S["run"]=False;_S["fin"]=True
311
-
312
- CSS='<style>.et{width:100%;border-collapse:collapse;font-size:.85em}.et th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc}.et td{padding:6px 8px;border-bottom:1px solid #eee}.sb{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}.sf{height:100%;border-radius:8px}.sc{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:20px;color:#fff;margin:8px 0}.pb{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}.pf{height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1)}</style>'
313
-
314
- def _clr2(s):
315
- if s>=80:return"#4caf50"
316
- if s>=60:return"#ff9800"
317
- return"#f44336"
318
-
319
- def _poll():
320
- global _S
321
- with _S["lk"]:
322
- run=_S["run"];fin=_S["fin"];tasks=_S.get("tsk",[]);res=dict(_S.get("res",{}))
323
- msg=_S.get("msg","");csvp=_S.get("csv")
324
- if not run and not fin and not res:
325
- return("Select model and press Start.","","",None)
326
- if run:
327
- dn=_S["dn"];tot=_S.get("tot",1);pct=min(int(dn/max(tot,1)*100),100)
328
- el=int(time.time()-_S.get("t0",time.time()));eta=int((el/max(dn,1))*(tot-dn)) if dn>0 else 0
329
- ac=_S.get("ac",[]);jok=_S.get("jok",0)
330
- tg=" ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;font-size:.78em">{a}</span>' for a in ac[-8:]])
331
- prog=f'{CSS}<div><div style="display:flex;justify-content:space-between;margin-bottom:4px"><span>🧬 {dn}/{tot} Β· {el}s Β· ETA {eta}s Β· Judgeβœ…{jok}</span><span style="font-weight:700;color:#7c3aed">{pct}%</span></div><div class="pb"><div class="pf" style="width:{pct}%"></div></div><div style="margin-top:6px">{tg}</div></div>'
332
- elif fin:
333
- prog=f'<div style="background:#f0fdf4;padding:14px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a">🏁 {msg}</div>'
334
- else:prog=msg
335
- tbl=""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  if tasks:
337
- rows=""
338
  for t in tasks:
339
- ic=TICOS_INFO.get(t.ticos_type,{})
340
- if t.task_id in res:
341
- s=res[t.task_id]["score"]
342
- if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td></tr>'
 
343
  else:
344
- c=_clr2(s);rows+=f'<tr><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td><div class="sb"><div class="sf" style="width:{min(s,100)}%;background:{c}"></div></div><span style="color:{c};font-weight:700">{s:.1f}</span></td></tr>'
345
- else:rows+=f'<tr style="opacity:.4"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>-</td><td>-</td><td>⏳</td></tr>'
346
- tbl=f'{CSS}<table class="et"><thead><tr><th>ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
347
- sm=""
348
- if fin and tasks:
349
- av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
350
- fs=round(np.mean(av),2) if av else 0
351
- gr2="A" if fs>=80 else("B+" if fs>=70 else("B" if fs>=60 else "C"))
352
- dh=""
 
 
 
 
 
 
353
  for dom in sorted(set(t.domain for t in tasks)):
354
- v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
355
- if v:a=np.mean(v);c=_clr2(a);dh+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:.85em">{dom}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
356
- th=""
357
- for tt,info in TICOS_INFO.items():
358
- v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
359
- if v:a=np.mean(v);c=_clr2(a);th+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:150px;font-size:.85em">{info["icon"]} {info["name"]}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
360
- jf=sum(1 for t in tasks if t.task_id in res and res[t.task_id]["score"]<0)
361
- sm=f'{CSS}<div class="sc"><h2 style="margin:0;font-size:1.6em;text-align:center">🧬 FINAL Score: {fs} / 100</h2><h3 style="margin:4px 0;text-align:center;color:#aaa">Grade {gr2} Β· {_S.get("mdl","")}</h3><p style="text-align:center;color:#888;font-size:.9em">{len(av)}문제{f" Β· ❌{jf}" if jf else ""}</p><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">πŸ“š 도메인별</h4>{dh}<hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">🧬 TICOS별</h4>{th}<hr style="border-color:#333;margin:12px 0"><p style="font-size:.85em;color:#aaa">{_S.get("hfs","")}</p></div>'
362
- return(prog,tbl,sm,csvp)
363
-
364
- def _start(mc,at,ek,jk,jm,df,mt,nw,fresh):
365
- global _S
366
- if _S["run"]:return"Already running"
367
- ek=(ek or"").strip() or os.getenv("HF_TOKEN","")
368
- jk=(jk or"").strip() or os.getenv("OPENAI_API_KEY","")
369
- if not ek:return"Need API key"
370
- if not jk:return"Need Judge key"
371
- if at=="HuggingFace Inference":mid=HF_MODELS.get(mc,mc);a="hf"
372
- else:mid=OAI_MODELS.get(mc,mc);a="openai"
373
- tasks=ALL_TASKS[:]
374
- if df!="전체":tasks=[t for t in tasks if t.difficulty==df]
375
- tasks=tasks[:int(mt)]
376
- rid=_rid(mid)
377
- if fresh:_clr(rid)
378
- _rst()
379
- with _S["lk"]:_S["run"]=True;_S["rid"]=rid;_S["mdl"]=mc;_S["tsk"]=tasks;_S["tot"]=len(tasks)
380
- threading.Thread(target=_bgev,args=(ek,jk,mid,mc,jm,a,tasks,rid,int(nw)),daemon=True).start()
381
- return f"🧬 {mc} FINAL Bench ({len(tasks)} tasks, {int(nw)} workers)"
382
-
383
- def _stop():
384
- global _S
385
- if _S["run"]:_S["stp"]=True;return"Stopping..."
386
- return"Not running"
387
-
388
- def _um(at):
389
- if at=="HuggingFace Inference":return gr.update(choices=list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0])
390
- return gr.update(choices=list(OAI_MODELS.keys()),value=list(OAI_MODELS.keys())[0])
391
-
392
- HEADER="""<div style="text-align:center;padding:16px 0">
393
- <h1 style="margin:0;font-size:1.8em">🧬 FINAL Bench Auto-Evaluator v1.0</h1>
394
- <h2 style="margin:4px 0;color:#555;font-size:1.05em">Metacognitive Intelligence Β· 100 Tasks Β· TICOS Scoring</h2>
395
- <p style="color:#888;font-size:.88em;max-width:700px;margin:8px auto">
396
- πŸ“Š <b>FINAL-Bench/Metacognitive</b> Β· 100문제 Β· 15도메인 Β· 8 TICOSμœ ν˜•<br>
397
- 🧬 TICOS: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
398
- πŸ“‘ HF Inference (μ˜€ν”ˆμ†ŒμŠ€) + πŸ”‘ OpenAI (ν΄λ‘œμ¦ˆλ“œ) β†’ βš–οΈ GPT-5.2 Judge<br>
399
- πŸ“Š β†’ <code>final_scores.json</code> β†’ ALL Bench Metacog μžλ™ 반영</p></div>"""
400
 
401
  def create_app():
402
- with gr.Blocks(title="FINAL Bench Evaluator",theme=gr.themes.Soft(),
403
- css=".gradio-container{max-width:1100px!important}") as app:
404
- gr.HTML(HEADER)
405
  with gr.Row():
406
- at=gr.Radio(["HuggingFace Inference","OpenAI Compatible"],value="HuggingFace Inference",label="πŸ“‘ API",scale=2)
407
- md=gr.Dropdown(list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0],label="πŸ€– Model",scale=3,allow_custom_value=True)
408
- at.change(_um,[at],[md])
 
 
 
 
 
 
409
  with gr.Row():
410
- ek=gr.Textbox(label="πŸ”‘ Eval Key",type="password",placeholder="hf_... or sk-...",value=os.getenv("HF_TOKEN",""),scale=3)
411
- jk=gr.Textbox(label="βš–οΈ Judge Key",type="password",placeholder="sk-...",value=os.getenv("OPENAI_API_KEY",""),scale=3)
 
 
 
 
 
 
 
412
  with gr.Row():
413
- jm=gr.Textbox(label="βš–οΈ Judge",value="gpt-5.2",scale=2)
414
- df=gr.Dropdown(["전체","expert","frontier"],value="전체",label="Difficulty",scale=1)
415
- mt=gr.Slider(1,100,value=100,step=1,label="Tasks",scale=2)
416
- nw=gr.Slider(1,20,value=10,step=1,label="Workers",scale=1)
 
 
 
417
  with gr.Row():
418
- sb=gr.Button("▢️ Start",variant="primary",size="lg",scale=2)
419
- fb=gr.Button("πŸš€ Fresh",variant="secondary",size="lg",scale=2)
420
- xb=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
421
- st=gr.Textbox(label="Status",interactive=False,max_lines=1)
422
- with gr.Accordion("πŸ“Š Existing Scores",open=False):
423
- gr.JSON(value=_lsf(),label="final_scores.json")
 
 
 
424
  with gr.Tabs():
425
- with gr.Tab("πŸ“Š Progress"):p=gr.HTML()
426
- with gr.Tab("πŸ“‹ Results"):t=gr.HTML()
427
- with gr.Tab("πŸ† Summary"):s=gr.HTML()
428
- with gr.Tab("πŸ’Ύ CSV"):c=gr.File(label="CSV")
429
- timer=gr.Timer(value=2,active=True)
430
- timer.tick(fn=_poll,outputs=[p,t,s,c])
431
- ins=[md,at,ek,jk,jm,df,mt,nw]
432
- sb.click(fn=lambda *a:_start(*a,fresh=False),inputs=ins,outputs=[st])
433
- fb.click(fn=lambda *a:_start(*a,fresh=True),inputs=ins,outputs=[st])
434
- xb.click(fn=_stop,outputs=[st])
435
- gr.Markdown(f"---\n<center>🧬 FINAL Bench v1.0 · Apache 2.0 · Ginigen AI<br>Data: FINAL-Bench/Metacognitive · {len(ALL_TASKS)} tasks · TICOS</center>")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  return app
437
 
438
- if __name__=="__main__":
439
- st={}
440
- for t in ALL_TASKS:st[t.ticos_type]=st.get(t.ticos_type,0)+1
441
- print(f"FINAL Bench Evaluator: {len(ALL_TASKS)} tasks")
442
- for tt,n in sorted(st.items()):i=TICOS_INFO.get(tt,{});print(f" {i.get('icon','')} {tt}: {n}")
443
- app=create_app()
 
 
 
 
444
  app.queue(default_concurrency_limit=2)
445
- app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)
 
 
 
1
  """
2
+ FINAL Bench Auto-Evaluator v1.1
3
+ ================================
4
+ FINAL-Bench/Metacognitive 100 tasks
5
+ HF Inference API + OpenAI Judge -> final_scores.json
6
+ Gradio 6.x compatible
7
 
8
+ Author: Ginigen AI Β· License: Apache 2.0
 
 
9
  """
10
+ import json, os, time, re, hashlib, sqlite3, threading, csv, io
11
  from datetime import datetime
12
  from dataclasses import dataclass
13
  from typing import Optional
14
  import requests, numpy as np, gradio as gr
15
 
16
+ # ══════════════ DATA ══════════════
17
+
18
  @dataclass
19
+ class Task:
20
+ task_id: str; domain: str; grade: str; ticos_type: str
21
+ difficulty: str; lens: str; title: str; prompt: str
22
+ expected_behavior: str; hidden_trap: Optional[str] = None
23
+ ticos_required: str = ""; ticos_optional: str = ""
24
 
25
  def load_tasks():
26
  try:
27
  from datasets import load_dataset
28
+ ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
29
+ tasks = []
30
+ for r in ds:
31
+ tasks.append(Task(
32
+ task_id=r["task_id"], domain=r["domain"], grade=r["grade"],
33
+ ticos_type=r["ticos_type"], difficulty=r["difficulty"],
34
+ lens=r.get("lens",""), title=r["title"], prompt=r["prompt"],
35
+ expected_behavior=r["expected_behavior"],
36
+ hidden_trap=r.get("hidden_trap"),
37
+ ticos_required=r.get("ticos_required",""),
38
+ ticos_optional=r.get("ticos_optional","")))
39
+ print(f"βœ… {len(tasks)} tasks loaded")
40
  return tasks
41
  except Exception as e:
42
+ print(f"❌ Load failed: {e}")
43
+ return []
44
+
45
+ TASKS = load_tasks()
46
+
47
+ # TICOS types from actual dataset
48
+ TICOS = {
49
+ "A_TrapEscape": {"n": "ν•¨μ •νƒˆμΆœ", "i": "πŸͺ€"},
50
+ "B_ContradictionResolution": {"n": "λͺ¨μˆœν•΄κ²°", "i": "⚑"},
51
+ "C_ProgressiveDiscovery": {"n": "μ μ§„λ°œκ²¬", "i": "πŸ”¬"},
52
+ "D_MultiConstraint": {"n": "λ‹€μ€‘μ œμ•½", "i": "🎯"},
53
+ "E_SelfCorrecting": {"n": "μžκΈ°μˆ˜μ •", "i": "πŸ”„"},
54
+ "F_ExpertPanel": {"n": "μ „λ¬Έκ°€ν† λ‘ ", "i": "πŸ‘₯"},
55
+ "G_PivotDetection": {"n": "μ „ν™˜κ°μ§€", "i": "πŸ”€"},
56
+ "H_DecisionUnderUncertainty":{"n": "λΆˆν™•μ‹€μ„±νŒλ‹¨", "i": "πŸ“Š"},
57
  }
58
 
59
+ # ══════════════ RUBRIC ══════════════
60
+
61
+ RK = ["trap_detection", "insight_depth", "confidence_calibration", "self_correction", "synthesis_quality"]
62
+ RW = {"trap_detection": 0.20, "insight_depth": 0.20, "confidence_calibration": 0.25,
63
+ "self_correction": 0.20, "synthesis_quality": 0.15}
64
+ RD = {"trap_detection": "Hidden trap/error detection",
65
+ "insight_depth": "Depth of genuine insight",
66
+ "confidence_calibration": "Confidence-accuracy alignment (overconfidence penalized)",
67
+ "self_correction": "Error detection and actual correction",
68
+ "synthesis_quality": "Coherent final synthesis"}
69
 
70
+ def calc_score(scores):
71
+ return round(sum(scores.get(k, 0.5) * w for k, w in RW.items()) * 100, 2)
72
+
73
+ # ══════════════ LLM CALLS ══════════════
74
 
75
  def _strip(t):
76
+ if not t: return t
77
+ for tag in ['think', 'thinking', 'reasoning', 'reflection']:
78
+ t = re.sub(rf'<{tag}>.*?</{tag}>', '', t, flags=re.DOTALL)
79
  return t.strip()
80
 
81
+ def call_hf(prompt, sys_msg="", key="", model="Qwen/Qwen3.5-397B-A17B", max_tok=4096, temp=0.6):
82
+ msgs = []
83
+ if sys_msg: msgs.append({"role": "system", "content": sys_msg})
84
+ msgs.append({"role": "user", "content": prompt})
85
+ h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
86
+ body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp, "stream": False}
87
+
88
+ for attempt in range(3):
89
  try:
90
+ print(f" πŸ“‘ HF call: {model} (attempt {attempt+1})")
91
+ r = requests.post(
92
+ f"https://router.huggingface.co/hf-inference/models/{model}/v1/chat/completions",
93
+ headers=h, json=body, timeout=120)
94
+ print(f" πŸ“‘ Status: {r.status_code}")
95
+ if r.status_code in (429, 503):
96
+ wait = 10 * (attempt + 1)
97
+ print(f" ⏳ Rate limited, waiting {wait}s")
98
+ time.sleep(wait); continue
99
  r.raise_for_status()
100
+ content = r.json()["choices"][0]["message"]["content"]
101
+ print(f" βœ… Got {len(content)} chars")
102
+ return _strip(content)
103
  except Exception as e:
104
+ print(f" ❌ HF error: {e}")
105
+ if attempt < 2: time.sleep(3 * (attempt + 1))
106
+ else: return f"[API_ERROR] {e}"
107
+
108
+ def call_oai(prompt, sys_msg="", key="", model="gpt-5.2", max_tok=4096, temp=0.6):
109
+ msgs = []
110
+ if sys_msg: msgs.append({"role": "system", "content": sys_msg})
111
+ msgs.append({"role": "user", "content": prompt})
112
+ h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
113
+ body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp}
114
+
115
+ for attempt in range(2):
116
  try:
117
+ print(f" πŸ”‘ OpenAI call: {model}")
118
+ r = requests.post("https://api.openai.com/v1/chat/completions",
119
+ headers=h, json=body, timeout=120)
120
+ if r.status_code == 429:
121
+ time.sleep(5 * (attempt + 1)); continue
122
  r.raise_for_status()
123
  return _strip(r.json()["choices"][0]["message"]["content"])
124
  except Exception as e:
125
+ print(f" ❌ OpenAI error: {e}")
126
+ if attempt < 1: time.sleep(3)
127
+ else: return f"[API_ERROR] {e}"
128
+
129
+ def call_model(prompt, sys_msg="", key="", model="", api_type="hf", max_tok=4096, temp=0.6):
130
+ if api_type == "openai":
131
+ return call_oai(prompt, sys_msg, key, model, max_tok, temp)
132
+ return call_hf(prompt, sys_msg, key, model, max_tok, temp)
133
+
134
+ # ══════════════ MODELS ══════════════
135
+
136
+ HF_MODELS = {
137
+ "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
138
+ "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
139
+ "Qwen3.5-27B": "Qwen/Qwen3.5-27B",
140
+ "Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B",
141
+ "Qwen3.5-9B": "Qwen/Qwen3.5-9B",
142
+ "Qwen3.5-4B": "Qwen/Qwen3.5-4B",
143
+ "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
144
+ "DeepSeek R1": "deepseek-ai/DeepSeek-R1",
145
+ "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
146
+ "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
147
+ "Phi-4": "microsoft/phi-4",
148
+ "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
149
  }
150
+ OAI_MODELS = {"GPT-5.2": "gpt-5.2", "GPT-5.4": "gpt-5.4", "GPT-5.1": "gpt-5.1"}
151
+
152
+ # ══════════════ JUDGE ══════════════
153
+
154
+ JUDGE_SYS = """You are a FINAL Bench Metacognition Judge. Score 5 TICOS dimensions using ONLY 0.0/0.25/0.5/0.75/1.0:
155
+
156
+ 1. trap_detection: Did model detect hidden traps? 1.0=all found, 0.0=fell in
157
+ 2. insight_depth: Genuine deep understanding? 1.0=novel, 0.0=wrong
158
+ 3. confidence_calibration: Confidence matches accuracy? 1.0=calibrated, 0.0=overconfident. Overconfidence is WORSE than underconfidence.
159
+ 4. self_correction: Caught and fixed own errors? 1.0=backtracked+fixed, 0.0=none
160
+ 5. synthesis_quality: Final synthesis coherent? 1.0=unified, 0.0=fragmented
161
+
162
+ Output ONLY JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"one line"}"""
163
+
164
+ def make_judge_prompt(task, response):
165
+ sk = ', '.join([f'"{k}": ___' for k in RK])
166
+ ht = f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
167
+ return f"""[FINAL Bench Evaluation]
 
 
 
 
 
168
  Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
169
  Title: {task.title}
170
  Prompt: {task.prompt[:1200]}
171
  Expected: {task.expected_behavior[:500]}{ht}
172
  === RESPONSE ===
173
+ {response[:8000]}
174
  === END ===
175
+ Output ONLY: {{"scores": {{{sk}}}, "comment": "..."}}"""
176
+
177
+ def judge(prompt, key, model="gpt-5.2"):
178
+ schema = {
179
+ "type": "object",
180
+ "properties": {
181
+ "scores": {
182
+ "type": "object",
183
+ "properties": {k: {"type": "number", "enum": [0.0, 0.25, 0.5, 0.75, 1.0]} for k in RK},
184
+ "required": RK, "additionalProperties": False},
185
+ "comment": {"type": "string"}},
186
+ "required": ["scores", "comment"], "additionalProperties": False}
187
+
188
+ msgs = [{"role": "system", "content": JUDGE_SYS}, {"role": "user", "content": prompt}]
189
+ payload = {"model": model, "max_completion_tokens": 4096, "temperature": 0.1,
190
+ "messages": msgs,
191
+ "response_format": {"type": "json_schema",
192
+ "json_schema": {"name": "FBResult", "strict": True, "schema": schema}}}
193
+ h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
194
+
195
  for a in range(3):
196
  try:
197
+ print(f" βš–οΈ Judge call (attempt {a+1})")
198
+ r = requests.post("https://api.openai.com/v1/chat/completions",
199
+ headers=h, json=payload, timeout=180)
200
+ print(f" βš–οΈ Judge status: {r.status_code}")
201
+ if r.status_code == 429:
202
+ time.sleep(5 * (a + 1)); continue
203
  r.raise_for_status()
204
+ c = r.json()["choices"][0]["message"]["content"]
205
  if not c:
206
+ if a < 2: time.sleep(2); continue
207
  return None
208
+ d = json.loads(_strip(c))
209
  if "scores" in d:
210
+ for k in RK:
211
+ if k not in d["scores"]: d["scores"][k] = 0.5
212
+ print(f" βœ… Judge OK: {d.get('comment','')[:50]}")
213
  return d
214
+ except Exception as e:
215
+ print(f" ❌ Judge error: {e}")
216
+ if a < 2: time.sleep(3 * (a + 1))
217
  return None
218
 
219
+ # ══════════════ DB ══════════════
220
+
221
+ DB = "final_bench.db"
222
+
223
+ def db_init():
224
+ c = sqlite3.connect(DB)
225
+ c.execute("CREATE TABLE IF NOT EXISTS results(rid TEXT, tid TEXT, resp TEXT, jdg TEXT, score REAL, ts REAL, PRIMARY KEY(rid,tid))")
226
+ c.commit(); c.close()
227
+
228
+ def db_save(rid, tid, resp, jdg, score):
229
+ c = sqlite3.connect(DB)
230
+ c.execute("INSERT OR REPLACE INTO results VALUES(?,?,?,?,?,?)", (rid, tid, resp, jdg, score, time.time()))
231
+ c.commit(); c.close()
232
+
233
+ def db_load(rid):
234
+ c = sqlite3.connect(DB)
235
+ rows = c.execute("SELECT tid, resp, jdg, score FROM results WHERE rid=?", (rid,)).fetchall()
236
+ c.close()
237
+ return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows}
238
+
239
+ def db_clear(rid):
240
+ c = sqlite3.connect(DB)
241
+ c.execute("DELETE FROM results WHERE rid=?", (rid,))
242
+ c.commit(); c.close()
243
+
244
+ db_init()
245
+
246
+ # ══════════════ SCORES FILE ══════════════
247
+
248
+ SF = "final_scores.json"
249
+
250
+ def sf_load():
251
  try:
252
+ with open(SF) as f: return json.load(f)
253
+ except: return {"version": "1.1", "bench": "FINAL-Bench/Metacognitive", "updated": "", "models": {}}
254
+
255
+ def sf_save(name, score, dom_scores, ticos_scores, n_total, n_done):
256
+ d = sf_load()
257
+ d["updated"] = datetime.now().isoformat()
258
+ d["models"][name] = {
259
+ "final_score": score, "domain_scores": dom_scores,
260
+ "ticos_scores": ticos_scores, "tasks_total": n_total,
261
+ "tasks_completed": n_done, "evaluated_at": datetime.now().isoformat()}
262
+ with open(SF, "w") as f: json.dump(d, f, indent=2, ensure_ascii=False)
263
  return d
264
 
265
+ def sf_upload(d):
266
+ tk = os.getenv("HF_TOKEN", "")
267
+ if not tk: return "⚠️ HF_TOKEN not set"
268
  try:
269
  from huggingface_hub import HfApi
270
+ HfApi(token=tk).upload_file(
271
+ path_or_fileobj=json.dumps(d, indent=2, ensure_ascii=False).encode("utf-8"),
272
+ path_in_repo="final_scores.json",
273
+ repo_id="FINAL-Bench/ALL-Bench-Leaderboard", repo_type="dataset",
274
+ commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
275
+ return "βœ… Uploaded to HF"
276
+ except Exception as e: return f"❌ Upload: {e}"
277
+
278
+ # ══════════════ EVAL ENGINE ══════════════
279
 
280
  from concurrent.futures import ThreadPoolExecutor
281
 
282
+ def eval_one(task, rid, key, jkey, mid, jmodel, atype, state):
283
+ print(f"\n{'='*40}\nπŸ“ Evaluating: {task.task_id} ({task.ticos_type})")
284
  try:
285
+ # 1. Model response
286
+ resp = call_model(task.prompt, key=key, model=mid, api_type=atype)
287
+ if not resp or resp.startswith("[API_ERROR"):
288
+ print(f" ❌ Model failed: {resp[:100]}")
289
+ db_save(rid, task.task_id, resp or "empty", "{}", 0)
290
+ with state["lock"]:
291
+ state["done"] += 1
292
+ state["errors"].append(task.task_id)
293
+ return task.task_id, {"response": resp, "judge": "{}", "score": 0}
294
+
295
+ # 2. Judge
296
+ jp = make_judge_prompt(task, resp)
297
+ jd = judge(jp, jkey, jmodel)
298
  if jd is None:
299
+ print(f" ❌ Judge failed for {task.task_id}")
300
+ jd = {"scores": {k: 0.0 for k in RK}, "comment": "judge_failed", "failed": True}
301
+
302
+ if jd.get("failed"):
303
+ sc = -1.0
304
+ else:
305
+ sc = calc_score(jd["scores"])
306
+ with state["lock"]: state["jok"] += 1
307
+
308
+ jj = json.dumps(jd, ensure_ascii=False)
309
+ db_save(rid, task.task_id, resp, jj, sc)
310
+ print(f" πŸ“Š Score: {sc}")
311
+
312
+ with state["lock"]:
313
+ state["done"] += 1
314
+ ti = TICOS.get(task.ticos_type, {})
315
+ state["active"].append(f'{ti.get("i","πŸ“")} {task.task_id} β†’ {sc}')
316
+ if len(state["active"]) > 10:
317
+ state["active"] = state["active"][-10:]
318
+
319
+ return task.task_id, {"response": resp, "judge": jj, "score": sc}
320
+
321
  except Exception as e:
322
+ print(f" πŸ’₯ Exception: {e}")
323
+ db_save(rid, task.task_id, f"[ERR] {e}", "{}", 0)
324
+ with state["lock"]:
325
+ state["done"] += 1
326
+ state["errors"].append(f"{task.task_id}: {str(e)[:40]}")
327
+ return task.task_id, {"response": f"[ERR] {e}", "judge": "{}", "score": 0}
328
+
329
+ # ── State ──
330
+ ST = {
331
+ "running": False, "stop": False, "finished": False,
332
+ "rid": "", "model": "", "done": 0, "total": 0, "cached": 0,
333
+ "errors": [], "active": [], "jok": 0, "t0": 0,
334
+ "results": {}, "tasks": [],
335
+ "lock": threading.Lock(), "msg": "", "csv": None, "hf": "",
336
+ }
337
+
338
+ def st_reset():
339
+ with ST["lock"]:
340
+ ST.update({"running": False, "stop": False, "finished": False,
341
+ "done": 0, "cached": 0, "errors": [], "active": [], "jok": 0,
342
+ "t0": 0, "results": {}, "tasks": [],
343
+ "msg": "", "csv": None, "hf": ""})
344
+
345
+ def bg_eval(key, jkey, mid, mname, jmodel, atype, tasks, rid, nw):
346
+ print(f"\n{'#'*50}")
347
+ print(f"# BG EVAL START: {mname} ({len(tasks)} tasks, {nw} workers)")
348
+ print(f"# API type: {atype}, Model ID: {mid}")
349
+ print(f"{'#'*50}\n")
350
+
351
  try:
352
+ cached = db_load(rid)
353
+ nc = sum(1 for t in tasks if t.task_id in cached)
354
+ pending = [t for t in tasks if t.task_id not in cached]
355
+
356
+ with ST["lock"]:
357
+ ST["results"] = cached
358
+ ST["cached"] = nc
359
+ ST["total"] = len(tasks)
360
+ ST["t0"] = time.time()
361
+
362
+ if not pending:
363
+ with ST["lock"]: ST["msg"] = f"πŸ’Ύ All cached ({nc})"
364
+ finalize(tasks, cached, mname)
365
+ return
366
+
367
+ with ST["lock"]: ST["msg"] = f"⚑ {len(pending)} tasks, {nw} workers"
368
+ print(f"πŸ“‹ Pending: {len(pending)}, Cached: {nc}")
369
+
370
+ with ThreadPoolExecutor(max_workers=nw) as exe:
371
+ futs = {}
372
+ for task in pending:
373
+ if ST["stop"]: break
374
+ f = exe.submit(eval_one, task, rid, key, jkey, mid, jmodel, atype, ST)
375
+ futs[f] = task
376
+
377
+ done_set = set()
378
+ while len(done_set) < len(futs):
379
+ if ST["stop"]:
380
+ print("⏹️ Stop requested")
381
+ with ST["lock"]:
382
+ ST["msg"] = "⏹️ Stopped"
383
+ ST["running"] = False
384
+ ST["finished"] = True
385
  return
386
  for f in list(futs):
387
+ if f in done_set: continue
388
  if f.done():
389
+ done_set.add(f)
390
  try:
391
+ tid, data = f.result()
392
+ with ST["lock"]: ST["results"][tid] = data
393
+ except Exception as e:
394
+ print(f"Future error: {e}")
395
  time.sleep(0.5)
396
+
397
+ with ST["lock"]: results = dict(ST["results"])
398
+ finalize(tasks, results, mname)
399
+
400
  except Exception as e:
401
+ print(f"πŸ’₯ BG EVAL CRASH: {e}")
402
+ import traceback; traceback.print_exc()
403
+ with ST["lock"]:
404
+ ST["msg"] = f"❌ {str(e)[:100]}"
405
+ ST["running"] = False
406
+ ST["finished"] = True
407
+
408
+ def finalize(tasks, results, mname):
409
+ print(f"\n🏁 Finalizing: {len(results)} results")
410
 
411
+ ds = {}
 
 
412
  for dom in set(t.domain for t in tasks):
413
+ v = [results[t.task_id]["score"] for t in tasks
414
+ if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
415
+ if v: ds[dom] = round(np.mean(v), 2)
416
+
417
+ ts = {}
418
  for tt in set(t.ticos_type for t in tasks):
419
+ v = [results[t.task_id]["score"] for t in tasks
420
+ if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
421
+ if v: ts[tt] = round(np.mean(v), 2)
422
+
423
+ av = [results[t.task_id]["score"] for t in tasks
424
+ if t.task_id in results and results[t.task_id]["score"] >= 0]
425
+ fs = round(np.mean(av), 2) if av else 0
426
+
427
+ print(f"πŸ“Š FINAL Score: {fs} ({len(av)}/{len(tasks)} tasks)")
428
+
429
+ sd = sf_save(mname, fs, ds, ts, len(tasks), len(av))
430
+ hf = sf_upload(sd)
431
+ el = int(time.time() - ST["t0"]) if ST["t0"] else 0
432
+
433
+ # CSV
434
+ cp = f"/tmp/fb_{ST['rid']}.csv"
435
+ with open(cp, "w", encoding="utf-8") as f:
436
+ w = csv.writer(f)
437
+ w.writerow(["task_id","domain","grade","ticos","difficulty","title","score","comment"])
438
+ tm = {t.task_id: t for t in tasks}
439
+ for tid, d in sorted(results.items()):
440
+ t = tm.get(tid)
441
+ if not t: continue
442
+ try: jd = json.loads(d["judge"]) if isinstance(d["judge"], str) else {}
443
+ except: jd = {}
444
+ w.writerow([tid, t.domain, t.grade, t.ticos_type, t.difficulty, t.title,
445
+ d["score"], (jd.get("comment","") if isinstance(jd,dict) else "")[:200]])
446
+
447
+ with ST["lock"]:
448
+ ST["csv"] = cp
449
+ ST["hf"] = hf
450
+ ST["msg"] = f"🏁 FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
451
+ ST["running"] = False
452
+ ST["finished"] = True
453
+
454
+ print(f"βœ… Done: FINAL Score = {fs}")
455
+
456
+ # ══════════════ UI CALLBACKS ══════════════
457
+
458
+ def do_start(model, api_type, eval_key, judge_key, judge_model, diff, max_t, workers, fresh):
459
+ print(f"\nπŸ”˜ START clicked: model={model}, api={api_type}, fresh={fresh}")
460
+
461
+ if ST["running"]:
462
+ return "⚠️ Already running"
463
+
464
+ eval_key = (eval_key or "").strip() or os.getenv("HF_TOKEN", "")
465
+ judge_key = (judge_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
466
+
467
+ if not eval_key:
468
+ print("❌ No eval key")
469
+ return "❌ API Key needed"
470
+ if not judge_key:
471
+ print("❌ No judge key")
472
+ return "❌ Judge Key needed"
473
+
474
+ print(f" Keys: eval={eval_key[:8]}... judge={judge_key[:8]}...")
475
+
476
+ if api_type == "HuggingFace Inference":
477
+ mid = HF_MODELS.get(model, model)
478
+ at = "hf"
479
+ else:
480
+ mid = OAI_MODELS.get(model, model)
481
+ at = "openai"
482
+
483
+ tasks = TASKS[:]
484
+ if diff != "전체":
485
+ tasks = [t for t in tasks if t.difficulty == diff]
486
+ tasks = tasks[:int(max_t)]
487
+
488
+ print(f" Model ID: {mid}, Tasks: {len(tasks)}")
489
+
490
+ rid = hashlib.md5(f"FB_{mid}".encode()).hexdigest()[:12]
491
+ if fresh:
492
+ db_clear(rid)
493
+ print(" πŸ—‘οΈ Cache cleared")
494
+
495
+ st_reset()
496
+ with ST["lock"]:
497
+ ST["running"] = True
498
+ ST["rid"] = rid
499
+ ST["model"] = model
500
+ ST["tasks"] = tasks
501
+ ST["total"] = len(tasks)
502
+
503
+ thread = threading.Thread(
504
+ target=bg_eval,
505
+ args=(eval_key, judge_key, mid, model, judge_model, at, tasks, rid, int(workers)),
506
+ daemon=True)
507
+ thread.start()
508
+ print(f" 🧡 Thread started")
509
+
510
+ return f"🧬 {model} started ({len(tasks)} tasks, {int(workers)} workers)"
511
+
512
+ def do_stop():
513
+ if ST["running"]:
514
+ ST["stop"] = True
515
+ return "⏹️ Stopping..."
516
+ return "Not running"
517
+
518
+ def do_poll():
519
+ with ST["lock"]:
520
+ running = ST["running"]
521
+ finished = ST["finished"]
522
+ tasks = ST.get("tasks", [])
523
+ results = dict(ST.get("results", {}))
524
+ msg = ST.get("msg", "")
525
+ csvp = ST.get("csv")
526
+
527
+ if not running and not finished and not results:
528
+ return ("ℹ️ Select model β†’ press ▢️ Start", "", "", None)
529
+
530
+ # Progress bar
531
+ if running:
532
+ dn = ST["done"]
533
+ tot = ST.get("total", 1)
534
+ pct = min(int(dn / max(tot, 1) * 100), 100)
535
+ el = int(time.time() - ST.get("t0", time.time()))
536
+ eta = int((el / max(dn, 1)) * (tot - dn)) if dn > 0 else 0
537
+ active = ST.get("active", [])
538
+ jok = ST.get("jok", 0)
539
+ errs = ST.get("errors", [])
540
+
541
+ tags = " ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;'
542
+ f'font-size:12px">{a}</span>' for a in active[-6:]])
543
+ err_html = ""
544
+ if errs:
545
+ err_html = f'<div style="color:#dc2626;margin-top:6px;font-size:12px">⚠️ Errors: {", ".join(errs[-3:])}</div>'
546
+
547
+ prog = f"""<div style="padding:12px;background:#fafafa;border-radius:8px;border:1px solid #e5e7eb">
548
+ <div style="display:flex;justify-content:space-between;margin-bottom:6px">
549
+ <span style="font-size:14px">🧬 {dn}/{tot} Β· {el}s Β· ETA ~{eta}s Β· Judge βœ…{jok}</span>
550
+ <span style="font-weight:700;color:#7c3aed;font-size:16px">{pct}%</span>
551
+ </div>
552
+ <div style="background:#e5e7eb;border-radius:8px;height:24px;overflow:hidden">
553
+ <div style="width:{pct}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1);transition:width 0.3s"></div>
554
+ </div>
555
+ <div style="margin-top:8px">{tags}</div>{err_html}
556
+ </div>"""
557
+
558
+ elif finished:
559
+ prog = f'<div style="background:#f0fdf4;padding:16px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a;font-size:16px">🏁 {msg}</div>'
560
+ else:
561
+ prog = f'<div style="padding:12px">{msg}</div>'
562
+
563
+ # Results table
564
+ tbl = ""
565
  if tasks:
566
+ rows = ""
567
  for t in tasks:
568
+ ti = TICOS.get(t.ticos_type, {"i": "πŸ“", "n": t.ticos_type})
569
+ if t.task_id in results:
570
+ s = results[t.task_id]["score"]
571
+ if s < 0:
572
+ rows += f'<tr style="background:#fef3c7"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td style="color:#f59e0b;font-weight:700">❌ Judge failed</td></tr>'
573
  else:
574
+ c = "#22c55e" if s >= 80 else ("#f59e0b" if s >= 60 else "#ef4444")
575
+ rows += f'<tr><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td><div style="display:flex;align-items:center;gap:6px"><div style="background:#e5e7eb;border-radius:6px;height:16px;width:80px;overflow:hidden"><div style="width:{min(s,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="color:{c};font-weight:700;font-size:12px">{s:.1f}</span></div></td></tr>'
576
+ else:
577
+ rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td>⏳</td></tr>'
578
+ tbl = f'<table style="width:100%;border-collapse:collapse;font-size:13px"><thead><tr style="background:#f1f5f9"><th style="padding:8px;text-align:left">ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
579
+
580
+ # Summary
581
+ sm = ""
582
+ if finished and tasks:
583
+ av = [results[t.task_id]["score"] for t in tasks
584
+ if t.task_id in results and results[t.task_id]["score"] >= 0]
585
+ fs = round(np.mean(av), 2) if av else 0
586
+
587
+ # Domain bars
588
+ dh = ""
589
  for dom in sorted(set(t.domain for t in tasks)):
590
+ v = [results[t.task_id]["score"] for t in tasks
591
+ if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
592
+ if v:
593
+ a = round(np.mean(v), 1)
594
+ c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
595
+ dh += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:13px">{dom}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
596
+
597
+ # TICOS bars
598
+ th = ""
599
+ for tt, info in TICOS.items():
600
+ v = [results[t.task_id]["score"] for t in tasks
601
+ if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
602
+ if v:
603
+ a = round(np.mean(v), 1)
604
+ c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
605
+ th += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:160px;font-size:13px">{info["i"]} {info["n"]}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
606
+
607
+ sm = f"""<div style="background:linear-gradient(135deg,#1e1b4b,#312e81);border-radius:14px;padding:24px;color:#fff;margin:8px 0">
608
+ <h2 style="margin:0;font-size:28px;text-align:center">🧬 FINAL Score: {fs} / 100</h2>
609
+ <p style="text-align:center;color:#a5b4fc;margin:8px 0">{ST.get("model","")} Β· {len(av)} tasks</p>
610
+ <hr style="border-color:#4338ca;margin:16px 0">
611
+ <h4 style="color:#a5b4fc;margin:8px 0">πŸ“š Domains</h4>{dh}
612
+ <hr style="border-color:#4338ca;margin:16px 0">
613
+ <h4 style="color:#a5b4fc;margin:8px 0">🧬 TICOS Types</h4>{th}
614
+ <hr style="border-color:#4338ca;margin:16px 0">
615
+ <p style="font-size:12px;color:#818cf8">{ST.get("hf","")}</p></div>"""
616
+
617
+ return (prog, tbl, sm, csvp)
618
+
619
+ def update_models(api_type):
620
+ if api_type == "HuggingFace Inference":
621
+ return gr.update(choices=list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0])
622
+ return gr.update(choices=list(OAI_MODELS.keys()), value=list(OAI_MODELS.keys())[0])
623
+
624
+ # ══════════════ GRADIO APP ══════════════
625
+
626
+ HEADER_HTML = """<div style="text-align:center;padding:16px 0">
627
+ <h1 style="margin:0;font-size:28px">🧬 FINAL Bench Auto-Evaluator v1.1</h1>
628
+ <h2 style="margin:4px 0;color:#6b7280;font-size:16px">Metacognitive Intelligence Β· 100 Tasks Β· TICOS Scoring</h2>
629
+ <p style="color:#9ca3af;font-size:13px;max-width:700px;margin:8px auto;line-height:1.6">
630
+ πŸ“Š <b>FINAL-Bench/Metacognitive</b> 100 tasks Β· 15 domains Β· 8 TICOS types<br>
631
+ 🧬 <b>TICOS</b>: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
632
+ πŸ“‘ HF Inference API (open-source) + πŸ”‘ OpenAI (closed) β†’ βš–οΈ GPT-5.2 Judge<br>
633
+ πŸ“Š β†’ <code>final_scores.json</code> β†’ ALL Bench Metacog column</p></div>"""
 
 
634
 
635
  def create_app():
636
+ with gr.Blocks(title="FINAL Bench Evaluator") as app:
637
+ gr.HTML(HEADER_HTML)
638
+
639
  with gr.Row():
640
+ api_type = gr.Radio(
641
+ ["HuggingFace Inference", "OpenAI Compatible"],
642
+ value="HuggingFace Inference", label="πŸ“‘ API Type", scale=2)
643
+ model_dd = gr.Dropdown(
644
+ list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0],
645
+ label="πŸ€– Target Model", scale=3, allow_custom_value=True)
646
+
647
+ api_type.change(update_models, [api_type], [model_dd])
648
+
649
  with gr.Row():
650
+ eval_key = gr.Textbox(
651
+ label="πŸ”‘ Eval API Key (HF Token or OpenAI)",
652
+ type="password", placeholder="hf_... or sk-...",
653
+ value=os.getenv("HF_TOKEN", ""), scale=3)
654
+ judge_key = gr.Textbox(
655
+ label="βš–οΈ Judge Key (OpenAI)",
656
+ type="password", placeholder="sk-...",
657
+ value=os.getenv("OPENAI_API_KEY", ""), scale=3)
658
+
659
  with gr.Row():
660
+ judge_model = gr.Textbox(label="βš–οΈ Judge Model", value="gpt-5.2", scale=2)
661
+ diff_dd = gr.Dropdown(
662
+ ["전체", "expert", "frontier"],
663
+ value="전체", label="Difficulty", scale=1)
664
+ max_tasks = gr.Slider(1, 100, value=100, step=1, label="Max Tasks", scale=2)
665
+ workers = gr.Slider(1, 20, value=10, step=1, label="⚑ Workers", scale=1)
666
+
667
  with gr.Row():
668
+ start_btn = gr.Button("▢️ Start (Resume)", variant="primary", size="lg", scale=2)
669
+ fresh_btn = gr.Button("πŸš€ Fresh Start", variant="secondary", size="lg", scale=2)
670
+ stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", scale=1)
671
+
672
+ status = gr.Textbox(label="Status", interactive=False, max_lines=1)
673
+
674
+ with gr.Accordion("πŸ“Š Existing FINAL Scores", open=False):
675
+ gr.JSON(value=sf_load(), label="final_scores.json")
676
+
677
  with gr.Tabs():
678
+ with gr.Tab("πŸ“Š Progress"):
679
+ prog_html = gr.HTML()
680
+ with gr.Tab("πŸ“‹ Results"):
681
+ table_html = gr.HTML()
682
+ with gr.Tab("πŸ† Summary"):
683
+ summary_html = gr.HTML()
684
+ with gr.Tab("πŸ’Ύ CSV"):
685
+ csv_file = gr.File(label="CSV Download")
686
+
687
+ # Timer for polling
688
+ timer = gr.Timer(value=2, active=True)
689
+ timer.tick(fn=do_poll, outputs=[prog_html, table_html, summary_html, csv_file])
690
+
691
+ # Button handlers
692
+ inputs = [model_dd, api_type, eval_key, judge_key, judge_model,
693
+ diff_dd, max_tasks, workers]
694
+
695
+ start_btn.click(
696
+ fn=lambda *a: do_start(*a, fresh=False),
697
+ inputs=inputs, outputs=[status])
698
+ fresh_btn.click(
699
+ fn=lambda *a: do_start(*a, fresh=True),
700
+ inputs=inputs, outputs=[status])
701
+ stop_btn.click(fn=do_stop, outputs=[status])
702
+
703
+ gr.Markdown(f"""---
704
+ <center>🧬 FINAL Bench Auto-Evaluator v1.1 · Apache 2.0 · Ginigen AI<br>
705
+ Data: <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive">FINAL-Bench/Metacognitive</a> ({len(TASKS)} tasks)<br>
706
+ β†’ ALL Bench Leaderboard Metacog auto-sync</center>""")
707
+
708
  return app
709
 
710
+ if __name__ == "__main__":
711
+ stats = {}
712
+ for t in TASKS:
713
+ stats[t.ticos_type] = stats.get(t.ticos_type, 0) + 1
714
+ print(f"FINAL Bench Evaluator: {len(TASKS)} tasks")
715
+ for tt, n in sorted(stats.items()):
716
+ info = TICOS.get(tt, {"i": "?", "n": tt})
717
+ print(f" {info['i']} {tt}: {n}")
718
+
719
+ app = create_app()
720
  app.queue(default_concurrency_limit=2)
721
+ app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False,
722
+ theme=gr.themes.Soft(),
723
+ css=".gradio-container{max-width:1100px !important}")