seawolf2357 commited on
Commit
e40a043
·
verified ·
1 Parent(s): f7e6776

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +377 -396
app.py CHANGED
@@ -1,464 +1,445 @@
1
  """
2
- FINAL Bench Auto-Evaluator v1.0
3
- ================================
4
- FINAL Bench 100문제 x HF Inference API -> GPT-5.2 Judge -> final_scores.json
 
5
 
6
- - 시험 문제: FINAL-Bench/Metacognitive (HuggingFace Dataset)
7
- - 시험 응시자: ALL Bench 등재 HF Inference API 모델
8
- - 심판: GPT-5.2 (os.getenv("OPENAI_API_KEY"))
9
- - 출력: final_scores.json -> ALL Bench Metacog 컬럼 자동 반영
10
 
11
- Author: Ginigen AI · FINAL-Bench · Apache 2.0
12
  """
13
-
14
- import json, os, time, csv, io, re, hashlib, sqlite3, threading
15
  from datetime import datetime
16
- from dataclasses import dataclass, field, asdict
17
- from typing import Dict, Optional
18
- from concurrent.futures import ThreadPoolExecutor
19
- import requests
20
- import numpy as np
21
- import gradio as gr
22
-
23
- PILLAR_INFO = {
24
- "P1_Emergence": {"name": "창발성", "icon": "✦", "color": "#FF6B35", "weight": 0.20},
25
- "P2_Metacognition": {"name": "메타인지", "icon": "◉", "color": "#7B2FF7", "weight": 0.25},
26
- "P3_SelfEvolution": {"name": "자가진화", "icon": "◈", "color": "#00B4D8", "weight": 0.15},
27
- "P4_Orchestration": {"name": "다중지능", "icon": "◬", "color": "#2EC4B6", "weight": 0.15},
28
- "P5_SynergyAntagonism": {"name": "상생상극", "icon": "☯", "color": "#E63946", "weight": 0.25},
29
- }
30
 
31
  @dataclass
32
- class EvalTask:
33
- task_id: str; pillar: str; sub_dimension: str; difficulty: str
34
- prompt: str; context: Optional[str] = None; expected_behavior: Optional[str] = None
35
- scoring_rubric: Dict = field(default_factory=dict); metadata: Dict = field(default_factory=dict)
36
-
37
- # ══ FINAL Bench 100문제 로드 ══
38
 
39
  def load_tasks():
40
- """FINAL-Bench/Metacognitive HF Dataset에서 100문제 로드"""
41
  try:
42
  from datasets import load_dataset
43
- ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
44
- tasks = []
45
- for row in ds:
46
- rubric = row.get("scoring_rubric", {})
47
- if isinstance(rubric, str):
48
- try: rubric = json.loads(rubric)
49
- except: rubric = {}
50
- meta = row.get("metadata") or {}
51
- if isinstance(meta, str):
52
- try: meta = json.loads(meta)
53
- except: meta = {}
54
- tasks.append(EvalTask(
55
- task_id=row["task_id"], pillar=row["pillar"],
56
- sub_dimension=row["sub_dimension"], difficulty=row["difficulty"],
57
- prompt=row["prompt"], context=row.get("context"),
58
- expected_behavior=row.get("expected_behavior"),
59
- scoring_rubric=rubric, metadata=meta))
60
- print(f"✅ FINAL Bench: {len(tasks)}문제 로드 (HF Dataset)")
61
  return tasks
62
  except Exception as e:
63
- print(f"⚠️ HF Dataset 실패: {e}, parquet 폴백...")
64
- try:
65
- import pandas as pd
66
- df = pd.read_parquet("full_v2.parquet")
67
- tasks = []
68
- for _, row in df.iterrows():
69
- rubric = row["scoring_rubric"]
70
- if isinstance(rubric, str): rubric = json.loads(rubric)
71
- tasks.append(EvalTask(
72
- task_id=row["task_id"], pillar=row["pillar"],
73
- sub_dimension=row["sub_dimension"], difficulty=row["difficulty"],
74
- prompt=row["prompt"], context=row.get("context"),
75
- expected_behavior=row.get("expected_behavior"),
76
- scoring_rubric=rubric, metadata={}))
77
- print(f"✅ Parquet 폴백: {len(tasks)}문제")
78
- return tasks
79
- except Exception as e2:
80
- print(f"❌ 로드 실패: {e2}")
81
- return []
82
-
83
- ALL_TASKS = load_tasks()
84
-
85
- # ══ ALL Bench 등재 HF Inference API 모델 ══
86
-
87
- HF_MODELS = {
88
- "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
89
- "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
90
- "Qwen3.5-27B": "Qwen/Qwen3.5-27B",
91
- "Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B",
92
- "Qwen3.5-9B": "Qwen/Qwen3.5-9B",
93
- "Qwen3.5-4B": "Qwen/Qwen3.5-4B",
94
- "Qwen3-Next-80B": "Qwen/Qwen3-Next-80B-A3B-Thinking",
95
- "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
96
- "DeepSeek R1": "deepseek-ai/DeepSeek-R1",
97
- "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
98
- "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
99
- "Phi-4": "microsoft/phi-4",
100
- "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
101
  }
102
 
103
- # ══ LLM 호출: HF Inference API ══
 
 
 
 
 
 
 
104
 
105
- def _strip(text):
106
- if not text: return text
107
- for t in ['think','thinking','reasoning','reflection']:
108
- text = re.sub(rf'<{t}>.*?</{t}>', '', text, flags=re.DOTALL)
109
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- def call_model(prompt, system="", model_id="Qwen/Qwen3.5-397B-A17B", max_tokens=4096, temperature=0.6):
112
- hf_token = os.getenv("HF_TOKEN", "")
113
- if not hf_token: return "[API_ERROR] HF_TOKEN 미설정"
114
- msgs = []
115
- if system: msgs.append({"role":"system","content":system})
116
  msgs.append({"role":"user","content":prompt})
117
- headers = {"Content-Type":"application/json","Authorization":f"Bearer {hf_token}"}
118
- payload = {"model":model_id,"messages":msgs,"max_tokens":max_tokens,"temperature":temperature,"stream":False}
119
- for attempt in range(3):
120
  try:
121
- r = requests.post(f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions",
122
- headers=headers, json=payload, timeout=180)
123
- if r.status_code in (429, 503):
124
- time.sleep(10*(attempt+1)); continue
125
  r.raise_for_status()
126
  return _strip(r.json()["choices"][0]["message"]["content"])
127
  except Exception as e:
128
- if attempt < 2: time.sleep(5*(attempt+1))
129
- else: return f"[API_ERROR] {e}"
130
-
131
- # ══ Judge: GPT-5.2 (OPENAI_API_KEY) ══
132
-
133
- JUDGE_SYS = """You are a FINAL Bench scoring judge. Score each rubric item using ONLY: 0.0, 0.25, 0.5, 0.75, 1.0.
134
- 1.0=Excellent 0.75=Good 0.5=Average 0.25=Below 0.0=Fails
135
- Output JSON: {"scores":{...}, "comment":"한줄평가"}. Every rubric key MUST appear."""
136
-
137
- def call_judge(prompt, rubric_keys):
138
- api_key = os.getenv("OPENAI_API_KEY", "")
139
- if not api_key: return None
140
- props = {k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in rubric_keys}
141
- schema = {"type":"object","properties":{"scores":{"type":"object","properties":props,
142
- "required":list(rubric_keys),"additionalProperties":False},
143
- "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
144
- payload = {"model":"gpt-5.2","max_completion_tokens":4096,"temperature":0.1,
145
- "messages":[{"role":"system","content":JUDGE_SYS},{"role":"user","content":prompt}],
146
- "response_format":{"type":"json_schema","json_schema":{"name":"JudgeResult","strict":True,"schema":schema}}}
147
- headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
148
- for attempt in range(3):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  try:
150
- r = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=180)
151
- if r.status_code == 429: time.sleep(8*(attempt+1)); continue
152
  r.raise_for_status()
153
- content = _strip(r.json()["choices"][0]["message"]["content"])
154
- if not content:
155
- if attempt < 2: time.sleep(3); continue
156
  return None
157
- data = json.loads(content)
158
- if "scores" in data:
159
- for k in rubric_keys:
160
- if k not in data["scores"]: data["scores"][k] = 0.5
161
- return data
162
- except:
163
- if attempt < 2: time.sleep(5*(attempt+1)); continue
164
- return None
165
  return None
166
 
167
- def judge_prompt(task, response):
168
- keys = list(task.scoring_rubric.keys())
169
- skel = ", ".join([f'\"{k}\": ___' for k in keys])
170
- rubric = "\n".join([f' \"{k}\": {v["desc"]}' for k,v in task.scoring_rubric.items()])
171
- return f"Task: {task.task_id} | {task.pillar} | {task.difficulty}\nPrompt: {task.prompt[:800]}\nExpected: {(task.expected_behavior or 'N/A')[:300]}\nResponse: {response[:8000]}\n\nRubric:\n{rubric}\n\nOutput JSON: {{\"scores\": {{{skel}}}, \"comment\": \"한줄평가\"}}"
172
-
173
- def score(scores, rubric):
174
- return round(sum(scores.get(k,0.5)*v["weight"] for k,v in rubric.items())*100, 2)
175
-
176
- # ══ 다중 라운드 과제 ══
177
-
178
- def _mutual(topic, mid):
179
- r1 = call_model(f"[R1] \'{topic}\' 500단어 분석.", model_id=mid)
180
- r2 = call_model(f"[R2] 비판하라.\n---\n{r1[:2000]}", model_id=mid)
181
- r3 = call_model(f"[R3] 수정하라.\n--- 원문 ---\n{r1[:1500]}\n--- 비판 ---\n{r2[:1500]}", model_id=mid)
182
- r4 = call_model(f"[R4] 메타분석.\n--- R1 ---\n{r1[:800]}\n--- R2 ---\n{r2[:800]}\n--- R3 ---\n{r3[:800]}", model_id=mid)
183
- return f"[R1]\n{r1}\n\n[R2]\n{r2}\n\n[R3]\n{r3}\n\n[R4]\n{r4}"
184
-
185
- def _feedback(pj, mid):
186
- try: data = json.loads(pj)
187
- except: return call_model(pj, model_id=mid)
188
- topic, specs = data.get("topic",""), data.get("rounds",[])
189
- outs, prev = [], ""
190
- for i, rd in enumerate(specs):
191
- inst, fb = rd.get("instruction",""), rd.get("feedback")
192
- if i==0: p = f"\'{topic}\' - {inst}."
193
- elif fb: p = f"피드백 반영: {inst}.\n--- 이전 ---\n{prev[:2000]}\n--- 피드백 ---\n{fb}"
194
- else: p = f"{inst}.\n--- 최종 ---\n{prev[:2500]}"
195
- resp = call_model(p, model_id=mid); outs.append(f"[R{i+1}]\n{resp}"); prev = resp
196
- return "\n\n".join(outs)
197
-
198
- def run_task(task, mid):
199
- if task.sub_dimension == "mutual_verification":
200
- return _mutual(task.prompt.replace("[상생-상극 사이클] ","").split("\n")[0], mid)
201
- elif task.sub_dimension == "feedback_incorporation":
202
- return _feedback(task.prompt, mid)
203
- return call_model(task.prompt, model_id=mid)
204
-
205
- # ══ DB 체크포인트 ══
206
-
207
- DB = "final_bench.db"
208
- def _initdb():
209
- c = sqlite3.connect(DB)
210
- c.execute("CREATE TABLE IF NOT EXISTS r (rid TEXT,tid TEXT,resp TEXT,judge TEXT,score REAL,ts REAL,PRIMARY KEY(rid,tid))")
211
- c.commit(); c.close()
212
- def _rid(m): return hashlib.md5(m.encode()).hexdigest()[:12]
213
- def _sv(rid,tid,resp,jdg,sc):
214
- c=sqlite3.connect(DB); c.execute("INSERT OR REPLACE INTO r VALUES(?,?,?,?,?,?)",(rid,tid,resp,jdg,sc,time.time())); c.commit(); c.close()
215
- def _loadall(rid):
216
- c=sqlite3.connect(DB); rows=c.execute("SELECT tid,resp,judge,score FROM r WHERE rid=?", (rid,)).fetchall(); c.close()
217
- return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
218
- def _clr(rid):
219
- c=sqlite3.connect(DB); c.execute("DELETE FROM r WHERE rid=?",(rid,)); c.commit(); c.close()
220
- _initdb()
221
-
222
- # ══ Scores 저장 + HF 업로드 ══
223
-
224
- SF = "final_scores.json"
225
- def load_sf():
226
  try:
227
- with open(SF) as f: return json.load(f)
228
- except: return {"version":"1.0","updated":"","models":{}}
229
- def save_sf(mn, ps, fs, total, done):
230
- d = load_sf(); d["updated"]=datetime.now().isoformat()
231
- d["models"][mn]={"final_score":fs,"pillar_scores":{p:round(s,2) for p,s in ps.items()},
232
- "total_tasks":total,"completed":done,"evaluated_at":datetime.now().isoformat()}
233
- with open(SF,"w") as f: json.dump(d,f,indent=2,ensure_ascii=False)
 
234
  return d
235
- def upload_sf(d):
236
- tk = os.getenv("HF_TOKEN","")
237
- if not tk: return "⚠️ HF_TOKEN 미설정"
 
238
  try:
239
  from huggingface_hub import HfApi
240
- api = HfApi(token=tk)
241
  api.upload_file(path_or_fileobj=json.dumps(d,indent=2,ensure_ascii=False).encode("utf-8"),
242
- path_in_repo="final_scores.json", repo_id="FINAL-Bench/ALL-Bench-Leaderboard",
243
- repo_type="dataset", commit_message=f"FINAL Score {datetime.now().strftime('%m-%d %H:%M')}")
244
- return "HF Dataset 업로드 완료"
245
- except Exception as e: return f" {e}"
246
 
247
- # ══ 평가 워커 ══
248
 
249
- def _eval1(task, rid, mid, st):
250
  try:
251
- resp = run_task(task, mid)
252
  if resp.startswith("[API_ERROR"):
253
- _sv(rid,task.task_id,resp,"{}",0)
254
- with st["lock"]: st["done"]+=1; st["err"].append(task.task_id)
255
- return task.task_id, {"response":resp,"judge":"{}","score":0}
256
- rk = list(task.scoring_rubric.keys())
257
- jp = judge_prompt(task, resp)
258
- jd = call_judge(jp, rk)
259
  if jd is None:
260
- _sv(rid,task.task_id,resp,'{"failed":true}',-1)
261
- with st["lock"]: st["done"]+=1; st["jf"]+=1
262
- return task.task_id, {"response":resp,"judge":'{"failed":true}',"score":-1}
263
- sc = score(jd["scores"], task.scoring_rubric)
264
- jj = json.dumps(jd, ensure_ascii=False)
265
- _sv(rid,task.task_id,resp,jj,sc)
266
- with st["lock"]:
267
- st["done"]+=1; st["jok"]+=1
268
- info = PILLAR_INFO.get(task.pillar,{})
269
- st["rec"].append(f'{info.get("icon","")} {task.task_id} → {sc:.0f}')
270
- if len(st["rec"])>8: st["rec"]=st["rec"][-8:]
271
- return task.task_id, {"response":resp,"judge":jj,"score":sc}
272
  except Exception as e:
273
- _sv(rid,task.task_id,f"[ERR]{e}","{}",0)
274
- with st["lock"]: st["done"]+=1; st["err"].append(f"{task.task_id}:{str(e)[:40]}")
275
- return task.task_id, {"response":f"[ERR]{e}","judge":"{}","score":0}
276
 
277
- # ══ 글로벌 상태 + 백그라운드 ══
278
-
279
- _S = {"running":False,"stop":False,"finished":False,"model":"","rid":"",
280
- "done":0,"total":0,"cached":0,"err":[],"rec":[],"jok":0,"jf":0,
281
- "t0":0,"results":{},"tasks":[],"lock":threading.Lock(),"msg":"","csv":None,"hf":""}
282
 
283
  def _rst():
284
- with _S["lock"]:
285
- _S.update({"running":False,"stop":False,"finished":False,"done":0,"cached":0,
286
- "err":[],"rec":[],"jok":0,"jf":0,"t0":0,"results":{},"tasks":[],"msg":"","csv":None,"hf":""})
 
287
 
288
- def _bg(mn, mid, tasks, rid, wk):
 
289
  try:
290
- cached = _loadall(rid)
291
- pending = [t for t in tasks if t.task_id not in cached]
292
- with _S["lock"]: _S["results"]=cached; _S["cached"]=len(cached); _S["total"]=len(tasks); _S["t0"]=time.time()
293
- if not pending:
294
- with _S["lock"]: _S["msg"]=f"💾 캐시 완료 ({len(cached)}개)"
295
- _fin(tasks,cached,mn); return
296
- with _S["lock"]: _S["msg"]=f"{len(pending)}문제 · {wk}워커"
297
- with ThreadPoolExecutor(max_workers=wk) as ex:
298
- futs = {ex.submit(_eval1,t,rid,mid,_S):t for t in pending if not _S["stop"]}
299
- done_set = set()
300
- while len(done_set)<len(futs):
301
- if _S["stop"]:
302
- with _S["lock"]: _S["msg"]="⏹️ 중단"; _S["running"]=False; _S["finished"]=True
303
  return
304
  for f in list(futs):
305
- if f in done_set: continue
306
  if f.done():
307
- done_set.add(f)
308
  try:
309
- tid,data = f.result()
310
- with _S["lock"]: _S["results"][tid]=data
311
- except: pass
312
  time.sleep(0.5)
313
- with _S["lock"]: results=dict(_S["results"])
314
- _fin(tasks,results,mn)
315
  except Exception as e:
316
- with _S["lock"]: _S["msg"]=f"{str(e)[:100]}"; _S["running"]=False; _S["finished"]=True
317
-
318
- def _fin(tasks, results, mn):
319
- ps = {}
320
- for p in PILLAR_INFO:
321
- valid = [results[t.task_id]["score"] for t in tasks if t.pillar==p and t.task_id in results and results[t.task_id]["score"]>=0]
322
- if valid: ps[p] = np.mean(valid)
323
- wts = {p:info["weight"] for p,info in PILLAR_INFO.items()}
324
- fs = round(sum(ps.get(p,0)*w for p,w in wts.items()), 2)
325
- done = sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]>=0)
326
- sd = save_sf(mn, ps, fs, len(tasks), done)
327
- hf = upload_sf(sd)
328
- el = int(time.time()-_S["t0"]) if _S["t0"] else 0
329
- with _S["lock"]:
330
- _S["hf"]=hf; _S["msg"]=f"🏁 FINAL Score = {fs:.1f} ({el}초)"
331
- _S["running"]=False; _S["finished"]=True
332
-
333
- def _start(mc, mt, wk, fresh):
334
- if _S["running"]: return "⚠️ 진행 중"
335
- if not os.getenv("HF_TOKEN"): return "❌ HF_TOKEN (Secrets)"
336
- if not os.getenv("OPENAI_API_KEY"): return "❌ OPENAI_API_KEY (Secrets)"
337
- if not ALL_TASKS: return "❌ 과제 로드 실패"
338
- mid = HF_MODELS.get(mc, mc)
339
- tasks = ALL_TASKS[:int(mt)]
340
- rid = _rid(mid)
341
- if fresh: _clr(rid)
342
- _rst()
343
- with _S["lock"]:
344
- _S.update({"running":True,"rid":rid,"model":mc,"tasks":tasks,"total":len(tasks),"msg":"🔄 준비..."})
345
- threading.Thread(target=_bg, args=(mc,mid,tasks,rid,int(wk)), daemon=True).start()
346
- return f"⚡ {mc} 평가 시작 ({len(tasks)}문제, {int(wk)}워커)"
347
-
348
- def _stop():
349
- if _S["running"]: _S["stop"]=True; return "⏹️ 중단 요청"
350
- return "ℹ️ 실행 아님"
351
-
352
- # ══ UI 빌더 ══
353
-
354
- CSS = """<style>
355
- .score-bar{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}
356
- .score-fill{height:100%;border-radius:8px}
357
- .summary-card{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:24px;color:#fff}
358
- .pillar-row{display:flex;align-items:center;gap:10px;margin:6px 0}
359
- .pillar-bar{flex:1;background:#333;border-radius:6px;height:16px;overflow:hidden}
360
- .pillar-fill{height:100%;border-radius:6px}
361
- .pbar{background:#e0e0e0;border-radius:8px;height:22px;overflow:hidden}
362
- .pfill{height:100%;border-radius:8px;background:linear-gradient(90deg,#6366f1,#10b981)}
363
- </style>"""
364
- def _c(s): return "#4caf50" if s>=80 else ("#ff9800" if s>=60 else "#f44336")
365
 
366
  def _poll():
367
- with _S["lock"]:
368
- run,fin = _S["running"],_S["finished"]
369
- tasks,res = _S.get("tasks",[]),dict(_S.get("results",{}))
370
- msg = _S.get("msg","")
371
  if not run and not fin and not res:
372
- return ("ℹ️ 모델 선택 ▶️ 시작", "", "", None)
373
- # Progress
374
  if run:
375
- d,tot = _S["done"],max(_S.get("total",1),1)
376
- pct = min(int(d/tot*100),100)
377
- el = int(time.time()-_S.get("t0",time.time()))
378
- eta = int((el/max(d,1))*(tot-d)) if d>0 else 0
379
- tags = " ".join([f'<span style="background:#e8eaf6;padding:2px 8px;border-radius:4px;font-size:.8em">{r}</span>' for r in _S.get("rec",[])[-6:]])
380
- prog = f'{CSS}<div><div style="display:flex;justify-content:space-between;margin-bottom:4px"><span>⚡ {d}/{tot} | {el}초 | ~{eta}초</span><span style="font-weight:700">{pct}%</span></div><div class="pbar"><div class="pfill" style="width:{pct}%"></div></div><div style="margin-top:6px">{tags}</div><div style="margin-top:4px;font-size:.85em">⚖️ ✅{_S.get("jok",0)} ❌{_S.get("jf",0)}</div></div>'
381
  elif fin:
382
- prog = f'<div style="background:#e8f5e9;padding:14px;border-radius:8px;font-weight:700">{msg}</div>'
383
- else: prog = msg
384
- # Table
385
- tbl = ""
386
  if tasks:
387
- rows = ""
388
  for t in tasks:
389
- info = PILLAR_INFO.get(t.pillar,{})
390
  if t.task_id in res:
391
- s = res[t.task_id]["score"]
392
- if s<0: rows += f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>—</td></tr>'
393
  else:
394
- c = _c(s)
395
- rows += f'<tr><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="color:{c};font-weight:700">{s:.1f}</td></tr>'
396
- else: rows += f'<tr style="opacity:.35"><td>{t.task_id}</td><td>{info.get("icon","")}</td><td>{t.difficulty}</td><td></td><td></td></tr>'
397
- tbl = f'{CSS}<table style="width:100%;border-collapse:collapse;font-size:.85em"><thead><tr><th style="background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc">ID</th><th style="background:#f0f4f8;padding:8px">기둥</th><th style="background:#f0f4f8;padding:8px">난이도</th><th style="background:#f0f4f8;padding:8px">점수</th><th style="background:#f0f4f8;padding:8px">값</th></tr></thead><tbody>{rows}</tbody></table>'
398
- # Summary
399
- summ = ""
400
  if fin and tasks:
401
- ps = {}
402
- for p in PILLAR_INFO:
403
- valid = [res[t.task_id]["score"] for t in tasks if t.pillar==p and t.task_id in res and res[t.task_id]["score"]>=0]
404
- if valid: ps[p]=np.mean(valid)
405
- wts = {p:info["weight"] for p,info in PILLAR_INFO.items()}
406
- fs = round(sum(ps.get(p,0)*w for p,w in wts.items()),2)
407
- g = "A" if fs>=80 else ("B+" if fs>=70 else ("B" if fs>=60 else "C"))
408
- ph = ""
409
- for p,info in PILLAR_INFO.items():
410
- s=ps.get(p,0); c=_c(s); w=int(info["weight"]*100)
411
- ph += f'<div class="pillar-row"><span style="width:140px">{info["icon"]} {info["name"]} ({w}%)</span><div class="pillar-bar"><div class="pillar-fill" style="width:{min(s,100)}%;background:{c}"></div></div><span style="width:55px;text-align:right;font-weight:700;color:{c}">{s:.1f}</span></div>'
412
- summ = f'{CSS}<div class="summary-card"><h2 style="margin:0;font-size:1.8em;text-align:center">🧬 FINAL Score: {fs:.1f}/100</h2><h3 style="text-align:center;color:#aaa">{g} | {_S.get("model","")}</h3><hr style="border-color:#333;margin:16px 0">{ph}<hr style="border-color:#333;margin:16px 0"><p style="font-size:.85em;color:#888">{_S.get("hf","")}</p></div>'
413
- return (prog, tbl, summ, None)
414
-
415
- # ══ Gradio App ══
416
-
417
- HEADER = """<div style="text-align:center;padding:20px 0">
418
- <h1 style="margin:0;font-size:2em">🧬 FINAL Bench Auto-Evaluator</h1>
419
- <p style="color:#666;max-width:700px;margin:10px auto;line-height:1.7">
420
- <b>FINAL Bench 100문제</b> × ALL Bench 등재 모델 자동 평가<br>
421
- 📡 HF Inference API · ⚖️ GPT-5.2 Judge · 📊 → ALL Bench Metacog 자동 반영
422
- </p></div>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
  def create_app():
425
- with gr.Blocks(title="FINAL Bench Auto-Evaluator", theme=gr.themes.Soft(),
426
- css=".gradio-container{max-width:1100px !important}") as app:
427
  gr.HTML(HEADER)
428
  with gr.Row():
429
- mdd = gr.Dropdown(list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0],
430
- label="🤖 평가 대상 모델", scale=4)
431
- mt = gr.Slider(1, len(ALL_TASKS) if ALL_TASKS else 100,
432
- value=len(ALL_TASKS) if ALL_TASKS else 100, step=1, label="과제 수", scale=2)
433
- wk = gr.Slider(1, 15, value=8, step=1, label=" 워커", scale=1)
 
 
 
 
 
 
434
  with gr.Row():
435
- sb = gr.Button("▶️ 이어하기", variant="primary", size="lg", scale=2)
436
- fb = gr.Button("🚀 새로 시작", variant="secondary", size="lg", scale=2)
437
- xb = gr.Button("⏹️ 중단", variant="stop", size="lg", scale=1)
438
- st = gr.Textbox(label="상태", interactive=False, max_lines=1)
439
- with gr.Accordion("📊 기존 결과", open=False):
440
- gr.JSON(label="final_scores.json", value=load_sf())
441
  with gr.Tabs():
442
- with gr.Tab("📊 진행"): p1=gr.HTML()
443
- with gr.Tab("📋 결과표"): p2=gr.HTML()
444
- with gr.Tab("🏆 최종"): p3=gr.HTML()
445
- with gr.Tab("💾 CSV"): p4=gr.File(label="CSV")
446
- timer = gr.Timer(value=2, active=True)
447
- timer.tick(fn=_poll, outputs=[p1,p2,p3,p4])
448
- sb.click(fn=lambda m,t,w: _start(m,t,w,False), inputs=[mdd,mt,wk], outputs=[st])
449
- fb.click(fn=lambda m,t,w: _start(m,t,w,True), inputs=[mdd,mt,wk], outputs=[st])
450
- xb.click(fn=_stop, outputs=[st])
451
- gr.Markdown(f"---\n<center>FINAL Bench v1.0 · {len(ALL_TASKS)}문제 · Ginigen AI · Apache 2.0</center>")
 
452
  return app
453
 
454
- if __name__ == "__main__":
455
- stats = {}
456
- for t in ALL_TASKS: stats[t.pillar]=stats.get(t.pillar,0)+1
457
- print(f"🧬 FINAL Bench Auto-Evaluator: {len(ALL_TASKS)} tasks")
458
- for p,n in stats.items():
459
- info=PILLAR_INFO[p]; print(f" {info['icon']} {info['name']}: {n}")
460
- print(f" 📡 HF Models: {len(HF_MODELS)} | ⚖️ Judge: GPT-5.2")
461
- print(f" 🔑 HF_TOKEN: {'✅' if os.getenv('HF_TOKEN') else '❌'} | OPENAI_API_KEY: {'✅' if os.getenv('OPENAI_API_KEY') else '❌'}")
462
- app = create_app()
463
  app.queue(default_concurrency_limit=2)
464
- app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
1
  """
2
+ FINAL Bench Auto-Evaluator v1.0 — ALL Bench Metacog 자동 측정
3
+ =============================================================
4
+ FINAL-Bench/Metacognitive 100문제 x HF Inference API x GPT Judge
5
+ -> final_scores.json -> ALL Bench Leaderboard 자동 연동
6
 
7
+ TICOS 채점: T=Trap I=Insight C=Confidence O=Self-Correction S=Synthesis
 
 
 
8
 
9
+ Author: Ginigen AI · FINAL-Bench · License: Apache 2.0
10
  """
11
+ import json,os,time,csv,io,re,html,hashlib,sqlite3,threading
 
12
  from datetime import datetime
13
+ from dataclasses import dataclass
14
+ from typing import Optional
15
+ import requests, numpy as np, gradio as gr
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  @dataclass
18
+ class FinalTask:
19
+ task_id:str; domain:str; grade:str; ticos_type:str; difficulty:str
20
+ lens:str; title:str; prompt:str; expected_behavior:str
21
+ hidden_trap:Optional[str]=None; ticos_required:str=""; ticos_optional:str=""
 
 
22
 
23
  def load_tasks():
 
24
  try:
25
  from datasets import load_dataset
26
+ ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
27
+ tasks=[FinalTask(task_id=r["task_id"],domain=r["domain"],grade=r["grade"],
28
+ ticos_type=r["ticos_type"],difficulty=r["difficulty"],lens=r.get("lens",""),
29
+ title=r["title"],prompt=r["prompt"],expected_behavior=r["expected_behavior"],
30
+ hidden_trap=r.get("hidden_trap"),ticos_required=r.get("ticos_required",""),
31
+ ticos_optional=r.get("ticos_optional","")) for r in ds]
32
+ print(f"FINAL Bench: {len(tasks)} tasks loaded (HF Dataset)")
 
 
 
 
 
 
 
 
 
 
 
33
  return tasks
34
  except Exception as e:
35
+ print(f"HF load failed: {e}"); return []
36
+
37
+ ALL_TASKS=load_tasks()
38
+
39
+ TICOS_INFO={
40
+ "E_SelfCorrecting":{"name":"자기수정","icon":"🔄"},
41
+ "A_TrapEscape":{"name":"함정탈출","icon":"🪤"},
42
+ "B_ContradictionResolution":{"name":"모순해결","icon":"⚡"},
43
+ "C_ProgressiveDiscovery":{"name":"점진발견","icon":"🔬"},
44
+ "D_MultiConstraint":{"name":"다중제약","icon":"🎯"},
45
+ "F_ExpertPanel":{"name":"전문가토론","icon":"👥"},
46
+ "G_PivotDetection":{"name":"전환감지","icon":"🔀"},
47
+ "H_ConfidenceCalibration":{"name":"확신도보정","icon":"📊"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  }
49
 
50
+ RUBRIC_KEYS=["trap_detection","insight_depth","confidence_calibration","self_correction","synthesis_quality"]
51
+ RUBRIC_W={"trap_detection":0.20,"insight_depth":0.20,"confidence_calibration":0.25,"self_correction":0.20,"synthesis_quality":0.15}
52
+ RUBRIC_D={"trap_detection":"숨겨진 함정/오류 감지","insight_depth":"통찰 깊이와 정확성",
53
+ "confidence_calibration":"확신도-정확도 일치 (과대확신 감점)","self_correction":"오류 인지 후 수정 실행",
54
+ "synthesis_quality":"종합의 일관성과 완결성"}
55
+
56
+ def final_score(scores):
57
+ return round(sum(scores.get(k,0.5)*w for k,w in RUBRIC_W.items())*100,2)
58
 
59
+ def _strip(t):
60
+ if not t:return t
61
+ t=re.sub(r'<think>.*?</think>','',t,flags=re.DOTALL)
62
+ t=re.sub(r'<thinking>.*?</thinking>','',t,flags=re.DOTALL)
63
+ return t.strip()
64
+
65
+ def call_hf(prompt,sys="",key="",mid="Qwen/Qwen3.5-397B-A17B",mt=4096,temp=0.6):
66
+ msgs=[]
67
+ if sys:msgs.append({"role":"system","content":sys})
68
+ msgs.append({"role":"user","content":prompt})
69
+ h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
70
+ p={"model":mid,"messages":msgs,"max_tokens":mt,"temperature":temp,"stream":False}
71
+ for a in range(3):
72
+ try:
73
+ r=requests.post(f"https://router.huggingface.co/hf-inference/models/{mid}/v1/chat/completions",headers=h,json=p,timeout=120)
74
+ if r.status_code in(429,503):time.sleep(5*(a+1));continue
75
+ r.raise_for_status()
76
+ return _strip(r.json()["choices"][0]["message"]["content"])
77
+ except Exception as e:
78
+ if a<2:time.sleep(3*(a+1))
79
+ else:return f"[API_ERROR] {e}"
80
 
81
+ def call_oai(prompt,sys="",key="",model="gpt-5.2",mt=4096,temp=0.6):
82
+ msgs=[]
83
+ if sys:msgs.append({"role":"system","content":sys})
 
 
84
  msgs.append({"role":"user","content":prompt})
85
+ h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
86
+ p={"model":model,"messages":msgs,"max_tokens":mt,"temperature":temp}
87
+ for a in range(2):
88
  try:
89
+ r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=120)
90
+ if r.status_code==429:time.sleep(5*(a+1));continue
 
 
91
  r.raise_for_status()
92
  return _strip(r.json()["choices"][0]["message"]["content"])
93
  except Exception as e:
94
+ if a<1:time.sleep(3)
95
+ else:return f"[API_ERROR] {e}"
96
+
97
+ def call_model(prompt,sys="",key="",mid="",at="hf",mt=4096,temp=0.6):
98
+ if at=="openai":return call_oai(prompt,sys,key,mid,mt,temp)
99
+ return call_hf(prompt,sys,key,mid,mt,temp)
100
+
101
+ HF_MODELS={
102
+ "Qwen3.5-397B":"Qwen/Qwen3.5-397B-A17B","Qwen3.5-122B":"Qwen/Qwen3.5-122B-A10B",
103
+ "Qwen3.5-27B":"Qwen/Qwen3.5-27B","Qwen3.5-35B":"Qwen/Qwen3.5-35B-A3B",
104
+ "Qwen3.5-9B":"Qwen/Qwen3.5-9B","Qwen3.5-4B":"Qwen/Qwen3.5-4B",
105
+ "DeepSeek V3.2":"deepseek-ai/DeepSeek-V3-0324","DeepSeek R1":"deepseek-ai/DeepSeek-R1",
106
+ "Llama 4 Scout":"meta-llama/Llama-4-Scout-17B-16E-Instruct",
107
+ "Llama 4 Maverick":"meta-llama/Llama-4-Maverick-17B-128E-Instruct",
108
+ "Phi-4":"microsoft/phi-4","Mistral Large 3":"mistralai/Mistral-Large-Instruct-2501",
109
+ }
110
+ OAI_MODELS={"GPT-5.2":"gpt-5.2","GPT-5.4":"gpt-5.4","GPT-5.1":"gpt-5.1"}
111
+
112
+ JUDGE_SYS="""You are a FINAL Bench Metacognition Judge. Score each TICOS dimension using ONLY 0.0/0.25/0.5/0.75/1.0:
113
+
114
+ 1. trap_detection (T): Did model detect hidden traps/false premises?
115
+ 1.0=all traps found, 0.5=some missed, 0.0=fell into traps
116
+ 2. insight_depth (I): Genuine deep understanding?
117
+ 1.0=novel insights, 0.5=correct but shallow, 0.0=wrong
118
+ 3. confidence_calibration (C): Confidence matches accuracy?
119
+ 1.0=well-calibrated, 0.5=inconsistent, 0.0=overconfident on wrong answers
120
+ CRITICAL: Overconfidence is WORSE than underconfidence.
121
+ 4. self_correction (O): Caught and fixed own errors?
122
+ 1.0=explicit backtrack+correct, 0.5=acknowledged not fixed, 0.0=no correction
123
+ 5. synthesis_quality (S): Final synthesis coherent and complete?
124
+ 1.0=unified nuanced conclusion, 0.5=partial, 0.0=fragmented
125
+
126
+ Output JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"한줄 평가"}"""
127
+
128
+ def build_jprompt(task,resp):
129
+ rl="\n".join([f' "{k}": {d}' for k,d in RUBRIC_D.items()])
130
+ sk=", ".join([f'"{k}": ___' for k in RUBRIC_KEYS])
131
+ ht=f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
132
+ return f"""[FINAL Bench Metacognition Evaluation]
133
+ Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
134
+ Title: {task.title}
135
+ Prompt: {task.prompt[:1200]}
136
+ Expected: {task.expected_behavior[:500]}{ht}
137
+ === RESPONSE ===
138
+ {resp[:8000]}
139
+ === END ===
140
+ Score TICOS (0.0/0.25/0.5/0.75/1.0):
141
+ {rl}
142
+ Output ONLY: {{"scores": {{{sk}}}, "comment": "한줄 평가"}}"""
143
+
144
+ def call_judge(prompt,key,model="gpt-5.2"):
145
+ schema={"type":"object","properties":{"scores":{"type":"object",
146
+ "properties":{k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC_KEYS},
147
+ "required":RUBRIC_KEYS,"additionalProperties":False},
148
+ "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
149
+ msgs=[{"role":"system","content":JUDGE_SYS},{"role":"user","content":prompt}]
150
+ p={"model":model,"max_completion_tokens":4096,"temperature":0.1,"messages":msgs,
151
+ "response_format":{"type":"json_schema","json_schema":{"name":"FINALResult","strict":True,"schema":schema}}}
152
+ h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
153
+ for a in range(3):
154
  try:
155
+ r=requests.post("https://api.openai.com/v1/chat/completions",headers=h,json=p,timeout=180)
156
+ if r.status_code==429:time.sleep(5*(a+1));continue
157
  r.raise_for_status()
158
+ c=r.json()["choices"][0]["message"]["content"]
159
+ if not c:
160
+ if a<2:time.sleep(2);continue
161
  return None
162
+ d=json.loads(_strip(c))
163
+ if "scores" in d:
164
+ for k in RUBRIC_KEYS:
165
+ if k not in d["scores"]:d["scores"][k]=0.5
166
+ return d
167
+ except:
168
+ if a<2:time.sleep(3*(a+1))
 
169
  return None
170
 
171
+ DB="final_bench.db"
172
+ def _idb():
173
+ c=sqlite3.connect(DB)
174
+ c.execute("CREATE TABLE IF NOT EXISTS r(rid TEXT,tid TEXT,resp TEXT,judge TEXT,score REAL,ts REAL,PRIMARY KEY(rid,tid))")
175
+ c.commit();c.close()
176
+ def _rid(m):return hashlib.md5(f"FB_{m}".encode()).hexdigest()[:12]
177
+ def _sv(rid,tid,resp,jj,sc):
178
+ c=sqlite3.connect(DB);c.execute("INSERT OR REPLACE INTO r VALUES(?,?,?,?,?,?)",(rid,tid,resp,jj,sc,time.time()));c.commit();c.close()
179
+ def _la(rid):
180
+ c=sqlite3.connect(DB);cur=c.execute("SELECT tid,resp,judge,score FROM r WHERE rid=?", (rid,));rows=cur.fetchall();c.close()
181
+ return{r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
182
+ def _clr(rid):c=sqlite3.connect(DB);c.execute("DELETE FROM r WHERE rid=?",(rid,));c.commit();c.close()
183
+ _idb()
184
+
185
+ SF="final_scores.json"
186
+ def _lsf():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  try:
188
+ with open(SF) as f:return json.load(f)
189
+ except:return{"version":"1.0","bench":"FINAL-Bench/Metacognitive","updated":"","models":{}}
190
+
191
+ def _ssf(mn,sc,ds,ts,nt,nc):
192
+ d=_lsf();d["updated"]=datetime.now().isoformat()
193
+ d["models"][mn]={"final_score":sc,"domain_scores":ds,"ticos_scores":ts,
194
+ "tasks_total":nt,"tasks_completed":nc,"evaluated_at":datetime.now().isoformat()}
195
+ with open(SF,"w") as f:json.dump(d,f,indent=2,ensure_ascii=False)
196
  return d
197
+
198
+ def _uhf(d):
199
+ tk=os.getenv("HF_TOKEN","")
200
+ if not tk:return "HF_TOKEN 미설정"
201
  try:
202
  from huggingface_hub import HfApi
203
+ api=HfApi(token=tk)
204
  api.upload_file(path_or_fileobj=json.dumps(d,indent=2,ensure_ascii=False).encode("utf-8"),
205
+ path_in_repo="final_scores.json",repo_id="FINAL-Bench/ALL-Bench-Leaderboard",
206
+ repo_type="dataset",commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
207
+ return "HF upload OK"
208
+ except Exception as e:return f"Upload fail: {e}"
209
 
210
+ from concurrent.futures import ThreadPoolExecutor
211
 
212
+ def _e1(t,rid,key,jk,mid,jm,at,st):
213
  try:
214
+ resp=call_model(t.prompt,key=key,mid=mid,at=at)
215
  if resp.startswith("[API_ERROR"):
216
+ _sv(rid,t.task_id,resp,"{}",0)
217
+ with st["lk"]:st["dn"]+=1;st["er"].append(t.task_id)
218
+ return t.task_id,{"response":resp,"judge":"{}","score":0}
219
+ jp=build_jprompt(t,resp)
220
+ jd=call_judge(jp,jk,jm)
 
221
  if jd is None:
222
+ jd={"scores":{k:0.0 for k in RUBRIC_KEYS},"comment":"judge_failed","failed":True}
223
+ if jd.get("failed"):sc=-1.0
224
+ else:sc=final_score(jd["scores"]);
225
+ with st["lk"]:
226
+ if not jd.get("failed"):st["jok"]+=1
227
+ jj=json.dumps(jd,ensure_ascii=False)
228
+ _sv(rid,t.task_id,resp,jj,sc)
229
+ with st["lk"]:
230
+ st["dn"]+=1;ic=TICOS_INFO.get(t.ticos_type,{})
231
+ st["ac"].append(f'{ic.get("icon","")}{t.task_id}');
232
+ if len(st["ac"])>10:st["ac"]=st["ac"][-10:]
233
+ return t.task_id,{"response":resp,"judge":jj,"score":sc}
234
  except Exception as e:
235
+ _sv(rid,t.task_id,f"[ERR]{e}","{}",0)
236
+ with st["lk"]:st["dn"]+=1;st["er"].append(f"{t.task_id}:{str(e)[:40]}")
237
+ return t.task_id,{"response":f"[ERR]{e}","judge":"{}","score":0}
238
 
239
+ _S={"run":False,"stp":False,"fin":False,"rid":"","mdl":"","dn":0,"tot":0,"cch":0,
240
+ "er":[],"ac":[],"jok":0,"t0":0,"res":{},"tsk":[],"lk":threading.Lock(),
241
+ "msg":"","csv":None,"hfs":""}
 
 
242
 
243
  def _rst():
244
+ global _S
245
+ with _S["lk"]:
246
+ _S.update({"run":False,"stp":False,"fin":False,"dn":0,"cch":0,"er":[],"ac":[],"jok":0,
247
+ "t0":0,"res":{},"tsk":[],"msg":"","csv":None,"hfs":""})
248
 
249
+ def _bgev(key,jk,mid,mn,jm,at,tasks,rid,wk):
250
+ global _S
251
  try:
252
+ res=dict(_la(rid));cch=sum(1 for t in tasks if t.task_id in res)
253
+ pend=[t for t in tasks if t.task_id not in res]
254
+ with _S["lk"]:_S["res"]=res;_S["cch"]=cch;_S["tot"]=len(tasks);_S["t0"]=time.time()
255
+ if not pend:
256
+ with _S["lk"]:_S["msg"]=f"Cache: {cch}"
257
+ _fin(tasks,res,mn);return
258
+ with _S["lk"]:_S["msg"]=f"{len(pend)} tasks, {wk} workers"
259
+ with ThreadPoolExecutor(max_workers=wk) as exe:
260
+ futs={exe.submit(_e1,t,rid,key,jk,mid,jm,at,_S):t for t in pend if not _S["stp"]}
261
+ done=set()
262
+ while len(done)<len(futs):
263
+ if _S["stp"]:
264
+ with _S["lk"]:_S["msg"]="Stopped";_S["run"]=False;_S["fin"]=True
265
  return
266
  for f in list(futs):
267
+ if f in done:continue
268
  if f.done():
269
+ done.add(f)
270
  try:
271
+ tid,d=f.result()
272
+ with _S["lk"]:_S["res"][tid]=d
273
+ except:pass
274
  time.sleep(0.5)
275
+ with _S["lk"]:res=dict(_S["res"])
276
+ _fin(tasks,res,mn)
277
  except Exception as e:
278
+ with _S["lk"]:_S["msg"]=f"ERR:{str(e)[:80]}";_S["run"]=False;_S["fin"]=True
279
+
280
+ def _fin(tasks,res,mn):
281
+ global _S
282
+ ds={};ts={}
283
+ for dom in set(t.domain for t in tasks):
284
+ v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
285
+ if v:ds[dom]=round(np.mean(v),2)
286
+ for tt in set(t.ticos_type for t in tasks):
287
+ v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
288
+ if v:ts[tt]=round(np.mean(v),2)
289
+ av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
290
+ fs=round(np.mean(av),2) if av else 0
291
+ sd=_ssf(mn,fs,ds,ts,len(tasks),len(av))
292
+ rid=_S["rid"]
293
+ cp=f"/tmp/fb_{rid}.csv"
294
+ with open(cp,"w",encoding="utf-8") as f:
295
+ w=csv.writer(f);w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","score","comment","ts"])
296
+ tm={t.task_id:t for t in tasks}
297
+ for tid,d in sorted(res.items()):
298
+ t=tm.get(tid)
299
+ if not t:continue
300
+ jd={}
301
+ try:jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else {}
302
+ except:pass
303
+ w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,d["score"],
304
+ (jd.get("comment","") if isinstance(jd,dict) else "")[:200],datetime.now().isoformat()])
305
+ hfs=_uhf(sd)
306
+ el=int(time.time()-_S["t0"]) if _S["t0"] else 0
307
+ with _S["lk"]:
308
+ _S["csv"]=cp;_S["hfs"]=hfs
309
+ _S["msg"]=f"FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
310
+ _S["run"]=False;_S["fin"]=True
311
+
312
+ CSS='<style>.et{width:100%;border-collapse:collapse;font-size:.85em}.et th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc}.et td{padding:6px 8px;border-bottom:1px solid #eee}.sb{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}.sf{height:100%;border-radius:8px}.sc{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:20px;color:#fff;margin:8px 0}.pb{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}.pf{height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1)}</style>'
313
+
314
+ def _clr2(s):
315
+ if s>=80:return"#4caf50"
316
+ if s>=60:return"#ff9800"
317
+ return"#f44336"
 
 
 
 
 
 
 
 
 
318
 
319
  def _poll():
320
+ global _S
321
+ with _S["lk"]:
322
+ run=_S["run"];fin=_S["fin"];tasks=_S.get("tsk",[]);res=dict(_S.get("res",{}))
323
+ msg=_S.get("msg","");csvp=_S.get("csv")
324
  if not run and not fin and not res:
325
+ return("Select model and press Start.","","",None)
 
326
  if run:
327
+ dn=_S["dn"];tot=_S.get("tot",1);pct=min(int(dn/max(tot,1)*100),100)
328
+ el=int(time.time()-_S.get("t0",time.time()));eta=int((el/max(dn,1))*(tot-dn)) if dn>0 else 0
329
+ ac=_S.get("ac",[]);jok=_S.get("jok",0)
330
+ tg=" ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;font-size:.78em">{a}</span>' for a in ac[-8:]])
331
+ prog=f'{CSS}<div><div style="display:flex;justify-content:space-between;margin-bottom:4px"><span>🧬 {dn}/{tot} · {el}s · ETA {eta}s · Judge✅{jok}</span><span style="font-weight:700;color:#7c3aed">{pct}%</span></div><div class="pb"><div class="pf" style="width:{pct}%"></div></div><div style="margin-top:6px">{tg}</div></div>'
 
332
  elif fin:
333
+ prog=f'<div style="background:#f0fdf4;padding:14px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a">🏁 {msg}</div>'
334
+ else:prog=msg
335
+ tbl=""
 
336
  if tasks:
337
+ rows=""
338
  for t in tasks:
339
+ ic=TICOS_INFO.get(t.ticos_type,{})
340
  if t.task_id in res:
341
+ s=res[t.task_id]["score"]
342
+ if s<0:rows+=f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td></tr>'
343
  else:
344
+ c=_clr2(s);rows+=f'<tr><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>{t.ticos_type}</td><td>{t.difficulty}</td><td><div class="sb"><div class="sf" style="width:{min(s,100)}%;background:{c}"></div></div><span style="color:{c};font-weight:700">{s:.1f}</span></td></tr>'
345
+ else:rows+=f'<tr style="opacity:.4"><td>{t.task_id}</td><td>{ic.get("icon","")}</td><td>{t.domain}</td><td>-</td><td>-</td><td></td></tr>'
346
+ tbl=f'{CSS}<table class="et"><thead><tr><th>ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
347
+ sm=""
 
 
348
  if fin and tasks:
349
+ av=[res[t.task_id]["score"] for t in tasks if t.task_id in res and res[t.task_id]["score"]>=0]
350
+ fs=round(np.mean(av),2) if av else 0
351
+ gr2="A" if fs>=80 else("B+" if fs>=70 else("B" if fs>=60 else "C"))
352
+ dh=""
353
+ for dom in sorted(set(t.domain for t in tasks)):
354
+ v=[res[t.task_id]["score"] for t in tasks if t.domain==dom and t.task_id in res and res[t.task_id]["score"]>=0]
355
+ if v:a=np.mean(v);c=_clr2(a);dh+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:.85em">{dom}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
356
+ th=""
357
+ for tt,info in TICOS_INFO.items():
358
+ v=[res[t.task_id]["score"] for t in tasks if t.ticos_type==tt and t.task_id in res and res[t.task_id]["score"]>=0]
359
+ if v:a=np.mean(v);c=_clr2(a);th+=f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:150px;font-size:.85em">{info["icon"]} {info["name"]}</span><div style="flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{a:.1f}</span></div>'
360
+ jf=sum(1 for t in tasks if t.task_id in res and res[t.task_id]["score"]<0)
361
+ sm=f'{CSS}<div class="sc"><h2 style="margin:0;font-size:1.6em;text-align:center">🧬 FINAL Score: {fs} / 100</h2><h3 style="margin:4px 0;text-align:center;color:#aaa">Grade {gr2} · {_S.get("mdl","")}</h3><p style="text-align:center;color:#888;font-size:.9em">{len(av)}문제{f" · ❌{jf}" if jf else ""}</p><hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">📚 도메인별</h4>{dh}<hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa">🧬 TICOS별</h4>{th}<hr style="border-color:#333;margin:12px 0"><p style="font-size:.85em;color:#aaa">{_S.get("hfs","")}</p></div>'
362
+ return(prog,tbl,sm,csvp)
363
+
364
+ def _start(mc,at,ek,jk,jm,df,mt,nw,fresh):
365
+ global _S
366
+ if _S["run"]:return"Already running"
367
+ ek=(ek or"").strip() or os.getenv("HF_TOKEN","")
368
+ jk=(jk or"").strip() or os.getenv("OPENAI_API_KEY","")
369
+ if not ek:return"Need API key"
370
+ if not jk:return"Need Judge key"
371
+ if at=="HuggingFace Inference":mid=HF_MODELS.get(mc,mc);a="hf"
372
+ else:mid=OAI_MODELS.get(mc,mc);a="openai"
373
+ tasks=ALL_TASKS[:]
374
+ if df!="전체":tasks=[t for t in tasks if t.difficulty==df]
375
+ tasks=tasks[:int(mt)]
376
+ rid=_rid(mid)
377
+ if fresh:_clr(rid)
378
+ _rst()
379
+ with _S["lk"]:_S["run"]=True;_S["rid"]=rid;_S["mdl"]=mc;_S["tsk"]=tasks;_S["tot"]=len(tasks)
380
+ threading.Thread(target=_bgev,args=(ek,jk,mid,mc,jm,a,tasks,rid,int(nw)),daemon=True).start()
381
+ return f"🧬 {mc} FINAL Bench ({len(tasks)} tasks, {int(nw)} workers)"
382
+
383
+ def _stop():
384
+ global _S
385
+ if _S["run"]:_S["stp"]=True;return"Stopping..."
386
+ return"Not running"
387
+
388
+ def _um(at):
389
+ if at=="HuggingFace Inference":return gr.update(choices=list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0])
390
+ return gr.update(choices=list(OAI_MODELS.keys()),value=list(OAI_MODELS.keys())[0])
391
+
392
+ HEADER="""<div style="text-align:center;padding:16px 0">
393
+ <h1 style="margin:0;font-size:1.8em">🧬 FINAL Bench Auto-Evaluator v1.0</h1>
394
+ <h2 style="margin:4px 0;color:#555;font-size:1.05em">Metacognitive Intelligence · 100 Tasks · TICOS Scoring</h2>
395
+ <p style="color:#888;font-size:.88em;max-width:700px;margin:8px auto">
396
+ 📊 <b>FINAL-Bench/Metacognitive</b> · 100문제 · 15도메인 · 8 TICOS유형<br>
397
+ 🧬 TICOS: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
398
+ 📡 HF Inference (오픈소스) + 🔑 OpenAI (클로즈드) → ⚖️ GPT-5.2 Judge<br>
399
+ 📊 → <code>final_scores.json</code> → ALL Bench Metacog 자동 반영</p></div>"""
400
 
401
  def create_app():
402
+ with gr.Blocks(title="FINAL Bench Evaluator",theme=gr.themes.Soft(),
403
+ css=".gradio-container{max-width:1100px!important}") as app:
404
  gr.HTML(HEADER)
405
  with gr.Row():
406
+ at=gr.Radio(["HuggingFace Inference","OpenAI Compatible"],value="HuggingFace Inference",label="📡 API",scale=2)
407
+ md=gr.Dropdown(list(HF_MODELS.keys()),value=list(HF_MODELS.keys())[0],label="🤖 Model",scale=3,allow_custom_value=True)
408
+ at.change(_um,[at],[md])
409
+ with gr.Row():
410
+ ek=gr.Textbox(label="🔑 Eval Key",type="password",placeholder="hf_... or sk-...",value=os.getenv("HF_TOKEN",""),scale=3)
411
+ jk=gr.Textbox(label="⚖️ Judge Key",type="password",placeholder="sk-...",value=os.getenv("OPENAI_API_KEY",""),scale=3)
412
+ with gr.Row():
413
+ jm=gr.Textbox(label="⚖️ Judge",value="gpt-5.2",scale=2)
414
+ df=gr.Dropdown(["전체","expert","frontier"],value="전체",label="Difficulty",scale=1)
415
+ mt=gr.Slider(1,100,value=100,step=1,label="Tasks",scale=2)
416
+ nw=gr.Slider(1,20,value=10,step=1,label="Workers",scale=1)
417
  with gr.Row():
418
+ sb=gr.Button("▶️ Start",variant="primary",size="lg",scale=2)
419
+ fb=gr.Button("🚀 Fresh",variant="secondary",size="lg",scale=2)
420
+ xb=gr.Button("⏹️ Stop",variant="stop",size="lg",scale=1)
421
+ st=gr.Textbox(label="Status",interactive=False,max_lines=1)
422
+ with gr.Accordion("📊 Existing Scores",open=False):
423
+ gr.JSON(value=_lsf(),label="final_scores.json")
424
  with gr.Tabs():
425
+ with gr.Tab("📊 Progress"):p=gr.HTML()
426
+ with gr.Tab("📋 Results"):t=gr.HTML()
427
+ with gr.Tab("🏆 Summary"):s=gr.HTML()
428
+ with gr.Tab("💾 CSV"):c=gr.File(label="CSV")
429
+ timer=gr.Timer(value=2,active=True)
430
+ timer.tick(fn=_poll,outputs=[p,t,s,c])
431
+ ins=[md,at,ek,jk,jm,df,mt,nw]
432
+ sb.click(fn=lambda *a:_start(*a,fresh=False),inputs=ins,outputs=[st])
433
+ fb.click(fn=lambda *a:_start(*a,fresh=True),inputs=ins,outputs=[st])
434
+ xb.click(fn=_stop,outputs=[st])
435
+ gr.Markdown(f"---\n<center>🧬 FINAL Bench v1.0 · Apache 2.0 · Ginigen AI<br>Data: FINAL-Bench/Metacognitive · {len(ALL_TASKS)} tasks · TICOS</center>")
436
  return app
437
 
438
+ if __name__=="__main__":
439
+ st={}
440
+ for t in ALL_TASKS:st[t.ticos_type]=st.get(t.ticos_type,0)+1
441
+ print(f"FINAL Bench Evaluator: {len(ALL_TASKS)} tasks")
442
+ for tt,n in sorted(st.items()):i=TICOS_INFO.get(tt,{});print(f" {i.get('icon','')} {tt}: {n}")
443
+ app=create_app()
 
 
 
444
  app.queue(default_concurrency_limit=2)
445
+ app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False)