Spaces:
Running
Running
| """ | |
| FINAL Bench Auto-Evaluator v1.1 | |
| ================================ | |
| FINAL-Bench/Metacognitive 100 tasks | |
| HF Inference API + OpenAI Judge -> final_scores.json | |
| Gradio 6.x compatible | |
| Author: Ginigen AI · License: Apache 2.0 | |
| """ | |
| import json, os, time, re, hashlib, sqlite3, threading, csv, io | |
| from datetime import datetime | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| import requests, numpy as np, gradio as gr | |
| # ══════════════ DATA ══════════════ | |
| class Task: | |
| task_id: str; domain: str; grade: str; ticos_type: str | |
| difficulty: str; lens: str; title: str; prompt: str | |
| expected_behavior: str; hidden_trap: Optional[str] = None | |
| ticos_required: str = ""; ticos_optional: str = "" | |
| def load_tasks(): | |
| try: | |
| from datasets import load_dataset | |
| ds = load_dataset("FINAL-Bench/Metacognitive", split="train") | |
| tasks = [] | |
| for r in ds: | |
| tasks.append(Task( | |
| task_id=r["task_id"], domain=r["domain"], grade=r["grade"], | |
| ticos_type=r["ticos_type"], difficulty=r["difficulty"], | |
| lens=r.get("lens",""), title=r["title"], prompt=r["prompt"], | |
| expected_behavior=r["expected_behavior"], | |
| hidden_trap=r.get("hidden_trap"), | |
| ticos_required=r.get("ticos_required",""), | |
| ticos_optional=r.get("ticos_optional",""))) | |
| print(f"✅ {len(tasks)} tasks loaded") | |
| return tasks | |
| except Exception as e: | |
| print(f"❌ Load failed: {e}") | |
| return [] | |
| TASKS = load_tasks() | |
| # TICOS types from actual dataset | |
| TICOS = { | |
| "A_TrapEscape": {"n": "함정탈출", "i": "🪤"}, | |
| "B_ContradictionResolution": {"n": "모순해결", "i": "⚡"}, | |
| "C_ProgressiveDiscovery": {"n": "점진발견", "i": "🔬"}, | |
| "D_MultiConstraint": {"n": "다중제약", "i": "🎯"}, | |
| "E_SelfCorrecting": {"n": "자기수정", "i": "🔄"}, | |
| "F_ExpertPanel": {"n": "전문가토론", "i": "👥"}, | |
| "G_PivotDetection": {"n": "전환감지", "i": "🔀"}, | |
| "H_DecisionUnderUncertainty":{"n": "불확실성판단", "i": "📊"}, | |
| } | |
| # ══════════════ RUBRIC ══════════════ | |
| RK = ["trap_detection", "insight_depth", "confidence_calibration", "self_correction", "synthesis_quality"] | |
| RW = {"trap_detection": 0.20, "insight_depth": 0.20, "confidence_calibration": 0.25, | |
| "self_correction": 0.20, "synthesis_quality": 0.15} | |
| RD = {"trap_detection": "Hidden trap/error detection", | |
| "insight_depth": "Depth of genuine insight", | |
| "confidence_calibration": "Confidence-accuracy alignment (overconfidence penalized)", | |
| "self_correction": "Error detection and actual correction", | |
| "synthesis_quality": "Coherent final synthesis"} | |
| def calc_score(scores): | |
| return round(sum(scores.get(k, 0.5) * w for k, w in RW.items()) * 100, 2) | |
| # ══════════════ LLM CALLS ══════════════ | |
| def _strip(t): | |
| if not t: return t | |
| for tag in ['think', 'thinking', 'reasoning', 'reflection']: | |
| t = re.sub(rf'<{tag}>.*?</{tag}>', '', t, flags=re.DOTALL) | |
| return t.strip() | |
| def call_hf(prompt, sys_msg="", key="", model="Qwen/Qwen3.5-397B-A17B", max_tok=4096, temp=0.6): | |
| """HuggingFace InferenceClient — auto-routes to correct provider""" | |
| from huggingface_hub import InferenceClient | |
| msgs = [] | |
| if sys_msg: msgs.append({"role": "system", "content": sys_msg}) | |
| msgs.append({"role": "user", "content": prompt}) | |
| for attempt in range(3): | |
| try: | |
| print(f" 📡 HF call: {model} (attempt {attempt+1})") | |
| client = InferenceClient(token=key) | |
| response = client.chat_completion( | |
| model=model, | |
| messages=msgs, | |
| max_tokens=max_tok, | |
| temperature=temp, | |
| ) | |
| content = response.choices[0].message.content | |
| print(f" ✅ Got {len(content)} chars") | |
| return _strip(content) | |
| except Exception as e: | |
| err_str = str(e) | |
| print(f" ❌ HF error (attempt {attempt+1}): {err_str[:150]}") | |
| if "429" in err_str or "rate" in err_str.lower(): | |
| wait = 10 * (attempt + 1) | |
| print(f" ⏳ Rate limited, waiting {wait}s") | |
| time.sleep(wait) | |
| elif "503" in err_str or "loading" in err_str.lower(): | |
| wait = 15 * (attempt + 1) | |
| print(f" ⏳ Model loading, waiting {wait}s") | |
| time.sleep(wait) | |
| elif attempt < 2: | |
| time.sleep(3 * (attempt + 1)) | |
| else: | |
| return f"[API_ERROR] {err_str[:200]}" | |
| def call_oai(prompt, sys_msg="", key="", model="gpt-5.2", max_tok=4096, temp=0.6): | |
| msgs = [] | |
| if sys_msg: msgs.append({"role": "system", "content": sys_msg}) | |
| msgs.append({"role": "user", "content": prompt}) | |
| h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"} | |
| body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp} | |
| for attempt in range(2): | |
| try: | |
| print(f" 🔑 OpenAI call: {model}") | |
| r = requests.post("https://api.openai.com/v1/chat/completions", | |
| headers=h, json=body, timeout=120) | |
| if r.status_code == 429: | |
| time.sleep(5 * (attempt + 1)); continue | |
| r.raise_for_status() | |
| return _strip(r.json()["choices"][0]["message"]["content"]) | |
| except Exception as e: | |
| print(f" ❌ OpenAI error: {e}") | |
| if attempt < 1: time.sleep(3) | |
| else: return f"[API_ERROR] {e}" | |
| def call_model(prompt, sys_msg="", key="", model="", api_type="hf", max_tok=4096, temp=0.6): | |
| if api_type == "openai": | |
| return call_oai(prompt, sys_msg, key, model, max_tok, temp) | |
| return call_hf(prompt, sys_msg, key, model, max_tok, temp) | |
| # ══════════════ MODELS ══════════════ | |
| HF_MODELS = { | |
| "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B", | |
| "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B", | |
| "Qwen3.5-27B": "Qwen/Qwen3.5-27B", | |
| "Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B", | |
| "Qwen3.5-9B": "Qwen/Qwen3.5-9B", | |
| "Qwen3.5-4B": "Qwen/Qwen3.5-4B", | |
| "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324", | |
| "DeepSeek R1": "deepseek-ai/DeepSeek-R1", | |
| "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct", | |
| "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct", | |
| "Phi-4": "microsoft/phi-4", | |
| "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501", | |
| } | |
| OAI_MODELS = {"GPT-5.2": "gpt-5.2", "GPT-5.4": "gpt-5.4", "GPT-5.1": "gpt-5.1"} | |
| # ══════════════ JUDGE ══════════════ | |
| JUDGE_SYS = """You are a FINAL Bench Metacognition Judge. Score 5 TICOS dimensions using ONLY 0.0/0.25/0.5/0.75/1.0: | |
| 1. trap_detection: Did model detect hidden traps? 1.0=all found, 0.0=fell in | |
| 2. insight_depth: Genuine deep understanding? 1.0=novel, 0.0=wrong | |
| 3. confidence_calibration: Confidence matches accuracy? 1.0=calibrated, 0.0=overconfident. Overconfidence is WORSE than underconfidence. | |
| 4. self_correction: Caught and fixed own errors? 1.0=backtracked+fixed, 0.0=none | |
| 5. synthesis_quality: Final synthesis coherent? 1.0=unified, 0.0=fragmented | |
| Output ONLY JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"one line"}""" | |
| def make_judge_prompt(task, response): | |
| sk = ', '.join([f'"{k}": ___' for k in RK]) | |
| ht = f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else "" | |
| return f"""[FINAL Bench Evaluation] | |
| Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty} | |
| Title: {task.title} | |
| Prompt: {task.prompt[:1200]} | |
| Expected: {task.expected_behavior[:500]}{ht} | |
| === RESPONSE === | |
| {response[:8000]} | |
| === END === | |
| Output ONLY: {{"scores": {{{sk}}}, "comment": "..."}}""" | |
| def judge(prompt, key, model="gpt-5.2"): | |
| schema = { | |
| "type": "object", | |
| "properties": { | |
| "scores": { | |
| "type": "object", | |
| "properties": {k: {"type": "number", "enum": [0.0, 0.25, 0.5, 0.75, 1.0]} for k in RK}, | |
| "required": RK, "additionalProperties": False}, | |
| "comment": {"type": "string"}}, | |
| "required": ["scores", "comment"], "additionalProperties": False} | |
| msgs = [{"role": "system", "content": JUDGE_SYS}, {"role": "user", "content": prompt}] | |
| payload = {"model": model, "max_completion_tokens": 4096, "temperature": 0.1, | |
| "messages": msgs, | |
| "response_format": {"type": "json_schema", | |
| "json_schema": {"name": "FBResult", "strict": True, "schema": schema}}} | |
| h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"} | |
| for a in range(3): | |
| try: | |
| print(f" ⚖️ Judge call (attempt {a+1})") | |
| r = requests.post("https://api.openai.com/v1/chat/completions", | |
| headers=h, json=payload, timeout=180) | |
| print(f" ⚖️ Judge status: {r.status_code}") | |
| if r.status_code == 429: | |
| time.sleep(5 * (a + 1)); continue | |
| r.raise_for_status() | |
| c = r.json()["choices"][0]["message"]["content"] | |
| if not c: | |
| if a < 2: time.sleep(2); continue | |
| return None | |
| d = json.loads(_strip(c)) | |
| if "scores" in d: | |
| for k in RK: | |
| if k not in d["scores"]: d["scores"][k] = 0.5 | |
| print(f" ✅ Judge OK: {d.get('comment','')[:50]}") | |
| return d | |
| except Exception as e: | |
| print(f" ❌ Judge error: {e}") | |
| if a < 2: time.sleep(3 * (a + 1)) | |
| return None | |
| # ══════════════ DB ══════════════ | |
| DB = "final_bench.db" | |
| def db_init(): | |
| c = sqlite3.connect(DB) | |
| c.execute("CREATE TABLE IF NOT EXISTS results(rid TEXT, tid TEXT, resp TEXT, jdg TEXT, score REAL, ts REAL, PRIMARY KEY(rid,tid))") | |
| c.commit(); c.close() | |
| def db_save(rid, tid, resp, jdg, score): | |
| c = sqlite3.connect(DB) | |
| c.execute("INSERT OR REPLACE INTO results VALUES(?,?,?,?,?,?)", (rid, tid, resp, jdg, score, time.time())) | |
| c.commit(); c.close() | |
| def db_load(rid): | |
| c = sqlite3.connect(DB) | |
| rows = c.execute("SELECT tid, resp, jdg, score FROM results WHERE rid=?", (rid,)).fetchall() | |
| c.close() | |
| return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows} | |
| def db_clear(rid): | |
| c = sqlite3.connect(DB) | |
| c.execute("DELETE FROM results WHERE rid=?", (rid,)) | |
| c.commit(); c.close() | |
| db_init() | |
| # ══════════════ SCORES FILE ══════════════ | |
| SF = "final_scores.json" | |
| def sf_load(): | |
| try: | |
| with open(SF) as f: return json.load(f) | |
| except: return {"version": "1.1", "bench": "FINAL-Bench/Metacognitive", "updated": "", "models": {}} | |
| def sf_save(name, score, dom_scores, ticos_scores, n_total, n_done): | |
| d = sf_load() | |
| d["updated"] = datetime.now().isoformat() | |
| d["models"][name] = { | |
| "final_score": score, "domain_scores": dom_scores, | |
| "ticos_scores": ticos_scores, "tasks_total": n_total, | |
| "tasks_completed": n_done, "evaluated_at": datetime.now().isoformat()} | |
| with open(SF, "w") as f: json.dump(d, f, indent=2, ensure_ascii=False) | |
| return d | |
| def sf_upload(d): | |
| tk = os.getenv("HF_TOKEN", "") | |
| if not tk: return "⚠️ HF_TOKEN not set" | |
| try: | |
| from huggingface_hub import HfApi | |
| HfApi(token=tk).upload_file( | |
| path_or_fileobj=json.dumps(d, indent=2, ensure_ascii=False).encode("utf-8"), | |
| path_in_repo="final_scores.json", | |
| repo_id="FINAL-Bench/ALL-Bench-Leaderboard", repo_type="dataset", | |
| commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}") | |
| return "✅ Uploaded to HF" | |
| except Exception as e: return f"❌ Upload: {e}" | |
| # ══════════════ EVAL ENGINE ══════════════ | |
| from concurrent.futures import ThreadPoolExecutor | |
| def eval_one(task, rid, key, jkey, mid, jmodel, atype, state): | |
| print(f"\n{'='*40}\n📝 Evaluating: {task.task_id} ({task.ticos_type})") | |
| try: | |
| # 1. Model response | |
| resp = call_model(task.prompt, key=key, model=mid, api_type=atype) | |
| if not resp or resp.startswith("[API_ERROR"): | |
| print(f" ❌ Model failed: {resp[:100]}") | |
| db_save(rid, task.task_id, resp or "empty", "{}", 0) | |
| with state["lock"]: | |
| state["done"] += 1 | |
| state["errors"].append(task.task_id) | |
| return task.task_id, {"response": resp, "judge": "{}", "score": 0} | |
| # 2. Judge | |
| jp = make_judge_prompt(task, resp) | |
| jd = judge(jp, jkey, jmodel) | |
| if jd is None: | |
| print(f" ❌ Judge failed for {task.task_id}") | |
| jd = {"scores": {k: 0.0 for k in RK}, "comment": "judge_failed", "failed": True} | |
| if jd.get("failed"): | |
| sc = -1.0 | |
| else: | |
| sc = calc_score(jd["scores"]) | |
| with state["lock"]: state["jok"] += 1 | |
| jj = json.dumps(jd, ensure_ascii=False) | |
| db_save(rid, task.task_id, resp, jj, sc) | |
| print(f" 📊 Score: {sc}") | |
| with state["lock"]: | |
| state["done"] += 1 | |
| ti = TICOS.get(task.ticos_type, {}) | |
| state["active"].append(f'{ti.get("i","📝")} {task.task_id} → {sc}') | |
| if len(state["active"]) > 10: | |
| state["active"] = state["active"][-10:] | |
| return task.task_id, {"response": resp, "judge": jj, "score": sc} | |
| except Exception as e: | |
| print(f" 💥 Exception: {e}") | |
| db_save(rid, task.task_id, f"[ERR] {e}", "{}", 0) | |
| with state["lock"]: | |
| state["done"] += 1 | |
| state["errors"].append(f"{task.task_id}: {str(e)[:40]}") | |
| return task.task_id, {"response": f"[ERR] {e}", "judge": "{}", "score": 0} | |
| # ── State ── | |
| ST = { | |
| "running": False, "stop": False, "finished": False, | |
| "rid": "", "model": "", "done": 0, "total": 0, "cached": 0, | |
| "errors": [], "active": [], "jok": 0, "t0": 0, | |
| "results": {}, "tasks": [], | |
| "lock": threading.Lock(), "msg": "", "csv": None, "hf": "", | |
| } | |
| def st_reset(): | |
| with ST["lock"]: | |
| ST.update({"running": False, "stop": False, "finished": False, | |
| "done": 0, "cached": 0, "errors": [], "active": [], "jok": 0, | |
| "t0": 0, "results": {}, "tasks": [], | |
| "msg": "", "csv": None, "hf": ""}) | |
| def bg_eval(key, jkey, mid, mname, jmodel, atype, tasks, rid, nw): | |
| print(f"\n{'#'*50}") | |
| print(f"# BG EVAL START: {mname} ({len(tasks)} tasks, {nw} workers)") | |
| print(f"# API type: {atype}, Model ID: {mid}") | |
| print(f"{'#'*50}\n") | |
| try: | |
| cached = db_load(rid) | |
| nc = sum(1 for t in tasks if t.task_id in cached) | |
| pending = [t for t in tasks if t.task_id not in cached] | |
| with ST["lock"]: | |
| ST["results"] = cached | |
| ST["cached"] = nc | |
| ST["total"] = len(tasks) | |
| ST["t0"] = time.time() | |
| if not pending: | |
| with ST["lock"]: ST["msg"] = f"💾 All cached ({nc})" | |
| finalize(tasks, cached, mname) | |
| return | |
| with ST["lock"]: ST["msg"] = f"⚡ {len(pending)} tasks, {nw} workers" | |
| print(f"📋 Pending: {len(pending)}, Cached: {nc}") | |
| with ThreadPoolExecutor(max_workers=nw) as exe: | |
| futs = {} | |
| for task in pending: | |
| if ST["stop"]: break | |
| f = exe.submit(eval_one, task, rid, key, jkey, mid, jmodel, atype, ST) | |
| futs[f] = task | |
| done_set = set() | |
| while len(done_set) < len(futs): | |
| if ST["stop"]: | |
| print("⏹️ Stop requested") | |
| with ST["lock"]: | |
| ST["msg"] = "⏹️ Stopped" | |
| ST["running"] = False | |
| ST["finished"] = True | |
| return | |
| for f in list(futs): | |
| if f in done_set: continue | |
| if f.done(): | |
| done_set.add(f) | |
| try: | |
| tid, data = f.result() | |
| with ST["lock"]: ST["results"][tid] = data | |
| except Exception as e: | |
| print(f"Future error: {e}") | |
| time.sleep(0.5) | |
| with ST["lock"]: results = dict(ST["results"]) | |
| finalize(tasks, results, mname) | |
| except Exception as e: | |
| print(f"💥 BG EVAL CRASH: {e}") | |
| import traceback; traceback.print_exc() | |
| with ST["lock"]: | |
| ST["msg"] = f"❌ {str(e)[:100]}" | |
| ST["running"] = False | |
| ST["finished"] = True | |
| def finalize(tasks, results, mname): | |
| print(f"\n🏁 Finalizing: {len(results)} results") | |
| ds = {} | |
| for dom in set(t.domain for t in tasks): | |
| v = [results[t.task_id]["score"] for t in tasks | |
| if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0] | |
| if v: ds[dom] = round(np.mean(v), 2) | |
| ts = {} | |
| for tt in set(t.ticos_type for t in tasks): | |
| v = [results[t.task_id]["score"] for t in tasks | |
| if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0] | |
| if v: ts[tt] = round(np.mean(v), 2) | |
| av = [results[t.task_id]["score"] for t in tasks | |
| if t.task_id in results and results[t.task_id]["score"] >= 0] | |
| fs = round(np.mean(av), 2) if av else 0 | |
| print(f"📊 FINAL Score: {fs} ({len(av)}/{len(tasks)} tasks)") | |
| sd = sf_save(mname, fs, ds, ts, len(tasks), len(av)) | |
| hf = sf_upload(sd) | |
| el = int(time.time() - ST["t0"]) if ST["t0"] else 0 | |
| # CSV | |
| cp = f"/tmp/fb_{ST['rid']}.csv" | |
| with open(cp, "w", encoding="utf-8") as f: | |
| w = csv.writer(f) | |
| w.writerow(["task_id","domain","grade","ticos","difficulty","title","score","comment"]) | |
| tm = {t.task_id: t for t in tasks} | |
| for tid, d in sorted(results.items()): | |
| t = tm.get(tid) | |
| if not t: continue | |
| try: jd = json.loads(d["judge"]) if isinstance(d["judge"], str) else {} | |
| except: jd = {} | |
| w.writerow([tid, t.domain, t.grade, t.ticos_type, t.difficulty, t.title, | |
| d["score"], (jd.get("comment","") if isinstance(jd,dict) else "")[:200]]) | |
| with ST["lock"]: | |
| ST["csv"] = cp | |
| ST["hf"] = hf | |
| ST["msg"] = f"🏁 FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})" | |
| ST["running"] = False | |
| ST["finished"] = True | |
| print(f"✅ Done: FINAL Score = {fs}") | |
| # ══════════════ UI CALLBACKS ══════════════ | |
| def do_start(model, api_type, eval_key, judge_key, judge_model, diff, max_t, workers, fresh): | |
| print(f"\n🔘 START clicked: model={model}, api={api_type}, fresh={fresh}") | |
| if ST["running"]: | |
| return "⚠️ Already running" | |
| eval_key = (eval_key or "").strip() or os.getenv("HF_TOKEN", "") | |
| judge_key = (judge_key or "").strip() or os.getenv("OPENAI_API_KEY", "") | |
| if not eval_key: | |
| print("❌ No eval key") | |
| return "❌ API Key needed" | |
| if not judge_key: | |
| print("❌ No judge key") | |
| return "❌ Judge Key needed" | |
| print(f" Keys: eval={eval_key[:8]}... judge={judge_key[:8]}...") | |
| if api_type == "HuggingFace Inference": | |
| mid = HF_MODELS.get(model, model) | |
| at = "hf" | |
| else: | |
| mid = OAI_MODELS.get(model, model) | |
| at = "openai" | |
| tasks = TASKS[:] | |
| if diff != "전체": | |
| tasks = [t for t in tasks if t.difficulty == diff] | |
| tasks = tasks[:int(max_t)] | |
| print(f" Model ID: {mid}, Tasks: {len(tasks)}") | |
| rid = hashlib.md5(f"FB_{mid}".encode()).hexdigest()[:12] | |
| if fresh: | |
| db_clear(rid) | |
| print(" 🗑️ Cache cleared") | |
| st_reset() | |
| with ST["lock"]: | |
| ST["running"] = True | |
| ST["rid"] = rid | |
| ST["model"] = model | |
| ST["tasks"] = tasks | |
| ST["total"] = len(tasks) | |
| thread = threading.Thread( | |
| target=bg_eval, | |
| args=(eval_key, judge_key, mid, model, judge_model, at, tasks, rid, int(workers)), | |
| daemon=True) | |
| thread.start() | |
| print(f" 🧵 Thread started") | |
| return f"🧬 {model} started ({len(tasks)} tasks, {int(workers)} workers)" | |
| def do_stop(): | |
| if ST["running"]: | |
| ST["stop"] = True | |
| return "⏹️ Stopping..." | |
| return "Not running" | |
| def do_poll(): | |
| with ST["lock"]: | |
| running = ST["running"] | |
| finished = ST["finished"] | |
| tasks = ST.get("tasks", []) | |
| results = dict(ST.get("results", {})) | |
| msg = ST.get("msg", "") | |
| csvp = ST.get("csv") | |
| if not running and not finished and not results: | |
| return ("ℹ️ Select model → press ▶️ Start", "", "", None) | |
| # Progress bar | |
| if running: | |
| dn = ST["done"] | |
| tot = ST.get("total", 1) | |
| pct = min(int(dn / max(tot, 1) * 100), 100) | |
| el = int(time.time() - ST.get("t0", time.time())) | |
| eta = int((el / max(dn, 1)) * (tot - dn)) if dn > 0 else 0 | |
| active = ST.get("active", []) | |
| jok = ST.get("jok", 0) | |
| errs = ST.get("errors", []) | |
| tags = " ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;' | |
| f'font-size:12px">{a}</span>' for a in active[-6:]]) | |
| err_html = "" | |
| if errs: | |
| err_html = f'<div style="color:#dc2626;margin-top:6px;font-size:12px">⚠️ Errors: {", ".join(errs[-3:])}</div>' | |
| prog = f"""<div style="padding:12px;background:#fafafa;border-radius:8px;border:1px solid #e5e7eb"> | |
| <div style="display:flex;justify-content:space-between;margin-bottom:6px"> | |
| <span style="font-size:14px">🧬 {dn}/{tot} · {el}s · ETA ~{eta}s · Judge ✅{jok}</span> | |
| <span style="font-weight:700;color:#7c3aed;font-size:16px">{pct}%</span> | |
| </div> | |
| <div style="background:#e5e7eb;border-radius:8px;height:24px;overflow:hidden"> | |
| <div style="width:{pct}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1);transition:width 0.3s"></div> | |
| </div> | |
| <div style="margin-top:8px">{tags}</div>{err_html} | |
| </div>""" | |
| elif finished: | |
| prog = f'<div style="background:#f0fdf4;padding:16px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a;font-size:16px">🏁 {msg}</div>' | |
| else: | |
| prog = f'<div style="padding:12px">{msg}</div>' | |
| # Results table | |
| tbl = "" | |
| if tasks: | |
| rows = "" | |
| for t in tasks: | |
| ti = TICOS.get(t.ticos_type, {"i": "📝", "n": t.ticos_type}) | |
| if t.task_id in results: | |
| s = results[t.task_id]["score"] | |
| if s < 0: | |
| rows += f'<tr style="background:#fef3c7"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td style="color:#f59e0b;font-weight:700">❌ Judge failed</td></tr>' | |
| else: | |
| c = "#22c55e" if s >= 80 else ("#f59e0b" if s >= 60 else "#ef4444") | |
| rows += f'<tr><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td><div style="display:flex;align-items:center;gap:6px"><div style="background:#e5e7eb;border-radius:6px;height:16px;width:80px;overflow:hidden"><div style="width:{min(s,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="color:{c};font-weight:700;font-size:12px">{s:.1f}</span></div></td></tr>' | |
| else: | |
| rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td>⏳</td></tr>' | |
| tbl = f'<table style="width:100%;border-collapse:collapse;font-size:13px"><thead><tr style="background:#f1f5f9"><th style="padding:8px;text-align:left">ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>' | |
| # Summary | |
| sm = "" | |
| if finished and tasks: | |
| av = [results[t.task_id]["score"] for t in tasks | |
| if t.task_id in results and results[t.task_id]["score"] >= 0] | |
| fs = round(np.mean(av), 2) if av else 0 | |
| # Domain bars | |
| dh = "" | |
| for dom in sorted(set(t.domain for t in tasks)): | |
| v = [results[t.task_id]["score"] for t in tasks | |
| if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0] | |
| if v: | |
| a = round(np.mean(v), 1) | |
| c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444") | |
| dh += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:13px">{dom}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>' | |
| # TICOS bars | |
| th = "" | |
| for tt, info in TICOS.items(): | |
| v = [results[t.task_id]["score"] for t in tasks | |
| if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0] | |
| if v: | |
| a = round(np.mean(v), 1) | |
| c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444") | |
| th += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:160px;font-size:13px">{info["i"]} {info["n"]}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>' | |
| sm = f"""<div style="background:linear-gradient(135deg,#1e1b4b,#312e81);border-radius:14px;padding:24px;color:#fff;margin:8px 0"> | |
| <h2 style="margin:0;font-size:28px;text-align:center">🧬 FINAL Score: {fs} / 100</h2> | |
| <p style="text-align:center;color:#a5b4fc;margin:8px 0">{ST.get("model","")} · {len(av)} tasks</p> | |
| <hr style="border-color:#4338ca;margin:16px 0"> | |
| <h4 style="color:#a5b4fc;margin:8px 0">📚 Domains</h4>{dh} | |
| <hr style="border-color:#4338ca;margin:16px 0"> | |
| <h4 style="color:#a5b4fc;margin:8px 0">🧬 TICOS Types</h4>{th} | |
| <hr style="border-color:#4338ca;margin:16px 0"> | |
| <p style="font-size:12px;color:#818cf8">{ST.get("hf","")}</p></div>""" | |
| return (prog, tbl, sm, csvp) | |
| def update_models(api_type): | |
| if api_type == "HuggingFace Inference": | |
| return gr.update(choices=list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0]) | |
| return gr.update(choices=list(OAI_MODELS.keys()), value=list(OAI_MODELS.keys())[0]) | |
| # ══════════════ GRADIO APP ══════════════ | |
| HEADER_HTML = """<div style="text-align:center;padding:16px 0"> | |
| <h1 style="margin:0;font-size:28px">🧬 FINAL Bench Auto-Evaluator v1.1</h1> | |
| <h2 style="margin:4px 0;color:#6b7280;font-size:16px">Metacognitive Intelligence · 100 Tasks · TICOS Scoring</h2> | |
| <p style="color:#9ca3af;font-size:13px;max-width:700px;margin:8px auto;line-height:1.6"> | |
| 📊 <b>FINAL-Bench/Metacognitive</b> 100 tasks · 15 domains · 8 TICOS types<br> | |
| 🧬 <b>TICOS</b>: Trap · Insight · Confidence · Self-Correction · Synthesis<br> | |
| 📡 HF Inference API (open-source) + 🔑 OpenAI (closed) → ⚖️ GPT-5.2 Judge<br> | |
| 📊 → <code>final_scores.json</code> → ALL Bench Metacog column</p></div>""" | |
| def create_app(): | |
| with gr.Blocks(title="FINAL Bench Evaluator") as app: | |
| gr.HTML(HEADER_HTML) | |
| with gr.Row(): | |
| api_type = gr.Radio( | |
| ["HuggingFace Inference", "OpenAI Compatible"], | |
| value="HuggingFace Inference", label="📡 API Type", scale=2) | |
| model_dd = gr.Dropdown( | |
| list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0], | |
| label="🤖 Target Model", scale=3, allow_custom_value=True) | |
| api_type.change(update_models, [api_type], [model_dd]) | |
| with gr.Row(): | |
| eval_key = gr.Textbox( | |
| label="🔑 Eval API Key (HF Token or OpenAI)", | |
| type="password", placeholder="hf_... or sk-...", | |
| value=os.getenv("HF_TOKEN", ""), scale=3) | |
| judge_key = gr.Textbox( | |
| label="⚖️ Judge Key (OpenAI)", | |
| type="password", placeholder="sk-...", | |
| value=os.getenv("OPENAI_API_KEY", ""), scale=3) | |
| with gr.Row(): | |
| judge_model = gr.Textbox(label="⚖️ Judge Model", value="gpt-5.2", scale=2) | |
| diff_dd = gr.Dropdown( | |
| ["전체", "expert", "frontier"], | |
| value="전체", label="Difficulty", scale=1) | |
| max_tasks = gr.Slider(1, 100, value=100, step=1, label="Max Tasks", scale=2) | |
| workers = gr.Slider(1, 20, value=10, step=1, label="⚡ Workers", scale=1) | |
| with gr.Row(): | |
| start_btn = gr.Button("▶️ Start (Resume)", variant="primary", size="lg", scale=2) | |
| fresh_btn = gr.Button("🚀 Fresh Start", variant="secondary", size="lg", scale=2) | |
| stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", scale=1) | |
| status = gr.Textbox(label="Status", interactive=False, max_lines=1) | |
| with gr.Accordion("📊 Existing FINAL Scores", open=False): | |
| gr.JSON(value=sf_load(), label="final_scores.json") | |
| with gr.Tabs(): | |
| with gr.Tab("📊 Progress"): | |
| prog_html = gr.HTML() | |
| with gr.Tab("📋 Results"): | |
| table_html = gr.HTML() | |
| with gr.Tab("🏆 Summary"): | |
| summary_html = gr.HTML() | |
| with gr.Tab("💾 CSV"): | |
| csv_file = gr.File(label="CSV Download") | |
| # Timer for polling | |
| timer = gr.Timer(value=2, active=True) | |
| timer.tick(fn=do_poll, outputs=[prog_html, table_html, summary_html, csv_file]) | |
| # Button handlers | |
| inputs = [model_dd, api_type, eval_key, judge_key, judge_model, | |
| diff_dd, max_tasks, workers] | |
| start_btn.click( | |
| fn=lambda *a: do_start(*a, fresh=False), | |
| inputs=inputs, outputs=[status]) | |
| fresh_btn.click( | |
| fn=lambda *a: do_start(*a, fresh=True), | |
| inputs=inputs, outputs=[status]) | |
| stop_btn.click(fn=do_stop, outputs=[status]) | |
| gr.Markdown(f"""--- | |
| <center>🧬 FINAL Bench Auto-Evaluator v1.1 · Apache 2.0 · Ginigen AI<br> | |
| Data: <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive">FINAL-Bench/Metacognitive</a> ({len(TASKS)} tasks)<br> | |
| → ALL Bench Leaderboard Metacog auto-sync</center>""") | |
| return app | |
| if __name__ == "__main__": | |
| stats = {} | |
| for t in TASKS: | |
| stats[t.ticos_type] = stats.get(t.ticos_type, 0) + 1 | |
| print(f"FINAL Bench Evaluator: {len(TASKS)} tasks") | |
| for tt, n in sorted(stats.items()): | |
| info = TICOS.get(tt, {"i": "?", "n": tt}) | |
| print(f" {info['i']} {tt}: {n}") | |
| app = create_app() | |
| app.queue(default_concurrency_limit=2) | |
| app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False, | |
| theme=gr.themes.Soft(), | |
| css=".gradio-container{max-width:1100px !important}") |