Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,445 +1,723 @@
|
|
| 1 |
"""
|
| 2 |
-
FINAL Bench Auto-Evaluator v1.
|
| 3 |
-
================================
|
| 4 |
-
FINAL-Bench/Metacognitive 100
|
| 5 |
-
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
Author: Ginigen AI Β· FINAL-Bench Β· License: Apache 2.0
|
| 10 |
"""
|
| 11 |
-
import json,os,time,
|
| 12 |
from datetime import datetime
|
| 13 |
from dataclasses import dataclass
|
| 14 |
from typing import Optional
|
| 15 |
import requests, numpy as np, gradio as gr
|
| 16 |
|
|
|
|
|
|
|
| 17 |
@dataclass
|
| 18 |
-
class
|
| 19 |
-
task_id:str; domain:str; grade:str; ticos_type:
|
| 20 |
-
|
| 21 |
-
hidden_trap:Optional[str]
|
|
|
|
| 22 |
|
| 23 |
def load_tasks():
|
| 24 |
try:
|
| 25 |
from datasets import load_dataset
|
| 26 |
-
ds=load_dataset("FINAL-Bench/Metacognitive",split="train")
|
| 27 |
-
tasks=[
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return tasks
|
| 34 |
except Exception as e:
|
| 35 |
-
print(f"
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
-
"
|
|
|
|
|
|
|
| 48 |
}
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
return round(sum(scores.get(k,0.5)*w for k,w in
|
|
|
|
|
|
|
| 58 |
|
| 59 |
def _strip(t):
|
| 60 |
-
if not t:return t
|
| 61 |
-
|
| 62 |
-
|
| 63 |
return t.strip()
|
| 64 |
|
| 65 |
-
def call_hf(prompt,
|
| 66 |
-
msgs=[]
|
| 67 |
-
if
|
| 68 |
-
msgs.append({"role":"user","content":prompt})
|
| 69 |
-
h={"Content-Type":"application/json","Authorization":f"Bearer {key}"}
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
try:
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
r.raise_for_status()
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
except Exception as e:
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
msgs.append({"role":"
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
try:
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
| 91 |
r.raise_for_status()
|
| 92 |
return _strip(r.json()["choices"][0]["message"]["content"])
|
| 93 |
except Exception as e:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
"
|
| 107 |
-
"
|
| 108 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
}
|
| 110 |
-
OAI_MODELS={"GPT-5.2":"gpt-5.2","GPT-5.4":"gpt-5.4","GPT-5.1":"gpt-5.1"}
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
3. confidence_calibration
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
def build_jprompt(task,resp):
|
| 129 |
-
rl="\n".join([f' "{k}": {d}' for k,d in RUBRIC_D.items()])
|
| 130 |
-
sk=", ".join([f'"{k}": ___' for k in RUBRIC_KEYS])
|
| 131 |
-
ht=f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
|
| 132 |
-
return f"""[FINAL Bench Metacognition Evaluation]
|
| 133 |
Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
|
| 134 |
Title: {task.title}
|
| 135 |
Prompt: {task.prompt[:1200]}
|
| 136 |
Expected: {task.expected_behavior[:500]}{ht}
|
| 137 |
=== RESPONSE ===
|
| 138 |
-
{
|
| 139 |
=== END ===
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
for a in range(3):
|
| 154 |
try:
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
r.raise_for_status()
|
| 158 |
-
c=r.json()["choices"][0]["message"]["content"]
|
| 159 |
if not c:
|
| 160 |
-
if a<2:time.sleep(2);continue
|
| 161 |
return None
|
| 162 |
-
d=json.loads(_strip(c))
|
| 163 |
if "scores" in d:
|
| 164 |
-
for k in
|
| 165 |
-
if k not in d["scores"]:d["scores"][k]=0.5
|
|
|
|
| 166 |
return d
|
| 167 |
-
except:
|
| 168 |
-
|
|
|
|
| 169 |
return None
|
| 170 |
|
| 171 |
-
DB
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
c
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
try:
|
| 188 |
-
with open(SF) as f:return json.load(f)
|
| 189 |
-
except:return{"version":"1.
|
| 190 |
-
|
| 191 |
-
def
|
| 192 |
-
d=
|
| 193 |
-
d["
|
| 194 |
-
|
| 195 |
-
|
|
|
|
|
|
|
|
|
|
| 196 |
return d
|
| 197 |
|
| 198 |
-
def
|
| 199 |
-
tk=os.getenv("HF_TOKEN","")
|
| 200 |
-
if not tk:return "HF_TOKEN
|
| 201 |
try:
|
| 202 |
from huggingface_hub import HfApi
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
path_in_repo="final_scores.json",
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
from concurrent.futures import ThreadPoolExecutor
|
| 211 |
|
| 212 |
-
def
|
|
|
|
| 213 |
try:
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
if jd is None:
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
except Exception as e:
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
try:
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
return
|
| 266 |
for f in list(futs):
|
| 267 |
-
if f in
|
| 268 |
if f.done():
|
| 269 |
-
|
| 270 |
try:
|
| 271 |
-
tid,
|
| 272 |
-
with
|
| 273 |
-
except:
|
|
|
|
| 274 |
time.sleep(0.5)
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
| 277 |
except Exception as e:
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
-
|
| 281 |
-
global _S
|
| 282 |
-
ds={};ts={}
|
| 283 |
for dom in set(t.domain for t in tasks):
|
| 284 |
-
v=[
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
| 286 |
for tt in set(t.ticos_type for t in tasks):
|
| 287 |
-
v=[
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
if tasks:
|
| 337 |
-
rows=""
|
| 338 |
for t in tasks:
|
| 339 |
-
|
| 340 |
-
if t.task_id in
|
| 341 |
-
s=
|
| 342 |
-
if s
|
|
|
|
| 343 |
else:
|
| 344 |
-
c
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
for dom in sorted(set(t.domain for t in tasks)):
|
| 354 |
-
v=[
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
return
|
| 382 |
-
|
| 383 |
-
def
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
return
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
<
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
π‘ HF Inference (μ€νμμ€) + π OpenAI (ν΄λ‘μ¦λ) β βοΈ GPT-5.2 Judge<br>
|
| 399 |
-
π β <code>final_scores.json</code> β ALL Bench Metacog μλ λ°μ</p></div>"""
|
| 400 |
|
| 401 |
def create_app():
|
| 402 |
-
with gr.Blocks(title="FINAL Bench Evaluator"
|
| 403 |
-
|
| 404 |
-
|
| 405 |
with gr.Row():
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
with gr.Row():
|
| 410 |
-
|
| 411 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
with gr.Row():
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
| 417 |
with gr.Row():
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
|
|
|
|
|
|
|
|
|
| 424 |
with gr.Tabs():
|
| 425 |
-
with gr.Tab("π Progress"):
|
| 426 |
-
|
| 427 |
-
with gr.Tab("
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
return app
|
| 437 |
|
| 438 |
-
if __name__=="__main__":
|
| 439 |
-
|
| 440 |
-
for t in
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
app.queue(default_concurrency_limit=2)
|
| 445 |
-
app.launch(server_name="0.0.0.0",server_port=7860,ssr_mode=False
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
FINAL Bench Auto-Evaluator v1.1
|
| 3 |
+
================================
|
| 4 |
+
FINAL-Bench/Metacognitive 100 tasks
|
| 5 |
+
HF Inference API + OpenAI Judge -> final_scores.json
|
| 6 |
+
Gradio 6.x compatible
|
| 7 |
|
| 8 |
+
Author: Ginigen AI Β· License: Apache 2.0
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
+
import json, os, time, re, hashlib, sqlite3, threading, csv, io
|
| 11 |
from datetime import datetime
|
| 12 |
from dataclasses import dataclass
|
| 13 |
from typing import Optional
|
| 14 |
import requests, numpy as np, gradio as gr
|
| 15 |
|
| 16 |
+
# ββββββββββββββ DATA ββββββββββββββ
|
| 17 |
+
|
| 18 |
@dataclass
|
| 19 |
+
class Task:
|
| 20 |
+
task_id: str; domain: str; grade: str; ticos_type: str
|
| 21 |
+
difficulty: str; lens: str; title: str; prompt: str
|
| 22 |
+
expected_behavior: str; hidden_trap: Optional[str] = None
|
| 23 |
+
ticos_required: str = ""; ticos_optional: str = ""
|
| 24 |
|
| 25 |
def load_tasks():
|
| 26 |
try:
|
| 27 |
from datasets import load_dataset
|
| 28 |
+
ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
|
| 29 |
+
tasks = []
|
| 30 |
+
for r in ds:
|
| 31 |
+
tasks.append(Task(
|
| 32 |
+
task_id=r["task_id"], domain=r["domain"], grade=r["grade"],
|
| 33 |
+
ticos_type=r["ticos_type"], difficulty=r["difficulty"],
|
| 34 |
+
lens=r.get("lens",""), title=r["title"], prompt=r["prompt"],
|
| 35 |
+
expected_behavior=r["expected_behavior"],
|
| 36 |
+
hidden_trap=r.get("hidden_trap"),
|
| 37 |
+
ticos_required=r.get("ticos_required",""),
|
| 38 |
+
ticos_optional=r.get("ticos_optional","")))
|
| 39 |
+
print(f"β
{len(tasks)} tasks loaded")
|
| 40 |
return tasks
|
| 41 |
except Exception as e:
|
| 42 |
+
print(f"β Load failed: {e}")
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
TASKS = load_tasks()
|
| 46 |
+
|
| 47 |
+
# TICOS types from actual dataset
|
| 48 |
+
TICOS = {
|
| 49 |
+
"A_TrapEscape": {"n": "ν¨μ νμΆ", "i": "πͺ€"},
|
| 50 |
+
"B_ContradictionResolution": {"n": "λͺ¨μν΄κ²°", "i": "β‘"},
|
| 51 |
+
"C_ProgressiveDiscovery": {"n": "μ μ§λ°κ²¬", "i": "π¬"},
|
| 52 |
+
"D_MultiConstraint": {"n": "λ€μ€μ μ½", "i": "π―"},
|
| 53 |
+
"E_SelfCorrecting": {"n": "μκΈ°μμ ", "i": "π"},
|
| 54 |
+
"F_ExpertPanel": {"n": "μ λ¬Έκ°ν λ‘ ", "i": "π₯"},
|
| 55 |
+
"G_PivotDetection": {"n": "μ νκ°μ§", "i": "π"},
|
| 56 |
+
"H_DecisionUnderUncertainty":{"n": "λΆνμ€μ±νλ¨", "i": "π"},
|
| 57 |
}
|
| 58 |
|
| 59 |
+
# ββββββββββββββ RUBRIC ββββββββββββββ
|
| 60 |
+
|
| 61 |
+
RK = ["trap_detection", "insight_depth", "confidence_calibration", "self_correction", "synthesis_quality"]
|
| 62 |
+
RW = {"trap_detection": 0.20, "insight_depth": 0.20, "confidence_calibration": 0.25,
|
| 63 |
+
"self_correction": 0.20, "synthesis_quality": 0.15}
|
| 64 |
+
RD = {"trap_detection": "Hidden trap/error detection",
|
| 65 |
+
"insight_depth": "Depth of genuine insight",
|
| 66 |
+
"confidence_calibration": "Confidence-accuracy alignment (overconfidence penalized)",
|
| 67 |
+
"self_correction": "Error detection and actual correction",
|
| 68 |
+
"synthesis_quality": "Coherent final synthesis"}
|
| 69 |
|
| 70 |
+
def calc_score(scores):
|
| 71 |
+
return round(sum(scores.get(k, 0.5) * w for k, w in RW.items()) * 100, 2)
|
| 72 |
+
|
| 73 |
+
# ββββββββββββββ LLM CALLS ββββββββββββββ
|
| 74 |
|
| 75 |
def _strip(t):
|
| 76 |
+
if not t: return t
|
| 77 |
+
for tag in ['think', 'thinking', 'reasoning', 'reflection']:
|
| 78 |
+
t = re.sub(rf'<{tag}>.*?</{tag}>', '', t, flags=re.DOTALL)
|
| 79 |
return t.strip()
|
| 80 |
|
| 81 |
+
def call_hf(prompt, sys_msg="", key="", model="Qwen/Qwen3.5-397B-A17B", max_tok=4096, temp=0.6):
|
| 82 |
+
msgs = []
|
| 83 |
+
if sys_msg: msgs.append({"role": "system", "content": sys_msg})
|
| 84 |
+
msgs.append({"role": "user", "content": prompt})
|
| 85 |
+
h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 86 |
+
body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp, "stream": False}
|
| 87 |
+
|
| 88 |
+
for attempt in range(3):
|
| 89 |
try:
|
| 90 |
+
print(f" π‘ HF call: {model} (attempt {attempt+1})")
|
| 91 |
+
r = requests.post(
|
| 92 |
+
f"https://router.huggingface.co/hf-inference/models/{model}/v1/chat/completions",
|
| 93 |
+
headers=h, json=body, timeout=120)
|
| 94 |
+
print(f" π‘ Status: {r.status_code}")
|
| 95 |
+
if r.status_code in (429, 503):
|
| 96 |
+
wait = 10 * (attempt + 1)
|
| 97 |
+
print(f" β³ Rate limited, waiting {wait}s")
|
| 98 |
+
time.sleep(wait); continue
|
| 99 |
r.raise_for_status()
|
| 100 |
+
content = r.json()["choices"][0]["message"]["content"]
|
| 101 |
+
print(f" β
Got {len(content)} chars")
|
| 102 |
+
return _strip(content)
|
| 103 |
except Exception as e:
|
| 104 |
+
print(f" β HF error: {e}")
|
| 105 |
+
if attempt < 2: time.sleep(3 * (attempt + 1))
|
| 106 |
+
else: return f"[API_ERROR] {e}"
|
| 107 |
+
|
| 108 |
+
def call_oai(prompt, sys_msg="", key="", model="gpt-5.2", max_tok=4096, temp=0.6):
|
| 109 |
+
msgs = []
|
| 110 |
+
if sys_msg: msgs.append({"role": "system", "content": sys_msg})
|
| 111 |
+
msgs.append({"role": "user", "content": prompt})
|
| 112 |
+
h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 113 |
+
body = {"model": model, "messages": msgs, "max_tokens": max_tok, "temperature": temp}
|
| 114 |
+
|
| 115 |
+
for attempt in range(2):
|
| 116 |
try:
|
| 117 |
+
print(f" π OpenAI call: {model}")
|
| 118 |
+
r = requests.post("https://api.openai.com/v1/chat/completions",
|
| 119 |
+
headers=h, json=body, timeout=120)
|
| 120 |
+
if r.status_code == 429:
|
| 121 |
+
time.sleep(5 * (attempt + 1)); continue
|
| 122 |
r.raise_for_status()
|
| 123 |
return _strip(r.json()["choices"][0]["message"]["content"])
|
| 124 |
except Exception as e:
|
| 125 |
+
print(f" β OpenAI error: {e}")
|
| 126 |
+
if attempt < 1: time.sleep(3)
|
| 127 |
+
else: return f"[API_ERROR] {e}"
|
| 128 |
+
|
| 129 |
+
def call_model(prompt, sys_msg="", key="", model="", api_type="hf", max_tok=4096, temp=0.6):
|
| 130 |
+
if api_type == "openai":
|
| 131 |
+
return call_oai(prompt, sys_msg, key, model, max_tok, temp)
|
| 132 |
+
return call_hf(prompt, sys_msg, key, model, max_tok, temp)
|
| 133 |
+
|
| 134 |
+
# ββββββββββββββ MODELS ββββββββββββββ
|
| 135 |
+
|
| 136 |
+
HF_MODELS = {
|
| 137 |
+
"Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
|
| 138 |
+
"Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
|
| 139 |
+
"Qwen3.5-27B": "Qwen/Qwen3.5-27B",
|
| 140 |
+
"Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B",
|
| 141 |
+
"Qwen3.5-9B": "Qwen/Qwen3.5-9B",
|
| 142 |
+
"Qwen3.5-4B": "Qwen/Qwen3.5-4B",
|
| 143 |
+
"DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
|
| 144 |
+
"DeepSeek R1": "deepseek-ai/DeepSeek-R1",
|
| 145 |
+
"Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
|
| 146 |
+
"Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
|
| 147 |
+
"Phi-4": "microsoft/phi-4",
|
| 148 |
+
"Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
|
| 149 |
}
|
| 150 |
+
OAI_MODELS = {"GPT-5.2": "gpt-5.2", "GPT-5.4": "gpt-5.4", "GPT-5.1": "gpt-5.1"}
|
| 151 |
+
|
| 152 |
+
# ββββββββββββββ JUDGE ββββββββββββββ
|
| 153 |
+
|
| 154 |
+
JUDGE_SYS = """You are a FINAL Bench Metacognition Judge. Score 5 TICOS dimensions using ONLY 0.0/0.25/0.5/0.75/1.0:
|
| 155 |
+
|
| 156 |
+
1. trap_detection: Did model detect hidden traps? 1.0=all found, 0.0=fell in
|
| 157 |
+
2. insight_depth: Genuine deep understanding? 1.0=novel, 0.0=wrong
|
| 158 |
+
3. confidence_calibration: Confidence matches accuracy? 1.0=calibrated, 0.0=overconfident. Overconfidence is WORSE than underconfidence.
|
| 159 |
+
4. self_correction: Caught and fixed own errors? 1.0=backtracked+fixed, 0.0=none
|
| 160 |
+
5. synthesis_quality: Final synthesis coherent? 1.0=unified, 0.0=fragmented
|
| 161 |
+
|
| 162 |
+
Output ONLY JSON: {"scores":{"trap_detection":X,"insight_depth":X,"confidence_calibration":X,"self_correction":X,"synthesis_quality":X},"comment":"one line"}"""
|
| 163 |
+
|
| 164 |
+
def make_judge_prompt(task, response):
|
| 165 |
+
sk = ', '.join([f'"{k}": ___' for k in RK])
|
| 166 |
+
ht = f"\nHidden trap: {task.hidden_trap}" if task.hidden_trap else ""
|
| 167 |
+
return f"""[FINAL Bench Evaluation]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.ticos_type} | {task.difficulty}
|
| 169 |
Title: {task.title}
|
| 170 |
Prompt: {task.prompt[:1200]}
|
| 171 |
Expected: {task.expected_behavior[:500]}{ht}
|
| 172 |
=== RESPONSE ===
|
| 173 |
+
{response[:8000]}
|
| 174 |
=== END ===
|
| 175 |
+
Output ONLY: {{"scores": {{{sk}}}, "comment": "..."}}"""
|
| 176 |
+
|
| 177 |
+
def judge(prompt, key, model="gpt-5.2"):
|
| 178 |
+
schema = {
|
| 179 |
+
"type": "object",
|
| 180 |
+
"properties": {
|
| 181 |
+
"scores": {
|
| 182 |
+
"type": "object",
|
| 183 |
+
"properties": {k: {"type": "number", "enum": [0.0, 0.25, 0.5, 0.75, 1.0]} for k in RK},
|
| 184 |
+
"required": RK, "additionalProperties": False},
|
| 185 |
+
"comment": {"type": "string"}},
|
| 186 |
+
"required": ["scores", "comment"], "additionalProperties": False}
|
| 187 |
+
|
| 188 |
+
msgs = [{"role": "system", "content": JUDGE_SYS}, {"role": "user", "content": prompt}]
|
| 189 |
+
payload = {"model": model, "max_completion_tokens": 4096, "temperature": 0.1,
|
| 190 |
+
"messages": msgs,
|
| 191 |
+
"response_format": {"type": "json_schema",
|
| 192 |
+
"json_schema": {"name": "FBResult", "strict": True, "schema": schema}}}
|
| 193 |
+
h = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
|
| 194 |
+
|
| 195 |
for a in range(3):
|
| 196 |
try:
|
| 197 |
+
print(f" βοΈ Judge call (attempt {a+1})")
|
| 198 |
+
r = requests.post("https://api.openai.com/v1/chat/completions",
|
| 199 |
+
headers=h, json=payload, timeout=180)
|
| 200 |
+
print(f" βοΈ Judge status: {r.status_code}")
|
| 201 |
+
if r.status_code == 429:
|
| 202 |
+
time.sleep(5 * (a + 1)); continue
|
| 203 |
r.raise_for_status()
|
| 204 |
+
c = r.json()["choices"][0]["message"]["content"]
|
| 205 |
if not c:
|
| 206 |
+
if a < 2: time.sleep(2); continue
|
| 207 |
return None
|
| 208 |
+
d = json.loads(_strip(c))
|
| 209 |
if "scores" in d:
|
| 210 |
+
for k in RK:
|
| 211 |
+
if k not in d["scores"]: d["scores"][k] = 0.5
|
| 212 |
+
print(f" β
Judge OK: {d.get('comment','')[:50]}")
|
| 213 |
return d
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f" β Judge error: {e}")
|
| 216 |
+
if a < 2: time.sleep(3 * (a + 1))
|
| 217 |
return None
|
| 218 |
|
| 219 |
+
# ββββββββββββββ DB ββββββββββββββ
|
| 220 |
+
|
| 221 |
+
DB = "final_bench.db"
|
| 222 |
+
|
| 223 |
+
def db_init():
|
| 224 |
+
c = sqlite3.connect(DB)
|
| 225 |
+
c.execute("CREATE TABLE IF NOT EXISTS results(rid TEXT, tid TEXT, resp TEXT, jdg TEXT, score REAL, ts REAL, PRIMARY KEY(rid,tid))")
|
| 226 |
+
c.commit(); c.close()
|
| 227 |
+
|
| 228 |
+
def db_save(rid, tid, resp, jdg, score):
|
| 229 |
+
c = sqlite3.connect(DB)
|
| 230 |
+
c.execute("INSERT OR REPLACE INTO results VALUES(?,?,?,?,?,?)", (rid, tid, resp, jdg, score, time.time()))
|
| 231 |
+
c.commit(); c.close()
|
| 232 |
+
|
| 233 |
+
def db_load(rid):
|
| 234 |
+
c = sqlite3.connect(DB)
|
| 235 |
+
rows = c.execute("SELECT tid, resp, jdg, score FROM results WHERE rid=?", (rid,)).fetchall()
|
| 236 |
+
c.close()
|
| 237 |
+
return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows}
|
| 238 |
+
|
| 239 |
+
def db_clear(rid):
|
| 240 |
+
c = sqlite3.connect(DB)
|
| 241 |
+
c.execute("DELETE FROM results WHERE rid=?", (rid,))
|
| 242 |
+
c.commit(); c.close()
|
| 243 |
+
|
| 244 |
+
db_init()
|
| 245 |
+
|
| 246 |
+
# ββββββββββββββ SCORES FILE ββββββββββββββ
|
| 247 |
+
|
| 248 |
+
SF = "final_scores.json"
|
| 249 |
+
|
| 250 |
+
def sf_load():
|
| 251 |
try:
|
| 252 |
+
with open(SF) as f: return json.load(f)
|
| 253 |
+
except: return {"version": "1.1", "bench": "FINAL-Bench/Metacognitive", "updated": "", "models": {}}
|
| 254 |
+
|
| 255 |
+
def sf_save(name, score, dom_scores, ticos_scores, n_total, n_done):
|
| 256 |
+
d = sf_load()
|
| 257 |
+
d["updated"] = datetime.now().isoformat()
|
| 258 |
+
d["models"][name] = {
|
| 259 |
+
"final_score": score, "domain_scores": dom_scores,
|
| 260 |
+
"ticos_scores": ticos_scores, "tasks_total": n_total,
|
| 261 |
+
"tasks_completed": n_done, "evaluated_at": datetime.now().isoformat()}
|
| 262 |
+
with open(SF, "w") as f: json.dump(d, f, indent=2, ensure_ascii=False)
|
| 263 |
return d
|
| 264 |
|
| 265 |
+
def sf_upload(d):
|
| 266 |
+
tk = os.getenv("HF_TOKEN", "")
|
| 267 |
+
if not tk: return "β οΈ HF_TOKEN not set"
|
| 268 |
try:
|
| 269 |
from huggingface_hub import HfApi
|
| 270 |
+
HfApi(token=tk).upload_file(
|
| 271 |
+
path_or_fileobj=json.dumps(d, indent=2, ensure_ascii=False).encode("utf-8"),
|
| 272 |
+
path_in_repo="final_scores.json",
|
| 273 |
+
repo_id="FINAL-Bench/ALL-Bench-Leaderboard", repo_type="dataset",
|
| 274 |
+
commit_message=f"FINAL Score {datetime.now().strftime('%Y-%m-%d %H:%M')}")
|
| 275 |
+
return "β
Uploaded to HF"
|
| 276 |
+
except Exception as e: return f"β Upload: {e}"
|
| 277 |
+
|
| 278 |
+
# ββββββββββββββ EVAL ENGINE ββββββββββββββ
|
| 279 |
|
| 280 |
from concurrent.futures import ThreadPoolExecutor
|
| 281 |
|
| 282 |
+
def eval_one(task, rid, key, jkey, mid, jmodel, atype, state):
|
| 283 |
+
print(f"\n{'='*40}\nπ Evaluating: {task.task_id} ({task.ticos_type})")
|
| 284 |
try:
|
| 285 |
+
# 1. Model response
|
| 286 |
+
resp = call_model(task.prompt, key=key, model=mid, api_type=atype)
|
| 287 |
+
if not resp or resp.startswith("[API_ERROR"):
|
| 288 |
+
print(f" β Model failed: {resp[:100]}")
|
| 289 |
+
db_save(rid, task.task_id, resp or "empty", "{}", 0)
|
| 290 |
+
with state["lock"]:
|
| 291 |
+
state["done"] += 1
|
| 292 |
+
state["errors"].append(task.task_id)
|
| 293 |
+
return task.task_id, {"response": resp, "judge": "{}", "score": 0}
|
| 294 |
+
|
| 295 |
+
# 2. Judge
|
| 296 |
+
jp = make_judge_prompt(task, resp)
|
| 297 |
+
jd = judge(jp, jkey, jmodel)
|
| 298 |
if jd is None:
|
| 299 |
+
print(f" β Judge failed for {task.task_id}")
|
| 300 |
+
jd = {"scores": {k: 0.0 for k in RK}, "comment": "judge_failed", "failed": True}
|
| 301 |
+
|
| 302 |
+
if jd.get("failed"):
|
| 303 |
+
sc = -1.0
|
| 304 |
+
else:
|
| 305 |
+
sc = calc_score(jd["scores"])
|
| 306 |
+
with state["lock"]: state["jok"] += 1
|
| 307 |
+
|
| 308 |
+
jj = json.dumps(jd, ensure_ascii=False)
|
| 309 |
+
db_save(rid, task.task_id, resp, jj, sc)
|
| 310 |
+
print(f" π Score: {sc}")
|
| 311 |
+
|
| 312 |
+
with state["lock"]:
|
| 313 |
+
state["done"] += 1
|
| 314 |
+
ti = TICOS.get(task.ticos_type, {})
|
| 315 |
+
state["active"].append(f'{ti.get("i","π")} {task.task_id} β {sc}')
|
| 316 |
+
if len(state["active"]) > 10:
|
| 317 |
+
state["active"] = state["active"][-10:]
|
| 318 |
+
|
| 319 |
+
return task.task_id, {"response": resp, "judge": jj, "score": sc}
|
| 320 |
+
|
| 321 |
except Exception as e:
|
| 322 |
+
print(f" π₯ Exception: {e}")
|
| 323 |
+
db_save(rid, task.task_id, f"[ERR] {e}", "{}", 0)
|
| 324 |
+
with state["lock"]:
|
| 325 |
+
state["done"] += 1
|
| 326 |
+
state["errors"].append(f"{task.task_id}: {str(e)[:40]}")
|
| 327 |
+
return task.task_id, {"response": f"[ERR] {e}", "judge": "{}", "score": 0}
|
| 328 |
+
|
| 329 |
+
# ββ State ββ
|
| 330 |
+
ST = {
|
| 331 |
+
"running": False, "stop": False, "finished": False,
|
| 332 |
+
"rid": "", "model": "", "done": 0, "total": 0, "cached": 0,
|
| 333 |
+
"errors": [], "active": [], "jok": 0, "t0": 0,
|
| 334 |
+
"results": {}, "tasks": [],
|
| 335 |
+
"lock": threading.Lock(), "msg": "", "csv": None, "hf": "",
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
def st_reset():
|
| 339 |
+
with ST["lock"]:
|
| 340 |
+
ST.update({"running": False, "stop": False, "finished": False,
|
| 341 |
+
"done": 0, "cached": 0, "errors": [], "active": [], "jok": 0,
|
| 342 |
+
"t0": 0, "results": {}, "tasks": [],
|
| 343 |
+
"msg": "", "csv": None, "hf": ""})
|
| 344 |
+
|
| 345 |
+
def bg_eval(key, jkey, mid, mname, jmodel, atype, tasks, rid, nw):
|
| 346 |
+
print(f"\n{'#'*50}")
|
| 347 |
+
print(f"# BG EVAL START: {mname} ({len(tasks)} tasks, {nw} workers)")
|
| 348 |
+
print(f"# API type: {atype}, Model ID: {mid}")
|
| 349 |
+
print(f"{'#'*50}\n")
|
| 350 |
+
|
| 351 |
try:
|
| 352 |
+
cached = db_load(rid)
|
| 353 |
+
nc = sum(1 for t in tasks if t.task_id in cached)
|
| 354 |
+
pending = [t for t in tasks if t.task_id not in cached]
|
| 355 |
+
|
| 356 |
+
with ST["lock"]:
|
| 357 |
+
ST["results"] = cached
|
| 358 |
+
ST["cached"] = nc
|
| 359 |
+
ST["total"] = len(tasks)
|
| 360 |
+
ST["t0"] = time.time()
|
| 361 |
+
|
| 362 |
+
if not pending:
|
| 363 |
+
with ST["lock"]: ST["msg"] = f"πΎ All cached ({nc})"
|
| 364 |
+
finalize(tasks, cached, mname)
|
| 365 |
+
return
|
| 366 |
+
|
| 367 |
+
with ST["lock"]: ST["msg"] = f"β‘ {len(pending)} tasks, {nw} workers"
|
| 368 |
+
print(f"π Pending: {len(pending)}, Cached: {nc}")
|
| 369 |
+
|
| 370 |
+
with ThreadPoolExecutor(max_workers=nw) as exe:
|
| 371 |
+
futs = {}
|
| 372 |
+
for task in pending:
|
| 373 |
+
if ST["stop"]: break
|
| 374 |
+
f = exe.submit(eval_one, task, rid, key, jkey, mid, jmodel, atype, ST)
|
| 375 |
+
futs[f] = task
|
| 376 |
+
|
| 377 |
+
done_set = set()
|
| 378 |
+
while len(done_set) < len(futs):
|
| 379 |
+
if ST["stop"]:
|
| 380 |
+
print("βΉοΈ Stop requested")
|
| 381 |
+
with ST["lock"]:
|
| 382 |
+
ST["msg"] = "βΉοΈ Stopped"
|
| 383 |
+
ST["running"] = False
|
| 384 |
+
ST["finished"] = True
|
| 385 |
return
|
| 386 |
for f in list(futs):
|
| 387 |
+
if f in done_set: continue
|
| 388 |
if f.done():
|
| 389 |
+
done_set.add(f)
|
| 390 |
try:
|
| 391 |
+
tid, data = f.result()
|
| 392 |
+
with ST["lock"]: ST["results"][tid] = data
|
| 393 |
+
except Exception as e:
|
| 394 |
+
print(f"Future error: {e}")
|
| 395 |
time.sleep(0.5)
|
| 396 |
+
|
| 397 |
+
with ST["lock"]: results = dict(ST["results"])
|
| 398 |
+
finalize(tasks, results, mname)
|
| 399 |
+
|
| 400 |
except Exception as e:
|
| 401 |
+
print(f"π₯ BG EVAL CRASH: {e}")
|
| 402 |
+
import traceback; traceback.print_exc()
|
| 403 |
+
with ST["lock"]:
|
| 404 |
+
ST["msg"] = f"β {str(e)[:100]}"
|
| 405 |
+
ST["running"] = False
|
| 406 |
+
ST["finished"] = True
|
| 407 |
+
|
| 408 |
+
def finalize(tasks, results, mname):
|
| 409 |
+
print(f"\nπ Finalizing: {len(results)} results")
|
| 410 |
|
| 411 |
+
ds = {}
|
|
|
|
|
|
|
| 412 |
for dom in set(t.domain for t in tasks):
|
| 413 |
+
v = [results[t.task_id]["score"] for t in tasks
|
| 414 |
+
if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 415 |
+
if v: ds[dom] = round(np.mean(v), 2)
|
| 416 |
+
|
| 417 |
+
ts = {}
|
| 418 |
for tt in set(t.ticos_type for t in tasks):
|
| 419 |
+
v = [results[t.task_id]["score"] for t in tasks
|
| 420 |
+
if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 421 |
+
if v: ts[tt] = round(np.mean(v), 2)
|
| 422 |
+
|
| 423 |
+
av = [results[t.task_id]["score"] for t in tasks
|
| 424 |
+
if t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 425 |
+
fs = round(np.mean(av), 2) if av else 0
|
| 426 |
+
|
| 427 |
+
print(f"π FINAL Score: {fs} ({len(av)}/{len(tasks)} tasks)")
|
| 428 |
+
|
| 429 |
+
sd = sf_save(mname, fs, ds, ts, len(tasks), len(av))
|
| 430 |
+
hf = sf_upload(sd)
|
| 431 |
+
el = int(time.time() - ST["t0"]) if ST["t0"] else 0
|
| 432 |
+
|
| 433 |
+
# CSV
|
| 434 |
+
cp = f"/tmp/fb_{ST['rid']}.csv"
|
| 435 |
+
with open(cp, "w", encoding="utf-8") as f:
|
| 436 |
+
w = csv.writer(f)
|
| 437 |
+
w.writerow(["task_id","domain","grade","ticos","difficulty","title","score","comment"])
|
| 438 |
+
tm = {t.task_id: t for t in tasks}
|
| 439 |
+
for tid, d in sorted(results.items()):
|
| 440 |
+
t = tm.get(tid)
|
| 441 |
+
if not t: continue
|
| 442 |
+
try: jd = json.loads(d["judge"]) if isinstance(d["judge"], str) else {}
|
| 443 |
+
except: jd = {}
|
| 444 |
+
w.writerow([tid, t.domain, t.grade, t.ticos_type, t.difficulty, t.title,
|
| 445 |
+
d["score"], (jd.get("comment","") if isinstance(jd,dict) else "")[:200]])
|
| 446 |
+
|
| 447 |
+
with ST["lock"]:
|
| 448 |
+
ST["csv"] = cp
|
| 449 |
+
ST["hf"] = hf
|
| 450 |
+
ST["msg"] = f"π FINAL Score = {fs} ({el}s, {len(av)}/{len(tasks)})"
|
| 451 |
+
ST["running"] = False
|
| 452 |
+
ST["finished"] = True
|
| 453 |
+
|
| 454 |
+
print(f"β
Done: FINAL Score = {fs}")
|
| 455 |
+
|
| 456 |
+
# ββββββββββββββ UI CALLBACKS ββββββββββββββ
|
| 457 |
+
|
| 458 |
+
def do_start(model, api_type, eval_key, judge_key, judge_model, diff, max_t, workers, fresh):
|
| 459 |
+
print(f"\nπ START clicked: model={model}, api={api_type}, fresh={fresh}")
|
| 460 |
+
|
| 461 |
+
if ST["running"]:
|
| 462 |
+
return "β οΈ Already running"
|
| 463 |
+
|
| 464 |
+
eval_key = (eval_key or "").strip() or os.getenv("HF_TOKEN", "")
|
| 465 |
+
judge_key = (judge_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
|
| 466 |
+
|
| 467 |
+
if not eval_key:
|
| 468 |
+
print("β No eval key")
|
| 469 |
+
return "β API Key needed"
|
| 470 |
+
if not judge_key:
|
| 471 |
+
print("β No judge key")
|
| 472 |
+
return "β Judge Key needed"
|
| 473 |
+
|
| 474 |
+
print(f" Keys: eval={eval_key[:8]}... judge={judge_key[:8]}...")
|
| 475 |
+
|
| 476 |
+
if api_type == "HuggingFace Inference":
|
| 477 |
+
mid = HF_MODELS.get(model, model)
|
| 478 |
+
at = "hf"
|
| 479 |
+
else:
|
| 480 |
+
mid = OAI_MODELS.get(model, model)
|
| 481 |
+
at = "openai"
|
| 482 |
+
|
| 483 |
+
tasks = TASKS[:]
|
| 484 |
+
if diff != "μ 체":
|
| 485 |
+
tasks = [t for t in tasks if t.difficulty == diff]
|
| 486 |
+
tasks = tasks[:int(max_t)]
|
| 487 |
+
|
| 488 |
+
print(f" Model ID: {mid}, Tasks: {len(tasks)}")
|
| 489 |
+
|
| 490 |
+
rid = hashlib.md5(f"FB_{mid}".encode()).hexdigest()[:12]
|
| 491 |
+
if fresh:
|
| 492 |
+
db_clear(rid)
|
| 493 |
+
print(" ποΈ Cache cleared")
|
| 494 |
+
|
| 495 |
+
st_reset()
|
| 496 |
+
with ST["lock"]:
|
| 497 |
+
ST["running"] = True
|
| 498 |
+
ST["rid"] = rid
|
| 499 |
+
ST["model"] = model
|
| 500 |
+
ST["tasks"] = tasks
|
| 501 |
+
ST["total"] = len(tasks)
|
| 502 |
+
|
| 503 |
+
thread = threading.Thread(
|
| 504 |
+
target=bg_eval,
|
| 505 |
+
args=(eval_key, judge_key, mid, model, judge_model, at, tasks, rid, int(workers)),
|
| 506 |
+
daemon=True)
|
| 507 |
+
thread.start()
|
| 508 |
+
print(f" π§΅ Thread started")
|
| 509 |
+
|
| 510 |
+
return f"𧬠{model} started ({len(tasks)} tasks, {int(workers)} workers)"
|
| 511 |
+
|
| 512 |
+
def do_stop():
|
| 513 |
+
if ST["running"]:
|
| 514 |
+
ST["stop"] = True
|
| 515 |
+
return "βΉοΈ Stopping..."
|
| 516 |
+
return "Not running"
|
| 517 |
+
|
| 518 |
+
def do_poll():
|
| 519 |
+
with ST["lock"]:
|
| 520 |
+
running = ST["running"]
|
| 521 |
+
finished = ST["finished"]
|
| 522 |
+
tasks = ST.get("tasks", [])
|
| 523 |
+
results = dict(ST.get("results", {}))
|
| 524 |
+
msg = ST.get("msg", "")
|
| 525 |
+
csvp = ST.get("csv")
|
| 526 |
+
|
| 527 |
+
if not running and not finished and not results:
|
| 528 |
+
return ("βΉοΈ Select model β press βΆοΈ Start", "", "", None)
|
| 529 |
+
|
| 530 |
+
# Progress bar
|
| 531 |
+
if running:
|
| 532 |
+
dn = ST["done"]
|
| 533 |
+
tot = ST.get("total", 1)
|
| 534 |
+
pct = min(int(dn / max(tot, 1) * 100), 100)
|
| 535 |
+
el = int(time.time() - ST.get("t0", time.time()))
|
| 536 |
+
eta = int((el / max(dn, 1)) * (tot - dn)) if dn > 0 else 0
|
| 537 |
+
active = ST.get("active", [])
|
| 538 |
+
jok = ST.get("jok", 0)
|
| 539 |
+
errs = ST.get("errors", [])
|
| 540 |
+
|
| 541 |
+
tags = " ".join([f'<span style="background:#ede9fe;padding:2px 6px;border-radius:4px;'
|
| 542 |
+
f'font-size:12px">{a}</span>' for a in active[-6:]])
|
| 543 |
+
err_html = ""
|
| 544 |
+
if errs:
|
| 545 |
+
err_html = f'<div style="color:#dc2626;margin-top:6px;font-size:12px">β οΈ Errors: {", ".join(errs[-3:])}</div>'
|
| 546 |
+
|
| 547 |
+
prog = f"""<div style="padding:12px;background:#fafafa;border-radius:8px;border:1px solid #e5e7eb">
|
| 548 |
+
<div style="display:flex;justify-content:space-between;margin-bottom:6px">
|
| 549 |
+
<span style="font-size:14px">𧬠{dn}/{tot} Β· {el}s Β· ETA ~{eta}s Β· Judge β
{jok}</span>
|
| 550 |
+
<span style="font-weight:700;color:#7c3aed;font-size:16px">{pct}%</span>
|
| 551 |
+
</div>
|
| 552 |
+
<div style="background:#e5e7eb;border-radius:8px;height:24px;overflow:hidden">
|
| 553 |
+
<div style="width:{pct}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#7c3aed,#6366f1);transition:width 0.3s"></div>
|
| 554 |
+
</div>
|
| 555 |
+
<div style="margin-top:8px">{tags}</div>{err_html}
|
| 556 |
+
</div>"""
|
| 557 |
+
|
| 558 |
+
elif finished:
|
| 559 |
+
prog = f'<div style="background:#f0fdf4;padding:16px;border-radius:8px;font-weight:700;border-left:4px solid #16a34a;font-size:16px">π {msg}</div>'
|
| 560 |
+
else:
|
| 561 |
+
prog = f'<div style="padding:12px">{msg}</div>'
|
| 562 |
+
|
| 563 |
+
# Results table
|
| 564 |
+
tbl = ""
|
| 565 |
if tasks:
|
| 566 |
+
rows = ""
|
| 567 |
for t in tasks:
|
| 568 |
+
ti = TICOS.get(t.ticos_type, {"i": "π", "n": t.ticos_type})
|
| 569 |
+
if t.task_id in results:
|
| 570 |
+
s = results[t.task_id]["score"]
|
| 571 |
+
if s < 0:
|
| 572 |
+
rows += f'<tr style="background:#fef3c7"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td style="color:#f59e0b;font-weight:700">β Judge failed</td></tr>'
|
| 573 |
else:
|
| 574 |
+
c = "#22c55e" if s >= 80 else ("#f59e0b" if s >= 60 else "#ef4444")
|
| 575 |
+
rows += f'<tr><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td><div style="display:flex;align-items:center;gap:6px"><div style="background:#e5e7eb;border-radius:6px;height:16px;width:80px;overflow:hidden"><div style="width:{min(s,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="color:{c};font-weight:700;font-size:12px">{s:.1f}</span></div></td></tr>'
|
| 576 |
+
else:
|
| 577 |
+
rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{ti["i"]}</td><td>{t.domain}</td><td>{ti["n"]}</td><td>{t.difficulty}</td><td>β³</td></tr>'
|
| 578 |
+
tbl = f'<table style="width:100%;border-collapse:collapse;font-size:13px"><thead><tr style="background:#f1f5f9"><th style="padding:8px;text-align:left">ID</th><th></th><th>Domain</th><th>TICOS</th><th>Diff</th><th>Score</th></tr></thead><tbody>{rows}</tbody></table>'
|
| 579 |
+
|
| 580 |
+
# Summary
|
| 581 |
+
sm = ""
|
| 582 |
+
if finished and tasks:
|
| 583 |
+
av = [results[t.task_id]["score"] for t in tasks
|
| 584 |
+
if t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 585 |
+
fs = round(np.mean(av), 2) if av else 0
|
| 586 |
+
|
| 587 |
+
# Domain bars
|
| 588 |
+
dh = ""
|
| 589 |
for dom in sorted(set(t.domain for t in tasks)):
|
| 590 |
+
v = [results[t.task_id]["score"] for t in tasks
|
| 591 |
+
if t.domain == dom and t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 592 |
+
if v:
|
| 593 |
+
a = round(np.mean(v), 1)
|
| 594 |
+
c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
|
| 595 |
+
dh += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:180px;font-size:13px">{dom}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
|
| 596 |
+
|
| 597 |
+
# TICOS bars
|
| 598 |
+
th = ""
|
| 599 |
+
for tt, info in TICOS.items():
|
| 600 |
+
v = [results[t.task_id]["score"] for t in tasks
|
| 601 |
+
if t.ticos_type == tt and t.task_id in results and results[t.task_id]["score"] >= 0]
|
| 602 |
+
if v:
|
| 603 |
+
a = round(np.mean(v), 1)
|
| 604 |
+
c = "#22c55e" if a >= 80 else ("#f59e0b" if a >= 60 else "#ef4444")
|
| 605 |
+
th += f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0"><span style="width:160px;font-size:13px">{info["i"]} {info["n"]}</span><div style="flex:1;background:#334155;border-radius:6px;height:14px;overflow:hidden"><div style="width:{min(a,100)}%;height:100%;background:{c};border-radius:6px"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c};font-size:13px">{a}</span></div>'
|
| 606 |
+
|
| 607 |
+
sm = f"""<div style="background:linear-gradient(135deg,#1e1b4b,#312e81);border-radius:14px;padding:24px;color:#fff;margin:8px 0">
|
| 608 |
+
<h2 style="margin:0;font-size:28px;text-align:center">𧬠FINAL Score: {fs} / 100</h2>
|
| 609 |
+
<p style="text-align:center;color:#a5b4fc;margin:8px 0">{ST.get("model","")} Β· {len(av)} tasks</p>
|
| 610 |
+
<hr style="border-color:#4338ca;margin:16px 0">
|
| 611 |
+
<h4 style="color:#a5b4fc;margin:8px 0">π Domains</h4>{dh}
|
| 612 |
+
<hr style="border-color:#4338ca;margin:16px 0">
|
| 613 |
+
<h4 style="color:#a5b4fc;margin:8px 0">𧬠TICOS Types</h4>{th}
|
| 614 |
+
<hr style="border-color:#4338ca;margin:16px 0">
|
| 615 |
+
<p style="font-size:12px;color:#818cf8">{ST.get("hf","")}</p></div>"""
|
| 616 |
+
|
| 617 |
+
return (prog, tbl, sm, csvp)
|
| 618 |
+
|
| 619 |
+
def update_models(api_type):
|
| 620 |
+
if api_type == "HuggingFace Inference":
|
| 621 |
+
return gr.update(choices=list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0])
|
| 622 |
+
return gr.update(choices=list(OAI_MODELS.keys()), value=list(OAI_MODELS.keys())[0])
|
| 623 |
+
|
| 624 |
+
# ββββββββββββββ GRADIO APP ββββββββββββββ
|
| 625 |
+
|
| 626 |
+
HEADER_HTML = """<div style="text-align:center;padding:16px 0">
|
| 627 |
+
<h1 style="margin:0;font-size:28px">𧬠FINAL Bench Auto-Evaluator v1.1</h1>
|
| 628 |
+
<h2 style="margin:4px 0;color:#6b7280;font-size:16px">Metacognitive Intelligence Β· 100 Tasks Β· TICOS Scoring</h2>
|
| 629 |
+
<p style="color:#9ca3af;font-size:13px;max-width:700px;margin:8px auto;line-height:1.6">
|
| 630 |
+
π <b>FINAL-Bench/Metacognitive</b> 100 tasks Β· 15 domains Β· 8 TICOS types<br>
|
| 631 |
+
𧬠<b>TICOS</b>: Trap · Insight · Confidence · Self-Correction · Synthesis<br>
|
| 632 |
+
π‘ HF Inference API (open-source) + π OpenAI (closed) β βοΈ GPT-5.2 Judge<br>
|
| 633 |
+
π β <code>final_scores.json</code> β ALL Bench Metacog column</p></div>"""
|
|
|
|
|
|
|
| 634 |
|
| 635 |
def create_app():
|
| 636 |
+
with gr.Blocks(title="FINAL Bench Evaluator") as app:
|
| 637 |
+
gr.HTML(HEADER_HTML)
|
| 638 |
+
|
| 639 |
with gr.Row():
|
| 640 |
+
api_type = gr.Radio(
|
| 641 |
+
["HuggingFace Inference", "OpenAI Compatible"],
|
| 642 |
+
value="HuggingFace Inference", label="π‘ API Type", scale=2)
|
| 643 |
+
model_dd = gr.Dropdown(
|
| 644 |
+
list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0],
|
| 645 |
+
label="π€ Target Model", scale=3, allow_custom_value=True)
|
| 646 |
+
|
| 647 |
+
api_type.change(update_models, [api_type], [model_dd])
|
| 648 |
+
|
| 649 |
with gr.Row():
|
| 650 |
+
eval_key = gr.Textbox(
|
| 651 |
+
label="π Eval API Key (HF Token or OpenAI)",
|
| 652 |
+
type="password", placeholder="hf_... or sk-...",
|
| 653 |
+
value=os.getenv("HF_TOKEN", ""), scale=3)
|
| 654 |
+
judge_key = gr.Textbox(
|
| 655 |
+
label="βοΈ Judge Key (OpenAI)",
|
| 656 |
+
type="password", placeholder="sk-...",
|
| 657 |
+
value=os.getenv("OPENAI_API_KEY", ""), scale=3)
|
| 658 |
+
|
| 659 |
with gr.Row():
|
| 660 |
+
judge_model = gr.Textbox(label="βοΈ Judge Model", value="gpt-5.2", scale=2)
|
| 661 |
+
diff_dd = gr.Dropdown(
|
| 662 |
+
["μ 체", "expert", "frontier"],
|
| 663 |
+
value="μ 체", label="Difficulty", scale=1)
|
| 664 |
+
max_tasks = gr.Slider(1, 100, value=100, step=1, label="Max Tasks", scale=2)
|
| 665 |
+
workers = gr.Slider(1, 20, value=10, step=1, label="β‘ Workers", scale=1)
|
| 666 |
+
|
| 667 |
with gr.Row():
|
| 668 |
+
start_btn = gr.Button("βΆοΈ Start (Resume)", variant="primary", size="lg", scale=2)
|
| 669 |
+
fresh_btn = gr.Button("π Fresh Start", variant="secondary", size="lg", scale=2)
|
| 670 |
+
stop_btn = gr.Button("βΉοΈ Stop", variant="stop", size="lg", scale=1)
|
| 671 |
+
|
| 672 |
+
status = gr.Textbox(label="Status", interactive=False, max_lines=1)
|
| 673 |
+
|
| 674 |
+
with gr.Accordion("π Existing FINAL Scores", open=False):
|
| 675 |
+
gr.JSON(value=sf_load(), label="final_scores.json")
|
| 676 |
+
|
| 677 |
with gr.Tabs():
|
| 678 |
+
with gr.Tab("π Progress"):
|
| 679 |
+
prog_html = gr.HTML()
|
| 680 |
+
with gr.Tab("π Results"):
|
| 681 |
+
table_html = gr.HTML()
|
| 682 |
+
with gr.Tab("π Summary"):
|
| 683 |
+
summary_html = gr.HTML()
|
| 684 |
+
with gr.Tab("πΎ CSV"):
|
| 685 |
+
csv_file = gr.File(label="CSV Download")
|
| 686 |
+
|
| 687 |
+
# Timer for polling
|
| 688 |
+
timer = gr.Timer(value=2, active=True)
|
| 689 |
+
timer.tick(fn=do_poll, outputs=[prog_html, table_html, summary_html, csv_file])
|
| 690 |
+
|
| 691 |
+
# Button handlers
|
| 692 |
+
inputs = [model_dd, api_type, eval_key, judge_key, judge_model,
|
| 693 |
+
diff_dd, max_tasks, workers]
|
| 694 |
+
|
| 695 |
+
start_btn.click(
|
| 696 |
+
fn=lambda *a: do_start(*a, fresh=False),
|
| 697 |
+
inputs=inputs, outputs=[status])
|
| 698 |
+
fresh_btn.click(
|
| 699 |
+
fn=lambda *a: do_start(*a, fresh=True),
|
| 700 |
+
inputs=inputs, outputs=[status])
|
| 701 |
+
stop_btn.click(fn=do_stop, outputs=[status])
|
| 702 |
+
|
| 703 |
+
gr.Markdown(f"""---
|
| 704 |
+
<center>𧬠FINAL Bench Auto-Evaluator v1.1 · Apache 2.0 · Ginigen AI<br>
|
| 705 |
+
Data: <a href="https://huggingface.co/datasets/FINAL-Bench/Metacognitive">FINAL-Bench/Metacognitive</a> ({len(TASKS)} tasks)<br>
|
| 706 |
+
β ALL Bench Leaderboard Metacog auto-sync</center>""")
|
| 707 |
+
|
| 708 |
return app
|
| 709 |
|
| 710 |
+
if __name__ == "__main__":
|
| 711 |
+
stats = {}
|
| 712 |
+
for t in TASKS:
|
| 713 |
+
stats[t.ticos_type] = stats.get(t.ticos_type, 0) + 1
|
| 714 |
+
print(f"FINAL Bench Evaluator: {len(TASKS)} tasks")
|
| 715 |
+
for tt, n in sorted(stats.items()):
|
| 716 |
+
info = TICOS.get(tt, {"i": "?", "n": tt})
|
| 717 |
+
print(f" {info['i']} {tt}: {n}")
|
| 718 |
+
|
| 719 |
+
app = create_app()
|
| 720 |
app.queue(default_concurrency_limit=2)
|
| 721 |
+
app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False,
|
| 722 |
+
theme=gr.themes.Soft(),
|
| 723 |
+
css=".gradio-container{max-width:1100px !important}")
|