seawolf2357 commited on
Commit
917990a
Β·
verified Β·
1 Parent(s): 1c0eff0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +377 -919
app.py CHANGED
@@ -1,35 +1,31 @@
1
  """
2
- FINAL Bench Auto-Evaluator v1.0 β€” ALL Bench μ—°λ™μš©
3
- ===================================================
4
- HF Inference API둜 λͺ¨λΈ 평가 β†’ OpenAI GPT Judge 채점 β†’ final_scores.json 좜λ ₯
5
 
6
- - 피평가: HuggingFace Inference API (μ˜€ν”ˆμ†ŒμŠ€ λͺ¨λΈ) + OpenAI/기타 API (ν΄λ‘œμ¦ˆλ“œ λͺ¨λΈ)
7
- - μ‹¬νŒ: OpenAI GPT-5.2 (Structured Output)
8
- - 좜λ ₯: final_scores.json (ALL Bench index.htmlμ—μ„œ fetch)
 
9
 
10
- Author: Ginigen AI (μ§€λ‹ˆμ  AI) Β· FINAL-Bench
11
- License: Apache 2.0
12
  """
13
 
14
- import json, os, time, csv, io, re, html, hashlib, sqlite3, threading
15
  from datetime import datetime
16
  from dataclasses import dataclass, field, asdict
17
- from typing import List, Dict, Optional
 
18
  import requests
19
  import numpy as np
20
- import pandas as pd
21
  import gradio as gr
22
 
23
- # ════════════════════════════════════════════════════════════════
24
- # PART 1: 벀치마크 데이터 ꡬ쑰 + 루브릭
25
- # ════════════════════════════════════════════════════════════════
26
-
27
  PILLAR_INFO = {
28
- "P1_Emergence": {"name": "μ°½λ°œμ„±", "icon": "✦", "color": "#FF6B35", "weight": 0.20},
29
- "P2_Metacognition": {"name": "메타인지", "icon": "β—‰", "color": "#7B2FF7", "weight": 0.25},
30
- "P3_SelfEvolution": {"name": "μžκ°€μ§„ν™”", "icon": "β—ˆ", "color": "#00B4D8", "weight": 0.15},
31
- "P4_Orchestration": {"name": "닀쀑지λŠ₯", "icon": "β—¬", "color": "#2EC4B6", "weight": 0.15},
32
- "P5_SynergyAntagonism":{"name": "상생상극", "icon": "☯", "color": "#E63946", "weight": 0.25},
33
  }
34
 
35
  @dataclass
@@ -37,970 +33,432 @@ class EvalTask:
37
  task_id: str; pillar: str; sub_dimension: str; difficulty: str
38
  prompt: str; context: Optional[str] = None; expected_behavior: Optional[str] = None
39
  scoring_rubric: Dict = field(default_factory=dict); metadata: Dict = field(default_factory=dict)
40
- def to_dict(self): return asdict(self)
41
-
42
- def load_tasks_from_parquet(path="full_v2.parquet"):
43
- df = pd.read_parquet(path)
44
- tasks = []
45
- for _, row in df.iterrows():
46
- rubric = row["scoring_rubric"]
47
- if isinstance(rubric, str):
48
- rubric = json.loads(rubric)
49
- meta = row.get("metadata") or {}
50
- if isinstance(meta, str):
51
- try: meta = json.loads(meta)
52
- except: meta = {}
53
- tasks.append(EvalTask(
54
- task_id=row["task_id"], pillar=row["pillar"],
55
- sub_dimension=row["sub_dimension"], difficulty=row["difficulty"],
56
- prompt=row["prompt"], context=row.get("context"),
57
- expected_behavior=row.get("expected_behavior"),
58
- scoring_rubric=rubric, metadata=meta,
59
- ))
60
- return tasks
61
-
62
- ALL_TASKS = load_tasks_from_parquet()
63
-
64
- # ════════════════════════════════════════════════════════════════
65
- # PART 2: ALL Bench λͺ¨λΈ λͺ©λ‘ (HF Inference API 지원)
66
- # ════════════════════════════════════════════════════════════════
67
-
68
- # HF Inference API둜 평가 κ°€λŠ₯ν•œ ALL Bench λ“±μž¬ λͺ¨λΈ
69
- HF_MODELS = {
70
- # ── Open-Source (HF Inference API) ──
71
- "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
72
- "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
73
- "Qwen3.5-27B": "Qwen/Qwen3.5-27B",
74
- "Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B",
75
- "Qwen3.5-9B": "Qwen/Qwen3.5-9B",
76
- "Qwen3.5-4B": "Qwen/Qwen3.5-4B",
77
- "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
78
- "DeepSeek R1": "deepseek-ai/DeepSeek-R1",
79
- "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
80
- "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
81
- "Phi-4": "microsoft/phi-4",
82
- "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
83
- "Qwen3-Next-80B": "Qwen/Qwen3-Next-80B-A3B-Thinking",
84
- }
85
 
86
- # OpenAI-compatible API λͺ¨λΈ (별도 API ν‚€ ν•„μš”)
87
- OPENAI_MODELS = {
88
- "GPT-5.2": "gpt-5.2",
89
- "GPT-5.4": "gpt-5.4",
90
- "GPT-5.1": "gpt-5.1",
91
- "GPT-5.3 Codex": "gpt-5.3-codex",
92
- }
93
 
94
- # ALL Bench ν‘œμ‹œλͺ… β†’ FINAL Score ν‚€ λ§€ν•‘
95
- MODEL_DISPLAY_NAMES = {
96
- **{k: k for k in HF_MODELS},
97
- **{k: k for k in OPENAI_MODELS},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  }
99
 
100
- # ════════════════════════════════════════════════════════════════
101
- # PART 3: LLM 호좜 β€” HF Inference API + OpenAI
102
- # ════════════════════════════════════════════════════════════════
103
 
104
- def _strip_think_tags(text):
105
  if not text: return text
106
- text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
107
- text = re.sub(r'<thinking>.*?</thinking>', '', text, flags=re.DOTALL)
108
- text = re.sub(r'<reasoning>.*?</reasoning>', '', text, flags=re.DOTALL)
109
- text = re.sub(r'<reflection>.*?</reflection>', '', text, flags=re.DOTALL)
110
  return text.strip()
111
 
112
-
113
- def call_llm_hf(prompt, system="", api_key="", model_id="Qwen/Qwen3.5-397B-A17B",
114
- max_tokens=4096, temperature=0.6):
115
- """HuggingFace Inference API (OpenAI-compatible) 호좜"""
116
- messages = []
117
- if system:
118
- messages.append({"role": "system", "content": system})
119
- messages.append({"role": "user", "content": prompt})
120
-
121
- payload = {
122
- "model": model_id,
123
- "messages": messages,
124
- "max_tokens": max_tokens,
125
- "temperature": temperature,
126
- "stream": False,
127
- }
128
- headers = {
129
- "Content-Type": "application/json",
130
- "Authorization": f"Bearer {api_key}",
131
- }
132
-
133
  for attempt in range(3):
134
  try:
135
- r = requests.post(
136
- f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions",
137
- headers=headers, json=payload, timeout=120,
138
- )
139
- if r.status_code == 429:
140
- time.sleep(5 * (attempt + 1)); continue
141
- if r.status_code == 503:
142
- # Model loading
143
- time.sleep(10 * (attempt + 1)); continue
144
- r.raise_for_status()
145
- content = r.json()["choices"][0]["message"]["content"]
146
- content = _strip_think_tags(content)
147
- return content
148
- except Exception as e:
149
- if attempt < 2:
150
- time.sleep(3 * (attempt + 1))
151
- else:
152
- return f"[API_ERROR] HF Inference: {e}"
153
-
154
-
155
- def call_llm_openai(prompt, system="", api_key="", model="gpt-5.2",
156
- max_tokens=4096, temperature=0.6, base_url="https://api.openai.com/v1"):
157
- """OpenAI-compatible API 호좜 (GPT, Claude λ“±)"""
158
- messages = []
159
- if system:
160
- messages.append({"role": "system", "content": system})
161
- messages.append({"role": "user", "content": prompt})
162
-
163
- payload = {
164
- "model": model,
165
- "messages": messages,
166
- "max_tokens": max_tokens,
167
- "temperature": temperature,
168
- }
169
- headers = {
170
- "Content-Type": "application/json",
171
- "Authorization": f"Bearer {api_key}",
172
- }
173
-
174
- for attempt in range(2):
175
- try:
176
- r = requests.post(
177
- f"{base_url}/chat/completions",
178
- headers=headers, json=payload, timeout=120,
179
- )
180
- if r.status_code == 429:
181
- time.sleep(5 * (attempt + 1)); continue
182
  r.raise_for_status()
183
- content = r.json()["choices"][0]["message"]["content"]
184
- content = _strip_think_tags(content)
185
- return content
186
  except Exception as e:
187
- if attempt < 1:
188
- time.sleep(3)
189
- else:
190
- return f"[API_ERROR] OpenAI: {e}"
191
-
192
-
193
- def call_llm(prompt, system="", api_key="", model_id="", api_type="hf",
194
- max_tokens=4096, temperature=0.6):
195
- """톡합 LLM 호좜 래퍼"""
196
- if api_type == "openai":
197
- return call_llm_openai(prompt, system, api_key, model_id, max_tokens, temperature)
198
- else:
199
- return call_llm_hf(prompt, system, api_key, model_id, max_tokens, temperature)
200
-
201
-
202
- # ════════════════════════════════════════════════════════════════
203
- # PART 4: Judge (OpenAI Structured Output) β€” 원본 μ½”λ“œ μœ μ§€
204
- # ════════════════════════════════════════════════════════════════
205
-
206
- def _build_judge_schema(rubric_keys):
207
- score_props = {}
208
- for k in rubric_keys:
209
- score_props[k] = {"type": "number", "enum": [0.0, 0.25, 0.5, 0.75, 1.0]}
210
- return {
211
- "type": "object",
212
- "properties": {
213
- "scores": {
214
- "type": "object", "properties": score_props,
215
- "required": list(rubric_keys), "additionalProperties": False,
216
- },
217
- "comment": {"type": "string"}
218
- },
219
- "required": ["scores", "comment"], "additionalProperties": False,
220
- }
221
-
222
-
223
- def call_judge_structured(prompt, system="", api_key="", model="gpt-5.2",
224
- rubric_keys=None, temperature=0.1, max_tokens=4096):
225
- if not rubric_keys:
226
- return {"scores": {}, "comment": "λ£¨λΈŒλ¦­ν‚€ μ—†μŒ"}
227
- messages = []
228
- if system:
229
- messages.append({"role": "system", "content": system})
230
- messages.append({"role": "user", "content": prompt})
231
- schema = _build_judge_schema(rubric_keys)
232
- payload = {
233
- "model": model, "max_completion_tokens": max_tokens,
234
- "temperature": temperature, "messages": messages,
235
- "response_format": {
236
- "type": "json_schema",
237
- "json_schema": {"name": "JudgeResult", "strict": True, "schema": schema}
238
- }
239
- }
240
- headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
241
-
242
  for attempt in range(3):
243
  try:
244
- r = requests.post("https://api.openai.com/v1/chat/completions",
245
- headers=headers, json=payload, timeout=180)
246
- if r.status_code == 429:
247
- time.sleep(5 * (attempt + 1)); continue
248
  r.raise_for_status()
249
- content = r.json()["choices"][0]["message"]["content"]
250
  if not content:
251
- if attempt < 2: time.sleep(2); continue
252
  return None
253
- if "<think>" in content:
254
- content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
255
  data = json.loads(content)
256
- if "scores" in data and isinstance(data["scores"], dict):
257
  for k in rubric_keys:
258
  if k not in data["scores"]: data["scores"][k] = 0.5
259
- return {"scores": data["scores"], "comment": data.get("comment", "structured_ok")}
260
- except json.JSONDecodeError:
261
- if attempt < 2: time.sleep(2); continue
262
- return None
263
- except Exception:
264
- if attempt < 2: time.sleep(3 * (attempt + 1)); continue
265
  return None
266
  return None
267
 
268
-
269
- JUDGE_SYSTEM = """You are a FINAL Bench scoring judge. Score each rubric item using ONLY these values: 0.0, 0.25, 0.5, 0.75, 1.0.
270
-
271
- Scoring criteria:
272
- - 1.0: Excellent, fully meets the rubric
273
- - 0.75: Good, mostly meets with minor gaps
274
- - 0.5: Average, partially meets
275
- - 0.25: Below average, significant gaps
276
- - 0.0: Fails to meet the rubric
277
-
278
- Evaluate the response as-is. Judge the substance and final answer quality.
279
- Output a JSON object with "scores" and "comment" (1-sentence Korean summary).
280
- Every rubric key MUST appear in scores."""
281
-
282
-
283
- def build_judge_prompt(task, response):
284
- rubric = task.scoring_rubric
285
- expected = task.expected_behavior or "N/A"
286
- keys = list(rubric.keys())
287
- skeleton = ", ".join([f'"{k}": ___' for k in keys])
288
- rubric_lines = "\n".join([f' "{k}": {v["desc"]}' for k, v in rubric.items()])
289
- return f"""Task: {task.task_id} | {task.pillar} | {task.difficulty}
290
- Prompt: {task.prompt[:800]}
291
- Expected: {expected[:300]}
292
- Response to judge: {response[:8000]}
293
-
294
- Rubric items to score (each 0.0~1.0):
295
- {rubric_lines}
296
-
297
- Fill in the scores and output ONLY this JSON (replace ___ with 0.0/0.25/0.5/0.75/1.0):
298
- {{"scores": {{{skeleton}}}, "comment": "ν•œμ€„ 평가"}}"""
299
-
300
-
301
- def parse_judge_response(text, rubric_keys):
302
- """6단계 λ°©μ–΄ νŒŒμ„œ"""
303
- if not text or text.startswith("[API_ERROR"):
304
- return {"scores": {k: 0.0 for k in rubric_keys}, "comment": "API였λ₯˜", "failed": True}
305
-
306
- cleaned = _strip_think_tags(text)
307
- cleaned = re.sub(r'```(?:json)?\s*', '', cleaned)
308
- cleaned = re.sub(r'```\s*$', '', cleaned)
309
- cleaned = cleaned.strip()
310
-
311
- def _validate(scores):
312
- result = {}
313
- for k in rubric_keys:
314
- v = scores.get(k)
315
- if v is not None:
316
- try: result[k] = min(max(float(v), 0.0), 1.0)
317
- except: result[k] = 0.5
318
- else: result[k] = 0.5
319
- return result
320
-
321
- # Pattern 1: standard JSON
322
- try:
323
- brace_depth = 0; start = -1
324
- for i, c in enumerate(cleaned):
325
- if c == '{':
326
- if brace_depth == 0: start = i
327
- brace_depth += 1
328
- elif c == '}':
329
- brace_depth -= 1
330
- if brace_depth == 0 and start >= 0:
331
- data = json.loads(cleaned[start:i+1])
332
- if "scores" in data:
333
- return {"scores": _validate(data["scores"]), "comment": data.get("comment", "")}
334
- except: pass
335
-
336
- # Pattern 2: regex
337
- try:
338
- m = re.search(r'"scores"\s*:\s*\{([^}]+)\}', cleaned, re.DOTALL)
339
- if m:
340
- pairs = re.findall(r'"([^"]+)"\s*:\s*([\d.]+)', '{' + m.group(1) + '}')
341
- if pairs:
342
- raw = {k: float(v) for k, v in pairs}
343
- validated = _validate(raw)
344
- if any(v != 0.5 for v in validated.values()):
345
- return {"scores": validated, "comment": "νŒ¨ν„΄2"}
346
- except: pass
347
-
348
- return {"scores": {k: 0.0 for k in rubric_keys}, "comment": "νŒŒμ‹±μ‹€νŒ¨", "failed": True}
349
-
350
-
351
- def compute_weighted_score(scores, rubric):
352
- return round(sum(scores.get(k, 0.5) * v["weight"] for k, v in rubric.items()) * 100, 2)
353
-
354
-
355
- # ════════════════════════════════════════════════════════════════
356
- # PART 5: 닀쀑 λΌμš΄λ“œ 과제 μ‹€ν–‰κΈ°
357
- # ════════════════════════════════════════════════════════════════
358
-
359
- def _run_mutual_verification(topic, api_key, model_id, api_type):
360
- rounds = []
361
- r1 = call_llm(f"[R1-상생] '{topic}'에 λŒ€ν•΄ 500단어 뢄석 λ³΄κ³ μ„œλ₯Ό μž‘μ„±ν•˜μ„Έμš”.",
362
- api_key=api_key, model_id=model_id, api_type=api_type)
363
- rounds.append(f"[R1-상생]\n{r1}")
364
- r2 = call_llm(f"[R2-상극] μ•„λž˜ λ³΄κ³ μ„œλ₯Ό λƒ‰μ² ν•˜κ²Œ λΉ„νŒν•˜μ„Έμš”.\n--- 원문 ---\n{r1[:2000]}",
365
- api_key=api_key, model_id=model_id, api_type=api_type)
366
- rounds.append(f"[R2-상극]\n{r2}")
367
- r3 = call_llm(f"[R3-μˆ˜μ •] λΉ„νŒμ„ λ°˜μ˜ν•˜μ—¬ μˆ˜μ •ν•˜μ„Έμš”.\n--- 원문 ---\n{r1[:1500]}\n--- λΉ„νŒ ---\n{r2[:1500]}",
368
- api_key=api_key, model_id=model_id, api_type=api_type)
369
- rounds.append(f"[R3-μˆ˜μ •]\n{r3}")
370
- r4 = call_llm(f"[R4-메타] 3λΌμš΄λ“œ 메타 뢄석:\n--- R1 ---\n{r1[:800]}\n--- R2 ---\n{r2[:800]}\n--- R3 ---\n{r3[:800]}",
371
- api_key=api_key, model_id=model_id, api_type=api_type)
372
- rounds.append(f"[R4-메타]\n{r4}")
373
- return "\n\n".join(rounds)
374
-
375
-
376
- def _run_feedback(prompt_json, api_key, model_id, api_type):
377
- try: data = json.loads(prompt_json)
378
- except: return call_llm(prompt_json, api_key=api_key, model_id=model_id, api_type=api_type)
379
- topic = data.get("topic", "")
380
- rounds_spec = data.get("rounds", [])
381
- outputs, prev = [], ""
382
- for i, rd in enumerate(rounds_spec):
383
- instruction = rd.get("instruction", "")
384
- feedback = rd.get("feedback")
385
- if i == 0: p = f"'{topic}' β€” {instruction}."
386
- elif feedback: p = f"ν”Όλ“œλ°± 반영: {instruction}.\n--- 이전 ---\n{prev[:2000]}\n--- ν”Όλ“œλ°± ---\n{feedback}"
387
- else: p = f"{instruction}.\n--- μ΅œμ’… ---\n{prev[:2500]}"
388
- resp = call_llm(p, api_key=api_key, model_id=model_id, api_type=api_type)
389
- outputs.append(f"[R{i+1}]\n{resp}")
390
- prev = resp
391
- return "\n\n".join(outputs)
392
-
393
-
394
- def execute_task(task, api_key, model_id, api_type):
395
- """단일 LLM 순수 평가 (Proto-AGI OFF)"""
396
  if task.sub_dimension == "mutual_verification":
397
- topic = task.prompt.replace("[상생-상극 사이클] ", "").split("\n")[0]
398
- return _run_mutual_verification(topic, api_key, model_id, api_type)
399
  elif task.sub_dimension == "feedback_incorporation":
400
- return _run_feedback(task.prompt, api_key, model_id, api_type)
401
- else:
402
- return call_llm(task.prompt, api_key=api_key, model_id=model_id, api_type=api_type)
403
-
404
-
405
- # ════════════════════════════════════════════════════════════════
406
- # PART 6: AETHER Score 계산 + final_scores.json 좜λ ₯
407
- # ════════════════════════════════════════════════════════════════
408
-
409
- def calculate_aether_score(pillar_avgs):
410
- weights = {p: info["weight"] for p, info in PILLAR_INFO.items()}
411
- return round(sum(pillar_avgs.get(p, 0) * w for p, w in weights.items()), 2)
412
-
413
-
414
- SCORES_FILE = "final_scores.json"
415
-
416
- def load_final_scores():
417
- try:
418
- with open(SCORES_FILE) as f:
419
- return json.load(f)
420
- except:
421
- return {"version": "1.0", "updated": "", "models": {}}
422
-
423
-
424
- def save_final_scores(model_name, pillar_scores, aether_score, total_tasks, completed):
425
- """ALL Bench μ—°λ™μš© final_scores.json μ—…λ°μ΄νŠΈ"""
426
- data = load_final_scores()
427
- data["updated"] = datetime.now().isoformat()
428
- data["models"][model_name] = {
429
- "final_score": aether_score,
430
- "pillar_scores": {p: round(s, 2) for p, s in pillar_scores.items()},
431
- "total_tasks": total_tasks,
432
- "completed_tasks": completed,
433
- "evaluated_at": datetime.now().isoformat(),
434
- "mode": "pure_llm",
435
- }
436
- with open(SCORES_FILE, "w") as f:
437
- json.dump(data, f, indent=2, ensure_ascii=False)
438
- return data
439
-
440
-
441
- def upload_scores_to_hf(data):
442
- """final_scores.json을 HF Dataset에 μ—…λ‘œλ“œ"""
443
- hf_token = os.getenv("HF_TOKEN", "")
444
- if not hf_token:
445
- return "⚠️ HF_TOKEN λ―Έμ„€μ •"
446
  try:
447
- from huggingface_hub import HfApi
448
- api = HfApi(token=hf_token)
449
- repo_id = "FINAL-Bench/ALL-Bench-Leaderboard"
450
- api.upload_file(
451
- path_or_fileobj=json.dumps(data, indent=2, ensure_ascii=False).encode("utf-8"),
452
- path_in_repo="final_scores.json",
453
- repo_id=repo_id, repo_type="dataset",
454
- commit_message=f"FINAL Score update: {datetime.now().strftime('%Y-%m-%d %H:%M')}",
455
- )
456
- return f"βœ… HF μ—…λ‘œλ“œ μ™„λ£Œ: datasets/{repo_id}/final_scores.json"
457
- except Exception as e:
458
- return f"❌ μ—…λ‘œλ“œ μ‹€νŒ¨: {e}"
459
-
460
-
461
- # ════════════════════════════════════════════════════════════════
462
- # PART 7: 체크포인트 DB
463
- # ════════════════════════════════════════════════════════════════
464
-
465
- DB_PATH = "final_bench_eval.db"
466
-
467
- def _init_db():
468
- conn = sqlite3.connect(DB_PATH)
469
- conn.execute("""CREATE TABLE IF NOT EXISTS eval_results (
470
- run_id TEXT, task_id TEXT, model_response TEXT, judge_response TEXT,
471
- weighted_score REAL, timestamp REAL,
472
- PRIMARY KEY (run_id, task_id))""")
473
- conn.commit(); conn.close()
474
-
475
- def _make_run_id(model): return hashlib.md5(model.encode()).hexdigest()[:12]
476
-
477
- def _get_cached(run_id, task_id):
478
- conn = sqlite3.connect(DB_PATH)
479
- cur = conn.execute("SELECT model_response, judge_response, weighted_score FROM eval_results WHERE run_id=? AND task_id=?", (run_id, task_id))
480
- row = cur.fetchone(); conn.close()
481
- return row
482
-
483
- def _save_result(run_id, task_id, response, judge_resp, score):
484
- conn = sqlite3.connect(DB_PATH)
485
- conn.execute("INSERT OR REPLACE INTO eval_results VALUES (?,?,?,?,?,?)",
486
- (run_id, task_id, response, judge_resp, score, time.time()))
487
- conn.commit(); conn.close()
488
-
489
- def _load_all(run_id):
490
- conn = sqlite3.connect(DB_PATH)
491
- cur = conn.execute("SELECT task_id, model_response, judge_response, weighted_score FROM eval_results WHERE run_id=?", (run_id,))
492
- rows = cur.fetchall(); conn.close()
493
- return {r[0]: {"response": r[1], "judge": r[2], "score": r[3]} for r in rows}
494
-
495
- def _clear_run(run_id):
496
- conn = sqlite3.connect(DB_PATH)
497
- conn.execute("DELETE FROM eval_results WHERE run_id=?", (run_id,))
498
- conn.commit(); conn.close()
499
-
500
- _init_db()
501
-
502
- # ════════════════════════════════════════════════════════════════
503
- # PART 8: CSV 생성 + HF μ—…λ‘œλ“œ
504
- # ════════════════════════════════════════════════════════════════
505
-
506
- def generate_csv(results, model_name):
507
- output = io.StringIO()
508
- writer = csv.writer(output)
509
- writer.writerow(["task_id","pillar","sub_dimension","difficulty","model",
510
- "weighted_score","judge_comment","rubric_scores_json","timestamp"])
511
- task_map = {t.task_id: t for t in ALL_TASKS}
512
- for tid, data in sorted(results.items()):
513
- task = task_map.get(tid)
514
- if not task: continue
515
- jd = {}
516
- try: jd = json.loads(data["judge"]) if isinstance(data["judge"], str) else (data["judge"] or {})
517
- except: pass
518
- score = data["score"]
519
- comment = (jd.get("comment","") if isinstance(jd,dict) else "")[:200]
520
- if score < 0:
521
- score = -1
522
- if not comment.startswith("JUDGE_FAILED"): comment = f"JUDGE_FAILED:{comment}"
523
- writer.writerow([
524
- tid, task.pillar, task.sub_dimension, task.difficulty, model_name,
525
- score, comment,
526
- json.dumps(jd.get("scores",{}) if isinstance(jd,dict) else {}, ensure_ascii=False),
527
- datetime.now().isoformat(),
528
- ])
529
- return output.getvalue()
530
-
531
-
532
- def upload_csv_to_hf(csv_content, model_name):
533
- hf_token = os.getenv("HF_TOKEN", "")
534
- if not hf_token:
535
- return "⚠️ HF_TOKEN λ―Έμ„€μ •"
536
  try:
537
  from huggingface_hub import HfApi
538
- api = HfApi(token=hf_token)
539
- safe_model = re.sub(r'[^a-zA-Z0-9_-]', '_', model_name.split('/')[-1])
540
- repo_id = "FINAL-Bench/ALL-Bench-Leaderboard"
541
- ts = datetime.now().strftime("%Y%m%d_%H%M%S")
542
- filename = f"eval_results/{safe_model}_{ts}.csv"
543
- api.upload_file(
544
- path_or_fileobj=csv_content.encode("utf-8"),
545
- path_in_repo=filename, repo_id=repo_id, repo_type="dataset",
546
- commit_message=f"FINAL Bench eval: {safe_model}",
547
- )
548
- return f"βœ… CSV μ—…λ‘œλ“œ: {filename}"
549
- except Exception as e:
550
- return f"❌ CSV μ—…λ‘œλ“œ μ‹€νŒ¨: {e}"
551
-
552
-
553
- # ════════════════════════════════════════════════════════════════
554
- # PART 9: HTML λΉŒλ”
555
- # ════════════════════════════════════════════════════════════════
556
-
557
- CSS = """<style>
558
- .eval-table{width:100%;border-collapse:collapse;font-size:0.85em}
559
- .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc}
560
- .eval-table td{padding:6px 8px;border-bottom:1px solid #eee}
561
- .score-bar{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}
562
- .score-fill{height:100%;border-radius:8px;transition:width .4s}
563
- .summary-card{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:20px;color:#fff;margin:8px 0}
564
- .pillar-row{display:flex;align-items:center;gap:10px;margin:6px 0}
565
- .pillar-bar{flex:1;background:#333;border-radius:6px;height:16px;overflow:hidden}
566
- .pillar-fill{height:100%;border-radius:6px}
567
- .progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}
568
- .progress-fill{height:100%;border-radius:8px;background:linear-gradient(90deg,#6366f1,#4caf50)}
569
- </style>"""
570
-
571
- def _sc(s):
572
- if s >= 80: return "#4caf50"
573
- if s >= 60: return "#ff9800"
574
- return "#f44336"
575
-
576
- def _build_progress_table(results, tasks):
577
- rows = ""
578
- for t in tasks:
579
- info = PILLAR_INFO.get(t.pillar, {})
580
- if t.task_id in results:
581
- s = results[t.task_id]["score"]
582
- if s < 0:
583
- rows += f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.sub_dimension}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌ Judgeμ‹€νŒ¨</td><td>β€”</td></tr>'
584
- continue
585
- c = _sc(s)
586
- cls = "color:#2e7d32;font-weight:700" if s>=70 else "color:#c62828;font-weight:700"
587
- rows += f'<tr><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.sub_dimension}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="{cls}">{s:.1f}</td></tr>'
588
- else:
589
- rows += f'<tr style="opacity:0.4"><td>{t.task_id}</td><td>{info.get("icon","")}</td><td>{t.sub_dimension}</td><td>{t.difficulty}</td><td>⏳</td><td>β€”</td></tr>'
590
- return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>κΈ°λ‘₯</th><th>차원</th><th>λ‚œμ΄λ„</th><th>점수</th><th>κ°’</th></tr></thead><tbody>{rows}</tbody></table>'
591
-
592
- def _build_summary(results, tasks, pillar_scores, aether, model_name, hf_status):
593
- if aether >= 80: grade = "A (AGI-Level)"
594
- elif aether >= 70: grade = "B+ (Near-AGI)"
595
- elif aether >= 60: grade = "B (Advanced)"
596
- elif aether >= 50: grade = "C+ (Competent)"
597
- else: grade = "C-F"
598
- ph = ""
599
- for p, info in PILLAR_INFO.items():
600
- s = pillar_scores.get(p, 0)
601
- c = _sc(s); w = int(info["weight"] * 100)
602
- ph += f'<div class="pillar-row"><span style="width:130px">{info["icon"]} {info["name"]} ({w}%)</span><div class="pillar-bar"><div class="pillar-fill" style="width:{min(s,100)}%;background:{c}"></div></div><span style="width:55px;text-align:right;font-weight:700;color:{c}">{s:.1f}</span></div>'
603
- done = sum(1 for t in tasks if t.task_id in results)
604
- jf = sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"] < 0)
605
- return f"""{CSS}<div class="summary-card">
606
- <h2 style="margin:0;font-size:1.6em;text-align:center">🧬 FINAL Score: {aether:.1f} / 100</h2>
607
- <h3 style="margin:4px 0;text-align:center;color:#aaa">Grade: {grade}</h3>
608
- <p style="text-align:center;color:#888">Model: {model_name} | {done}개 μ™„λ£Œ{f' Β· ❌Judgeμ‹€νŒ¨ {jf}건' if jf else ''}</p>
609
- <hr style="border-color:#333;margin:12px 0"><h4 style="color:#aaa;margin:8px 0">κΈ°λ‘₯별 점수</h4>{ph}
610
- <hr style="border-color:#333;margin:12px 0"><p style="font-size:0.85em;color:#aaa">{hf_status}</p></div>"""
611
-
612
-
613
- # ════════════════════════════════════════════════════════════════
614
- # PART 10: 병렬 평가 μ—”μ§„ + λ°±κ·ΈλΌμš΄λ“œ μŠ€λ ˆλ“œ
615
- # ════════════════════════════════════════════════════════════════
616
 
617
- from concurrent.futures import ThreadPoolExecutor
618
 
619
- def _eval_single(task, run_id, api_key, judge_key, model_id, judge_model, api_type, state):
620
  try:
621
- response = execute_task(task, api_key, model_id, api_type)
622
- if response.startswith("[API_ERROR"):
623
- _save_result(run_id, task.task_id, response, "{}", 0)
624
- with state["lock"]:
625
- state["done"] += 1; state["errors"].append(task.task_id)
626
- return task.task_id, {"response": response, "judge": "{}", "score": 0}
627
-
628
- rubric_keys = list(task.scoring_rubric.keys())
629
- judge_prompt = build_judge_prompt(task, response)
630
- judge_data = call_judge_structured(
631
- judge_prompt, system=JUDGE_SYSTEM, api_key=judge_key,
632
- model=judge_model, rubric_keys=rubric_keys, temperature=0.1)
633
-
634
- if judge_data is None:
635
- # Fallback: text parsing
636
- judge_raw = call_llm_openai(
637
- judge_prompt, system=JUDGE_SYSTEM, api_key=judge_key,
638
- model=judge_model, temperature=0.05, max_tokens=512)
639
- judge_data = parse_judge_response(judge_raw, rubric_keys)
640
-
641
- if judge_data.get("failed"):
642
- weighted = -1.0
643
- else:
644
- weighted = compute_weighted_score(judge_data["scores"], task.scoring_rubric)
645
- with state["lock"]: state["parse_ok"] += 1
646
-
647
- judge_json = json.dumps(judge_data, ensure_ascii=False)
648
- _save_result(run_id, task.task_id, response, judge_json, weighted)
649
-
650
- with state["lock"]:
651
- state["done"] += 1
652
- info = PILLAR_INFO.get(task.pillar, {})
653
- state["active"].append(f'{info.get("icon","")} {task.task_id}')
654
- if len(state["active"]) > 10: state["active"] = state["active"][-10:]
655
-
656
- return task.task_id, {"response": response, "judge": judge_json, "score": weighted}
657
-
658
  except Exception as e:
659
- with state["lock"]:
660
- state["done"] += 1; state["errors"].append(f"{task.task_id}: {str(e)[:60]}")
661
- _save_result(run_id, task.task_id, f"[ERROR] {e}", "{}", 0)
662
- return task.task_id, {"response": f"[ERROR] {e}", "judge": "{}", "score": 0}
663
-
664
-
665
- # ── κΈ€λ‘œλ²Œ μƒνƒœ ──
666
- _STATE = {
667
- "running": False, "stop_requested": False, "finished": False,
668
- "run_id": "", "model": "", "done": 0, "total": 0, "cached": 0,
669
- "errors": [], "active": [], "parse_ok": 0, "parse_fail": 0,
670
- "start_time": 0, "results": {}, "tasks": [],
671
- "pillar_done": {}, "pillar_total": {},
672
- "n_workers": 5, "lock": threading.Lock(),
673
- "message": "", "csv_path": None, "hf_status": "",
674
- }
675
 
 
676
 
677
- def _reset_state():
678
- global _STATE
679
- with _STATE["lock"]:
680
- _STATE.update({
681
- "running": False, "stop_requested": False, "finished": False,
682
- "done": 0, "cached": 0, "errors": [], "active": [],
683
- "parse_ok": 0, "parse_fail": 0, "start_time": 0,
684
- "results": {}, "tasks": [], "pillar_done": {}, "pillar_total": {},
685
- "message": "", "csv_path": None, "hf_status": "",
686
- })
687
 
 
 
 
 
688
 
689
- def _bg_evaluate(api_key, judge_key, model_id, model_display, judge_model,
690
- api_type, tasks, run_id, n_workers):
691
- global _STATE
692
  try:
693
- results = dict(_load_all(run_id))
694
- cached = sum(1 for t in tasks if t.task_id in results)
695
- pending = [t for t in tasks if t.task_id not in results]
696
-
697
- pillar_tasks = {}
698
- for t in pending:
699
- pillar_tasks.setdefault(t.pillar, []).append(t)
700
-
701
- with _STATE["lock"]:
702
- _STATE["results"] = results; _STATE["cached"] = cached
703
- _STATE["total"] = len(tasks)
704
- _STATE["pillar_total"] = {p: len(ts) for p, ts in pillar_tasks.items()}
705
- _STATE["pillar_done"] = {p: 0 for p in pillar_tasks}
706
- _STATE["start_time"] = time.time()
707
-
708
  if not pending:
709
- with _STATE["lock"]:
710
- _STATE["message"] = f"πŸ’Ύ μ „λΆ€ μΊμ‹œ! ({cached}개)"
711
- _finalize(tasks, results, model_display)
712
- return
713
-
714
- with _STATE["lock"]:
715
- _STATE["message"] = f"⚑ {len(pending)}개 과제 Β· {n_workers}μ›Œμ»€"
716
-
717
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
718
- futures = {}
719
- for task in pending:
720
- if _STATE["stop_requested"]: break
721
- fut = executor.submit(_eval_single, task, run_id, api_key, judge_key,
722
- model_id, judge_model, api_type, _STATE)
723
- futures[fut] = task
724
-
725
- completed = set()
726
- while len(completed) < len(futures):
727
- if _STATE["stop_requested"]:
728
- with _STATE["lock"]:
729
- _STATE["message"] = "⏹️ 쀑단됨"; _STATE["running"] = False; _STATE["finished"] = True
730
  return
731
- for fut in list(futures):
732
- if fut in completed: continue
733
- if fut.done():
734
- completed.add(fut)
735
  try:
736
- tid, data = fut.result()
737
- with _STATE["lock"]:
738
- _STATE["results"][tid] = data
739
- _STATE["pillar_done"][futures[fut].pillar] = \
740
- _STATE["pillar_done"].get(futures[fut].pillar, 0) + 1
741
  except: pass
742
  time.sleep(0.5)
743
-
744
- with _STATE["lock"]:
745
- results = dict(_STATE["results"])
746
- _finalize(tasks, results, model_display)
747
-
748
  except Exception as e:
749
- with _STATE["lock"]:
750
- _STATE["message"] = f"❌ 였λ₯˜: {str(e)[:100]}"
751
- _STATE["running"] = False; _STATE["finished"] = True
752
-
753
 
754
- def _finalize(tasks, results, model_display):
755
- global _STATE
756
- pillar_scores = {}
757
  for p in PILLAR_INFO:
758
- pt = [t for t in tasks if t.pillar == p and t.task_id in results]
759
- valid = [results[t.task_id]["score"] for t in pt if results[t.task_id]["score"] >= 0]
760
- if valid: pillar_scores[p] = np.mean(valid)
761
-
762
- aether = calculate_aether_score(pillar_scores)
763
- completed = sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"] >= 0)
764
-
765
- # Save final_scores.json
766
- scores_data = save_final_scores(model_display, pillar_scores, aether, len(tasks), completed)
767
-
768
- # CSV
769
- csv_str = generate_csv(results, model_display)
770
- run_id = _STATE["run_id"]
771
- csv_path = f"/tmp/final_bench_{run_id}.csv"
772
- with open(csv_path, "w", encoding="utf-8") as f:
773
- f.write(csv_str)
774
-
775
- # Upload
776
- hf_status = upload_scores_to_hf(scores_data)
777
- csv_hf = upload_csv_to_hf(csv_str, model_display)
778
-
779
- elapsed = int(time.time() - _STATE["start_time"]) if _STATE["start_time"] else 0
780
- with _STATE["lock"]:
781
- _STATE["csv_path"] = csv_path; _STATE["hf_status"] = f"{hf_status}\n{csv_hf}"
782
- _STATE["message"] = f"🏁 μ™„λ£Œ! FINAL Score={aether:.1f} ({elapsed}초)"
783
- _STATE["running"] = False; _STATE["finished"] = True
784
-
785
-
786
- def _start_eval(model_choice, api_type, eval_api_key, judge_api_key, judge_model,
787
- pillar_filter, diff_filter, max_tasks, n_workers, fresh_start):
788
- global _STATE
789
- if _STATE["running"]:
790
- return "⚠️ 이미 μ§„ν–‰ 쀑"
791
-
792
- eval_api_key = (eval_api_key or "").strip() or os.getenv("HF_TOKEN", "")
793
- judge_api_key = (judge_api_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
794
- if not eval_api_key: return "❌ API Keyλ₯Ό μž…λ ₯ν•˜μ„Έμš”."
795
- if not judge_api_key: return "❌ Judge API Key (OpenAI)λ₯Ό μž…λ ₯ν•˜μ„Έμš”."
796
-
797
- # Resolve model ID
798
- if api_type == "HuggingFace Inference":
799
- model_id = HF_MODELS.get(model_choice, model_choice)
800
- at = "hf"
801
- else:
802
- model_id = OPENAI_MODELS.get(model_choice, model_choice)
803
- at = "openai"
804
-
805
- tasks = ALL_TASKS[:]
806
- if pillar_filter != "전체":
807
- tasks = [t for t in tasks if t.pillar == pillar_filter]
808
- if diff_filter != "전체":
809
- tasks = [t for t in tasks if t.difficulty == diff_filter]
810
- tasks = tasks[:int(max_tasks)]
811
-
812
- run_id = _make_run_id(model_id + "_pure")
813
- if fresh_start:
814
- _clear_run(run_id)
815
-
816
- _reset_state()
817
- with _STATE["lock"]:
818
- _STATE["running"] = True; _STATE["run_id"] = run_id
819
- _STATE["model"] = model_choice; _STATE["tasks"] = tasks
820
- _STATE["total"] = len(tasks); _STATE["n_workers"] = int(n_workers)
821
- _STATE["message"] = "πŸ”„ μ€€λΉ„ 쀑..."
822
-
823
- thread = threading.Thread(
824
- target=_bg_evaluate,
825
- args=(eval_api_key, judge_api_key, model_id, model_choice, judge_model,
826
- at, tasks, run_id, int(n_workers)),
827
- daemon=True)
828
- thread.start()
829
- return f"⚑ {model_choice} 평가 μ‹œμž‘ ({len(tasks)}과제, {int(n_workers)}μ›Œμ»€)"
830
-
831
 
832
  def _stop():
833
- global _STATE
834
- if _STATE["running"]:
835
- _STATE["stop_requested"] = True
836
- return "⏹️ 쀑단 μš”μ²­"
837
  return "ℹ️ μ‹€ν–‰ 쀑 μ•„λ‹˜"
838
 
 
839
 
840
- def _poll():
841
- global _STATE
842
- with _STATE["lock"]:
843
- running = _STATE["running"]; finished = _STATE["finished"]
844
- tasks = _STATE.get("tasks", []); results = dict(_STATE.get("results", {}))
845
- message = _STATE.get("message", ""); csv_path = _STATE.get("csv_path")
846
-
847
- if not running and not finished and not results:
848
- return ("ℹ️ λͺ¨λΈμ„ μ„ νƒν•˜κ³  ▢️ μ‹œμž‘μ„ λˆ„λ₯΄μ„Έμš”.", "", "", None)
 
 
849
 
 
 
 
 
 
 
 
850
  # Progress
851
- if running:
852
- done = _STATE["done"]; total = _STATE.get("total", 1)
853
- pct = min(int(done / max(total, 1) * 100), 100)
854
- elapsed = int(time.time() - _STATE.get("start_time", time.time()))
855
- eta = int((elapsed / max(done,1)) * (total - done)) if done > 0 else 0
856
- active = _STATE.get("active", [])
857
- tags = " ".join([f'<span style="background:#e3f2fd;padding:2px 6px;border-radius:4px;font-size:0.78em;">{a}</span>' for a in active[-8:]])
858
- prog = f"""{CSS}<div>
859
- <div style="display:flex;justify-content:space-between;margin-bottom:4px;">
860
- <span>⚑ {done}/{total} μ™„λ£Œ | {elapsed}초 | μ˜ˆμƒμž”μ—¬ {eta}초</span>
861
- <span style="font-weight:700">{pct}%</span>
862
- </div>
863
- <div class="progress-bar"><div class="progress-fill" style="width:{pct}%"></div></div>
864
- <div style="margin-top:6px;">{tags}</div></div>"""
865
- elif finished:
866
- prog = f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:700;">{message}</div>'
867
- else:
868
- prog = message
869
-
870
- table = _build_progress_table(results, tasks) if tasks else ""
871
-
872
- summary = ""
873
- if finished and tasks:
874
- pillar_scores = {}
 
 
 
 
875
  for p in PILLAR_INFO:
876
- pt = [t for t in tasks if t.pillar == p and t.task_id in results]
877
- valid = [results[t.task_id]["score"] for t in pt if results[t.task_id]["score"] >= 0]
878
- if valid: pillar_scores[p] = np.mean(valid)
879
- aether = calculate_aether_score(pillar_scores)
880
- summary = _build_summary(results, tasks, pillar_scores, aether,
881
- _STATE.get("model", ""), _STATE.get("hf_status", ""))
882
-
883
- return (prog, table, summary, csv_path)
884
-
885
-
886
- # ════════════════════════════════════════════════════════════════
887
- # PART 11: Gradio App
888
- # ════════════════════════════════════════════════════════════════
889
-
890
- def _update_model_choices(api_type):
891
- if api_type == "HuggingFace Inference":
892
- return gr.update(choices=list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0])
893
- else:
894
- return gr.update(choices=list(OPENAI_MODELS.keys()), value=list(OPENAI_MODELS.keys())[0])
895
-
896
-
897
- HEADER_HTML = """
898
- <div style="text-align:center;padding:16px 0;">
899
- <h1 style="margin:0;font-size:1.8em;">🧬 FINAL Bench Auto-Evaluator v1.0</h1>
900
- <h2 style="margin:4px 0;color:#555;font-size:1.05em;">ALL Bench 연동 Β· FINAL Score μžλ™ μΈ‘μ •</h2>
901
- <p style="color:#888;font-size:0.88em;max-width:700px;margin:8px auto;">
902
- 220 Tasks Β· 5 Pillars Β· 21 Sub-dimensions Β· HAR Metric<br>
903
- πŸ“‘ <b>HF Inference API</b>: Qwen, DeepSeek, Llama, Phi, Mistral λ“± μ˜€ν”ˆμ†ŒμŠ€<br>
904
- πŸ”‘ <b>OpenAI API</b>: GPT-5.x μ‹œλ¦¬μ¦ˆ<br>
905
- βš–οΈ <b>Judge</b>: OpenAI GPT-5.2 Structured Output Β· πŸ“Š κ²°κ³Ό β†’ <code>final_scores.json</code> β†’ ALL Bench μžλ™ 반영
906
- </p>
907
- </div>"""
908
-
909
- PILLAR_CHOICES = ["전체"] + list(PILLAR_INFO.keys())
910
- DIFF_CHOICES = ["전체", "expert", "frontier"]
911
-
912
 
913
  def create_app():
914
  with gr.Blocks(title="FINAL Bench Auto-Evaluator", theme=gr.themes.Soft(),
915
  css=".gradio-container{max-width:1100px !important}") as app:
916
- gr.HTML(HEADER_HTML)
917
-
918
- with gr.Row():
919
- api_type = gr.Radio(
920
- ["HuggingFace Inference", "OpenAI Compatible"],
921
- value="HuggingFace Inference", label="πŸ“‘ API μœ ν˜•", scale=2)
922
- model_choice = gr.Dropdown(
923
- choices=list(HF_MODELS.keys()),
924
- value=list(HF_MODELS.keys())[0],
925
- label="πŸ€– 평가 λŒ€μƒ λͺ¨λΈ", scale=3, allow_custom_value=True)
926
-
927
- api_type.change(_update_model_choices, [api_type], [model_choice])
928
-
929
  with gr.Row():
930
- eval_api_key = gr.Textbox(
931
- label="πŸ”‘ 피평가 API Key (HF Token λ˜λŠ” OpenAI Key)",
932
- type="password", placeholder="hf_... λ˜λŠ” sk-...",
933
- value=os.getenv("HF_TOKEN", ""), scale=3)
934
- judge_api_key = gr.Textbox(
935
- label="βš–οΈ Judge API Key (OpenAI GPT-5.2)",
936
- type="password", placeholder="sk-...",
937
- value=os.getenv("OPENAI_API_KEY", ""), scale=3)
938
-
939
  with gr.Row():
940
- judge_model = gr.Textbox(label="βš–οΈ μ‹¬νŒ λͺ¨λΈ", value="gpt-5.2", scale=2)
941
- pillar_dd = gr.Dropdown(PILLAR_CHOICES, value="전체", label="κΈ°λ‘₯ ν•„ν„°", scale=2)
942
- diff_dd = gr.Dropdown(DIFF_CHOICES, value="전체", label="λ‚œμ΄λ„", scale=1)
943
- max_tasks = gr.Slider(1, 220, value=220, step=1, label="μ΅œλŒ€ 과제 수", scale=2)
944
- n_workers = gr.Slider(1, 20, value=10, step=1, label="⚑ 병렬 μ›Œμ»€", scale=1)
945
-
946
- with gr.Row():
947
- start_btn = gr.Button("▢️ 평가 μ‹œμž‘ (μ΄μ–΄ν•˜κΈ°)", variant="primary", size="lg", scale=2)
948
- fresh_btn = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
949
- stop_btn = gr.Button("⏹️ 쀑단", variant="stop", size="lg", scale=1)
950
-
951
- status_msg = gr.Textbox(label="μƒνƒœ", interactive=False, max_lines=1)
952
-
953
- # ── Existing scores display ──
954
- with gr.Accordion("πŸ“Š κΈ°μ‘΄ FINAL Score κ²°κ³Ό", open=False):
955
- scores_display = gr.JSON(label="final_scores.json", value=load_final_scores())
956
-
957
  with gr.Tabs():
958
- with gr.Tab("πŸ“Š μ§„ν–‰"):
959
- progress_html = gr.HTML()
960
- with gr.Tab("πŸ“‹ κ²°κ³Όν‘œ"):
961
- table_html = gr.HTML()
962
- with gr.Tab("πŸ† μ΅œμ’…"):
963
- summary_html = gr.HTML()
964
- with gr.Tab("πŸ’Ύ CSV"):
965
- csv_file = gr.File(label="평가 κ²°κ³Ό CSV")
966
-
967
- # Timer polling
968
  timer = gr.Timer(value=2, active=True)
969
- timer.tick(fn=_poll, outputs=[progress_html, table_html, summary_html, csv_file])
970
-
971
- all_inputs = [model_choice, api_type, eval_api_key, judge_api_key, judge_model,
972
- pillar_dd, diff_dd, max_tasks, n_workers]
973
-
974
- start_btn.click(
975
- fn=lambda *args: _start_eval(*args, fresh_start=False),
976
- inputs=all_inputs, outputs=[status_msg])
977
- fresh_btn.click(
978
- fn=lambda *args: _start_eval(*args, fresh_start=True),
979
- inputs=all_inputs, outputs=[status_msg])
980
- stop_btn.click(fn=_stop, outputs=[status_msg])
981
-
982
- gr.Markdown("""---
983
- <center>FINAL Bench Auto-Evaluator v1.0 Β· Apache 2.0 Β· Ginigen AI (μ§€λ‹ˆμ  AI)<br>
984
- πŸ“‘ HF Inference API + βš–οΈ OpenAI Structured Judge<br>
985
- πŸ“Š κ²°κ³Ό β†’ <code>final_scores.json</code> β†’ ALL Bench Leaderboard μžλ™ 연동</center>""")
986
-
987
  return app
988
 
989
-
990
- # ════════════════════════════════════════════════════════════════
991
- # MAIN
992
- # ════════════════════════════════════════════════════════════════
993
-
994
  if __name__ == "__main__":
995
  stats = {}
996
- for t in ALL_TASKS:
997
- stats[t.pillar] = stats.get(t.pillar, 0) + 1
998
- print(f"FINAL Bench Auto-Evaluator v1.0: {len(ALL_TASKS)} tasks loaded")
999
- for p, n in stats.items():
1000
- info = PILLAR_INFO[p]
1001
- print(f" {info['icon']} {info['name']}: {n} ({int(info['weight']*100)}%)")
1002
- print(f" πŸ“‘ HF Models: {len(HF_MODELS)} | πŸ”‘ OpenAI Models: {len(OPENAI_MODELS)}")
1003
-
1004
  app = create_app()
1005
  app.queue(default_concurrency_limit=2)
1006
  app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)
 
1
  """
2
+ FINAL Bench Auto-Evaluator v1.0
3
+ ================================
4
+ FINAL Bench 100문제 x HF Inference API -> GPT-5.2 Judge -> final_scores.json
5
 
6
+ - μ‹œν—˜ 문제: FINAL-Bench/Metacognitive (HuggingFace Dataset)
7
+ - μ‹œν—˜ μ‘μ‹œμž: ALL Bench λ“±μž¬ HF Inference API λͺ¨λΈ
8
+ - μ‹¬νŒ: GPT-5.2 (os.getenv("OPENAI_API_KEY"))
9
+ - 좜λ ₯: final_scores.json -> ALL Bench Metacog 컬럼 μžλ™ 반영
10
 
11
+ Author: Ginigen AI Β· FINAL-Bench Β· Apache 2.0
 
12
  """
13
 
14
+ import json, os, time, csv, io, re, hashlib, sqlite3, threading
15
  from datetime import datetime
16
  from dataclasses import dataclass, field, asdict
17
+ from typing import Dict, Optional
18
+ from concurrent.futures import ThreadPoolExecutor
19
  import requests
20
  import numpy as np
 
21
  import gradio as gr
22
 
 
 
 
 
23
  PILLAR_INFO = {
24
+ "P1_Emergence": {"name": "μ°½λ°œμ„±", "icon": "✦", "color": "#FF6B35", "weight": 0.20},
25
+ "P2_Metacognition": {"name": "메타인지", "icon": "β—‰", "color": "#7B2FF7", "weight": 0.25},
26
+ "P3_SelfEvolution": {"name": "μžκ°€μ§„ν™”", "icon": "β—ˆ", "color": "#00B4D8", "weight": 0.15},
27
+ "P4_Orchestration": {"name": "닀쀑지λŠ₯", "icon": "β—¬", "color": "#2EC4B6", "weight": 0.15},
28
+ "P5_SynergyAntagonism": {"name": "상생상극", "icon": "☯", "color": "#E63946", "weight": 0.25},
29
  }
30
 
31
  @dataclass
 
33
  task_id: str; pillar: str; sub_dimension: str; difficulty: str
34
  prompt: str; context: Optional[str] = None; expected_behavior: Optional[str] = None
35
  scoring_rubric: Dict = field(default_factory=dict); metadata: Dict = field(default_factory=dict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ # ══ FINAL Bench 100문제 λ‘œλ“œ ══
 
 
 
 
 
 
38
 
39
+ def load_tasks():
40
+ """FINAL-Bench/Metacognitive HF Datasetμ—μ„œ 100문제 λ‘œλ“œ"""
41
+ try:
42
+ from datasets import load_dataset
43
+ ds = load_dataset("FINAL-Bench/Metacognitive", split="train")
44
+ tasks = []
45
+ for row in ds:
46
+ rubric = row.get("scoring_rubric", {})
47
+ if isinstance(rubric, str):
48
+ try: rubric = json.loads(rubric)
49
+ except: rubric = {}
50
+ meta = row.get("metadata") or {}
51
+ if isinstance(meta, str):
52
+ try: meta = json.loads(meta)
53
+ except: meta = {}
54
+ tasks.append(EvalTask(
55
+ task_id=row["task_id"], pillar=row["pillar"],
56
+ sub_dimension=row["sub_dimension"], difficulty=row["difficulty"],
57
+ prompt=row["prompt"], context=row.get("context"),
58
+ expected_behavior=row.get("expected_behavior"),
59
+ scoring_rubric=rubric, metadata=meta))
60
+ print(f"βœ… FINAL Bench: {len(tasks)}문제 λ‘œλ“œ (HF Dataset)")
61
+ return tasks
62
+ except Exception as e:
63
+ print(f"⚠️ HF Dataset μ‹€νŒ¨: {e}, parquet 폴백...")
64
+ try:
65
+ import pandas as pd
66
+ df = pd.read_parquet("full_v2.parquet")
67
+ tasks = []
68
+ for _, row in df.iterrows():
69
+ rubric = row["scoring_rubric"]
70
+ if isinstance(rubric, str): rubric = json.loads(rubric)
71
+ tasks.append(EvalTask(
72
+ task_id=row["task_id"], pillar=row["pillar"],
73
+ sub_dimension=row["sub_dimension"], difficulty=row["difficulty"],
74
+ prompt=row["prompt"], context=row.get("context"),
75
+ expected_behavior=row.get("expected_behavior"),
76
+ scoring_rubric=rubric, metadata={}))
77
+ print(f"βœ… Parquet 폴백: {len(tasks)}문제")
78
+ return tasks
79
+ except Exception as e2:
80
+ print(f"❌ λ‘œλ“œ μ‹€νŒ¨: {e2}")
81
+ return []
82
+
83
+ ALL_TASKS = load_tasks()
84
+
85
+ # ══ ALL Bench λ“±μž¬ HF Inference API λͺ¨λΈ ══
86
+
87
+ HF_MODELS = {
88
+ "Qwen3.5-397B": "Qwen/Qwen3.5-397B-A17B",
89
+ "Qwen3.5-122B": "Qwen/Qwen3.5-122B-A10B",
90
+ "Qwen3.5-27B": "Qwen/Qwen3.5-27B",
91
+ "Qwen3.5-35B": "Qwen/Qwen3.5-35B-A3B",
92
+ "Qwen3.5-9B": "Qwen/Qwen3.5-9B",
93
+ "Qwen3.5-4B": "Qwen/Qwen3.5-4B",
94
+ "Qwen3-Next-80B": "Qwen/Qwen3-Next-80B-A3B-Thinking",
95
+ "DeepSeek V3.2": "deepseek-ai/DeepSeek-V3-0324",
96
+ "DeepSeek R1": "deepseek-ai/DeepSeek-R1",
97
+ "Llama 4 Scout": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
98
+ "Llama 4 Maverick": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
99
+ "Phi-4": "microsoft/phi-4",
100
+ "Mistral Large 3": "mistralai/Mistral-Large-Instruct-2501",
101
  }
102
 
103
+ # ══ LLM 호좜: HF Inference API ══
 
 
104
 
105
+ def _strip(text):
106
  if not text: return text
107
+ for t in ['think','thinking','reasoning','reflection']:
108
+ text = re.sub(rf'<{t}>.*?</{t}>', '', text, flags=re.DOTALL)
 
 
109
  return text.strip()
110
 
111
+ def call_model(prompt, system="", model_id="Qwen/Qwen3.5-397B-A17B", max_tokens=4096, temperature=0.6):
112
+ hf_token = os.getenv("HF_TOKEN", "")
113
+ if not hf_token: return "[API_ERROR] HF_TOKEN λ―Έμ„€μ •"
114
+ msgs = []
115
+ if system: msgs.append({"role":"system","content":system})
116
+ msgs.append({"role":"user","content":prompt})
117
+ headers = {"Content-Type":"application/json","Authorization":f"Bearer {hf_token}"}
118
+ payload = {"model":model_id,"messages":msgs,"max_tokens":max_tokens,"temperature":temperature,"stream":False}
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  for attempt in range(3):
120
  try:
121
+ r = requests.post(f"https://router.huggingface.co/hf-inference/models/{model_id}/v1/chat/completions",
122
+ headers=headers, json=payload, timeout=180)
123
+ if r.status_code in (429, 503):
124
+ time.sleep(10*(attempt+1)); continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  r.raise_for_status()
126
+ return _strip(r.json()["choices"][0]["message"]["content"])
 
 
127
  except Exception as e:
128
+ if attempt < 2: time.sleep(5*(attempt+1))
129
+ else: return f"[API_ERROR] {e}"
130
+
131
+ # ══ Judge: GPT-5.2 (OPENAI_API_KEY) ══
132
+
133
+ JUDGE_SYS = """You are a FINAL Bench scoring judge. Score each rubric item using ONLY: 0.0, 0.25, 0.5, 0.75, 1.0.
134
+ 1.0=Excellent 0.75=Good 0.5=Average 0.25=Below 0.0=Fails
135
+ Output JSON: {"scores":{...}, "comment":"ν•œμ€„ν‰κ°€"}. Every rubric key MUST appear."""
136
+
137
+ def call_judge(prompt, rubric_keys):
138
+ api_key = os.getenv("OPENAI_API_KEY", "")
139
+ if not api_key: return None
140
+ props = {k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in rubric_keys}
141
+ schema = {"type":"object","properties":{"scores":{"type":"object","properties":props,
142
+ "required":list(rubric_keys),"additionalProperties":False},
143
+ "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
144
+ payload = {"model":"gpt-5.2","max_completion_tokens":4096,"temperature":0.1,
145
+ "messages":[{"role":"system","content":JUDGE_SYS},{"role":"user","content":prompt}],
146
+ "response_format":{"type":"json_schema","json_schema":{"name":"JudgeResult","strict":True,"schema":schema}}}
147
+ headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  for attempt in range(3):
149
  try:
150
+ r = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=180)
151
+ if r.status_code == 429: time.sleep(8*(attempt+1)); continue
 
 
152
  r.raise_for_status()
153
+ content = _strip(r.json()["choices"][0]["message"]["content"])
154
  if not content:
155
+ if attempt < 2: time.sleep(3); continue
156
  return None
 
 
157
  data = json.loads(content)
158
+ if "scores" in data:
159
  for k in rubric_keys:
160
  if k not in data["scores"]: data["scores"][k] = 0.5
161
+ return data
162
+ except:
163
+ if attempt < 2: time.sleep(5*(attempt+1)); continue
 
 
 
164
  return None
165
  return None
166
 
167
+ def judge_prompt(task, response):
168
+ keys = list(task.scoring_rubric.keys())
169
+ skel = ", ".join([f'\"{k}\": ___' for k in keys])
170
+ rubric = "\n".join([f' \"{k}\": {v["desc"]}' for k,v in task.scoring_rubric.items()])
171
+ return f"Task: {task.task_id} | {task.pillar} | {task.difficulty}\nPrompt: {task.prompt[:800]}\nExpected: {(task.expected_behavior or 'N/A')[:300]}\nResponse: {response[:8000]}\n\nRubric:\n{rubric}\n\nOutput JSON: {{\"scores\": {{{skel}}}, \"comment\": \"ν•œμ€„ν‰κ°€\"}}"
172
+
173
+ def score(scores, rubric):
174
+ return round(sum(scores.get(k,0.5)*v["weight"] for k,v in rubric.items())*100, 2)
175
+
176
+ # ══ 닀쀑 λΌμš΄λ“œ 과제 ══
177
+
178
+ def _mutual(topic, mid):
179
+ r1 = call_model(f"[R1] \'{topic}\' 500단어 뢄석.", model_id=mid)
180
+ r2 = call_model(f"[R2] λΉ„νŒν•˜λΌ.\n---\n{r1[:2000]}", model_id=mid)
181
+ r3 = call_model(f"[R3] μˆ˜μ •ν•˜λΌ.\n--- 원문 ---\n{r1[:1500]}\n--- λΉ„νŒ ---\n{r2[:1500]}", model_id=mid)
182
+ r4 = call_model(f"[R4] 메타뢄석.\n--- R1 ---\n{r1[:800]}\n--- R2 ---\n{r2[:800]}\n--- R3 ---\n{r3[:800]}", model_id=mid)
183
+ return f"[R1]\n{r1}\n\n[R2]\n{r2}\n\n[R3]\n{r3}\n\n[R4]\n{r4}"
184
+
185
+ def _feedback(pj, mid):
186
+ try: data = json.loads(pj)
187
+ except: return call_model(pj, model_id=mid)
188
+ topic, specs = data.get("topic",""), data.get("rounds",[])
189
+ outs, prev = [], ""
190
+ for i, rd in enumerate(specs):
191
+ inst, fb = rd.get("instruction",""), rd.get("feedback")
192
+ if i==0: p = f"\'{topic}\' - {inst}."
193
+ elif fb: p = f"ν”Όλ“œλ°± 반영: {inst}.\n--- 이전 ---\n{prev[:2000]}\n--- ν”Όλ“œλ°± ---\n{fb}"
194
+ else: p = f"{inst}.\n--- μ΅œμ’… ---\n{prev[:2500]}"
195
+ resp = call_model(p, model_id=mid); outs.append(f"[R{i+1}]\n{resp}"); prev = resp
196
+ return "\n\n".join(outs)
197
+
198
+ def run_task(task, mid):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  if task.sub_dimension == "mutual_verification":
200
+ return _mutual(task.prompt.replace("[상생-상극 사이클] ","").split("\n")[0], mid)
 
201
  elif task.sub_dimension == "feedback_incorporation":
202
+ return _feedback(task.prompt, mid)
203
+ return call_model(task.prompt, model_id=mid)
204
+
205
+ # ══ DB 체크포인트 ══
206
+
207
+ DB = "final_bench.db"
208
+ def _initdb():
209
+ c = sqlite3.connect(DB)
210
+ c.execute("CREATE TABLE IF NOT EXISTS r (rid TEXT,tid TEXT,resp TEXT,judge TEXT,score REAL,ts REAL,PRIMARY KEY(rid,tid))")
211
+ c.commit(); c.close()
212
+ def _rid(m): return hashlib.md5(m.encode()).hexdigest()[:12]
213
+ def _sv(rid,tid,resp,jdg,sc):
214
+ c=sqlite3.connect(DB); c.execute("INSERT OR REPLACE INTO r VALUES(?,?,?,?,?,?)",(rid,tid,resp,jdg,sc,time.time())); c.commit(); c.close()
215
+ def _loadall(rid):
216
+ c=sqlite3.connect(DB); rows=c.execute("SELECT tid,resp,judge,score FROM r WHERE rid=?", (rid,)).fetchall(); c.close()
217
+ return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
218
+ def _clr(rid):
219
+ c=sqlite3.connect(DB); c.execute("DELETE FROM r WHERE rid=?",(rid,)); c.commit(); c.close()
220
+ _initdb()
221
+
222
+ # ══ Scores μ €μž₯ + HF μ—…λ‘œλ“œ ══
223
+
224
+ SF = "final_scores.json"
225
+ def load_sf():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  try:
227
+ with open(SF) as f: return json.load(f)
228
+ except: return {"version":"1.0","updated":"","models":{}}
229
+ def save_sf(mn, ps, fs, total, done):
230
+ d = load_sf(); d["updated"]=datetime.now().isoformat()
231
+ d["models"][mn]={"final_score":fs,"pillar_scores":{p:round(s,2) for p,s in ps.items()},
232
+ "total_tasks":total,"completed":done,"evaluated_at":datetime.now().isoformat()}
233
+ with open(SF,"w") as f: json.dump(d,f,indent=2,ensure_ascii=False)
234
+ return d
235
+ def upload_sf(d):
236
+ tk = os.getenv("HF_TOKEN","")
237
+ if not tk: return "⚠️ HF_TOKEN λ―Έμ„€μ •"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  try:
239
  from huggingface_hub import HfApi
240
+ api = HfApi(token=tk)
241
+ api.upload_file(path_or_fileobj=json.dumps(d,indent=2,ensure_ascii=False).encode("utf-8"),
242
+ path_in_repo="final_scores.json", repo_id="FINAL-Bench/ALL-Bench-Leaderboard",
243
+ repo_type="dataset", commit_message=f"FINAL Score {datetime.now().strftime('%m-%d %H:%M')}")
244
+ return "βœ… HF Dataset μ—…λ‘œλ“œ μ™„λ£Œ"
245
+ except Exception as e: return f"❌ {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ # ══ 평가 μ›Œμ»€ ══
248
 
249
+ def _eval1(task, rid, mid, st):
250
  try:
251
+ resp = run_task(task, mid)
252
+ if resp.startswith("[API_ERROR"):
253
+ _sv(rid,task.task_id,resp,"{}",0)
254
+ with st["lock"]: st["done"]+=1; st["err"].append(task.task_id)
255
+ return task.task_id, {"response":resp,"judge":"{}","score":0}
256
+ rk = list(task.scoring_rubric.keys())
257
+ jp = judge_prompt(task, resp)
258
+ jd = call_judge(jp, rk)
259
+ if jd is None:
260
+ _sv(rid,task.task_id,resp,'{"failed":true}',-1)
261
+ with st["lock"]: st["done"]+=1; st["jf"]+=1
262
+ return task.task_id, {"response":resp,"judge":'{"failed":true}',"score":-1}
263
+ sc = score(jd["scores"], task.scoring_rubric)
264
+ jj = json.dumps(jd, ensure_ascii=False)
265
+ _sv(rid,task.task_id,resp,jj,sc)
266
+ with st["lock"]:
267
+ st["done"]+=1; st["jok"]+=1
268
+ info = PILLAR_INFO.get(task.pillar,{})
269
+ st["rec"].append(f'{info.get("icon","")} {task.task_id} β†’ {sc:.0f}')
270
+ if len(st["rec"])>8: st["rec"]=st["rec"][-8:]
271
+ return task.task_id, {"response":resp,"judge":jj,"score":sc}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  except Exception as e:
273
+ _sv(rid,task.task_id,f"[ERR]{e}","{}",0)
274
+ with st["lock"]: st["done"]+=1; st["err"].append(f"{task.task_id}:{str(e)[:40]}")
275
+ return task.task_id, {"response":f"[ERR]{e}","judge":"{}","score":0}
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
+ # ══ κΈ€λ‘œλ²Œ μƒνƒœ + λ°±κ·ΈλΌμš΄λ“œ ══
278
 
279
+ _S = {"running":False,"stop":False,"finished":False,"model":"","rid":"",
280
+ "done":0,"total":0,"cached":0,"err":[],"rec":[],"jok":0,"jf":0,
281
+ "t0":0,"results":{},"tasks":[],"lock":threading.Lock(),"msg":"","csv":None,"hf":""}
 
 
 
 
 
 
 
282
 
283
+ def _rst():
284
+ with _S["lock"]:
285
+ _S.update({"running":False,"stop":False,"finished":False,"done":0,"cached":0,
286
+ "err":[],"rec":[],"jok":0,"jf":0,"t0":0,"results":{},"tasks":[],"msg":"","csv":None,"hf":""})
287
 
288
+ def _bg(mn, mid, tasks, rid, wk):
 
 
289
  try:
290
+ cached = _loadall(rid)
291
+ pending = [t for t in tasks if t.task_id not in cached]
292
+ with _S["lock"]: _S["results"]=cached; _S["cached"]=len(cached); _S["total"]=len(tasks); _S["t0"]=time.time()
 
 
 
 
 
 
 
 
 
 
 
 
293
  if not pending:
294
+ with _S["lock"]: _S["msg"]=f"πŸ’Ύ μΊμ‹œ μ™„λ£Œ ({len(cached)}개)"
295
+ _fin(tasks,cached,mn); return
296
+ with _S["lock"]: _S["msg"]=f"⚑ {len(pending)}문제 Β· {wk}μ›Œμ»€"
297
+ with ThreadPoolExecutor(max_workers=wk) as ex:
298
+ futs = {ex.submit(_eval1,t,rid,mid,_S):t for t in pending if not _S["stop"]}
299
+ done_set = set()
300
+ while len(done_set)<len(futs):
301
+ if _S["stop"]:
302
+ with _S["lock"]: _S["msg"]="⏹️ 쀑단"; _S["running"]=False; _S["finished"]=True
 
 
 
 
 
 
 
 
 
 
 
 
303
  return
304
+ for f in list(futs):
305
+ if f in done_set: continue
306
+ if f.done():
307
+ done_set.add(f)
308
  try:
309
+ tid,data = f.result()
310
+ with _S["lock"]: _S["results"][tid]=data
 
 
 
311
  except: pass
312
  time.sleep(0.5)
313
+ with _S["lock"]: results=dict(_S["results"])
314
+ _fin(tasks,results,mn)
 
 
 
315
  except Exception as e:
316
+ with _S["lock"]: _S["msg"]=f"❌ {str(e)[:100]}"; _S["running"]=False; _S["finished"]=True
 
 
 
317
 
318
+ def _fin(tasks, results, mn):
319
+ ps = {}
 
320
  for p in PILLAR_INFO:
321
+ valid = [results[t.task_id]["score"] for t in tasks if t.pillar==p and t.task_id in results and results[t.task_id]["score"]>=0]
322
+ if valid: ps[p] = np.mean(valid)
323
+ wts = {p:info["weight"] for p,info in PILLAR_INFO.items()}
324
+ fs = round(sum(ps.get(p,0)*w for p,w in wts.items()), 2)
325
+ done = sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]>=0)
326
+ sd = save_sf(mn, ps, fs, len(tasks), done)
327
+ hf = upload_sf(sd)
328
+ el = int(time.time()-_S["t0"]) if _S["t0"] else 0
329
+ with _S["lock"]:
330
+ _S["hf"]=hf; _S["msg"]=f"🏁 FINAL Score = {fs:.1f} ({el}초)"
331
+ _S["running"]=False; _S["finished"]=True
332
+
333
+ def _start(mc, mt, wk, fresh):
334
+ if _S["running"]: return "⚠️ μ§„ν–‰ 쀑"
335
+ if not os.getenv("HF_TOKEN"): return "❌ HF_TOKEN (Secrets)"
336
+ if not os.getenv("OPENAI_API_KEY"): return "❌ OPENAI_API_KEY (Secrets)"
337
+ if not ALL_TASKS: return "❌ 과제 λ‘œλ“œ μ‹€νŒ¨"
338
+ mid = HF_MODELS.get(mc, mc)
339
+ tasks = ALL_TASKS[:int(mt)]
340
+ rid = _rid(mid)
341
+ if fresh: _clr(rid)
342
+ _rst()
343
+ with _S["lock"]:
344
+ _S.update({"running":True,"rid":rid,"model":mc,"tasks":tasks,"total":len(tasks),"msg":"πŸ”„ μ€€λΉ„..."})
345
+ threading.Thread(target=_bg, args=(mc,mid,tasks,rid,int(wk)), daemon=True).start()
346
+ return f"⚑ {mc} 평가 μ‹œμž‘ ({len(tasks)}문제, {int(wk)}μ›Œμ»€)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  def _stop():
349
+ if _S["running"]: _S["stop"]=True; return "⏹️ 쀑단 μš”μ²­"
 
 
 
350
  return "ℹ️ μ‹€ν–‰ 쀑 μ•„λ‹˜"
351
 
352
+ # ══ UI λΉŒλ” ══
353
 
354
+ CSS = """<style>
355
+ .score-bar{background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden;min-width:80px}
356
+ .score-fill{height:100%;border-radius:8px}
357
+ .summary-card{background:linear-gradient(135deg,#1a1a2e,#16213e);border-radius:14px;padding:24px;color:#fff}
358
+ .pillar-row{display:flex;align-items:center;gap:10px;margin:6px 0}
359
+ .pillar-bar{flex:1;background:#333;border-radius:6px;height:16px;overflow:hidden}
360
+ .pillar-fill{height:100%;border-radius:6px}
361
+ .pbar{background:#e0e0e0;border-radius:8px;height:22px;overflow:hidden}
362
+ .pfill{height:100%;border-radius:8px;background:linear-gradient(90deg,#6366f1,#10b981)}
363
+ </style>"""
364
+ def _c(s): return "#4caf50" if s>=80 else ("#ff9800" if s>=60 else "#f44336")
365
 
366
+ def _poll():
367
+ with _S["lock"]:
368
+ run,fin = _S["running"],_S["finished"]
369
+ tasks,res = _S.get("tasks",[]),dict(_S.get("results",{}))
370
+ msg = _S.get("msg","")
371
+ if not run and not fin and not res:
372
+ return ("ℹ️ λͺ¨λΈ 선택 β†’ ▢️ μ‹œμž‘", "", "", None)
373
  # Progress
374
+ if run:
375
+ d,tot = _S["done"],max(_S.get("total",1),1)
376
+ pct = min(int(d/tot*100),100)
377
+ el = int(time.time()-_S.get("t0",time.time()))
378
+ eta = int((el/max(d,1))*(tot-d)) if d>0 else 0
379
+ tags = " ".join([f'<span style="background:#e8eaf6;padding:2px 8px;border-radius:4px;font-size:.8em">{r}</span>' for r in _S.get("rec",[])[-6:]])
380
+ prog = f'{CSS}<div><div style="display:flex;justify-content:space-between;margin-bottom:4px"><span>⚑ {d}/{tot} | {el}초 | ~{eta}초</span><span style="font-weight:700">{pct}%</span></div><div class="pbar"><div class="pfill" style="width:{pct}%"></div></div><div style="margin-top:6px">{tags}</div><div style="margin-top:4px;font-size:.85em">βš–οΈ βœ…{_S.get("jok",0)} ❌{_S.get("jf",0)}</div></div>'
381
+ elif fin:
382
+ prog = f'<div style="background:#e8f5e9;padding:14px;border-radius:8px;font-weight:700">{msg}</div>'
383
+ else: prog = msg
384
+ # Table
385
+ tbl = ""
386
+ if tasks:
387
+ rows = ""
388
+ for t in tasks:
389
+ info = PILLAR_INFO.get(t.pillar,{})
390
+ if t.task_id in res:
391
+ s = res[t.task_id]["score"]
392
+ if s<0: rows += f'<tr style="background:#fff3e0"><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>β€”</td></tr>'
393
+ else:
394
+ c = _c(s)
395
+ rows += f'<tr><td>{t.task_id}</td><td>{info.get("icon","")} {info.get("name","")}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="color:{c};font-weight:700">{s:.1f}</td></tr>'
396
+ else: rows += f'<tr style="opacity:.35"><td>{t.task_id}</td><td>{info.get("icon","")}</td><td>{t.difficulty}</td><td>⏳</td><td>β€”</td></tr>'
397
+ tbl = f'{CSS}<table style="width:100%;border-collapse:collapse;font-size:.85em"><thead><tr><th style="background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc">ID</th><th style="background:#f0f4f8;padding:8px">κΈ°λ‘₯</th><th style="background:#f0f4f8;padding:8px">λ‚œμ΄λ„</th><th style="background:#f0f4f8;padding:8px">점수</th><th style="background:#f0f4f8;padding:8px">κ°’</th></tr></thead><tbody>{rows}</tbody></table>'
398
+ # Summary
399
+ summ = ""
400
+ if fin and tasks:
401
+ ps = {}
402
  for p in PILLAR_INFO:
403
+ valid = [res[t.task_id]["score"] for t in tasks if t.pillar==p and t.task_id in res and res[t.task_id]["score"]>=0]
404
+ if valid: ps[p]=np.mean(valid)
405
+ wts = {p:info["weight"] for p,info in PILLAR_INFO.items()}
406
+ fs = round(sum(ps.get(p,0)*w for p,w in wts.items()),2)
407
+ g = "A" if fs>=80 else ("B+" if fs>=70 else ("B" if fs>=60 else "C"))
408
+ ph = ""
409
+ for p,info in PILLAR_INFO.items():
410
+ s=ps.get(p,0); c=_c(s); w=int(info["weight"]*100)
411
+ ph += f'<div class="pillar-row"><span style="width:140px">{info["icon"]} {info["name"]} ({w}%)</span><div class="pillar-bar"><div class="pillar-fill" style="width:{min(s,100)}%;background:{c}"></div></div><span style="width:55px;text-align:right;font-weight:700;color:{c}">{s:.1f}</span></div>'
412
+ summ = f'{CSS}<div class="summary-card"><h2 style="margin:0;font-size:1.8em;text-align:center">🧬 FINAL Score: {fs:.1f}/100</h2><h3 style="text-align:center;color:#aaa">{g} | {_S.get("model","")}</h3><hr style="border-color:#333;margin:16px 0">{ph}<hr style="border-color:#333;margin:16px 0"><p style="font-size:.85em;color:#888">{_S.get("hf","")}</p></div>'
413
+ return (prog, tbl, summ, None)
414
+
415
+ # ══ Gradio App ══
416
+
417
+ HEADER = """<div style="text-align:center;padding:20px 0">
418
+ <h1 style="margin:0;font-size:2em">🧬 FINAL Bench Auto-Evaluator</h1>
419
+ <p style="color:#666;max-width:700px;margin:10px auto;line-height:1.7">
420
+ <b>FINAL Bench 100문제</b> Γ— ALL Bench λ“±μž¬ λͺ¨λΈ μžλ™ 평가<br>
421
+ πŸ“‘ HF Inference API Β· βš–οΈ GPT-5.2 Judge Β· πŸ“Š β†’ ALL Bench Metacog μžλ™ 반영
422
+ </p></div>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
 
424
  def create_app():
425
  with gr.Blocks(title="FINAL Bench Auto-Evaluator", theme=gr.themes.Soft(),
426
  css=".gradio-container{max-width:1100px !important}") as app:
427
+ gr.HTML(HEADER)
 
 
 
 
 
 
 
 
 
 
 
 
428
  with gr.Row():
429
+ mdd = gr.Dropdown(list(HF_MODELS.keys()), value=list(HF_MODELS.keys())[0],
430
+ label="πŸ€– 평가 λŒ€μƒ λͺ¨λΈ", scale=4)
431
+ mt = gr.Slider(1, len(ALL_TASKS) if ALL_TASKS else 100,
432
+ value=len(ALL_TASKS) if ALL_TASKS else 100, step=1, label="과제 수", scale=2)
433
+ wk = gr.Slider(1, 15, value=8, step=1, label="⚑ μ›Œμ»€", scale=1)
 
 
 
 
434
  with gr.Row():
435
+ sb = gr.Button("▢️ μ΄μ–΄ν•˜κΈ°", variant="primary", size="lg", scale=2)
436
+ fb = gr.Button("πŸš€ μƒˆλ‘œ μ‹œμž‘", variant="secondary", size="lg", scale=2)
437
+ xb = gr.Button("⏹️ 쀑단", variant="stop", size="lg", scale=1)
438
+ st = gr.Textbox(label="μƒνƒœ", interactive=False, max_lines=1)
439
+ with gr.Accordion("πŸ“Š κΈ°μ‘΄ κ²°κ³Ό", open=False):
440
+ gr.JSON(label="final_scores.json", value=load_sf())
 
 
 
 
 
 
 
 
 
 
 
441
  with gr.Tabs():
442
+ with gr.Tab("πŸ“Š μ§„ν–‰"): p1=gr.HTML()
443
+ with gr.Tab("πŸ“‹ κ²°κ³Όν‘œ"): p2=gr.HTML()
444
+ with gr.Tab("πŸ† μ΅œμ’…"): p3=gr.HTML()
445
+ with gr.Tab("πŸ’Ύ CSV"): p4=gr.File(label="CSV")
 
 
 
 
 
 
446
  timer = gr.Timer(value=2, active=True)
447
+ timer.tick(fn=_poll, outputs=[p1,p2,p3,p4])
448
+ sb.click(fn=lambda m,t,w: _start(m,t,w,False), inputs=[mdd,mt,wk], outputs=[st])
449
+ fb.click(fn=lambda m,t,w: _start(m,t,w,True), inputs=[mdd,mt,wk], outputs=[st])
450
+ xb.click(fn=_stop, outputs=[st])
451
+ gr.Markdown(f"---\n<center>FINAL Bench v1.0 · {len(ALL_TASKS)}문제 · Ginigen AI · Apache 2.0</center>")
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  return app
453
 
 
 
 
 
 
454
  if __name__ == "__main__":
455
  stats = {}
456
+ for t in ALL_TASKS: stats[t.pillar]=stats.get(t.pillar,0)+1
457
+ print(f"🧬 FINAL Bench Auto-Evaluator: {len(ALL_TASKS)} tasks")
458
+ for p,n in stats.items():
459
+ info=PILLAR_INFO[p]; print(f" {info['icon']} {info['name']}: {n}")
460
+ print(f" πŸ“‘ HF Models: {len(HF_MODELS)} | βš–οΈ Judge: GPT-5.2")
461
+ print(f" πŸ”‘ HF_TOKEN: {'βœ…' if os.getenv('HF_TOKEN') else '❌'} | OPENAI_API_KEY: {'βœ…' if os.getenv('OPENAI_API_KEY') else '❌'}")
 
 
462
  app = create_app()
463
  app.queue(default_concurrency_limit=2)
464
  app.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)