seawolf2357's picture
Update app.py
c36a3d1 verified
"""
FINAL Bench v3.1 — AGI-Level Verification System + Local Model Eval
Frontier Intelligence Nexus for AGI-Level Verification
★ Non-AGI vs Proto-AGI 비교 평가
★ 100 FINAL Tasks + 500 SWE-bench Verified Tasks
★ GPT-5.2 Eval + GPT-5.2 Structured Output Judge
★ Proto-AGI 오행 완전체: 木→火→土→金→水
★ Local Model Support: Darwin-gpt-ernie-20b (vLLM)
★ SWE-bench_Verified Dataset Integration
Author: Ginigen AI (지니젠AI) — Choi Sunyoung
License: Apache 2.0
"""
import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, subprocess, signal
from datetime import datetime
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Optional
import requests
import numpy as np
import gradio as gr
from concurrent.futures import ThreadPoolExecutor
# ════════════════════════════════════════════════════════════════
# §1. Data Structures
# ════════════════════════════════════════════════════════════════
DOMAIN_INFO = {
"Mathematics & Logic": {"icon":"🔢","color":"#FF6B35"},
"Science": {"icon":"🔬","color":"#7B2FF7"},
"Philosophy": {"icon":"🤔","color":"#00B4D8"},
"Medicine": {"icon":"🏥","color":"#2EC4B6"},
"Economics": {"icon":"📈","color":"#E63946"},
"History": {"icon":"📜","color":"#F4A261"},
"War & Security": {"icon":"🛡️","color":"#264653"},
"Space & Physics": {"icon":"🚀","color":"#6C63FF"},
"Chemistry & Biology": {"icon":"🧬","color":"#06D6A0"},
"Language & Writing": {"icon":"✍️","color":"#EF476F"},
"Literature": {"icon":"📖","color":"#8338EC"},
"Art": {"icon":"🎨","color":"#FF006E"},
"Religion & Mythology": {"icon":"🕊️","color":"#FFD166"},
"Ethics": {"icon":"⚖️","color":"#118AB2"},
"AI & Technology": {"icon":"🤖","color":"#073B4C"},
# SWE-bench domains
"Software Engineering": {"icon":"💻","color":"#00897B"},
"Bug Fix": {"icon":"🐛","color":"#D84315"},
"Code Patch": {"icon":"🔧","color":"#5E35B1"},
}
GRADE_WEIGHT = {"A": 1.5, "B": 1.0, "C": 0.7}
RUBRIC = {
"process_quality": {"weight":0.25, "desc":"Systematic reasoning transparency"},
"metacognitive_accuracy": {"weight":0.25, "desc":"Confidence calibration + uncertainty honesty"},
"error_recovery": {"weight":0.20, "desc":"Mid-analysis self-correction"},
"integration_depth": {"weight":0.15, "desc":"Multi-perspective synthesis + emergent insights"},
"final_correctness": {"weight":0.15, "desc":"Answer accuracy and completeness"},
}
AXIS_MAP = {
"generalization": {"rubrics":["process_quality","final_correctness"], "ticos":[]},
"reasoning": {"rubrics":["process_quality","error_recovery"], "ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]},
"planning": {"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]},
"reliability": {"rubrics":["metacognitive_accuracy"], "ticos":["E_SelfCorrecting","G_PivotDetection"]},
"safety": {"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]},
}
AGI_STAGES = [
{"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence", "min":0, "max":39, "color":"#f44336"},
{"stage":2,"name":"FINAL-Proto", "label":"Proto Intelligence", "min":40,"max":59, "color":"#ff9800"},
{"stage":3,"name":"FINAL-Pre", "label":"Pre-AGI", "min":60,"max":79, "color":"#2196f3"},
{"stage":4,"name":"FINAL-Pass", "label":"AGI Achieved", "min":80,"max":94, "color":"#4caf50"},
{"stage":5,"name":"FINAL-Post", "label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"},
]
@dataclass
class FinalTask:
task_id:str; domain:str; grade:str; ticos_type:str
difficulty:str; lens:str; title:str; prompt:str
expected_behavior:str; hidden_trap:str
ticos_required:List[str]=field(default_factory=list)
metadata:Dict=field(default_factory=dict)
# ════════════════════════════════════════════════════════════════
# §1.5 SWE-bench Verified Dataset Loader
# ════════════════════════════════════════════════════════════════
SWE_BENCH_TASKS: List[FinalTask] = []
def _load_swe_bench_verified():
"""SWE-bench_Verified 데이터셋을 HuggingFace에서 로드하여 FinalTask 형식으로 변환"""
global SWE_BENCH_TASKS
try:
from datasets import load_dataset
print("📦 Loading SWE-bench_Verified from HuggingFace...")
ds = load_dataset("SWE-bench/SWE-bench_Verified", split="test")
print(f" ✅ Loaded {len(ds)} SWE-bench instances")
# 난이도 매핑
diff_map = {"15 min fix": "easy", "15 min - 1 hour": "medium",
"1-4 hours": "hard", "4+ hours": "expert"}
# TICOS 타입 매핑 (SWE-bench 특성에 맞게)
ticos_types = [
"E_SelfCorrecting", # 버그 수정 = 자기교정
"D_MultiConstraint", # 다중 제약조건 해결
"C_ProgressiveDiscovery", # 점진적 발견
"A_TrapEscape", # 함정 탈출 (edge case)
]
# 등급 분배: 난이도 기반
grade_map = {"15 min fix": "C", "15 min - 1 hour": "B",
"1-4 hours": "A", "4+ hours": "A"}
tasks = []
for i, item in enumerate(ds):
instance_id = item.get("instance_id", f"swe_{i:04d}")
repo = item.get("repo", "unknown")
problem = item.get("problem_statement", "")
patch = item.get("patch", "")
test_patch = item.get("test_patch", "")
hints = item.get("hints_text", "")
difficulty_raw = item.get("difficulty", "15 min - 1 hour")
version = item.get("version", "")
fail_to_pass = item.get("FAIL_TO_PASS", "")
base_commit = item.get("base_commit", "")
# FinalTask 변환
difficulty = diff_map.get(difficulty_raw, "medium")
grade = grade_map.get(difficulty_raw, "B")
ticos = ticos_types[i % len(ticos_types)]
# 도메인 결정 (repo 기반)
if "django" in repo.lower():
domain = "Software Engineering"
elif "astropy" in repo.lower() or "scipy" in repo.lower() or "sympy" in repo.lower():
domain = "Science"
elif "matplotlib" in repo.lower():
domain = "Art"
else:
domain = "Software Engineering"
# 프롬프트 구성: 문제 설명 + 코드 컨텍스트
prompt_text = (
f"## Software Bug Fix Task\n"
f"**Repository**: {repo} (version {version})\n"
f"**Base Commit**: {base_commit[:12]}...\n\n"
f"### Problem Description:\n{problem[:3000]}\n\n"
)
if hints:
prompt_text += f"### Hints:\n{hints[:1000]}\n\n"
prompt_text += (
f"### Requirements:\n"
f"1. Analyze the bug described above\n"
f"2. Identify the root cause in the codebase\n"
f"3. Propose a minimal, correct patch\n"
f"4. Explain why the fix is correct\n"
f"5. Identify potential edge cases or regressions\n"
f"6. State your confidence level for each claim\n"
)
# Expected behavior = 실제 패치
expected = f"Correct patch:\n{patch[:2000]}"
# Hidden trap = 테스트가 실패→성공으로 바뀌어야 하는 항목
hidden = f"Tests that must pass after fix: {fail_to_pass[:500]}"
task = FinalTask(
task_id=f"SWE_{instance_id}",
domain=domain,
grade=grade,
ticos_type=ticos,
difficulty=difficulty,
lens="code_analysis",
title=f"[{repo}] {instance_id}",
prompt=prompt_text,
expected_behavior=expected,
hidden_trap=hidden,
ticos_required=[ticos],
metadata={
"source": "SWE-bench_Verified",
"repo": repo,
"instance_id": instance_id,
"base_commit": base_commit,
"version": version,
"difficulty_raw": difficulty_raw,
"patch": patch, # 정답 패치 보관
"test_patch": test_patch, # 테스트 패치 보관
}
)
tasks.append(task)
SWE_BENCH_TASKS = tasks
print(f" ✅ Converted {len(tasks)} SWE-bench tasks to FinalTask format")
# 통계
repos = {}
for t in tasks:
r = t.metadata.get("repo", "?")
repos[r] = repos.get(r, 0) + 1
print(f" 📊 Repos: {dict(sorted(repos.items(), key=lambda x:-x[1])[:10])}")
grades = {}
for t in tasks:
grades[t.grade] = grades.get(t.grade, 0) + 1
print(f" 📊 Grades: {grades}")
return tasks
except ImportError:
print("⚠️ 'datasets' library not installed. Run: pip install datasets")
return []
except Exception as e:
print(f"❌ SWE-bench loading failed: {e}")
return []
# ════════════════════════════════════════════════════════════════
# §1.6 Local Model Server (vLLM for Darwin-gpt-ernie-20b)
# ════════════════════════════════════════════════════════════════
LOCAL_MODEL_CONFIG = {
"model_id": "seawolf2357/Darwin-gpt-ernie-20b",
"base_models": ["openai/gpt-oss-20b", "baidu/ERNIE-4.5-21B-A3B-Thinking"],
"merge_ratio": 0.50,
"params": "21B",
"active_params": "3.6B (MoE)",
"min_vram": "16GB",
"server_port": 8000,
"server_process": None,
"server_ready": False,
"server_starting": False,
"gpu_detected": False,
"gpu_count": 0,
"gpu_info": "",
"gpu_all": [],
"total_vram_mb": 0,
"active_config": {},
"auto_start_attempted": False,
}
def _detect_gpu():
"""GPU 감지 — 개수 + VRAM 정보"""
try:
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=10)
if result.returncode == 0 and result.stdout.strip():
lines = [l.strip() for l in result.stdout.strip().split('\n') if l.strip()]
LOCAL_MODEL_CONFIG["gpu_detected"] = True
LOCAL_MODEL_CONFIG["gpu_count"] = len(lines)
LOCAL_MODEL_CONFIG["gpu_info"] = lines[0] # 첫 GPU 정보만 저장
LOCAL_MODEL_CONFIG["gpu_all"] = lines # 전체 GPU 목록
# 총 VRAM 계산
total_vram = 0
for line in lines:
parts = [p.strip() for p in line.split(',')]
if len(parts) >= 2:
try: total_vram += int(parts[1])
except: pass
LOCAL_MODEL_CONFIG["total_vram_mb"] = total_vram
print(f" 🖥️ GPU detected: {len(lines)}x {lines[0]}")
print(f" 💾 Total VRAM: {total_vram/1024:.1f} GB")
return True
except:
pass
LOCAL_MODEL_CONFIG["gpu_detected"] = False
LOCAL_MODEL_CONFIG["gpu_count"] = 0
print(" ⚠️ No GPU detected (nvidia-smi failed)")
return False
def _probe_vllm_server(port=None):
"""vLLM 서버가 이미 실행 중인지 확인"""
port = port or LOCAL_MODEL_CONFIG["server_port"]
try:
r = requests.get(f"http://localhost:{port}/health", timeout=5)
if r.status_code == 200:
return True
except:
pass
# v1/models 엔드포인트도 확인
try:
r = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
if r.status_code == 200:
return True
except:
pass
return False
def _auto_detect_server():
"""앱 시작 시 vLLM 서버 자동 감지"""
port = LOCAL_MODEL_CONFIG["server_port"]
if _probe_vllm_server(port):
LOCAL_MODEL_CONFIG["server_ready"] = True
print(f" ✅ vLLM server auto-detected on port {port}")
# 모델 확인
try:
r = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
if r.status_code == 200:
models = r.json().get("data", [])
if models:
model_ids = [m.get("id", "?") for m in models]
print(f" 📦 Loaded models: {model_ids}")
except:
pass
return True
return False
def _start_local_model_server(model_id=None, gpu_memory_utilization=0.95, max_model_len=4096):
"""vLLM 서버로 Darwin-gpt-ernie-20b 로컬 서빙 시작 — 자동 TP + OOM 폴백"""
global LOCAL_MODEL_CONFIG
if model_id:
LOCAL_MODEL_CONFIG["model_id"] = model_id
mid = LOCAL_MODEL_CONFIG["model_id"]
port = LOCAL_MODEL_CONFIG["server_port"]
# 이미 실행 중인지 확인
if _probe_vllm_server(port):
LOCAL_MODEL_CONFIG["server_ready"] = True
LOCAL_MODEL_CONFIG["server_starting"] = False
return f"✅ Local model server already running on port {port}"
# 이미 시작 중이면 대기
if LOCAL_MODEL_CONFIG["server_starting"]:
return "⏳ Server is starting... please wait"
# GPU 확인
if not _detect_gpu():
return "❌ No GPU detected. vLLM requires GPU (nvidia-smi failed)"
LOCAL_MODEL_CONFIG["server_starting"] = True
gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 1)
total_vram = LOCAL_MODEL_CONFIG.get("total_vram_mb", 48000)
print(f"🚀 Starting vLLM server for {mid}...")
print(f" GPUs: {gpu_count}x | Total VRAM: {total_vram/1024:.1f} GB")
try:
# ★ vLLM 실행 가능한지 먼저 체크
check = subprocess.run(["python", "-c", "import vllm; print(vllm.__version__)"],
capture_output=True, text=True, timeout=30)
if check.returncode != 0:
LOCAL_MODEL_CONFIG["server_starting"] = False
return f"❌ vLLM not available: {check.stderr[:300]}"
print(f" vLLM version: {check.stdout.strip()}")
# ★ 시도할 설정 목록 (점점 메모리 절약 방향으로)
# 21B 모델 bf16 ≈ 42GB → 단일 48GB GPU에서는 KV캐시 공간 부족
configs = []
if gpu_count >= 4:
# 4 GPU: TP=4, 각 GPU에 ~10.5GB weights → KV캐시 충분
configs.append({"tp": 4, "mem": 0.90, "maxlen": 8192, "dtype": "auto", "label": "TP=4, 8K ctx"})
configs.append({"tp": 4, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=4, 4K ctx"})
configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"})
elif gpu_count >= 2:
# 2 GPU: TP=2, 각 GPU에 ~21GB weights
configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"})
configs.append({"tp": 2, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "TP=2, 2K ctx"})
else:
# 1 GPU 48GB: 21B bf16은 빡빡 → quantization 또는 짧은 ctx
configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "1GPU, 2K ctx"})
configs.append({"tp": 1, "mem": 0.95, "maxlen": 1024, "dtype": "auto", "label": "1GPU, 1K ctx"})
configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "half", "label": "1GPU, fp16, 2K"})
for ci, cfg in enumerate(configs):
print(f"\n 🔄 Attempt {ci+1}/{len(configs)}: {cfg['label']}")
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", mid,
"--port", str(port),
"--gpu-memory-utilization", str(cfg["mem"]),
"--max-model-len", str(cfg["maxlen"]),
"--trust-remote-code",
"--dtype", cfg["dtype"],
"--enforce-eager",
]
if cfg["tp"] > 1:
cmd.extend(["--tensor-parallel-size", str(cfg["tp"])])
print(f" CMD: {' '.join(cmd)}")
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
preexec_fn=os.setsid
)
LOCAL_MODEL_CONFIG["server_process"] = proc
# 서버 준비 대기 (최대 10분)
max_wait = 120
started = False
crashed = False
crash_msg = ""
for i in range(max_wait):
time.sleep(5)
if proc.poll() is not None:
stderr = proc.stderr.read().decode()[-2000:]
stdout = proc.stdout.read().decode()[-1000:]
crashed = True
crash_msg = stderr + stdout
# OOM 확인
if "No available memory" in crash_msg or "CUDA out of memory" in crash_msg:
print(f" ⚠️ OOM with {cfg['label']} — trying next config...")
else:
print(f" ❌ Crashed (non-OOM): {crash_msg[-300:]}")
break
if _probe_vllm_server(port):
started = True
LOCAL_MODEL_CONFIG["server_ready"] = True
LOCAL_MODEL_CONFIG["server_starting"] = False
LOCAL_MODEL_CONFIG["active_config"] = cfg
elapsed = (i+1)*5
print(f" ✅ vLLM server ready! Config: {cfg['label']} ({elapsed}s)")
return (f"✅ Server started: {mid}\n"
f"Config: {cfg['label']} | TP={cfg['tp']} | "
f"MaxLen={cfg['maxlen']} | {elapsed}s")
if (i+1) % 12 == 0:
print(f" ⏳ Still waiting... ({(i+1)*5}s)")
if started:
break
if not crashed:
# 타임아웃
try:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
proc.wait(timeout=10)
except: pass
print(f" ⚠️ Timeout with {cfg['label']}")
# 다음 시도 전 정리
try:
if proc.poll() is None:
os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
proc.wait(timeout=5)
except: pass
time.sleep(3)
# 모든 시도 실패
LOCAL_MODEL_CONFIG["server_starting"] = False
return (f"❌ All vLLM configurations failed on {gpu_count}x GPU.\n"
f"Last error: {crash_msg[-300:]}\n"
f"💡 Try: (1) Restart Space to free GPU memory, "
f"(2) Use smaller max-model-len, "
f"(3) Check model compatibility with vLLM {check.stdout.strip()}")
except FileNotFoundError:
LOCAL_MODEL_CONFIG["server_starting"] = False
return "❌ vLLM not installed. Run: pip install vllm"
except Exception as e:
LOCAL_MODEL_CONFIG["server_starting"] = False
return f"❌ Server start failed: {e}"
def _stop_local_model_server():
"""vLLM 서버 종료"""
global LOCAL_MODEL_CONFIG
proc = LOCAL_MODEL_CONFIG.get("server_process")
if proc and proc.poll() is None:
try:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
proc.wait(timeout=10)
except:
try: proc.kill()
except: pass
LOCAL_MODEL_CONFIG["server_process"] = None
LOCAL_MODEL_CONFIG["server_ready"] = False
LOCAL_MODEL_CONFIG["server_starting"] = False
return "⏹️ Local model server stopped"
LOCAL_MODEL_CONFIG["server_ready"] = False
LOCAL_MODEL_CONFIG["server_starting"] = False
return "ℹ️ No server running"
def call_local_model(prompt, system="", max_tokens=8192, temperature=0.6):
"""로컬 vLLM 서버에 요청 (OpenAI-compatible API)"""
port = LOCAL_MODEL_CONFIG["server_port"]
# ★ 서버 자동 감지 (server_ready가 False여도 실제로 돌고 있을 수 있음)
if not LOCAL_MODEL_CONFIG["server_ready"]:
if _probe_vllm_server(port):
LOCAL_MODEL_CONFIG["server_ready"] = True
print(f" ✅ vLLM server re-detected on port {port}")
else:
return ("[LOCAL_ERROR] vLLM server not running on port {port}. "
"Click '🚀 Start vLLM Server' or check that vLLM process is active.")
headers = {"Content-Type": "application/json"}
# ★ vLLM에 로드된 실제 모델명 확인 (첫 호출 시)
model_name = LOCAL_MODEL_CONFIG.get("_actual_vllm_model")
if not model_name:
try:
r = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
if r.status_code == 200:
models = r.json().get("data", [])
if models:
model_name = models[0].get("id", LOCAL_MODEL_CONFIG["model_id"])
LOCAL_MODEL_CONFIG["_actual_vllm_model"] = model_name
print(f" 📦 Using vLLM model: {model_name}")
except:
pass
if not model_name:
model_name = LOCAL_MODEL_CONFIG["model_id"]
# ★ 입력 길이 제한 — max_model_len 기준으로 입력+출력 합계 맞춤
active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {})
model_max_len = active_cfg.get("maxlen", 4096)
# 대략 4 chars ≈ 1 token. 입력은 max_model_len의 60%까지만 허용
max_input_chars = int(model_max_len * 0.6 * 4) # e.g., 4096 * 0.6 * 4 = 9830
max_tokens = min(max_tokens, int(model_max_len * 0.4)) # 출력은 40%
total_input = (system or "") + prompt
if len(total_input) > max_input_chars:
if system and len(system) > max_input_chars // 3:
system = system[:max_input_chars // 3] + "\n[...truncated...]"
remaining = max_input_chars - len(system or "")
if len(prompt) > remaining:
prompt = prompt[:remaining] + "\n[...truncated for context length...]"
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
payload = {
"model": model_name,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
}
for attempt in range(3):
try:
r = requests.post(
f"http://localhost:{port}/v1/chat/completions",
headers=headers,
data=json.dumps(payload),
timeout=600 # 긴 응답 대비 10분
)
if r.status_code == 400:
err = ""
try: err = r.json().get("error", {}).get("message", r.text[:500])
except: err = r.text[:500]
print(f" ⚠️ vLLM 400 (attempt {attempt+1}): {err[:300]}")
# ★ 토큰 제한 에러 → max_tokens 줄여서 재시도
if "max_tokens" in str(err).lower() or "too many tokens" in str(err).lower():
payload["max_tokens"] = min(max_tokens // 2, 4096)
if attempt < 2: continue
# ★ 모델명 불일치 → v1/models로 재확인
if "model" in str(err).lower():
try:
mr = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
if mr.status_code == 200:
models = mr.json().get("data", [])
if models:
payload["model"] = models[0]["id"]
LOCAL_MODEL_CONFIG["_actual_vllm_model"] = models[0]["id"]
if attempt < 2: continue
except: pass
return f"[LOCAL_ERROR] 400: {err[:300]}"
if r.status_code == 503:
# 서버 과부하
print(f" ⏳ vLLM 503 overloaded (attempt {attempt+1})")
if attempt < 2: time.sleep(10 * (attempt+1)); continue
return "[LOCAL_ERROR] Server overloaded (503)"
r.raise_for_status()
c = r.json()["choices"][0]["message"]["content"]
return c if c else "[EMPTY]"
except requests.exceptions.ConnectionError:
print(f" ⚠️ vLLM connection error (attempt {attempt+1})")
# 서버 재감지
if _probe_vllm_server(port):
if attempt < 2: time.sleep(3); continue
else:
LOCAL_MODEL_CONFIG["server_ready"] = False
return "[LOCAL_ERROR] Connection refused. vLLM server may have crashed."
except requests.exceptions.ReadTimeout:
print(f" ⚠️ vLLM timeout (attempt {attempt+1})")
if attempt < 2: time.sleep(5); continue
return "[LOCAL_ERROR] Request timeout (600s). Task may be too complex."
except Exception as e:
print(f" ⚠️ vLLM exception (attempt {attempt+1}): {str(e)[:200]}")
if attempt < 2:
time.sleep(3 * (attempt + 1))
else:
return f"[LOCAL_ERROR] {e}"
# ═══════════════════════════════════════════════════════════
# ★ 앱 시작 시 vLLM 자동 감지 + 자동 시작
# ═══════════════════════════════════════════════════════════
def _auto_boot_vllm():
"""앱 부팅 시 vLLM 서버 자동 시작 (백그라운드)"""
# 1) 이미 실행 중인지 확인
if _auto_detect_server():
return
# 2) GPU 감지
if not _detect_gpu():
print(" ℹ️ No GPU → vLLM auto-start skipped")
return
# 3) vLLM 설치 확인
try:
check = subprocess.run(["python", "-c", "import vllm"],
capture_output=True, timeout=10)
if check.returncode != 0:
print(" ℹ️ vLLM not installed → auto-start skipped")
return
except:
return
# 4) 자동 시작
print(" 🚀 Auto-starting vLLM server for Darwin model...")
LOCAL_MODEL_CONFIG["auto_start_attempted"] = True
result = _start_local_model_server()
print(f" 📋 Auto-start result: {result}")
# ★ 앱 시작 시 실행 (블로킹 아닌 백그라운드 스레드)
print("🔍 Checking for local vLLM server...")
if _auto_detect_server():
print(" ✅ vLLM server found and ready!")
elif _detect_gpu():
print(" 🖥️ GPU available — will auto-start vLLM in background")
threading.Thread(target=_auto_boot_vllm, daemon=True, name="vLLM-AutoBoot").start()
else:
print(" ℹ️ No GPU detected — using OpenAI/HF Inference models")
def _test_hf_model(model_id, hf_token):
"""HF 모델 사전 검증 — 어떤 API 엔드포인트가 작동하는지 확인"""
if not hf_token:
return False, "no_token", "HF_TOKEN required"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {hf_token}"}
test_messages = [{"role": "user", "content": "Say OK"}]
test_payload = {"model": model_id, "messages": test_messages, "max_tokens": 20, "stream": False}
# 1) router.huggingface.co (Inference Providers — OpenAI-compatible)
try:
r = requests.post("https://router.huggingface.co/v1/chat/completions",
headers=headers, data=json.dumps(test_payload), timeout=30)
if r.status_code == 200:
return True, "router", "OK"
err = ""
try:
rj = r.json()
e = rj.get("error", "")
err = e.get("message", str(e)) if isinstance(e, dict) else str(e)
except:
try: err = r.text[:300]
except: err = str(r.status_code)
print(f" ℹ️ HF router test ({model_id}): {r.status_code}{str(err)[:200]}")
except Exception as e:
print(f" ℹ️ HF router exception: {str(e)[:200]}")
# 2) api-inference.huggingface.co (Serverless Inference API)
try:
inf_payload = {"inputs": "Say OK", "parameters": {"max_new_tokens": 20}}
r2 = requests.post(f"https://api-inference.huggingface.co/models/{model_id}",
headers=headers, data=json.dumps(inf_payload), timeout=30)
if r2.status_code == 200:
return True, "serverless", "OK"
err2 = ""
try:
rj2 = r2.json()
e2 = rj2.get("error", "")
err2 = e2.get("message", str(e2)) if isinstance(e2, dict) else str(e2)
except:
try: err2 = r2.text[:300]
except: err2 = str(r2.status_code)
print(f" ℹ️ HF serverless test ({model_id}): {r2.status_code}{str(err2)[:200]}")
except Exception as e:
print(f" ℹ️ HF serverless exception: {str(e)[:200]}")
# 3) HF Inference Endpoints (dedicated) — 사용자 전용 엔드포인트
# 이건 URL이 다르므로 여기서는 스킵
return False, "none", f"Model {model_id} not available on HF Inference"
_HF_ENDPOINT_CACHE = {} # {model_id: "router" | "serverless" | "none"}
def call_hf_inference(prompt, system="", model_id=None, hf_token=None, max_tokens=4096):
"""HuggingFace Inference API — 자동 엔드포인트 탐색 + 에러 바디 캡처"""
mid = model_id or LOCAL_MODEL_CONFIG["model_id"]
token = hf_token or os.getenv("HF_TOKEN", "")
if not token:
return "[HF_ERROR] HF_TOKEN required for Inference API"
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"}
# ★ 엔드포인트 자동 탐색 (캐시)
if mid not in _HF_ENDPOINT_CACHE:
ok, endpoint_type, msg = _test_hf_model(mid, token)
_HF_ENDPOINT_CACHE[mid] = endpoint_type
if not ok:
print(f" ❌ HF model {mid} not available: {msg}")
endpoint_type = _HF_ENDPOINT_CACHE.get(mid, "none")
# ★ 프롬프트 길이 제한 (HF Inference는 입력 토큰 제한이 엄격)
MAX_PROMPT_CHARS = 12000 # ~3000 토큰
if len(prompt) > MAX_PROMPT_CHARS:
prompt = prompt[:MAX_PROMPT_CHARS] + "\n\n[... truncated for API limits ...]"
if system and len(system) > 2000:
system = system[:2000]
# ── Router (OpenAI-compatible) ──
if endpoint_type == "router":
messages = []
if system: messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
payload = {"model": mid, "messages": messages, "max_tokens": max_tokens, "stream": False}
for attempt in range(3):
try:
r = requests.post("https://router.huggingface.co/v1/chat/completions",
headers=headers, data=json.dumps(payload), timeout=300)
if r.status_code == 429:
time.sleep(5 * (attempt + 1)); continue
if r.status_code >= 400:
err_body = ""
try: err_body = r.json().get("error", r.text[:500])
except: err_body = r.text[:500]
print(f" ⚠️ HF router {r.status_code} (attempt {attempt+1}): {err_body[:200]}")
if attempt < 2:
# 프롬프트 더 줄여서 재시도
if len(prompt) > 4000:
prompt = prompt[:4000] + "\n[... further truncated ...]"
payload["messages"][-1]["content"] = prompt
time.sleep(3 * (attempt + 1)); continue
return f"[HF_ERROR] {r.status_code}: {err_body[:200]}"
r.raise_for_status()
c = r.json()["choices"][0]["message"]["content"]
return c if c else "[EMPTY]"
except Exception as e:
if attempt < 2: time.sleep(3 * (attempt + 1))
else: return f"[HF_ERROR] router: {e}"
# ── Serverless Inference API ──
elif endpoint_type == "serverless":
full_prompt = f"{system}\n\n{prompt}" if system else prompt
payload = {
"inputs": full_prompt,
"parameters": {"max_new_tokens": min(max_tokens, 2048), "temperature": 0.6, "return_full_text": False}
}
for attempt in range(3):
try:
r = requests.post(f"https://api-inference.huggingface.co/models/{mid}",
headers=headers, data=json.dumps(payload), timeout=300)
if r.status_code == 429:
time.sleep(5 * (attempt + 1)); continue
if r.status_code == 503:
# 모델 로딩 중
est = 60
try: est = r.json().get("estimated_time", 60)
except: pass
print(f" ⏳ Model loading... ETA {est}s")
if attempt < 2: time.sleep(min(est, 30)); continue
return f"[HF_ERROR] Model still loading (ETA {est}s)"
if r.status_code >= 400:
err_body = ""
try: err_body = r.json().get("error", r.text[:500])
except: err_body = r.text[:500]
print(f" ⚠️ HF serverless {r.status_code}: {err_body[:200]}")
if attempt < 2: time.sleep(3 * (attempt + 1)); continue
return f"[HF_ERROR] {r.status_code}: {err_body[:200]}"
r.raise_for_status()
result = r.json()
if isinstance(result, list) and result:
text = result[0].get("generated_text", "")
elif isinstance(result, dict):
text = result.get("generated_text", "")
else:
text = str(result)[:5000]
return text if text else "[EMPTY]"
except Exception as e:
if attempt < 2: time.sleep(3 * (attempt + 1))
else: return f"[HF_ERROR] serverless: {e}"
# ── 어떤 엔드포인트도 안 됨 ──
else:
return (f"[HF_ERROR] Model '{mid}' is not available on HF Inference API. "
f"Possible reasons: (1) Model too large for serverless, (2) Not deployed as Inference Provider, "
f"(3) Needs dedicated Inference Endpoint. Try using OpenAI model (gpt-4o) for evaluation instead.")
# ════════════════════════════════════════════════════════════════
# §2. Load Dataset
# ════════════════════════════════════════════════════════════════
def load_tasks():
for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json",
os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]:
if os.path.exists(p):
with open(p,"r",encoding="utf-8") as f: data=json.load(f)
print(f" Loaded from {p}"); break
else:
print("⚠️ FINAL_Bench_v3.json not found — FINAL tasks empty, SWE-bench only mode")
return []
return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"],
ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""),
title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""),
hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]),
metadata=t.get("metadata",{})) for t in data["tasks"]]
try:
ALL_TASKS = load_tasks()
print(f"✅ FINAL Bench v3.0: {len(ALL_TASKS)} tasks")
except:
ALL_TASKS = []
print("⚠️ FINAL tasks: 0 (SWE-bench only mode)")
# SWE-bench 로드 (비동기)
SWE_BENCH_TASKS = _load_swe_bench_verified()
# ════════════════════════════════════════════════════════════════
# §3. Model API (OpenAI + Local + HF)
# ════════════════════════════════════════════════════════════════
OPENAI_MODELS = {
"gpt-5.2": "GPT-5.2 (flagship)",
"gpt-5.2-chat-latest": "GPT-5.2 Instant",
"gpt-5-mini": "GPT-5 Mini",
"o4-mini": "o4-mini",
"gpt-4.1": "GPT-4.1",
}
LOCAL_MODELS = {
"Darwin-gpt-ernie-20b (Friendli)": {
"id": "deppfs281rgffnk",
"type": "friendli",
"desc": "21B MoE (Friendli Dedicated Endpoint)",
"api_url": "https://api.friendli.ai/dedicated/v1/chat/completions",
},
"Darwin-gpt-ernie-20b (Local vLLM)": {
"id": "seawolf2357/Darwin-gpt-ernie-20b",
"type": "local_vllm",
"desc": "21B MoE (Local vLLM, GPU required)"
},
"Darwin-gpt-ernie-20b (HF Inference)": {
"id": "seawolf2357/Darwin-gpt-ernie-20b",
"type": "hf_inference",
"desc": "HuggingFace Inference API (HF_TOKEN required)"
},
}
ALL_EVAL_MODELS = {**OPENAI_MODELS, **{k: v["desc"] for k, v in LOCAL_MODELS.items()}}
# ════════════════════════════════════════════════════════════════
# §3.6 Friendli AI Dedicated Endpoint
# ════════════════════════════════════════════════════════════════
def _test_friendli(model_id=None):
"""Friendli API 연결 테스트"""
token = os.getenv("FRIENDLI_TOKEN", "")
if not token:
return False, "FRIENDLI_TOKEN not set"
mid = model_id or "deppfs281rgffnk"
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
payload = {
"model": mid,
"messages": [{"role": "user", "content": "Say OK"}],
"max_tokens": 10,
"temperature": 0,
"stream": False,
}
try:
r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions",
headers=headers, json=payload, timeout=30)
if r.status_code == 200:
text = r.json()["choices"][0]["message"]["content"]
return True, f"OK ({text[:20]})"
err = ""
try: err = r.json().get("error", {}).get("message", r.text[:300])
except: err = r.text[:300]
return False, f"{r.status_code}: {err[:200]}"
except Exception as e:
return False, f"Connection error: {str(e)[:200]}"
def call_friendli(prompt, system="", model_id=None, max_tokens=8192, temperature=0.6):
"""Friendli AI Dedicated Endpoint 호출 (OpenAI-compatible)"""
token = os.getenv("FRIENDLI_TOKEN", "")
if not token:
return "[FRIENDLI_ERROR] FRIENDLI_TOKEN not set in environment"
mid = model_id or "deppfs281rgffnk"
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
payload = {
"model": mid,
"messages": messages,
"max_tokens": min(max_tokens, 16384),
"temperature": temperature,
"top_p": 0.95,
"stream": False,
}
for attempt in range(3):
try:
r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions",
headers=headers, json=payload, timeout=600)
if r.status_code == 429:
wait = 5 * (attempt + 1)
print(f" ⏳ Friendli rate limit, waiting {wait}s...")
time.sleep(wait)
continue
if r.status_code == 400:
err = ""
try: err = r.json().get("error", {}).get("message", r.text[:500])
except: err = r.text[:500]
print(f" ⚠️ Friendli 400 (attempt {attempt+1}): {err[:200]}")
# 토큰 제한 에러 → max_tokens 줄여서 재시도
if "max_tokens" in str(err).lower() or "too many" in str(err).lower():
payload["max_tokens"] = min(payload["max_tokens"] // 2, 4096)
if attempt < 2: continue
# 입력 길이 에러 → 프롬프트 축소
if "input" in str(err).lower() and ("length" in str(err).lower() or "token" in str(err).lower()):
cur_len = len(messages[-1]["content"])
messages[-1]["content"] = messages[-1]["content"][:cur_len // 2] + "\n[...truncated...]"
payload["messages"] = messages
if attempt < 2: continue
return f"[FRIENDLI_ERROR] 400: {err[:200]}"
if r.status_code >= 500:
print(f" ⚠️ Friendli {r.status_code} (attempt {attempt+1})")
if attempt < 2: time.sleep(3 * (attempt + 1)); continue
return f"[FRIENDLI_ERROR] Server error {r.status_code}"
r.raise_for_status()
c = r.json()["choices"][0]["message"]["content"]
return c if c else "[EMPTY]"
except requests.exceptions.Timeout:
print(f" ⚠️ Friendli timeout (attempt {attempt+1})")
if attempt < 2: time.sleep(5); continue
return "[FRIENDLI_ERROR] Request timeout (600s)"
except requests.exceptions.ConnectionError:
print(f" ⚠️ Friendli connection error (attempt {attempt+1})")
if attempt < 2: time.sleep(3 * (attempt + 1)); continue
return "[FRIENDLI_ERROR] Connection failed"
except Exception as e:
print(f" ⚠️ Friendli exception (attempt {attempt+1}): {str(e)[:200]}")
if attempt < 2: time.sleep(3 * (attempt + 1))
else: return f"[FRIENDLI_ERROR] {e}"
def _strip_think(text):
if not text: return text
for tag in ['think','thinking','reasoning','reflection']:
text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL)
return text.strip()
def call_model(prompt, system="", api_key="", model="gpt-5.2",
max_tokens=8192, temperature=0.6, reasoning_effort=None):
"""통합 모델 호출 — Friendli / OpenAI / Local vLLM / HF Inference 자동 분기"""
# 로컬/전용 모델 분기
if model in LOCAL_MODELS:
minfo = LOCAL_MODELS[model]
if minfo["type"] == "friendli":
return call_friendli(prompt, system=system,
model_id=minfo["id"],
max_tokens=max_tokens, temperature=temperature)
elif minfo["type"] == "local_vllm":
return call_local_model(prompt, system=system,
max_tokens=max_tokens, temperature=temperature)
elif minfo["type"] == "hf_inference":
return call_hf_inference(prompt, system=system,
model_id=minfo["id"],
max_tokens=min(max_tokens, 4096))
# OpenAI API
return call_openai(prompt, system=system, api_key=api_key, model=model,
max_tokens=max_tokens, temperature=temperature,
reasoning_effort=reasoning_effort)
def _test_api_connection(api_key, model="gpt-4o-mini"):
"""API 연결 + 모델 유효성 빠른 테스트"""
if not api_key:
return False, "❌ API key is empty"
headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
payload = {"model":model,"max_completion_tokens":50,"temperature":0,
"messages":[{"role":"user","content":"Say OK"}]}
try:
r = requests.post("https://api.openai.com/v1/chat/completions",
headers=headers,data=json.dumps(payload),timeout=30)
if r.status_code == 200:
return True, f"✅ {model} OK"
err = ""
try: err = r.json().get("error",{}).get("message", r.text[:200])
except: err = str(r.status_code)
return False, f"❌ {r.status_code}: {err}"
except Exception as e:
return False, f"❌ Connection failed: {e}"
# ── 모델명 폴백 맵 (존재하지 않는 모델 → 실제 모델) ──
MODEL_FALLBACK = {
"gpt-5.2": ["gpt-5.2", "gpt-4.1", "gpt-4o", "gpt-4o-mini"],
"gpt-5.2-chat-latest": ["gpt-5.2-chat-latest", "gpt-4.1", "gpt-4o"],
"gpt-5-mini": ["gpt-5-mini", "gpt-4o-mini", "gpt-4o"],
"o4-mini": ["o4-mini", "o3-mini", "gpt-4o-mini"],
"gpt-4.1": ["gpt-4.1", "gpt-4o", "gpt-4o-mini"],
}
_VERIFIED_MODELS = {} # 캐시: {requested_model: actual_working_model}
def _resolve_model(model, api_key):
"""모델명이 유효한지 확인하고, 안 되면 폴백 모델 탐색"""
if model in _VERIFIED_MODELS:
return _VERIFIED_MODELS[model]
ok, msg = _test_api_connection(api_key, model)
if ok:
_VERIFIED_MODELS[model] = model
print(f" ✅ Model verified: {model}")
return model
# 폴백 탐색
fallbacks = MODEL_FALLBACK.get(model, [])
for fb in fallbacks:
if fb == model: continue
ok2, msg2 = _test_api_connection(api_key, fb)
if ok2:
_VERIFIED_MODELS[model] = fb
print(f" ⚠️ Model {model} unavailable → fallback to {fb}")
return fb
# 최후 수단
ok3, _ = _test_api_connection(api_key, "gpt-4o-mini")
if ok3:
_VERIFIED_MODELS[model] = "gpt-4o-mini"
print(f" ⚠️ All fallbacks failed → using gpt-4o-mini")
return "gpt-4o-mini"
_VERIFIED_MODELS[model] = model
print(f" ❌ No working model found for {model}: {msg}")
return model
def call_openai(prompt, system="", api_key="", model="gpt-5.2",
max_tokens=8192, temperature=0.6, reasoning_effort=None):
"""OpenAI API — 자동 모델 검증/폴백 + 파라미터 호환성 자동 수정"""
# ★ 모델명 자동 검증/폴백
actual_model = _resolve_model(model, api_key)
headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
messages = []
if system: messages.append({"role":"system","content":system})
messages.append({"role":"user","content":prompt})
payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages}
# reasoning_effort: 일부 모델만 지원
if reasoning_effort:
payload["reasoning_effort"] = reasoning_effort
for attempt in range(3):
try:
r = requests.post("https://api.openai.com/v1/chat/completions",
headers=headers,data=json.dumps(payload),timeout=300)
if r.status_code == 429:
time.sleep(5*(attempt+1)); continue
if r.status_code == 400:
err_msg = ""
try: err_msg = r.json().get("error",{}).get("message","")
except: err_msg = str(r.status_code)
print(f" ⚠️ 400 Error (attempt {attempt+1}): {err_msg[:200]}")
# ★ 파라미터 호환성 자동 수정
if "max_completion_tokens" in err_msg:
payload.pop("max_completion_tokens", None)
payload["max_tokens"] = max_tokens
if "reasoning_effort" in err_msg or "not supported" in err_msg.lower():
payload.pop("reasoning_effort", None)
if "temperature" in err_msg:
payload["temperature"] = 1 # reasoning 모델은 temperature 지원 안 할 수 있음
if attempt < 2:
time.sleep(2); continue
return f"[API_ERROR] 400: {err_msg[:200]}"
r.raise_for_status()
c = r.json()["choices"][0]["message"]["content"]
return c if c else "[EMPTY]"
except requests.exceptions.HTTPError:
try: err=r.json().get("error",{}).get("message","")
except: err=str(r.status_code)
print(f" ⚠️ HTTP Error (attempt {attempt+1}): {err[:200]}")
if attempt<2: time.sleep(3*(attempt+1)); continue
return f"[API_ERROR] {err}"
except Exception as e:
print(f" ⚠️ Exception (attempt {attempt+1}): {str(e)[:200]}")
if attempt<2: time.sleep(3*(attempt+1))
else: return f"[API_ERROR] {e}"
# ════════════════════════════════════════════════════════════════
# §3.5 Proto-AGI 오행 멀티에이전트 파이프라인
# ════════════════════════════════════════════════════════════════
MAGIC_SQUARE_5x5 = np.array([
[17,24,1,8,15],[23,5,7,14,16],[4,6,13,20,22],[10,12,19,21,3],[11,18,25,2,9]
], dtype=np.float64)
COMM_MATRIX = MAGIC_SQUARE_5x5 / MAGIC_SQUARE_5x5.sum(axis=1, keepdims=True)
AGENT_BUDGETS = {
"木_발상": 512,
"火_실행": 65536,
"火_이어쓰기": 32768,
"土_판단": 512,
"金_비평": 2048,
"水_정제": 4096,
}
AGENT_REASONING = {
"木_발상": "low",
"火_실행": None,
"火_이어쓰기": None,
"土_판단": "low",
"金_비평": "medium",
"水_정제": "medium",
}
PROTO_AGENTS = {
"木_발상": {
"role": "Ideation (木/仁). 3 bullets MAX, 100 words total. "
"What makes this AGI-hard? Key traps? Core angles?",
"element":"木","index":2,"sheng_from":"水","ke_target":"土",
},
"火_실행": {
"role": "★ MAIN SOLVER (火/禮). You write THE COMPLETE FINAL ANSWER. "
"Complete ALL numbered requirements — check each off. "
"State confidence (0-100%) per major claim. "
"★ MANDATORY SELF-CHECK at the END of your answer: "
"Write 2-3 [BACKTRACK] corrections reviewing your own claims: "
"'[BACKTRACK-1] I adjust X because Y. Corrected: Z.' "
"'[BACKTRACK-2] I refine A because B. Corrected: C.' "
"Find genuine improvements — qualify overconfident claims, fix edge cases, "
"or add missing nuance. This is REQUIRED for scoring. "
"NEVER stop mid-sentence. Be concise but COMPLETE all requirements.",
"element":"火","index":3,"sheng_from":"木","ke_target":"金",
},
"土_판단": {
"role": "Auditor (土/信). ONE paragraph only. "
"List: (1) missing requirements (2) overconfident claims (3) domain drift. Max 80 words.",
"element":"土","index":4,"sheng_from":"火","ke_target":"水",
},
"金_비평": {
"role": "Verifier (金/義). Use STRUCTURED fix tags. For each error found: "
"[FIX-1] error description → correction. [FIX-2] ... Max 5 fixes. "
"Also: [TRAP-CHECK] verify hidden traps. [HALLUCINATION] tag unverifiable claims.",
"element":"金","index":0,"sheng_from":"土","ke_target":"木",
},
"水_정제": {
"role": "Correction Agent (水/智). You apply 金's fixes to 火's answer. "
"For EACH [FIX-n] from 金, write [APPLIED-n] with the specific correction. "
"Also cool any overconfident claims (水克火). "
"End with confidence summary table + 2 most uncertain points. "
"Do NOT rewrite 火's entire answer — only write corrections and additions.",
"element":"水","index":1,"sheng_from":"金","ke_target":"火",
},
}
AGENT_ORDER = ["木_발상","火_실행","土_판단","金_비평","水_정제"]
AGENT_EMOJIS = {"木":"🌳","火":"🔥","土":"🏔️","金":"⚔️","水":"💧"}
FINAL_AGENT_INSTRUCTIONS = {
"木": "\n[BUDGET: 100 words MAX] 3 bullets: (1) core trap (2) key contradiction (3) best angle.",
"火": "\n★★★ YOU ARE THE FINAL ANSWER. THE JUDGE SCORES YOUR TEXT. ★★★\n"
"Complete ALL numbered requirements from the task. Check each off.\n"
"Confidence per major claim.\n"
"CRITICAL: Finish EVERY section. NEVER stop mid-sentence.\n"
"If running long, be CONCISE but COMPLETE all requirements.\n"
"PRIORITY: completeness > depth.\n"
"\n★ MANDATORY SELF-CHECK (REQUIRED — at the very end of your answer):\n"
"Review your answer and write 2-3 [BACKTRACK] corrections:\n"
"[BACKTRACK-1] I adjust [claim] because [reason]. Corrected: [new version].\n"
"[BACKTRACK-2] I refine [claim] because [reason]. Corrected: [new version].\n"
"[BACKTRACK-3] (optional) ...\n"
"These must be GENUINE improvements, not cosmetic. Examples:\n"
"- Qualify an overconfident claim (90%→70%)\n"
"- Fix an edge case you missed\n"
"- Add a missing perspective or caveat\n"
"The Judge gives ZERO for error_recovery without [BACKTRACK] tags.",
"土": "\n[BUDGET: 80 words MAX] Checklist only: □missing items □overconfidence □drift.",
"金": "\n[BUDGET: MAX 5 FIXES] Format strictly as:\n"
"[FIX-1] Problem → Correction\n[FIX-2] Problem → Correction\n...\n"
"[TRAP-CHECK] Hidden trap found? Y/N + detail\n[HALLUCINATION] Any? Y/N + which claims",
"水": "\nYou MUST produce [APPLIED-n] for EVERY [FIX-n] from 金.\n"
"Format:\n"
"[APPLIED-1] Fixed: (what was wrong → what it should be, 1-2 sentences)\n"
"[APPLIED-2] Fixed: ...\n"
"[NO-FIXES-NEEDED] if 金 found no errors.\n"
"\n## Confidence Adjustments\n"
"(List 火's overclaimed items with corrected confidence, max 3)\n"
"\n## Top-2 Uncertainties\n"
"(Most uncertain claims and why)\n"
"\nMAX 600 words. Focus on corrections, not repetition.",
}
def _count_requirements(prompt_text):
count = 0
for line in prompt_text.split('\n'):
stripped = line.strip()
if stripped and len(stripped) > 3:
if (stripped[0].isdigit() and len(stripped) > 2 and stripped[1] in '.):') or \
(stripped.startswith('(') and len(stripped) > 3 and stripped[1].isdigit()):
count += 1
return count
def _detect_truncation(text):
if not text or len(text) < 100:
return True
if '[BACKTRACK' not in text:
return True
return False
def _build_agent_prompt(agent_name, task_prompt, prev_outputs, task=None):
info = PROTO_AGENTS[agent_name]
elem = info['element']
idx = info['index']
sheng_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['sheng_from']]
sheng_ref = ""
if sheng_agent and sheng_agent[0] in prev_outputs:
sheng_ref = f"\n[상생 {sheng_agent[0]}] {prev_outputs[sheng_agent[0]][:200]}\n"
ke_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['ke_target']]
ke_ref = ""
if ke_agent and ke_agent[0] in prev_outputs:
ke_ref = f"\n[상극 {ke_agent[0]}] {prev_outputs[ke_agent[0]][:150]}\n"
weights = COMM_MATRIX[idx]
comm_lines = []
for aname in AGENT_ORDER:
if aname in prev_outputs and aname != agent_name:
w = weights[PROTO_AGENTS[aname]['index']]
if w >= 0.15:
trunc = min(len(prev_outputs[aname]), 150)
comm_lines.append(f"[{aname}] {prev_outputs[aname][:trunc]}")
adaptive_instruction = ""
if elem == "火" and task:
req_count = _count_requirements(task.prompt)
prompt_lower = task.prompt.lower()
is_multi_perspective = any(kw in prompt_lower for kw in [
'traditions debate', 'positions debate', 'expert panel',
'each position at maximum depth', 'each response at maximum depth',
'develop each', 'each tradition'
])
if req_count >= 7 or is_multi_perspective:
adaptive_instruction = (
f"\n\n★ COMPLEX TASK detected ({req_count} requirements"
f"{', multi-perspective' if is_multi_perspective else ''}).\n"
"Use CONCISE mode: max 100 words per perspective/requirement.\n"
"Do NOT elaborate beyond what's strictly needed.\n"
"Cover ALL requirements and perspectives BRIEFLY rather than some deeply.\n"
"Structure: address each requirement with 2-3 focused sentences, then move on."
)
sys_prompt = (
f"You are {agent_name}{info['role']}\n"
f"{sheng_ref}{ke_ref}"
f"{FINAL_AGENT_INSTRUCTIONS.get(elem, '')}"
f"{adaptive_instruction}"
)
if task:
sys_prompt += f"\nTICOS: {task.ticos_type} | {task.domain} | {task.difficulty}"
usr_prompt = task_prompt
if comm_lines:
usr_prompt += "\n\n[Previous Agents]\n" + "\n".join(comm_lines)
return sys_prompt, usr_prompt
def _strip_framework_noise(text):
if not text: return text
for pat in [r'(?:마방진|상생|상극|오행)[\s\w]{0,30}(?:구조|원리|체계)',
r'Proto-AGI[\s\w]{0,60}[\.。]',
r'(?:저는|나는)\s*(?:Proto-AGI|오행|木_|火_|土_|金_|水_)[\s\w]{0,60}[\.。]']:
text = re.sub(pat, '', text, flags=re.IGNORECASE)
return re.sub(r'\n{3,}', '\n\n', text).strip()
def run_proto_agi_pipeline(task_prompt, api_key, eval_model, task=None):
"""★ Proto-AGI v2.8 — 통합 모델 호출 사용"""
prev_outputs = {}
budgets = dict(AGENT_BUDGETS)
# 로컬/전용 모델인 경우 버짓 조정
is_local = eval_model in LOCAL_MODELS
if is_local:
minfo = LOCAL_MODELS.get(eval_model, {})
if minfo.get("type") == "friendli":
# Friendli: max_tokens 16384까지 → 넉넉하게
budgets["火_실행"] = min(budgets["火_실행"], 12288)
budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], 8192)
budgets["金_비평"] = min(budgets["金_비평"], 2048)
budgets["水_정제"] = min(budgets["水_정제"], 4096)
else:
# vLLM 로컬: 컨텍스트 길이 제한
active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {})
model_maxlen = active_cfg.get("maxlen", 4096)
max_output = int(model_maxlen * 0.4)
budgets["木_발상"] = min(budgets["木_발상"], 256)
budgets["火_실행"] = min(budgets["火_실행"], max_output)
budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], max_output)
budgets["土_판단"] = min(budgets["土_판단"], 256)
budgets["金_비평"] = min(budgets["金_비평"], min(512, max_output))
budgets["水_정제"] = min(budgets["水_정제"], min(1024, max_output))
for agent_name in AGENT_ORDER:
sys_p, usr_p = _build_agent_prompt(agent_name, task_prompt, prev_outputs, task)
re_effort = AGENT_REASONING.get(agent_name) if not is_local else None
resp = call_model(usr_p, system=sys_p, api_key=api_key, model=eval_model,
max_tokens=budgets[agent_name],
reasoning_effort=re_effort)
if agent_name == "火_실행" and _detect_truncation(resp):
last_chunk = resp[-500:] if len(resp) > 500 else resp
cont_prompt = (
f"You were writing an answer but it was CUT OFF. "
f"Here is the end of what you wrote:\n\n"
f"---\n{last_chunk}\n---\n\n"
f"CONTINUE from exactly where you stopped. "
f"Complete ALL remaining requirements. "
f"Include your [BACKTRACK] self-check section at the end.\n"
f"Do NOT repeat what was already written."
)
cont_sys = (
f"You are 火_실행 — MAIN SOLVER continuing an interrupted answer.\n"
f"TICOS: {task.ticos_type if task else ''} | {task.domain if task else ''}\n"
f"Original task:\n{task_prompt[:1500]}"
)
cont_resp = call_model(cont_prompt, system=cont_sys, api_key=api_key,
model=eval_model,
max_tokens=budgets.get("火_이어쓰기", 4096),
reasoning_effort=AGENT_REASONING.get("火_이어쓰기") if not is_local else None)
if cont_resp and not cont_resp.startswith("[API_ERROR") and not cont_resp.startswith("[LOCAL_ERROR"):
resp = resp + "\n\n" + cont_resp
if agent_name != "水_정제":
prev_outputs[agent_name] = _strip_framework_noise(resp)
else:
prev_outputs[agent_name] = resp
combined = []
for aname in AGENT_ORDER:
if aname in prev_outputs:
emoji = AGENT_EMOJIS.get(PROTO_AGENTS[aname]['element'], "")
combined.append(f"{'='*40}\n{emoji} [{aname}]\n{'='*40}\n{prev_outputs[aname]}")
return "\n\n".join(combined), prev_outputs
def compress_for_judge(prev_outputs):
parts = []
fire = prev_outputs.get("火_실행", "")
parts.append(f"[ANSWER]\n{fire[:14000]}")
water = prev_outputs.get("水_정제", "")
if water and '[APPLIED' in water:
applied_lines = [l for l in water.split('\n') if '[APPLIED' in l or 'Fixed:' in l or 'Corrected:' in l]
if applied_lines:
parts.append(f"\n[ADDITIONAL CORRECTIONS]\n" + "\n".join(applied_lines[:8]))
elif water:
parts.append(f"\n[CORRECTIONS]\n{water[:1500]}")
metal = prev_outputs.get("金_비평", "")
fix_lines = [l for l in metal.split('\n') if any(tag in l for tag in ['[FIX-','[TRAP-','[HALLUCINATION'])]
if fix_lines:
parts.append(f"\n[VERIFICATION]\n" + "\n".join(fix_lines[:8]))
return "\n".join(parts)
# ════════════════════════════════════════════════════════════════
# §4. Structured Judge (GPT-5.2 — 항상 OpenAI)
# ════════════════════════════════════════════════════════════════
JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification.
Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0
RUBRIC:
process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher.
metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. Honest uncertainty=0.75+
error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY of these exist: [BACKTRACK-n] numbered corrections in the answer, [APPLIED-n] correction tags, or explicit mid-chain corrections. Score 0.75 if 2+ genuine corrections are well-executed.
integration_depth (15%): Multi-perspective synthesis + emergent insights
final_correctness (15%): Answer accuracy and completeness. INCOMPLETE/TRUNCATED answers get 0.25 max.
TICOS BONUSES:
A_TrapEscape: ID'd ALL hidden traps? Challenged false premises?
B_ContradictionResolution: Resolved both sides? Found shared error?
C_ProgressiveDiscovery: Revised earlier stages with new info?
D_MultiConstraint: Mapped ALL conflicts? Creative tradeoffs?
E_SelfCorrecting: EXPLICIT backtrack+correct mid-chain?
F_ExpertPanel: Max-depth per perspective? Surprising convergence?
G_PivotDetection: Detected which premise reverses conclusion?
H_DecisionUnderUncertainty: Scenario matrix? Minimax regret?
SWE-BENCH SPECIFIC CRITERIA (for Software Engineering tasks):
- Did the model correctly identify the bug root cause?
- Is the proposed patch minimal and correct?
- Does the patch avoid introducing regressions?
- Are edge cases properly considered?
MULTI-AGENT FORMAT:
[ANSWER] = THE answer to judge for all rubrics.
[BACKTRACK-n] = explicit self-corrections within the answer = evidence for error_recovery 0.5+.
[ADDITIONAL CORRECTIONS] / [APPLIED-n] = post-hoc corrections = also count for error_recovery.
Multiple genuine [BACKTRACK] corrections = 0.75. Single [BACKTRACK] = 0.5.
[VERIFICATION] = What was checked externally.
Judge the ANSWER's completeness. Corrections ENHANCE the score.
STRICT: 1.0=AGI-worthy. 0.75=expert. 0.5=competent. 0.25=gaps. 0.0=failure.
Output JSON: {"scores":{...},"comment":"<50 words>"}"""
def _build_judge_schema():
sp = {k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC}
return {"type":"object","properties":{
"scores":{"type":"object","properties":sp,"required":list(RUBRIC.keys()),"additionalProperties":False},
"comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False}
JUDGE_SCHEMA = _build_judge_schema()
def call_judge_structured(prompt, system="", api_key="", model="gpt-5.2",
temperature=0.1, max_tokens=2048):
"""★ Judge는 항상 GPT (OpenAI) — 구조적 출력 + 자동 폴백"""
# ★ 모델 자동 검증
actual_model = _resolve_model(model, api_key)
messages = []
if system: messages.append({"role":"system","content":system})
messages.append({"role":"user","content":prompt})
# ★ 먼저 structured output 시도
payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature,
"messages":messages,
"response_format":{"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":JUDGE_SCHEMA}}}
# reasoning_effort: 일부 모델만
if actual_model in ("gpt-5.2","gpt-5.2-chat-latest","o4-mini","o3-mini"):
payload["reasoning_effort"] = "none"
headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"}
for attempt in range(3):
try:
r = requests.post("https://api.openai.com/v1/chat/completions",
headers=headers,data=json.dumps(payload),timeout=180)
if r.status_code==429: time.sleep(5*(attempt+1)); continue
if r.status_code==400:
err_msg = ""
try: err_msg = r.json().get("error",{}).get("message","")
except: err_msg = str(r.status_code)
print(f" ⚠️ Judge 400 (attempt {attempt+1}): {err_msg[:200]}")
# ★ 파라미터 호환성 자동 수정
if "json_schema" in err_msg or "response_format" in err_msg:
# structured output 미지원 → json_object로 폴백
payload["response_format"] = {"type":"json_object"}
payload.pop("reasoning_effort", None)
if "max_completion_tokens" in err_msg:
payload.pop("max_completion_tokens", None)
payload["max_tokens"] = max_tokens
if "reasoning_effort" in err_msg:
payload.pop("reasoning_effort", None)
if "temperature" in err_msg:
payload.pop("temperature", None)
if attempt < 2:
time.sleep(2); continue
# ★ structured 완전 실패 → plain text JSON 폴백
print(f" ⚠️ Judge structured output failed, falling back to plain JSON")
return None
r.raise_for_status()
content = _strip_think(r.json()["choices"][0]["message"]["content"] or "")
if not content:
if attempt<2: time.sleep(2); continue
return None
data = json.loads(content)
if "scores" in data and isinstance(data["scores"],dict):
for k in RUBRIC:
if k not in data["scores"]: data["scores"][k]=0.5
return {"scores":data["scores"],"comment":data.get("comment","ok")}
except json.JSONDecodeError:
print(f" ⚠️ Judge JSON parse failed (attempt {attempt+1})")
if attempt<2: time.sleep(2); continue
return None
except Exception as e:
print(f" ⚠️ Judge exception (attempt {attempt+1}): {str(e)[:100]}")
if attempt<2: time.sleep(3*(attempt+1)); continue
return None
return None
def build_judge_prompt(task, response):
# SWE-bench 과제에 대한 추가 컨텍스트
swe_context = ""
if task.metadata.get("source") == "SWE-bench_Verified":
correct_patch = task.metadata.get("patch", "")[:2000]
swe_context = f"\n\nREFERENCE CORRECT PATCH (for scoring final_correctness):\n{correct_patch}\n"
return f"""FINAL Bench Task Evaluation
Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty}
TICOS: {task.ticos_type} | Title: {task.title}
PROMPT:
{task.prompt[:2000]}
EXPECTED:
{task.expected_behavior[:600]}
HIDDEN TRAPS: {task.hidden_trap or 'None'}
{swe_context}
RESPONSE TO JUDGE:
{response[:17000]}
Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness
Apply {task.ticos_type} bonus. Check for [BACKTRACK-n] and [APPLIED-n] tags (self-corrections). Output ONLY JSON."""
def parse_judge_fallback(text, keys):
if not text or text.startswith("[API_ERROR"): return {"scores":{k:0.0 for k in keys},"comment":"API_ERROR","failed":True}
cleaned = _strip_think(text); VALID={0.0,0.25,0.5,0.75,1.0}
try:
m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL)
if m:
d=json.loads(m.group())
if "scores" in d: return {"scores":{k:min(VALID,key=lambda x:abs(x-float(d["scores"].get(k,0.5)))) for k in keys},"comment":d.get("comment","parsed")}
except: pass
try:
sc={}
for k in keys:
m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE)
if m2:
v=float(m2.group(1))
if 0<=v<=1: sc[k]=min(VALID,key=lambda x:abs(x-v))
if len(sc)>=3:
for k in keys:
if k not in sc: sc[k]=0.5
return {"scores":sc,"comment":"regex"}
except: pass
return {"scores":{k:0.0 for k in keys},"comment":"parse_failed","failed":True}
# ════════════════════════════════════════════════════════════════
# §5. Scoring Engine
# ════════════════════════════════════════════════════════════════
def compute_task_score(scores):
return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2)
def compute_axis_scores(results, tasks):
tm={t.task_id:t for t in tasks}; ax={}
for an,ai in AXIS_MAP.items():
vals=[]
for tid,d in results.items():
if d["score"]<0: continue
t=tm.get(tid)
if not t: continue
try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]; sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
except: sc={}
rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc]
w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0
if rv: vals.append(np.mean(rv)*w)
ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0
return ax
def compute_final_score(results, tasks):
tm={t.task_id:t for t in tasks}
ds={}
for tid,d in results.items():
if d["score"]<0: continue
t=tm.get(tid)
if t: ds.setdefault(t.domain,[]).append(d["score"])
da={d:np.mean(v) for d,v in ds.items() if v}
gd={}
for t in tasks: gd.setdefault(t.grade,set()).add(t.domain)
ws,wt=0,0
for g,doms in gd.items():
w=GRADE_WEIGHT.get(g,1.0)
for d in doms:
if d in da: ws+=da[d]*w; wt+=w
base=ws/wt if wt>0 else 0
axis=compute_axis_scores(results,tasks)
av=[max(v,0.01) for v in axis.values()]
har=(len(av)/sum(1.0/v for v in av)) if av else 50
har_p=har/100.0
return round(base*har_p,2),round(base,2),round(har_p,3),axis,da
def determine_agi_stage(score, axis):
all60=all(v>=60 for v in axis.values()) if axis else False
for s in reversed(AGI_STAGES):
if score>=s["min"]:
if s["stage"]>=4 and not all60: return AGI_STAGES[2]
return s
return AGI_STAGES[0]
# ════════════════════════════════════════════════════════════════
# §6. Checkpoint DB
# ════════════════════════════════════════════════════════════════
DB_PATH = "final_bench_eval.db"
def _init_db():
c=sqlite3.connect(DB_PATH)
c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))")
c.commit(); c.close()
def _make_run_id(m,mode="NON"): return hashlib.md5(f"FINALv31_{mode}_{m}".encode()).hexdigest()[:12]
def _save_result(rid,tid,resp,jresp,sc):
c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close()
def _load_all(rid):
c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close()
return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows}
def _clear_run(rid):
c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close()
_init_db()
# ════════════════════════════════════════════════════════════════
# §7. CSV + HuggingFace
# ════════════════════════════════════════════════════════════════
def generate_csv(results, tasks, model_name, mode="NON-AGI"):
out=io.StringIO(); w=csv.writer(out)
w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","model","mode","weighted_score",
"process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness",
"judge_comment","response_preview","timestamp","dataset_source"])
tm={t.task_id:t for t in tasks}
for tid,d in sorted(results.items()):
t=tm.get(tid)
if not t: continue
jd={}
try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
except: pass
sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200]
s=d["score"]
if s<0: s=-1; cm=f"JUDGE_FAILED:{cm}"
source = t.metadata.get("source", "FINAL_Bench")
w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,mode,s,
sc.get("process_quality",""),sc.get("metacognitive_accuracy",""),
sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""),
cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat(),source])
return out.getvalue()
def upload_to_hf(csv_content, model_name, mode=""):
hf_token=os.getenv("HF_TOKEN","")
if not hf_token: return "⚠️ HF_TOKEN not set"
try:
from huggingface_hub import HfApi
api=HfApi(token=hf_token); safe=re.sub(r'[^a-zA-Z0-9_-]','_',model_name.split('/')[-1])
repo="seawolf2357/FINAL-Bench-Results"; ts=datetime.now().strftime("%Y%m%d_%H%M%S")
fn=f"eval_{safe}_{mode}_{ts}.csv"
try: api.create_repo(repo_id=repo,repo_type="dataset",private=True,exist_ok=True)
except: pass
api.upload_file(path_or_fileobj=csv_content.encode("utf-8"),path_in_repo=fn,repo_id=repo,repo_type="dataset",commit_message=f"FINAL Bench: {safe} {mode} ({ts})")
return f"✅ HF: {fn}"
except Exception as e: return f"❌ HF: {e}"
# ════════════════════════════════════════════════════════════════
# §8. HTML Builders
# ════════════════════════════════════════════════════════════════
CSS = """<style>
.eval-table{width:100%;border-collapse:collapse;font-size:0.82em}
.eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em}
.eval-table td{padding:5px 8px;border-bottom:1px solid #eee}
.score-bar{background:#e0e0e0;border-radius:8px;height:16px;overflow:hidden;min-width:70px}
.score-fill{height:100%;border-radius:8px;transition:width .4s}
.summary-card{background:linear-gradient(135deg,#0a0a1a,#1a1a3e);border-radius:16px;padding:24px;color:#fff;margin:8px 0}
.axis-row{display:flex;align-items:center;gap:10px;margin:5px 0}
.axis-bar{flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden}
.axis-fill{height:100%;border-radius:6px}
.stage-badge{display:inline-block;padding:6px 16px;border-radius:20px;font-weight:700;font-size:1.1em;margin:8px 0}
.progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden}
.progress-fill{height:100%;border-radius:8px;transition:width .4s;background:linear-gradient(90deg,#1565c0,#00c853)}
.cmp-table{width:100%;border-collapse:collapse;font-size:0.85em;margin:8px 0}
.cmp-table th{background:#1a1a2e;color:#fff;padding:8px;text-align:center}
.cmp-table td{padding:6px 8px;border-bottom:1px solid #eee;text-align:center}
.cmp-up{color:#4caf50;font-weight:700} .cmp-down{color:#f44336;font-weight:700} .cmp-same{color:#888}
.model-card{background:linear-gradient(135deg,#1a237e,#0d47a1);border-radius:12px;padding:16px;color:#fff;margin:8px 0}
</style>"""
def _sc(s):
if s>=80: return "#4caf50"
if s>=60: return "#ff9800"
if s>=40: return "#ff5722"
return "#f44336"
def _build_progress_table(results, tasks):
rows=""
for t in tasks:
info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"})
gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>'
src_badge = "🐛" if t.metadata.get("source") == "SWE-bench_Verified" else "📝"
if t.task_id in results:
s=results[t.task_id]["score"]
if s<0:
rows+=f'<tr style="background:#fff3e0"><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>—</td></tr>'
else:
c=_sc(s)
rows+=f'<tr><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>'
else:
rows+=f'<tr style="opacity:0.35"><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>⏳</td><td>—</td></tr>'
return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>'
def _build_summary_card(results, tasks, model_name, hf_status, mode_label=""):
final,base,har_p,axis,dom_avgs = compute_final_score(results,tasks)
stage = determine_agi_stage(final,axis)
labels={"generalization":"🌐 General","reasoning":"🧠 Reason","planning":"📋 Plan","reliability":"🎯 Reliable","safety":"🛡️ Safety"}
ax_html=""
for an,av in axis.items():
c=_sc(av)
ax_html+=f'<div class="axis-row"><span style="width:110px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>'
gh=""
for g in ["A","B","C"]:
gd=[t.domain for t in tasks if t.grade==g]
gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs]
if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}×{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>'
done=sum(1 for t in tasks if t.task_id in results)
jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0)
# SWE-bench 통계
swe_count = sum(1 for t in tasks if t.metadata.get("source") == "SWE-bench_Verified")
swe_label = f" · SWE-bench: {swe_count}" if swe_count > 0 else ""
ad=[t.domain for t in tasks if t.grade=="A"]
asc=[dom_avgs[d] for d in set(ad) if d in dom_avgs]
aa=np.mean(asc) if asc else 0
checks=[("Score≥80",final>=80),("Axes≥60",all(v>=60 for v in axis.values())),(f"A-avg≥75({aa:.0f})",aa>=75)]
ch="".join([f'<span style="margin-right:8px">{"✅" if ok else "❌"}{lb}</span>' for lb,ok in checks])
return f"""<div class="summary-card">
<div style="text-align:center">
<div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div>
<h2 style="margin:6px 0;font-size:1.6em">{mode_label} FINAL: {final:.1f}</h2>
<p style="color:#aaa;font-size:0.85em">{stage['label']} · Base {base:.1f} × HAR {har_p:.3f} · {done}/{len(tasks)}{f" · JF={jf}" if jf else ""}{swe_label}</p>
</div><hr style="border-color:#333;margin:12px 0">
<h4 style="color:#aaa;margin:6px 0">🎯 5-Axis</h4>{ax_html}
<hr style="border-color:#333;margin:10px 0">
<div style="font-size:0.88em">{gh}</div>
<div style="font-size:0.82em;margin-top:6px">{ch}</div>
<p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p></div>"""
def _build_comparison_html(non_results, pagi_results, tasks, model_name):
if not non_results and not pagi_results:
return "<p>아직 비교할 결과가 없습니다.</p>"
tm = {t.task_id: t for t in tasks}
non_final = compute_final_score(non_results, tasks) if non_results else (0,0,0,{},{})
pagi_final = compute_final_score(pagi_results, tasks) if pagi_results else (0,0,0,{},{})
nf, nb, nh, nax, nda = non_final
pf, pb, ph2, pax, pda = pagi_final
def _delta(a, b):
d = a - b
if abs(d) < 0.5: return f'<span class="cmp-same">±0</span>'
cls = "cmp-up" if d > 0 else "cmp-down"
return f'<span class="{cls}">{"+" if d>0 else ""}{d:.1f}</span>'
ns = determine_agi_stage(nf, nax) if non_results else AGI_STAGES[0]
ps = determine_agi_stage(pf, pax) if pagi_results else AGI_STAGES[0]
header = f"""{CSS}
<div style="background:linear-gradient(135deg,#0d0d2b,#1a1a4e);border-radius:16px;padding:20px;color:#fff;margin:8px 0;text-align:center">
<h2 style="margin:0">🔄 Before vs After Comparison</h2>
<p style="color:#aaa;font-size:0.9em">{model_name}</p>
<div style="display:flex;justify-content:center;gap:40px;margin:16px 0">
<div>
<div style="font-size:0.8em;color:#aaa">🤖 Non-AGI (Before)</div>
<div class="stage-badge" style="background:{ns['color']};font-size:0.9em">{ns['name']}</div>
<div style="font-size:1.5em;font-weight:700">{nf:.1f}</div>
</div>
<div style="font-size:2em;color:#ffd700;align-self:center">→</div>
<div>
<div style="font-size:0.8em;color:#aaa">🌟 Proto-AGI (After)</div>
<div class="stage-badge" style="background:{ps['color']};font-size:0.9em">{ps['name']}</div>
<div style="font-size:1.5em;font-weight:700">{pf:.1f}</div>
</div>
<div style="align-self:center;padding:12px;background:{'#1b5e20' if pf>nf else '#b71c1c' if pf<nf else '#333'};border-radius:12px">
<div style="font-size:0.7em;color:#ccc">Delta</div>
<div style="font-size:1.4em;font-weight:700">{_delta(pf,nf)}</div>
</div>
</div></div>"""
ax_labels = {"generalization":"🌐 Generalization","reasoning":"🧠 Reasoning","planning":"📋 Planning","reliability":"🎯 Reliability","safety":"🛡️ Safety"}
ax_rows = ""
for an in AXIS_MAP:
nv = nax.get(an, 0)
pv = pax.get(an, 0)
ax_rows += f'<tr><td style="text-align:left">{ax_labels.get(an,an)}</td><td style="color:{_sc(nv)}">{nv:.1f}</td><td style="color:{_sc(pv)}">{pv:.1f}</td><td>{_delta(pv,nv)}</td></tr>'
grade_rows = ""
for g in ["A","B","C"]:
gd = list(set(t.domain for t in tasks if t.grade==g))
n_gs = [nda.get(d,0) for d in gd if d in nda]
p_gs = [pda.get(d,0) for d in gd if d in pda]
na = np.mean(n_gs) if n_gs else 0
pa = np.mean(p_gs) if p_gs else 0
grade_rows += f'<tr><td>Grade {g}{GRADE_WEIGHT[g]})</td><td style="color:{_sc(na)}">{na:.1f}</td><td style="color:{_sc(pa)}">{pa:.1f}</td><td>{_delta(pa,na)}</td></tr>'
task_rows = ""
for t in tasks[:50]: # 최대 50개만 표시
ns_v = non_results.get(t.task_id,{}).get("score",0) if non_results else 0
ps_v = pagi_results.get(t.task_id,{}).get("score",0) if pagi_results else 0
if ns_v < 0: ns_v = 0
if ps_v < 0: ps_v = 0
info = DOMAIN_INFO.get(t.domain,{"icon":"?"})
src = "🐛" if t.metadata.get("source") == "SWE-bench_Verified" else ""
task_rows += f'<tr><td style="text-align:left">{info["icon"]}{src} {t.task_id[:25]}</td><td>{t.grade}</td><td style="color:{_sc(ns_v)}">{ns_v:.1f}</td><td style="color:{_sc(ps_v)}">{ps_v:.1f}</td><td>{_delta(ps_v,ns_v)}</td></tr>'
return f"""{header}
<table class="cmp-table">
<thead><tr><th style="text-align:left">5-Axis</th><th>🤖 Non-AGI</th><th>🌟 Proto-AGI</th><th>Δ</th></tr></thead>
<tbody>{ax_rows}</tbody></table>
<table class="cmp-table">
<thead><tr><th>Grade</th><th>🤖 Non-AGI</th><th>🌟 Proto-AGI</th><th>Δ</th></tr></thead>
<tbody>{grade_rows}</tbody></table>
<details style="margin-top:12px"><summary style="cursor:pointer;font-weight:600;font-size:0.95em">📋 Task-Level Comparison (click)</summary>
<table class="cmp-table" style="margin-top:8px">
<thead><tr><th style="text-align:left">Task</th><th>G</th><th>🤖</th><th>🌟</th><th>Δ</th></tr></thead>
<tbody>{task_rows}</tbody></table></details>"""
def _build_detail_view(results, tasks):
items=""
for t in tasks:
if t.task_id not in results: continue
d=results[t.task_id]; info=DOMAIN_INFO.get(t.domain,{"icon":"?"})
s=d["score"]; resp=html.escape((d.get("response","") or "")[:500])
jc=""; ss=""
try:
jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {})
jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200])
sc=jd.get("scores",{}) if isinstance(jd,dict) else {}
ss=" · ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()])
except: pass
c=_sc(s) if s>=0 else "#ff9800"
badge=f'{s:.1f}' if s>=0 else "JF"
src = "🐛 SWE-bench" if t.metadata.get("source") == "SWE-bench_Verified" else "📝 FINAL"
items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id[:30]} [{t.grade}] — <span style="color:{c}">{badge}</span> <span style="font-size:0.75em;color:#888">{src}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>'
return CSS+items
def _build_model_info_html():
"""Darwin-gpt-ernie-20b 모델 카드 HTML"""
cfg = LOCAL_MODEL_CONFIG
# Friendli 상태 체크
friendli_token = os.getenv("FRIENDLI_TOKEN", "")
friendli_status = "🟢 Connected" if friendli_token else "🔴 FRIENDLI_TOKEN not set"
# vLLM 상태
if cfg["server_ready"]:
vllm_status = "🟢 Running"
elif cfg.get("server_starting"):
vllm_status = "🟡 Starting..."
else:
vllm_status = "🔴 Stopped"
gpu_count = cfg.get("gpu_count", 0)
gpu = f"{gpu_count}x {cfg.get('gpu_info', 'Not detected')}" if gpu_count else "Not detected"
active = cfg.get("active_config", {})
active_str = f"TP={active.get('tp','?')} MaxLen={active.get('maxlen','?')}" if active else "—"
return f"""{CSS}
<div class="model-card">
<h3 style="margin:0">🧬 Darwin-gpt-ernie-20b</h3>
<p style="color:#aaa;font-size:0.85em;margin:4px 0">Evolutionary Merge (진화적 병합 v3.2)</p>
<table style="color:#ccc;font-size:0.82em;margin:8px 0">
<tr><td style="padding:2px 12px 2px 0">Base Models</td><td>openai/gpt-oss-20b + baidu/ERNIE-4.5-21B-A3B-Thinking</td></tr>
<tr><td style="padding:2px 12px 2px 0">Merge Ratio</td><td>{cfg['merge_ratio']*100:.0f}% / {(1-cfg['merge_ratio'])*100:.0f}%</td></tr>
<tr><td style="padding:2px 12px 2px 0">Parameters</td><td>{cfg['params']} total ({cfg['active_params']} active)</td></tr>
<tr><td style="padding:2px 12px 2px 0">Architecture</td><td>MoE (Mixture of Experts)</td></tr>
<tr><td style="padding:2px 12px 2px 0">🚀 Friendli</td><td><b>{friendli_status}</b> · model: deppfs281rgffnk</td></tr>
<tr><td style="padding:2px 12px 2px 0">🖥️ vLLM</td><td>{vllm_status} (port {cfg['server_port']}) | {active_str}</td></tr>
<tr><td style="padding:2px 12px 2px 0">🖥️ GPU</td><td>{gpu}</td></tr>
</table>
<p style="font-size:0.75em;color:#888;margin:4px 0">
📊 SWE-bench_Verified: {len(SWE_BENCH_TASKS)} tasks | FINAL Bench: {len(ALL_TASKS)} tasks
</p></div>"""
# ════════════════════════════════════════════════════════════════
# §9. Evaluation Engine — ★ Producer-Consumer 파이프라인
# Stage 1 (Solver Pool): 피평가 모델 호출 → Queue
# Stage 2 (Judge Pool): Queue → GPT-5.2 채점 → 결과 저장
# 두 풀이 동시 가동되어 Solver가 풀고 있는 동안 Judge도 채점 중
# ════════════════════════════════════════════════════════════════
import queue # thread-safe queue
# ── Stage 1: Solver (문제 풀이만 수행) ──
def _solve_single(task, api_key, eval_model, state, proto_agi=False):
"""피평가 모델에게 문제를 풀게 한다 (채점 없이 응답만 획득)"""
try:
judge_input = None
if proto_agi:
full_output, prev_outputs = run_proto_agi_pipeline(task.prompt, api_key, eval_model, task)
model_response = full_output
judge_input = compress_for_judge(prev_outputs)
else:
sys_p = (f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n"
f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. "
f"If unsure, say so honestly.")
model_response = call_model(task.prompt, system=sys_p, api_key=api_key,
model=eval_model, max_tokens=12288)
is_error = any(model_response.startswith(pfx)
for pfx in ["[API_ERROR","[LOCAL_ERROR","[HF_ERROR","[FRIENDLI_ERROR","[EMPTY]","[ERROR]"])
if is_error:
print(f" ❌ Solver error for {task.task_id[:25]}: {model_response[:150]}")
with state["lock"]:
state["solved"] += 1
info = DOMAIN_INFO.get(task.domain, {"icon":"?"})
state["solve_active"].append(f'{info["icon"]}🧠 {task.task_id[:18]}')
if len(state["solve_active"]) > 8:
state["solve_active"] = state["solve_active"][-8:]
if is_error:
state["errors"].append(f"Solver:{task.task_id[:15]}:{model_response[:60]}")
return {
"task": task,
"model_response": model_response,
"judge_input": judge_input,
"is_error": is_error,
}
except Exception as e:
with state["lock"]:
state["solved"] += 1
state["errors"].append(f"S:{task.task_id[:15]}:{str(e)[:40]}")
return {
"task": task,
"model_response": f"[ERROR] {e}",
"judge_input": None,
"is_error": True,
}
# ── Stage 2: Judge (채점만 수행) ──
def _judge_single(solve_result, run_id, api_key, judge_model, state):
"""GPT-5.2로 채점한다 (풀이 결과를 받아서)"""
task = solve_result["task"]
model_response = solve_result["model_response"]
judge_input = solve_result["judge_input"]
try:
# 풀이 에러면 즉시 0점 처리
if solve_result["is_error"]:
err_preview = model_response[:100]
print(f" ❌ Judge skip (solver error): {task.task_id[:25]}{err_preview}")
_save_result(run_id, task.task_id, model_response, "{}", 0)
with state["lock"]:
state["judged"] += 1
state["errors"].append(f"{task.task_id[:15]}: {model_response[:40]}")
return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
# ★ Judge 호출 — 항상 GPT-5.2 (OpenAI API)
resp_for_judge = judge_input if judge_input else model_response
jp = build_judge_prompt(task, resp_for_judge)
jd = call_judge_structured(jp, system=JUDGE_SYSTEM, api_key=api_key, model=judge_model)
if jd is None:
jr = call_openai(jp, system=JUDGE_SYSTEM, api_key=api_key, model=judge_model,
max_tokens=2048, temperature=0.05, reasoning_effort="none")
jd = parse_judge_fallback(jr, list(RUBRIC.keys()))
if jd is None:
jd = {"scores":{k:0.0 for k in RUBRIC}, "comment":"FAILURE", "failed":True}
if jd.get("failed"):
ws = -1.0
jd["comment"] = f"JF:{jd['comment']}"
print(f" ❌ Judge FAILED for {task.task_id[:25]}: {jd['comment'][:100]}")
else:
ws = compute_task_score(jd["scores"])
print(f" ✅ Judged {task.task_id[:25]}: score={ws:.1f}, scores={jd['scores']}")
with state["lock"]:
state["parse_ok"] += 1
jj = json.dumps(jd, ensure_ascii=False)
_save_result(run_id, task.task_id, model_response, jj, ws)
with state["lock"]:
state["judged"] += 1
info = DOMAIN_INFO.get(task.domain, {"icon":"?"})
state["judge_active"].append(f'{info["icon"]}⚖️ {task.task_id[:18]}')
if len(state["judge_active"]) > 8:
state["judge_active"] = state["judge_active"][-8:]
return task.task_id, {"response": model_response, "judge": jj, "score": ws}
except Exception as e:
with state["lock"]:
state["judged"] += 1
state["errors"].append(f"J:{task.task_id[:15]}:{str(e)[:40]}")
_save_result(run_id, task.task_id, model_response, "{}", 0)
return task.task_id, {"response": model_response, "judge": "{}", "score": 0}
# ── Stage 1→2 Bridge: Solver가 완료되면 Judge Queue에 투입 ──
def _solver_to_queue(task, api_key, eval_model, state, proto_agi, judge_q):
"""Solver 워커 — 풀이 완료 후 judge_q에 결과 넣기"""
result = _solve_single(task, api_key, eval_model, state, proto_agi)
judge_q.put(result) # non-blocking, 바로 다음 문제로
return result
def _judge_from_queue(judge_q, run_id, api_key, judge_model, state, result_dict, stop_flag):
"""Judge 워커 (데몬) — Queue에서 풀이 결과를 꺼내 채점"""
while True:
try:
solve_result = judge_q.get(timeout=2)
except queue.Empty:
# 큐가 비었으면: 전체 종료 시그널 확인
if stop_flag.is_set() and judge_q.empty():
break
continue
if solve_result is None: # 종료 센티넬
break
tid, data = _judge_single(solve_result, run_id, api_key, judge_model, state)
with state["lock"]:
result_dict[tid] = data
task = solve_result["task"]
state["grade_done"][task.grade] = state["grade_done"].get(task.grade, 0) + 1
judge_q.task_done()
# ════════════════════════════════════════════════════════════════
# §10. State Machine + Pipeline Runner
# ════════════════════════════════════════════════════════════════
_EVAL_STATE = {
"running":False,"stop_requested":False,"finished":False,
"run_id":"","model":"","mode":"NON",
# ★ 파이프라인 2단계 카운터
"solved":0, "judged":0, "total":0, "cached":0,
"errors":[],"solve_active":[],"judge_active":[],
"parse_ok":0,"parse_fail":0,
"start_time":0,"results":{},"tasks":[],
"grade_done":{},"grade_total":{},
"lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"",
"n_workers":5,"proto_agi":False,
# 비교 모드
"compare_mode":False,"compare_phase":"","non_results":{},"pagi_results":{},
"non_run_id":"","pagi_run_id":"",
# 파이프라인 설정
"solver_workers":3,"judge_workers":5,
}
def _reset():
with _EVAL_STATE["lock"]:
_EVAL_STATE.update({
"running":False,"stop_requested":False,"finished":False,
"solved":0,"judged":0,"cached":0,
"errors":[],"solve_active":[],"judge_active":[],
"parse_ok":0,"parse_fail":0,
"start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{},
"message":"","csv_path":None,"hf_status":"","mode":"NON","proto_agi":False,
"compare_mode":False,"compare_phase":"",
})
def _prog_html(state, pending):
"""★ 2단계 파이프라인 진행률 표시"""
solved = state.get("solved", 0)
judged = state.get("judged", 0)
total = max(pending, 1)
pct_solve = min(int(solved / total * 100), 100)
pct_judge = min(int(judged / total * 100), 100)
mode_icon = "🌟 Proto-AGI" if state.get("proto_agi") else "🤖 Non-AGI"
phase = state.get("compare_phase", "")
if phase:
mode_icon = f"🔄 Compare: {phase}"
# ★ 2단계 진행 바: Solver + Judge
pipeline_html = f"""
<div style="margin:8px 0">
<div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:4px">
<span>⚡ <b>{mode_icon}</b></span>
<span style="font-weight:700">🧠 Solve {solved}/{total} · ⚖️ Judge {judged}/{total}</span>
</div>
<div style="display:flex;gap:4px;margin-bottom:2px;font-size:0.78em;color:#666">
<span style="flex:1">🧠 Solver ({state.get('solver_workers',3)}w)</span>
<span style="flex:1;text-align:right">⚖️ Judge ({state.get('judge_workers',5)}w)</span>
</div>
<div style="display:flex;gap:4px">
<div style="flex:1;background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden">
<div style="width:{pct_solve}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#1565c0,#42a5f5);transition:width .4s"></div>
</div>
<div style="flex:1;background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden">
<div style="width:{pct_judge}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#e65100,#ff9800);transition:width .4s"></div>
</div>
</div>
<div style="display:flex;gap:4px;font-size:0.78em;color:#888;margin-top:2px">
<span style="flex:1">🧠 {pct_solve}%</span>
<span style="flex:1;text-align:right">⚖️ {pct_judge}%</span>
</div>"""
# 버퍼 표시: 풀었지만 아직 채점 안 된 것
in_buffer = solved - judged
if in_buffer > 0:
pipeline_html += f'<div style="text-align:center;font-size:0.8em;color:#1565c0;margin:4px 0">📦 채점 대기 버퍼: {in_buffer}건</div>'
# Grade별 진행
gb = ""
for g in ["A","B","C"]:
gt = state["grade_total"].get(g, 0)
gd = state["grade_done"].get(g, 0)
if gt == 0: continue
gp = min(int(gd / gt * 100), 100)
c = "#4caf50" if gp == 100 else ("#1976d2" if gp > 0 else "#e0e0e0")
gb += (f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0">'
f'<span style="width:100px;font-size:0.85em">'
f'{"🅰️" if g=="A" else "🅱️" if g=="B" else "🅾️"} {g}×{GRADE_WEIGHT[g]}</span>'
f'<div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden">'
f'<div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div>'
f'<span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>')
pipeline_html += gb
# 활성 작업 표시 (Solver + Judge 분리)
sa = state.get("solve_active", [])
ja = state.get("judge_active", [])
if sa:
pipeline_html += '<div style="margin-top:6px;font-size:0.78em">🧠 ' + " ".join(
[f'<span style="background:#e3f2fd;padding:1px 5px;border-radius:4px">{a}</span>' for a in sa[-6:]]
) + '</div>'
if ja:
pipeline_html += '<div style="margin-top:3px;font-size:0.78em">⚖️ ' + " ".join(
[f'<span style="background:#fff3e0;padding:1px 5px;border-radius:4px">{a}</span>' for a in ja[-6:]]
) + '</div>'
# 에러
er = state.get("errors", [])
if er:
pipeline_html += f'<div style="color:#c62828;margin-top:6px;font-size:0.82em;background:#ffebee;padding:6px;border-radius:6px;max-height:120px;overflow-y:auto">'
pipeline_html += '<b>⚠️ Errors:</b><br>'
for e in er[-8:]:
pipeline_html += f'· {html.escape(e[:80])}<br>'
pipeline_html += '</div>'
pipeline_html += '</div>'
return pipeline_html
def _run_phase(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi):
"""★ 파이프라인 실행: SolverPool → Queue → JudgePool 병렬 처리"""
global _EVAL_STATE
results = dict(_load_all(run_id))
cached = sum(1 for t in tasks if t.task_id in results)
pending = [t for t in tasks if t.task_id not in results]
gt = {}
for t in pending:
gt.setdefault(t.grade, []).append(t)
# ★ 로컬/전용 모델이면 Solver 제한
is_local = eval_model in LOCAL_MODELS
if is_local:
minfo = LOCAL_MODELS.get(eval_model, {})
if minfo.get("type") == "friendli":
# Friendli API: 클라우드라 동시성 OK
solver_w = min(solver_w, 3)
if proto_agi:
solver_w = min(solver_w, 2)
else:
# vLLM 로컬: GPU 공유
solver_w = min(solver_w, 2)
if proto_agi:
solver_w = 1
elif proto_agi:
solver_w = min(solver_w, 3)
with _EVAL_STATE["lock"]:
_EVAL_STATE["results"] = results
_EVAL_STATE["cached"] = cached
_EVAL_STATE["total"] = len(pending)
_EVAL_STATE["grade_total"] = {g: len(ts) for g, ts in gt.items()}
_EVAL_STATE["grade_done"] = {g: 0 for g in gt}
_EVAL_STATE["solved"] = 0
_EVAL_STATE["judged"] = 0
_EVAL_STATE["errors"] = []
_EVAL_STATE["solve_active"] = []
_EVAL_STATE["judge_active"] = []
_EVAL_STATE["proto_agi"] = proto_agi
_EVAL_STATE["solver_workers"] = solver_w
_EVAL_STATE["judge_workers"] = judge_w
if not pending:
return results
# ★ 파이프라인 구성
judge_q = queue.Queue(maxsize=solver_w * 3) # 버퍼: solver의 3배
solver_done = threading.Event() # solver 전체 완료 시그널
result_dict = dict(results) # 스레드 안전 (lock으로 보호)
# ── Judge 데몬 스레드들 시작 (먼저 대기) ──
judge_threads = []
for i in range(judge_w):
jt = threading.Thread(
target=_judge_from_queue,
args=(judge_q, run_id, api_key, judge_model, _EVAL_STATE, result_dict, solver_done),
daemon=True,
name=f"Judge-{i}"
)
jt.start()
judge_threads.append(jt)
# ── Solver Pool 실행 → 완료되면 Judge에 센티넬 전송 ──
try:
with ThreadPoolExecutor(max_workers=solver_w, thread_name_prefix="Solver") as solver_pool:
futs = {}
for t in pending:
if _EVAL_STATE["stop_requested"]:
break
fut = solver_pool.submit(
_solver_to_queue, t, api_key, eval_model, _EVAL_STATE, proto_agi, judge_q
)
futs[fut] = t
# Solver 완료 대기
done_set = set()
while len(done_set) < len(futs):
if _EVAL_STATE["stop_requested"]:
solver_pool.shutdown(wait=False, cancel_futures=True)
break
for f in list(futs):
if f in done_set:
continue
if f.done():
done_set.add(f)
try:
f.result() # 예외 확인
except Exception as e:
with _EVAL_STATE["lock"]:
_EVAL_STATE["errors"].append(f"SolverEx:{str(e)[:40]}")
time.sleep(0.3)
finally:
# ★ Solver 전체 완료 → Judge에 종료 시그널
solver_done.set()
# ── Judge 완료 대기 ──
for jt in judge_threads:
jt.join(timeout=300) # 5분 타임아웃
# 결과 동기화
with _EVAL_STATE["lock"]:
_EVAL_STATE["results"] = result_dict
return dict(result_dict)
def _bg_single(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi):
global _EVAL_STATE
try:
with _EVAL_STATE["lock"]:
_EVAL_STATE["start_time"] = time.time()
mode = "🌟 Proto-AGI" if proto_agi else "🤖 Non-AGI"
_EVAL_STATE["message"] = f"⚡ {mode}{len(tasks)} tasks (🧠{solver_w}w ⚖️{judge_w}w)"
results = _run_phase(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi)
_finalize_single(tasks, results, eval_model, "PAGI" if proto_agi else "NON")
except Exception as e:
with _EVAL_STATE["lock"]:
_EVAL_STATE["message"] = f"❌ Fatal: {str(e)[:100]}"
_EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True
def _bg_compare(api_key, eval_model, judge_model, tasks, non_rid, pagi_rid, solver_w, judge_w):
global _EVAL_STATE
try:
with _EVAL_STATE["lock"]:
_EVAL_STATE["start_time"] = time.time()
_EVAL_STATE["compare_phase"] = "Phase 1/2 — 🤖 Non-AGI"
_EVAL_STATE["message"] = f"🔄 Phase 1/2: Non-AGI (🧠{solver_w}w ⚖️{judge_w}w)"
non_results = _run_phase(api_key, eval_model, judge_model, tasks, non_rid, solver_w, judge_w, proto_agi=False)
if _EVAL_STATE["stop_requested"]:
with _EVAL_STATE["lock"]:
_EVAL_STATE["message"] = "⏹️ Stopped during Phase 1"
_EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True
return
with _EVAL_STATE["lock"]:
_EVAL_STATE["non_results"] = non_results
_EVAL_STATE["compare_phase"] = "Phase 2/2 — 🌟 Proto-AGI"
_EVAL_STATE["message"] = f"🔄 Phase 2/2: Proto-AGI (🧠{solver_w}w ⚖️{judge_w}w)"
pagi_results = _run_phase(api_key, eval_model, judge_model, tasks, pagi_rid, solver_w, judge_w, proto_agi=True)
with _EVAL_STATE["lock"]:
_EVAL_STATE["pagi_results"] = pagi_results
csv1 = generate_csv(non_results, tasks, eval_model, "NON-AGI")
csv2 = generate_csv(pagi_results, tasks, eval_model, "PROTO-AGI")
combined_csv = csv1 + csv2.split("\n", 1)[1] if "\n" in csv2 else csv2
cp = f"/tmp/final_compare_{non_rid}.csv"
with open(cp, "w", encoding="utf-8") as f:
f.write(combined_csv)
hf = upload_to_hf(combined_csv, eval_model, "COMPARE")
nf = compute_final_score(non_results, tasks)[0]
pf = compute_final_score(pagi_results, tasks)[0]
delta = pf - nf
elapsed = int(time.time() - _EVAL_STATE["start_time"])
with _EVAL_STATE["lock"]:
_EVAL_STATE["csv_path"] = cp; _EVAL_STATE["hf_status"] = hf
_EVAL_STATE["message"] = (
f"🏁 Compare Complete! Non-AGI={nf:.1f} → Proto-AGI={pf:.1f} "
f"(Δ={'+' if delta>0 else ''}{delta:.1f}) · {elapsed}s"
)
_EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True
except Exception as e:
with _EVAL_STATE["lock"]:
_EVAL_STATE["message"] = f"❌ Fatal: {str(e)[:100]}"
_EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True
def _finalize_single(tasks, results, eval_model, mode_tag):
global _EVAL_STATE
final,base,har,axis,_=compute_final_score(results,tasks)
stage=determine_agi_stage(final,axis)
csv_str=generate_csv(results,tasks,eval_model,mode_tag)
cp=f"/tmp/final_{_EVAL_STATE['run_id']}.csv"
with open(cp,"w",encoding="utf-8") as f: f.write(csv_str)
hf=upload_to_hf(csv_str,eval_model,mode_tag)
elapsed=int(time.time()-_EVAL_STATE["start_time"])
with _EVAL_STATE["lock"]:
_EVAL_STATE["csv_path"]=cp; _EVAL_STATE["hf_status"]=hf
_EVAL_STATE["message"]=f"🏁 {stage['name']} — FINAL={final:.1f} · {elapsed}s"
_EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True
def _get_selected_tasks(dataset_choice, grade_f, diff_f, max_t):
"""데이터셋 선택에 따라 태스크 목록 반환"""
if dataset_choice == "FINAL Bench Only":
tasks = ALL_TASKS[:]
elif dataset_choice == "SWE-bench Verified Only":
tasks = SWE_BENCH_TASKS[:]
else: # Both
tasks = ALL_TASKS[:] + SWE_BENCH_TASKS[:]
if grade_f != "All":
tasks = [t for t in tasks if t.grade == grade_f]
if diff_f != "All":
tasks = [t for t in tasks if t.difficulty == diff_f]
tasks = tasks[:int(max_t)]
return tasks
def _start_eval(api_key, eval_model, judge_model, proto_agi, dataset_choice, grade_f, diff_f, max_t, s_w, j_w, fresh):
"""단일 모드 시작 (Non-AGI 또는 Proto-AGI)"""
global _EVAL_STATE
if _EVAL_STATE["running"]: return "⚠️ Already running"
# API 키 체크: Judge는 항상 OpenAI 필요
api_key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
if not api_key: return "❌ OpenAI API Key required (for Judge)"
# ★ API 연결 사전 검증
if eval_model not in LOCAL_MODELS:
ok, msg = _test_api_connection(api_key, eval_model)
if not ok:
# 폴백 시도
actual = _resolve_model(eval_model, api_key)
if actual == eval_model:
return f"❌ Eval model API check failed: {msg}"
else:
print(f" ℹ️ Eval model {eval_model}{actual}")
# Judge 모델 검증
ok_j, msg_j = _test_api_connection(api_key, judge_model)
if not ok_j:
actual_j = _resolve_model(judge_model, api_key)
if actual_j == judge_model:
return f"❌ Judge model API check failed: {msg_j}"
else:
print(f" ℹ️ Judge model {judge_model}{actual_j}")
# 로컬/전용 모델 체크
if eval_model in LOCAL_MODELS:
minfo = LOCAL_MODELS[eval_model]
if minfo["type"] == "friendli":
ok_f, msg_f = _test_friendli(minfo["id"])
if not ok_f:
return (f"❌ Friendli API check failed: {msg_f}\n"
f"💡 Set FRIENDLI_TOKEN in Space secrets")
print(f" ✅ Friendli API OK: {minfo['id']}")
elif minfo["type"] == "local_vllm":
port = LOCAL_MODEL_CONFIG["server_port"]
# ★ 서버 자동 감지 시도
if not LOCAL_MODEL_CONFIG["server_ready"]:
if _probe_vllm_server(port):
LOCAL_MODEL_CONFIG["server_ready"] = True
print(f" ✅ vLLM server detected on port {port}")
# ★ 아직 시작 중이면 대기 (최대 2분)
if LOCAL_MODEL_CONFIG.get("server_starting") and not LOCAL_MODEL_CONFIG["server_ready"]:
print(" ⏳ Waiting for vLLM server to finish starting...")
for _ in range(24):
time.sleep(5)
if _probe_vllm_server(port):
LOCAL_MODEL_CONFIG["server_ready"] = True
break
if not LOCAL_MODEL_CONFIG.get("server_starting"):
break
# ★ 그래도 안 되면 자동 시작 시도
if not LOCAL_MODEL_CONFIG["server_ready"]:
if _detect_gpu():
return ("⏳ vLLM server not ready. Click '🚀 Start vLLM Server' to start, "
"or wait for auto-start to complete. "
f"GPU: {LOCAL_MODEL_CONFIG.get('gpu_info','unknown')}")
else:
return "❌ Local vLLM requires GPU but none detected. Use OpenAI model instead."
if minfo["type"] == "hf_inference":
hf_token = os.getenv("HF_TOKEN", "")
if not hf_token:
return "❌ HF_TOKEN required for HuggingFace Inference API. Set it in Space secrets."
ok, ep_type, msg = _test_hf_model(minfo["id"], hf_token)
if not ok:
return (f"❌ HF Inference not available for {minfo['id']}: {msg}\n"
f"💡 Try: (1) Use 'Darwin-gpt-ernie-20b (Local vLLM)' with GPU, "
f"(2) Use OpenAI model like gpt-4o instead")
print(f" ✅ HF model OK: {minfo['id']} via {ep_type}")
tasks = _get_selected_tasks(dataset_choice, grade_f, diff_f, int(max_t))
if not tasks:
return "❌ No tasks found for selected filters"
mode = "PAGI" if proto_agi else "NON"
rid = _make_run_id(eval_model, mode)
if fresh: _clear_run(rid)
_reset()
# ★ 실제 사용될 모델명 기록
actual_eval = _resolve_model(eval_model, api_key) if eval_model not in LOCAL_MODELS else eval_model
actual_judge = _resolve_model(judge_model, api_key)
with _EVAL_STATE["lock"]:
_EVAL_STATE.update({"running":True,"run_id":rid,"model":eval_model,"tasks":tasks,
"total":len(tasks),"n_workers":int(s_w),"mode":mode,"proto_agi":proto_agi,
"actual_eval_model": actual_eval, "actual_judge_model": actual_judge})
threading.Thread(target=_bg_single, daemon=True,
args=(api_key, eval_model, judge_model, tasks, rid, int(s_w), int(j_w), proto_agi)).start()
icon = "🌟 Proto-AGI" if proto_agi else "🤖 Non-AGI"
swe_cnt = sum(1 for t in tasks if t.metadata.get("source") == "SWE-bench_Verified")
final_cnt = len(tasks) - swe_cnt
model_info = f"{eval_model}" if actual_eval == eval_model else f"{eval_model}{actual_eval}"
judge_info = f"{judge_model}" if actual_judge == judge_model else f"{judge_model}{actual_judge}"
return f"⚡ {icon} Started ({len(tasks)} tasks · 🧠{int(s_w)}w ⚖️{int(j_w)}w) · Eval:{model_info} Judge:{judge_info}"
def _start_compare(api_key, eval_model, judge_model, dataset_choice, grade_f, diff_f, max_t, s_w, j_w, fresh):
global _EVAL_STATE
if _EVAL_STATE["running"]: return "⚠️ Already running"
api_key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "")
if not api_key: return "❌ OpenAI API Key required (for Judge)"
# ★ API 사전 검증
if eval_model in LOCAL_MODELS:
minfo = LOCAL_MODELS[eval_model]
if minfo.get("type") == "friendli":
ok_f, msg_f = _test_friendli(minfo["id"])
if not ok_f:
return f"❌ Friendli API check failed: {msg_f}"
elif eval_model not in LOCAL_MODELS:
ok, msg = _test_api_connection(api_key, eval_model)
if not ok:
actual = _resolve_model(eval_model, api_key)
if actual == eval_model:
return f"❌ Eval model failed: {msg}"
ok_j, msg_j = _test_api_connection(api_key, judge_model)
if not ok_j:
actual_j = _resolve_model(judge_model, api_key)
if actual_j == judge_model:
return f"❌ Judge model failed: {msg_j}"
tasks = _get_selected_tasks(dataset_choice, grade_f, diff_f, int(max_t))
if not tasks: return "❌ No tasks found"
non_rid = _make_run_id(eval_model, "NON")
pagi_rid = _make_run_id(eval_model, "PAGI")
if fresh: _clear_run(non_rid); _clear_run(pagi_rid)
_reset()
with _EVAL_STATE["lock"]:
_EVAL_STATE.update({"running":True,"model":eval_model,"tasks":tasks,
"total":len(tasks),"n_workers":int(s_w),
"compare_mode":True,"non_run_id":non_rid,"pagi_run_id":pagi_rid,
"run_id":non_rid})
threading.Thread(target=_bg_compare, daemon=True,
args=(api_key, eval_model, judge_model, tasks, non_rid, pagi_rid, int(s_w), int(j_w))).start()
return f"🔄 Compare Started! ({len(tasks)} tasks · 🧠{int(s_w)}w ⚖️{int(j_w)}w)"
def _stop():
if _EVAL_STATE["running"]: _EVAL_STATE["stop_requested"]=True; return "⏹️ Stopping..."
return "ℹ️ Not running"
def _poll():
with _EVAL_STATE["lock"]:
running=_EVAL_STATE["running"]; finished=_EVAL_STATE["finished"]
tasks=_EVAL_STATE.get("tasks",[]); results=dict(_EVAL_STATE.get("results",{}))
msg=_EVAL_STATE.get("message",""); cp=_EVAL_STATE.get("csv_path")
compare_mode=_EVAL_STATE.get("compare_mode",False)
non_results=dict(_EVAL_STATE.get("non_results",{}))
pagi_results=dict(_EVAL_STATE.get("pagi_results",{}))
if not running and not finished and not results:
return("ℹ️ Press ▶️ Start, 🌟 Proto-AGI, or 🔄 Compare","","","","",None)
if running:
pend = _EVAL_STATE.get("total",0) - _EVAL_STATE.get("cached",0)
ph=CSS+_prog_html(_EVAL_STATE, pend)
elif finished:
ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>'
else: ph=msg
th=_build_progress_table(results,tasks) if tasks else ""
sh,dh,cmp_html,co="","","",None
if finished and tasks:
model = _EVAL_STATE.get("model","?")
hf_st = _EVAL_STATE.get("hf_status","")
if compare_mode:
sh = (_build_summary_card(non_results,tasks,model,hf_st,"🤖 Non-AGI") +
_build_summary_card(pagi_results,tasks,model,hf_st,"🌟 Proto-AGI"))
cmp_html = _build_comparison_html(non_results, pagi_results, tasks, model)
else:
sh = _build_summary_card(results,tasks,model,hf_st,
"🌟 Proto-AGI" if _EVAL_STATE.get("proto_agi") else "🤖 Non-AGI")
non_rid = _make_run_id(model,"NON")
pagi_rid = _make_run_id(model,"PAGI")
old_non = _load_all(non_rid)
old_pagi = _load_all(pagi_rid)
if old_non and old_pagi:
cmp_html = _build_comparison_html(old_non, old_pagi, tasks, model)
else:
cmp_html = "<p style='color:#888'>Run both Non-AGI and Proto-AGI (or use 🔄 Compare) to see comparison.</p>"
dh = _build_detail_view(results,tasks)
co = cp
return(ph,th,sh,cmp_html,dh,co)
# ════════════════════════════════════════════════════════════════
# §11. Gradio App
# ════════════════════════════════════════════════════════════════
HEADER = """
<div style="text-align:center;padding:16px 0">
<h1 style="margin:0;font-size:1.8em">🏆 FINAL Bench v3.1 + 🧬 Darwin Local Eval</h1>
<h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2>
<p style="color:#888;font-size:0.88em;max-width:780px;margin:8px auto">
<b>FINAL 100 Tasks + SWE-bench 500 Tasks · 15+ Domains · 8 TICOS · 5-Axis · 5-Stage AGI Grade</b><br>
🤖 Non-AGI (single LLM) vs 🌟 Proto-AGI (五行 木→火→土→金→水)<br>
🧬 <b>Darwin-gpt-ernie-20b</b> (gpt-oss-20b + ERNIE-4.5-21B Merge) · Friendli Dedicated Endpoint<br>
🐛 <b>SWE-bench_Verified</b> (500 Real-world Bug Fix Tasks) · ⚖️ Judge: GPT-5.2
</p>
<div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em">
<span style="background:#ffcdd2;padding:2px 10px;border-radius:12px">🅰️ A×1.5</span>
<span style="background:#bbdefb;padding:2px 10px;border-radius:12px">🅱️ B×1.0</span>
<span style="background:#e1bee7;padding:2px 10px;border-radius:12px">🅾️ C×0.7</span>
<span style="background:#c8e6c9;padding:2px 10px;border-radius:12px">🐛 SWE-bench</span>
<span style="background:#ffe0b2;padding:2px 10px;border-radius:12px">🧬 Darwin Merge</span>
</div></div>"""
def create_app():
with gr.Blocks(title="FINAL Bench v3.1 + Darwin") as app:
gr.HTML(HEADER)
# ── 모델 서버 제어 ──
with gr.Accordion("🧬 Darwin-gpt-ernie-20b Local Server", open=False):
model_info = gr.HTML(_build_model_info_html())
with gr.Row():
srv_start = gr.Button("🚀 Start vLLM Server", variant="primary", scale=2)
srv_stop = gr.Button("⏹️ Stop Server", variant="stop", scale=1)
srv_refresh = gr.Button("🔄 Refresh Status", scale=1)
srv_status = gr.Textbox(label="Server Status", interactive=False, max_lines=2)
srv_start.click(fn=_start_local_model_server, outputs=[srv_status])
srv_stop.click(fn=_stop_local_model_server, outputs=[srv_status])
def _refresh_model_info():
_auto_detect_server()
return _build_model_info_html()
srv_refresh.click(fn=_refresh_model_info, outputs=[model_info])
# ── 평가 설정 ──
with gr.Row():
api_key = gr.Textbox(label="🔑 OpenAI API Key (Judge용)", type="password",
placeholder="sk-...", value=os.getenv("OPENAI_API_KEY",""), scale=4)
diag_btn = gr.Button("🔍 API Test", variant="secondary", scale=1)
diag_status = gr.Textbox(label="API Diagnostic", interactive=False, max_lines=3, visible=True)
def _run_diagnostic(key):
key = (key or "").strip() or os.getenv("OPENAI_API_KEY", "")
results = []
# Friendli check
friendli_token = os.getenv("FRIENDLI_TOKEN", "")
if friendli_token:
ok_fr, msg_fr = _test_friendli()
results.append(f"{'✅' if ok_fr else '❌'} Friendli (Darwin): {msg_fr[:60]}")
else:
results.append("⚠️ FRIENDLI_TOKEN not set")
# vLLM 로컬 서버 체크
port = LOCAL_MODEL_CONFIG["server_port"]
if _probe_vllm_server(port):
results.append(f"✅ vLLM server: Running on port {port}")
try:
r = requests.get(f"http://localhost:{port}/v1/models", timeout=5)
if r.status_code == 200:
models = [m.get("id","?") for m in r.json().get("data",[])]
results.append(f" 📦 Models: {', '.join(models)}")
except: pass
elif LOCAL_MODEL_CONFIG.get("server_starting"):
results.append(f"🟡 vLLM server: Starting...")
else:
results.append(f"🔴 vLLM server: Not running (port {port})")
# GPU
gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 0)
gpu = LOCAL_MODEL_CONFIG.get("gpu_info", "")
if not gpu:
_detect_gpu()
gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 0)
gpu = LOCAL_MODEL_CONFIG.get("gpu_info", "Not detected")
total_vram = LOCAL_MODEL_CONFIG.get("total_vram_mb", 0)
results.append(f"🖥️ GPU: {gpu_count}x {gpu} (Total: {total_vram/1024:.1f}GB)")
# OpenAI
if not key:
results.append("⚠️ OpenAI API Key is empty")
else:
for m in ["gpt-5.2", "gpt-4.1", "gpt-4o", "gpt-4o-mini"]:
ok, msg = _test_api_connection(key, m)
results.append(f"{'✅' if ok else '❌'} {m}: {'OK' if ok else msg.split(':',1)[-1].strip()[:50]}")
# HF Token
hf_token = os.getenv("HF_TOKEN", "")
if hf_token:
mid = LOCAL_MODEL_CONFIG["model_id"]
ok_hf, ep, msg_hf = _test_hf_model(mid, hf_token)
results.append(f"{'✅' if ok_hf else '❌'} HF:{mid}{ep if ok_hf else msg_hf[:60]}")
else:
results.append("ℹ️ HF_TOKEN not set")
return "\n".join(results)
diag_btn.click(fn=_run_diagnostic, inputs=[api_key], outputs=[diag_status])
with gr.Row():
eval_m = gr.Dropdown(label="🤖 Eval Model (피평가 모델)",
choices=list(ALL_EVAL_MODELS.keys()),
value="Darwin-gpt-ernie-20b (Friendli)",
scale=3)
judge_m = gr.Dropdown(label="⚖️ Judge Model (채점자)",
choices=list(OPENAI_MODELS.keys()),
value="gpt-5.2", scale=3)
with gr.Row():
dataset_choice = gr.Dropdown(
label="📊 Dataset",
choices=["FINAL Bench Only", "SWE-bench Verified Only", "Both (FINAL + SWE-bench)"],
value="SWE-bench Verified Only",
scale=2
)
proto_toggle = gr.Checkbox(label="🌟 Proto-AGI (五行)", value=False, scale=1)
gf = gr.Dropdown(["All","A","B","C"], value="All", label="Grade", scale=1)
df = gr.Dropdown(["All","easy","medium","hard","expert","frontier"], value="All", label="Difficulty", scale=1)
with gr.Row():
mt = gr.Slider(1, 600, value=30, step=1, label="Max Tasks", scale=2)
sw = gr.Slider(1, 6, value=2, step=1, label="🧠 Solver Workers", scale=1)
jw = gr.Slider(1, 10, value=5, step=1, label="⚖️ Judge Workers", scale=1)
with gr.Row():
s_btn = gr.Button("▶️ Start (Resume)", variant="primary", size="lg", scale=2)
f_btn = gr.Button("🚀 Fresh Start", variant="secondary", size="lg", scale=2)
cmp_btn = gr.Button("🔄 Compare: Non-AGI vs Proto-AGI", variant="primary", size="lg", scale=3)
x_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", scale=1)
with gr.Row():
gr.HTML(f'<p style="color:#888;font-size:0.78em;margin:0">'
f'📊 Available: FINAL={len(ALL_TASKS)} · SWE-bench={len(SWE_BENCH_TASKS)} · '
f'🧠Solver=풀이(Darwin/GPT) · ⚖️Judge=채점(GPT-5.2) · ★ 풀이↔채점 파이프라인 병렬처리</p>')
status = gr.Textbox(label="Status", interactive=False, max_lines=2)
with gr.Tabs():
with gr.Tab("📊 Progress"): p_html = gr.HTML()
with gr.Tab("📋 Results"): t_html = gr.HTML()
with gr.Tab("🏆 FINAL Score"): s_html = gr.HTML()
with gr.Tab("🔄 Compare"): cmp_html = gr.HTML()
with gr.Tab("🔍 Details"): d_html = gr.HTML()
with gr.Tab("💾 CSV"): c_file = gr.File(label="CSV")
timer = gr.Timer(value=2, active=True)
timer.tick(fn=_poll, outputs=[p_html, t_html, s_html, cmp_html, d_html, c_file])
single_ins = [api_key, eval_m, judge_m, proto_toggle, dataset_choice, gf, df, mt, sw, jw]
s_btn.click(fn=lambda *a: _start_eval(*a, fresh=False), inputs=single_ins, outputs=[status])
f_btn.click(fn=lambda *a: _start_eval(*a, fresh=True), inputs=single_ins, outputs=[status])
cmp_ins = [api_key, eval_m, judge_m, dataset_choice, gf, df, mt, sw, jw]
cmp_btn.click(fn=lambda *a: _start_compare(*a, fresh=True), inputs=cmp_ins, outputs=[status])
x_btn.click(fn=_stop, outputs=[status])
gr.Markdown("---\n<center><b>FINAL Bench v3.1</b> · 🧬 Darwin-gpt-ernie-20b (Friendli) + SWE-bench_Verified<br>"
"AGI Verification · Non-AGI vs Proto-AGI · 木火土金水<br>"
"Apache 2.0 · <b>Ginigen AI</b> — Choi Sunyoung</center>")
return app
if __name__ == "__main__":
# 통계 출력
print(f"\n{'='*60}")
print(f" FINAL Bench v3.1 + Darwin Local Eval")
print(f"{'='*60}")
print(f" FINAL tasks: {len(ALL_TASKS)}")
print(f" SWE-bench tasks: {len(SWE_BENCH_TASKS)}")
print(f" Total available: {len(ALL_TASKS) + len(SWE_BENCH_TASKS)}")
print(f" Darwin model: {LOCAL_MODEL_CONFIG['model_id']}")
print(f" Friendli endpoint: deppfs281rgffnk")
# ★ Friendli 연결 테스트
_friendli_ok, _friendli_msg = _test_friendli()
print(f" Friendli status: {'✅' if _friendli_ok else '❌'} {_friendli_msg}")
print(f" Proto-AGI: 木발상→火실행→土판단→金비평→水정제")
print(f" Judge: GPT-5.2 (OpenAI)")
print(f"{'='*60}\n")
app = create_app()
app.queue(default_concurrency_limit=2)
app.launch(server_name="0.0.0.0", server_port=7860,
theme=gr.themes.Soft(),
css=".gradio-container{max-width:1200px !important} header{display:none!important}")