""" FINAL Bench v3.1 — AGI-Level Verification System + Local Model Eval Frontier Intelligence Nexus for AGI-Level Verification ★ Non-AGI vs Proto-AGI 비교 평가 ★ 100 FINAL Tasks + 500 SWE-bench Verified Tasks ★ GPT-5.2 Eval + GPT-5.2 Structured Output Judge ★ Proto-AGI 오행 완전체: 木→火→土→金→水 ★ Local Model Support: Darwin-gpt-ernie-20b (vLLM) ★ SWE-bench_Verified Dataset Integration Author: Ginigen AI (지니젠AI) — Choi Sunyoung License: Apache 2.0 """ import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, subprocess, signal from datetime import datetime from dataclasses import dataclass, field, asdict from typing import List, Dict, Optional import requests import numpy as np import gradio as gr from concurrent.futures import ThreadPoolExecutor # ════════════════════════════════════════════════════════════════ # §1. Data Structures # ════════════════════════════════════════════════════════════════ DOMAIN_INFO = { "Mathematics & Logic": {"icon":"🔢","color":"#FF6B35"}, "Science": {"icon":"🔬","color":"#7B2FF7"}, "Philosophy": {"icon":"🤔","color":"#00B4D8"}, "Medicine": {"icon":"🏥","color":"#2EC4B6"}, "Economics": {"icon":"📈","color":"#E63946"}, "History": {"icon":"📜","color":"#F4A261"}, "War & Security": {"icon":"🛡️","color":"#264653"}, "Space & Physics": {"icon":"🚀","color":"#6C63FF"}, "Chemistry & Biology": {"icon":"🧬","color":"#06D6A0"}, "Language & Writing": {"icon":"✍️","color":"#EF476F"}, "Literature": {"icon":"📖","color":"#8338EC"}, "Art": {"icon":"🎨","color":"#FF006E"}, "Religion & Mythology": {"icon":"🕊️","color":"#FFD166"}, "Ethics": {"icon":"⚖️","color":"#118AB2"}, "AI & Technology": {"icon":"🤖","color":"#073B4C"}, # SWE-bench domains "Software Engineering": {"icon":"💻","color":"#00897B"}, "Bug Fix": {"icon":"🐛","color":"#D84315"}, "Code Patch": {"icon":"🔧","color":"#5E35B1"}, } GRADE_WEIGHT = {"A": 1.5, "B": 1.0, "C": 0.7} RUBRIC = { "process_quality": {"weight":0.25, "desc":"Systematic reasoning transparency"}, "metacognitive_accuracy": {"weight":0.25, "desc":"Confidence calibration + uncertainty honesty"}, "error_recovery": {"weight":0.20, "desc":"Mid-analysis self-correction"}, "integration_depth": {"weight":0.15, "desc":"Multi-perspective synthesis + emergent insights"}, "final_correctness": {"weight":0.15, "desc":"Answer accuracy and completeness"}, } AXIS_MAP = { "generalization": {"rubrics":["process_quality","final_correctness"], "ticos":[]}, "reasoning": {"rubrics":["process_quality","error_recovery"], "ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]}, "planning": {"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]}, "reliability": {"rubrics":["metacognitive_accuracy"], "ticos":["E_SelfCorrecting","G_PivotDetection"]}, "safety": {"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]}, } AGI_STAGES = [ {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence", "min":0, "max":39, "color":"#f44336"}, {"stage":2,"name":"FINAL-Proto", "label":"Proto Intelligence", "min":40,"max":59, "color":"#ff9800"}, {"stage":3,"name":"FINAL-Pre", "label":"Pre-AGI", "min":60,"max":79, "color":"#2196f3"}, {"stage":4,"name":"FINAL-Pass", "label":"AGI Achieved", "min":80,"max":94, "color":"#4caf50"}, {"stage":5,"name":"FINAL-Post", "label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"}, ] @dataclass class FinalTask: task_id:str; domain:str; grade:str; ticos_type:str difficulty:str; lens:str; title:str; prompt:str expected_behavior:str; hidden_trap:str ticos_required:List[str]=field(default_factory=list) metadata:Dict=field(default_factory=dict) # ════════════════════════════════════════════════════════════════ # §1.5 SWE-bench Verified Dataset Loader # ════════════════════════════════════════════════════════════════ SWE_BENCH_TASKS: List[FinalTask] = [] def _load_swe_bench_verified(): """SWE-bench_Verified 데이터셋을 HuggingFace에서 로드하여 FinalTask 형식으로 변환""" global SWE_BENCH_TASKS try: from datasets import load_dataset print("📦 Loading SWE-bench_Verified from HuggingFace...") ds = load_dataset("SWE-bench/SWE-bench_Verified", split="test") print(f" ✅ Loaded {len(ds)} SWE-bench instances") # 난이도 매핑 diff_map = {"15 min fix": "easy", "15 min - 1 hour": "medium", "1-4 hours": "hard", "4+ hours": "expert"} # TICOS 타입 매핑 (SWE-bench 특성에 맞게) ticos_types = [ "E_SelfCorrecting", # 버그 수정 = 자기교정 "D_MultiConstraint", # 다중 제약조건 해결 "C_ProgressiveDiscovery", # 점진적 발견 "A_TrapEscape", # 함정 탈출 (edge case) ] # 등급 분배: 난이도 기반 grade_map = {"15 min fix": "C", "15 min - 1 hour": "B", "1-4 hours": "A", "4+ hours": "A"} tasks = [] for i, item in enumerate(ds): instance_id = item.get("instance_id", f"swe_{i:04d}") repo = item.get("repo", "unknown") problem = item.get("problem_statement", "") patch = item.get("patch", "") test_patch = item.get("test_patch", "") hints = item.get("hints_text", "") difficulty_raw = item.get("difficulty", "15 min - 1 hour") version = item.get("version", "") fail_to_pass = item.get("FAIL_TO_PASS", "") base_commit = item.get("base_commit", "") # FinalTask 변환 difficulty = diff_map.get(difficulty_raw, "medium") grade = grade_map.get(difficulty_raw, "B") ticos = ticos_types[i % len(ticos_types)] # 도메인 결정 (repo 기반) if "django" in repo.lower(): domain = "Software Engineering" elif "astropy" in repo.lower() or "scipy" in repo.lower() or "sympy" in repo.lower(): domain = "Science" elif "matplotlib" in repo.lower(): domain = "Art" else: domain = "Software Engineering" # 프롬프트 구성: 문제 설명 + 코드 컨텍스트 prompt_text = ( f"## Software Bug Fix Task\n" f"**Repository**: {repo} (version {version})\n" f"**Base Commit**: {base_commit[:12]}...\n\n" f"### Problem Description:\n{problem[:3000]}\n\n" ) if hints: prompt_text += f"### Hints:\n{hints[:1000]}\n\n" prompt_text += ( f"### Requirements:\n" f"1. Analyze the bug described above\n" f"2. Identify the root cause in the codebase\n" f"3. Propose a minimal, correct patch\n" f"4. Explain why the fix is correct\n" f"5. Identify potential edge cases or regressions\n" f"6. State your confidence level for each claim\n" ) # Expected behavior = 실제 패치 expected = f"Correct patch:\n{patch[:2000]}" # Hidden trap = 테스트가 실패→성공으로 바뀌어야 하는 항목 hidden = f"Tests that must pass after fix: {fail_to_pass[:500]}" task = FinalTask( task_id=f"SWE_{instance_id}", domain=domain, grade=grade, ticos_type=ticos, difficulty=difficulty, lens="code_analysis", title=f"[{repo}] {instance_id}", prompt=prompt_text, expected_behavior=expected, hidden_trap=hidden, ticos_required=[ticos], metadata={ "source": "SWE-bench_Verified", "repo": repo, "instance_id": instance_id, "base_commit": base_commit, "version": version, "difficulty_raw": difficulty_raw, "patch": patch, # 정답 패치 보관 "test_patch": test_patch, # 테스트 패치 보관 } ) tasks.append(task) SWE_BENCH_TASKS = tasks print(f" ✅ Converted {len(tasks)} SWE-bench tasks to FinalTask format") # 통계 repos = {} for t in tasks: r = t.metadata.get("repo", "?") repos[r] = repos.get(r, 0) + 1 print(f" 📊 Repos: {dict(sorted(repos.items(), key=lambda x:-x[1])[:10])}") grades = {} for t in tasks: grades[t.grade] = grades.get(t.grade, 0) + 1 print(f" 📊 Grades: {grades}") return tasks except ImportError: print("⚠️ 'datasets' library not installed. Run: pip install datasets") return [] except Exception as e: print(f"❌ SWE-bench loading failed: {e}") return [] # ════════════════════════════════════════════════════════════════ # §1.6 Local Model Server (vLLM for Darwin-gpt-ernie-20b) # ════════════════════════════════════════════════════════════════ LOCAL_MODEL_CONFIG = { "model_id": "seawolf2357/Darwin-gpt-ernie-20b", "base_models": ["openai/gpt-oss-20b", "baidu/ERNIE-4.5-21B-A3B-Thinking"], "merge_ratio": 0.50, "params": "21B", "active_params": "3.6B (MoE)", "min_vram": "16GB", "server_port": 8000, "server_process": None, "server_ready": False, "server_starting": False, "gpu_detected": False, "gpu_count": 0, "gpu_info": "", "gpu_all": [], "total_vram_mb": 0, "active_config": {}, "auto_start_attempted": False, } def _detect_gpu(): """GPU 감지 — 개수 + VRAM 정보""" try: result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=10) if result.returncode == 0 and result.stdout.strip(): lines = [l.strip() for l in result.stdout.strip().split('\n') if l.strip()] LOCAL_MODEL_CONFIG["gpu_detected"] = True LOCAL_MODEL_CONFIG["gpu_count"] = len(lines) LOCAL_MODEL_CONFIG["gpu_info"] = lines[0] # 첫 GPU 정보만 저장 LOCAL_MODEL_CONFIG["gpu_all"] = lines # 전체 GPU 목록 # 총 VRAM 계산 total_vram = 0 for line in lines: parts = [p.strip() for p in line.split(',')] if len(parts) >= 2: try: total_vram += int(parts[1]) except: pass LOCAL_MODEL_CONFIG["total_vram_mb"] = total_vram print(f" 🖥️ GPU detected: {len(lines)}x {lines[0]}") print(f" 💾 Total VRAM: {total_vram/1024:.1f} GB") return True except: pass LOCAL_MODEL_CONFIG["gpu_detected"] = False LOCAL_MODEL_CONFIG["gpu_count"] = 0 print(" ⚠️ No GPU detected (nvidia-smi failed)") return False def _probe_vllm_server(port=None): """vLLM 서버가 이미 실행 중인지 확인""" port = port or LOCAL_MODEL_CONFIG["server_port"] try: r = requests.get(f"http://localhost:{port}/health", timeout=5) if r.status_code == 200: return True except: pass # v1/models 엔드포인트도 확인 try: r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) if r.status_code == 200: return True except: pass return False def _auto_detect_server(): """앱 시작 시 vLLM 서버 자동 감지""" port = LOCAL_MODEL_CONFIG["server_port"] if _probe_vllm_server(port): LOCAL_MODEL_CONFIG["server_ready"] = True print(f" ✅ vLLM server auto-detected on port {port}") # 모델 확인 try: r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) if r.status_code == 200: models = r.json().get("data", []) if models: model_ids = [m.get("id", "?") for m in models] print(f" 📦 Loaded models: {model_ids}") except: pass return True return False def _start_local_model_server(model_id=None, gpu_memory_utilization=0.95, max_model_len=4096): """vLLM 서버로 Darwin-gpt-ernie-20b 로컬 서빙 시작 — 자동 TP + OOM 폴백""" global LOCAL_MODEL_CONFIG if model_id: LOCAL_MODEL_CONFIG["model_id"] = model_id mid = LOCAL_MODEL_CONFIG["model_id"] port = LOCAL_MODEL_CONFIG["server_port"] # 이미 실행 중인지 확인 if _probe_vllm_server(port): LOCAL_MODEL_CONFIG["server_ready"] = True LOCAL_MODEL_CONFIG["server_starting"] = False return f"✅ Local model server already running on port {port}" # 이미 시작 중이면 대기 if LOCAL_MODEL_CONFIG["server_starting"]: return "⏳ Server is starting... please wait" # GPU 확인 if not _detect_gpu(): return "❌ No GPU detected. vLLM requires GPU (nvidia-smi failed)" LOCAL_MODEL_CONFIG["server_starting"] = True gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 1) total_vram = LOCAL_MODEL_CONFIG.get("total_vram_mb", 48000) print(f"🚀 Starting vLLM server for {mid}...") print(f" GPUs: {gpu_count}x | Total VRAM: {total_vram/1024:.1f} GB") try: # ★ vLLM 실행 가능한지 먼저 체크 check = subprocess.run(["python", "-c", "import vllm; print(vllm.__version__)"], capture_output=True, text=True, timeout=30) if check.returncode != 0: LOCAL_MODEL_CONFIG["server_starting"] = False return f"❌ vLLM not available: {check.stderr[:300]}" print(f" vLLM version: {check.stdout.strip()}") # ★ 시도할 설정 목록 (점점 메모리 절약 방향으로) # 21B 모델 bf16 ≈ 42GB → 단일 48GB GPU에서는 KV캐시 공간 부족 configs = [] if gpu_count >= 4: # 4 GPU: TP=4, 각 GPU에 ~10.5GB weights → KV캐시 충분 configs.append({"tp": 4, "mem": 0.90, "maxlen": 8192, "dtype": "auto", "label": "TP=4, 8K ctx"}) configs.append({"tp": 4, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=4, 4K ctx"}) configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"}) elif gpu_count >= 2: # 2 GPU: TP=2, 각 GPU에 ~21GB weights configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"}) configs.append({"tp": 2, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "TP=2, 2K ctx"}) else: # 1 GPU 48GB: 21B bf16은 빡빡 → quantization 또는 짧은 ctx configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "1GPU, 2K ctx"}) configs.append({"tp": 1, "mem": 0.95, "maxlen": 1024, "dtype": "auto", "label": "1GPU, 1K ctx"}) configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "half", "label": "1GPU, fp16, 2K"}) for ci, cfg in enumerate(configs): print(f"\n 🔄 Attempt {ci+1}/{len(configs)}: {cfg['label']}") cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", mid, "--port", str(port), "--gpu-memory-utilization", str(cfg["mem"]), "--max-model-len", str(cfg["maxlen"]), "--trust-remote-code", "--dtype", cfg["dtype"], "--enforce-eager", ] if cfg["tp"] > 1: cmd.extend(["--tensor-parallel-size", str(cfg["tp"])]) print(f" CMD: {' '.join(cmd)}") proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, preexec_fn=os.setsid ) LOCAL_MODEL_CONFIG["server_process"] = proc # 서버 준비 대기 (최대 10분) max_wait = 120 started = False crashed = False crash_msg = "" for i in range(max_wait): time.sleep(5) if proc.poll() is not None: stderr = proc.stderr.read().decode()[-2000:] stdout = proc.stdout.read().decode()[-1000:] crashed = True crash_msg = stderr + stdout # OOM 확인 if "No available memory" in crash_msg or "CUDA out of memory" in crash_msg: print(f" ⚠️ OOM with {cfg['label']} — trying next config...") else: print(f" ❌ Crashed (non-OOM): {crash_msg[-300:]}") break if _probe_vllm_server(port): started = True LOCAL_MODEL_CONFIG["server_ready"] = True LOCAL_MODEL_CONFIG["server_starting"] = False LOCAL_MODEL_CONFIG["active_config"] = cfg elapsed = (i+1)*5 print(f" ✅ vLLM server ready! Config: {cfg['label']} ({elapsed}s)") return (f"✅ Server started: {mid}\n" f"Config: {cfg['label']} | TP={cfg['tp']} | " f"MaxLen={cfg['maxlen']} | {elapsed}s") if (i+1) % 12 == 0: print(f" ⏳ Still waiting... ({(i+1)*5}s)") if started: break if not crashed: # 타임아웃 try: os.killpg(os.getpgid(proc.pid), signal.SIGTERM) proc.wait(timeout=10) except: pass print(f" ⚠️ Timeout with {cfg['label']}") # 다음 시도 전 정리 try: if proc.poll() is None: os.killpg(os.getpgid(proc.pid), signal.SIGKILL) proc.wait(timeout=5) except: pass time.sleep(3) # 모든 시도 실패 LOCAL_MODEL_CONFIG["server_starting"] = False return (f"❌ All vLLM configurations failed on {gpu_count}x GPU.\n" f"Last error: {crash_msg[-300:]}\n" f"💡 Try: (1) Restart Space to free GPU memory, " f"(2) Use smaller max-model-len, " f"(3) Check model compatibility with vLLM {check.stdout.strip()}") except FileNotFoundError: LOCAL_MODEL_CONFIG["server_starting"] = False return "❌ vLLM not installed. Run: pip install vllm" except Exception as e: LOCAL_MODEL_CONFIG["server_starting"] = False return f"❌ Server start failed: {e}" def _stop_local_model_server(): """vLLM 서버 종료""" global LOCAL_MODEL_CONFIG proc = LOCAL_MODEL_CONFIG.get("server_process") if proc and proc.poll() is None: try: os.killpg(os.getpgid(proc.pid), signal.SIGTERM) proc.wait(timeout=10) except: try: proc.kill() except: pass LOCAL_MODEL_CONFIG["server_process"] = None LOCAL_MODEL_CONFIG["server_ready"] = False LOCAL_MODEL_CONFIG["server_starting"] = False return "⏹️ Local model server stopped" LOCAL_MODEL_CONFIG["server_ready"] = False LOCAL_MODEL_CONFIG["server_starting"] = False return "ℹ️ No server running" def call_local_model(prompt, system="", max_tokens=8192, temperature=0.6): """로컬 vLLM 서버에 요청 (OpenAI-compatible API)""" port = LOCAL_MODEL_CONFIG["server_port"] # ★ 서버 자동 감지 (server_ready가 False여도 실제로 돌고 있을 수 있음) if not LOCAL_MODEL_CONFIG["server_ready"]: if _probe_vllm_server(port): LOCAL_MODEL_CONFIG["server_ready"] = True print(f" ✅ vLLM server re-detected on port {port}") else: return ("[LOCAL_ERROR] vLLM server not running on port {port}. " "Click '🚀 Start vLLM Server' or check that vLLM process is active.") headers = {"Content-Type": "application/json"} # ★ vLLM에 로드된 실제 모델명 확인 (첫 호출 시) model_name = LOCAL_MODEL_CONFIG.get("_actual_vllm_model") if not model_name: try: r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) if r.status_code == 200: models = r.json().get("data", []) if models: model_name = models[0].get("id", LOCAL_MODEL_CONFIG["model_id"]) LOCAL_MODEL_CONFIG["_actual_vllm_model"] = model_name print(f" 📦 Using vLLM model: {model_name}") except: pass if not model_name: model_name = LOCAL_MODEL_CONFIG["model_id"] # ★ 입력 길이 제한 — max_model_len 기준으로 입력+출력 합계 맞춤 active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {}) model_max_len = active_cfg.get("maxlen", 4096) # 대략 4 chars ≈ 1 token. 입력은 max_model_len의 60%까지만 허용 max_input_chars = int(model_max_len * 0.6 * 4) # e.g., 4096 * 0.6 * 4 = 9830 max_tokens = min(max_tokens, int(model_max_len * 0.4)) # 출력은 40% total_input = (system or "") + prompt if len(total_input) > max_input_chars: if system and len(system) > max_input_chars // 3: system = system[:max_input_chars // 3] + "\n[...truncated...]" remaining = max_input_chars - len(system or "") if len(prompt) > remaining: prompt = prompt[:remaining] + "\n[...truncated for context length...]" messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) payload = { "model": model_name, "messages": messages, "max_tokens": max_tokens, "temperature": temperature, } for attempt in range(3): try: r = requests.post( f"http://localhost:{port}/v1/chat/completions", headers=headers, data=json.dumps(payload), timeout=600 # 긴 응답 대비 10분 ) if r.status_code == 400: err = "" try: err = r.json().get("error", {}).get("message", r.text[:500]) except: err = r.text[:500] print(f" ⚠️ vLLM 400 (attempt {attempt+1}): {err[:300]}") # ★ 토큰 제한 에러 → max_tokens 줄여서 재시도 if "max_tokens" in str(err).lower() or "too many tokens" in str(err).lower(): payload["max_tokens"] = min(max_tokens // 2, 4096) if attempt < 2: continue # ★ 모델명 불일치 → v1/models로 재확인 if "model" in str(err).lower(): try: mr = requests.get(f"http://localhost:{port}/v1/models", timeout=5) if mr.status_code == 200: models = mr.json().get("data", []) if models: payload["model"] = models[0]["id"] LOCAL_MODEL_CONFIG["_actual_vllm_model"] = models[0]["id"] if attempt < 2: continue except: pass return f"[LOCAL_ERROR] 400: {err[:300]}" if r.status_code == 503: # 서버 과부하 print(f" ⏳ vLLM 503 overloaded (attempt {attempt+1})") if attempt < 2: time.sleep(10 * (attempt+1)); continue return "[LOCAL_ERROR] Server overloaded (503)" r.raise_for_status() c = r.json()["choices"][0]["message"]["content"] return c if c else "[EMPTY]" except requests.exceptions.ConnectionError: print(f" ⚠️ vLLM connection error (attempt {attempt+1})") # 서버 재감지 if _probe_vllm_server(port): if attempt < 2: time.sleep(3); continue else: LOCAL_MODEL_CONFIG["server_ready"] = False return "[LOCAL_ERROR] Connection refused. vLLM server may have crashed." except requests.exceptions.ReadTimeout: print(f" ⚠️ vLLM timeout (attempt {attempt+1})") if attempt < 2: time.sleep(5); continue return "[LOCAL_ERROR] Request timeout (600s). Task may be too complex." except Exception as e: print(f" ⚠️ vLLM exception (attempt {attempt+1}): {str(e)[:200]}") if attempt < 2: time.sleep(3 * (attempt + 1)) else: return f"[LOCAL_ERROR] {e}" # ═══════════════════════════════════════════════════════════ # ★ 앱 시작 시 vLLM 자동 감지 + 자동 시작 # ═══════════════════════════════════════════════════════════ def _auto_boot_vllm(): """앱 부팅 시 vLLM 서버 자동 시작 (백그라운드)""" # 1) 이미 실행 중인지 확인 if _auto_detect_server(): return # 2) GPU 감지 if not _detect_gpu(): print(" ℹ️ No GPU → vLLM auto-start skipped") return # 3) vLLM 설치 확인 try: check = subprocess.run(["python", "-c", "import vllm"], capture_output=True, timeout=10) if check.returncode != 0: print(" ℹ️ vLLM not installed → auto-start skipped") return except: return # 4) 자동 시작 print(" 🚀 Auto-starting vLLM server for Darwin model...") LOCAL_MODEL_CONFIG["auto_start_attempted"] = True result = _start_local_model_server() print(f" 📋 Auto-start result: {result}") # ★ 앱 시작 시 실행 (블로킹 아닌 백그라운드 스레드) print("🔍 Checking for local vLLM server...") if _auto_detect_server(): print(" ✅ vLLM server found and ready!") elif _detect_gpu(): print(" 🖥️ GPU available — will auto-start vLLM in background") threading.Thread(target=_auto_boot_vllm, daemon=True, name="vLLM-AutoBoot").start() else: print(" ℹ️ No GPU detected — using OpenAI/HF Inference models") def _test_hf_model(model_id, hf_token): """HF 모델 사전 검증 — 어떤 API 엔드포인트가 작동하는지 확인""" if not hf_token: return False, "no_token", "HF_TOKEN required" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {hf_token}"} test_messages = [{"role": "user", "content": "Say OK"}] test_payload = {"model": model_id, "messages": test_messages, "max_tokens": 20, "stream": False} # 1) router.huggingface.co (Inference Providers — OpenAI-compatible) try: r = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, data=json.dumps(test_payload), timeout=30) if r.status_code == 200: return True, "router", "OK" err = "" try: rj = r.json() e = rj.get("error", "") err = e.get("message", str(e)) if isinstance(e, dict) else str(e) except: try: err = r.text[:300] except: err = str(r.status_code) print(f" ℹ️ HF router test ({model_id}): {r.status_code} — {str(err)[:200]}") except Exception as e: print(f" ℹ️ HF router exception: {str(e)[:200]}") # 2) api-inference.huggingface.co (Serverless Inference API) try: inf_payload = {"inputs": "Say OK", "parameters": {"max_new_tokens": 20}} r2 = requests.post(f"https://api-inference.huggingface.co/models/{model_id}", headers=headers, data=json.dumps(inf_payload), timeout=30) if r2.status_code == 200: return True, "serverless", "OK" err2 = "" try: rj2 = r2.json() e2 = rj2.get("error", "") err2 = e2.get("message", str(e2)) if isinstance(e2, dict) else str(e2) except: try: err2 = r2.text[:300] except: err2 = str(r2.status_code) print(f" ℹ️ HF serverless test ({model_id}): {r2.status_code} — {str(err2)[:200]}") except Exception as e: print(f" ℹ️ HF serverless exception: {str(e)[:200]}") # 3) HF Inference Endpoints (dedicated) — 사용자 전용 엔드포인트 # 이건 URL이 다르므로 여기서는 스킵 return False, "none", f"Model {model_id} not available on HF Inference" _HF_ENDPOINT_CACHE = {} # {model_id: "router" | "serverless" | "none"} def call_hf_inference(prompt, system="", model_id=None, hf_token=None, max_tokens=4096): """HuggingFace Inference API — 자동 엔드포인트 탐색 + 에러 바디 캡처""" mid = model_id or LOCAL_MODEL_CONFIG["model_id"] token = hf_token or os.getenv("HF_TOKEN", "") if not token: return "[HF_ERROR] HF_TOKEN required for Inference API" headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} # ★ 엔드포인트 자동 탐색 (캐시) if mid not in _HF_ENDPOINT_CACHE: ok, endpoint_type, msg = _test_hf_model(mid, token) _HF_ENDPOINT_CACHE[mid] = endpoint_type if not ok: print(f" ❌ HF model {mid} not available: {msg}") endpoint_type = _HF_ENDPOINT_CACHE.get(mid, "none") # ★ 프롬프트 길이 제한 (HF Inference는 입력 토큰 제한이 엄격) MAX_PROMPT_CHARS = 12000 # ~3000 토큰 if len(prompt) > MAX_PROMPT_CHARS: prompt = prompt[:MAX_PROMPT_CHARS] + "\n\n[... truncated for API limits ...]" if system and len(system) > 2000: system = system[:2000] # ── Router (OpenAI-compatible) ── if endpoint_type == "router": messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) payload = {"model": mid, "messages": messages, "max_tokens": max_tokens, "stream": False} for attempt in range(3): try: r = requests.post("https://router.huggingface.co/v1/chat/completions", headers=headers, data=json.dumps(payload), timeout=300) if r.status_code == 429: time.sleep(5 * (attempt + 1)); continue if r.status_code >= 400: err_body = "" try: err_body = r.json().get("error", r.text[:500]) except: err_body = r.text[:500] print(f" ⚠️ HF router {r.status_code} (attempt {attempt+1}): {err_body[:200]}") if attempt < 2: # 프롬프트 더 줄여서 재시도 if len(prompt) > 4000: prompt = prompt[:4000] + "\n[... further truncated ...]" payload["messages"][-1]["content"] = prompt time.sleep(3 * (attempt + 1)); continue return f"[HF_ERROR] {r.status_code}: {err_body[:200]}" r.raise_for_status() c = r.json()["choices"][0]["message"]["content"] return c if c else "[EMPTY]" except Exception as e: if attempt < 2: time.sleep(3 * (attempt + 1)) else: return f"[HF_ERROR] router: {e}" # ── Serverless Inference API ── elif endpoint_type == "serverless": full_prompt = f"{system}\n\n{prompt}" if system else prompt payload = { "inputs": full_prompt, "parameters": {"max_new_tokens": min(max_tokens, 2048), "temperature": 0.6, "return_full_text": False} } for attempt in range(3): try: r = requests.post(f"https://api-inference.huggingface.co/models/{mid}", headers=headers, data=json.dumps(payload), timeout=300) if r.status_code == 429: time.sleep(5 * (attempt + 1)); continue if r.status_code == 503: # 모델 로딩 중 est = 60 try: est = r.json().get("estimated_time", 60) except: pass print(f" ⏳ Model loading... ETA {est}s") if attempt < 2: time.sleep(min(est, 30)); continue return f"[HF_ERROR] Model still loading (ETA {est}s)" if r.status_code >= 400: err_body = "" try: err_body = r.json().get("error", r.text[:500]) except: err_body = r.text[:500] print(f" ⚠️ HF serverless {r.status_code}: {err_body[:200]}") if attempt < 2: time.sleep(3 * (attempt + 1)); continue return f"[HF_ERROR] {r.status_code}: {err_body[:200]}" r.raise_for_status() result = r.json() if isinstance(result, list) and result: text = result[0].get("generated_text", "") elif isinstance(result, dict): text = result.get("generated_text", "") else: text = str(result)[:5000] return text if text else "[EMPTY]" except Exception as e: if attempt < 2: time.sleep(3 * (attempt + 1)) else: return f"[HF_ERROR] serverless: {e}" # ── 어떤 엔드포인트도 안 됨 ── else: return (f"[HF_ERROR] Model '{mid}' is not available on HF Inference API. " f"Possible reasons: (1) Model too large for serverless, (2) Not deployed as Inference Provider, " f"(3) Needs dedicated Inference Endpoint. Try using OpenAI model (gpt-4o) for evaluation instead.") # ════════════════════════════════════════════════════════════════ # §2. Load Dataset # ════════════════════════════════════════════════════════════════ def load_tasks(): for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json", os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]: if os.path.exists(p): with open(p,"r",encoding="utf-8") as f: data=json.load(f) print(f" Loaded from {p}"); break else: print("⚠️ FINAL_Bench_v3.json not found — FINAL tasks empty, SWE-bench only mode") return [] return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"], ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""), title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""), hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]), metadata=t.get("metadata",{})) for t in data["tasks"]] try: ALL_TASKS = load_tasks() print(f"✅ FINAL Bench v3.0: {len(ALL_TASKS)} tasks") except: ALL_TASKS = [] print("⚠️ FINAL tasks: 0 (SWE-bench only mode)") # SWE-bench 로드 (비동기) SWE_BENCH_TASKS = _load_swe_bench_verified() # ════════════════════════════════════════════════════════════════ # §3. Model API (OpenAI + Local + HF) # ════════════════════════════════════════════════════════════════ OPENAI_MODELS = { "gpt-5.2": "GPT-5.2 (flagship)", "gpt-5.2-chat-latest": "GPT-5.2 Instant", "gpt-5-mini": "GPT-5 Mini", "o4-mini": "o4-mini", "gpt-4.1": "GPT-4.1", } LOCAL_MODELS = { "Darwin-gpt-ernie-20b (Friendli)": { "id": "deppfs281rgffnk", "type": "friendli", "desc": "21B MoE (Friendli Dedicated Endpoint)", "api_url": "https://api.friendli.ai/dedicated/v1/chat/completions", }, "Darwin-gpt-ernie-20b (Local vLLM)": { "id": "seawolf2357/Darwin-gpt-ernie-20b", "type": "local_vllm", "desc": "21B MoE (Local vLLM, GPU required)" }, "Darwin-gpt-ernie-20b (HF Inference)": { "id": "seawolf2357/Darwin-gpt-ernie-20b", "type": "hf_inference", "desc": "HuggingFace Inference API (HF_TOKEN required)" }, } ALL_EVAL_MODELS = {**OPENAI_MODELS, **{k: v["desc"] for k, v in LOCAL_MODELS.items()}} # ════════════════════════════════════════════════════════════════ # §3.6 Friendli AI Dedicated Endpoint # ════════════════════════════════════════════════════════════════ def _test_friendli(model_id=None): """Friendli API 연결 테스트""" token = os.getenv("FRIENDLI_TOKEN", "") if not token: return False, "FRIENDLI_TOKEN not set" mid = model_id or "deppfs281rgffnk" headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} payload = { "model": mid, "messages": [{"role": "user", "content": "Say OK"}], "max_tokens": 10, "temperature": 0, "stream": False, } try: r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions", headers=headers, json=payload, timeout=30) if r.status_code == 200: text = r.json()["choices"][0]["message"]["content"] return True, f"OK ({text[:20]})" err = "" try: err = r.json().get("error", {}).get("message", r.text[:300]) except: err = r.text[:300] return False, f"{r.status_code}: {err[:200]}" except Exception as e: return False, f"Connection error: {str(e)[:200]}" def call_friendli(prompt, system="", model_id=None, max_tokens=8192, temperature=0.6): """Friendli AI Dedicated Endpoint 호출 (OpenAI-compatible)""" token = os.getenv("FRIENDLI_TOKEN", "") if not token: return "[FRIENDLI_ERROR] FRIENDLI_TOKEN not set in environment" mid = model_id or "deppfs281rgffnk" headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) payload = { "model": mid, "messages": messages, "max_tokens": min(max_tokens, 16384), "temperature": temperature, "top_p": 0.95, "stream": False, } for attempt in range(3): try: r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions", headers=headers, json=payload, timeout=600) if r.status_code == 429: wait = 5 * (attempt + 1) print(f" ⏳ Friendli rate limit, waiting {wait}s...") time.sleep(wait) continue if r.status_code == 400: err = "" try: err = r.json().get("error", {}).get("message", r.text[:500]) except: err = r.text[:500] print(f" ⚠️ Friendli 400 (attempt {attempt+1}): {err[:200]}") # 토큰 제한 에러 → max_tokens 줄여서 재시도 if "max_tokens" in str(err).lower() or "too many" in str(err).lower(): payload["max_tokens"] = min(payload["max_tokens"] // 2, 4096) if attempt < 2: continue # 입력 길이 에러 → 프롬프트 축소 if "input" in str(err).lower() and ("length" in str(err).lower() or "token" in str(err).lower()): cur_len = len(messages[-1]["content"]) messages[-1]["content"] = messages[-1]["content"][:cur_len // 2] + "\n[...truncated...]" payload["messages"] = messages if attempt < 2: continue return f"[FRIENDLI_ERROR] 400: {err[:200]}" if r.status_code >= 500: print(f" ⚠️ Friendli {r.status_code} (attempt {attempt+1})") if attempt < 2: time.sleep(3 * (attempt + 1)); continue return f"[FRIENDLI_ERROR] Server error {r.status_code}" r.raise_for_status() c = r.json()["choices"][0]["message"]["content"] return c if c else "[EMPTY]" except requests.exceptions.Timeout: print(f" ⚠️ Friendli timeout (attempt {attempt+1})") if attempt < 2: time.sleep(5); continue return "[FRIENDLI_ERROR] Request timeout (600s)" except requests.exceptions.ConnectionError: print(f" ⚠️ Friendli connection error (attempt {attempt+1})") if attempt < 2: time.sleep(3 * (attempt + 1)); continue return "[FRIENDLI_ERROR] Connection failed" except Exception as e: print(f" ⚠️ Friendli exception (attempt {attempt+1}): {str(e)[:200]}") if attempt < 2: time.sleep(3 * (attempt + 1)) else: return f"[FRIENDLI_ERROR] {e}" def _strip_think(text): if not text: return text for tag in ['think','thinking','reasoning','reflection']: text = re.sub(rf'<{tag}>.*?{tag}>','',text,flags=re.DOTALL) return text.strip() def call_model(prompt, system="", api_key="", model="gpt-5.2", max_tokens=8192, temperature=0.6, reasoning_effort=None): """통합 모델 호출 — Friendli / OpenAI / Local vLLM / HF Inference 자동 분기""" # 로컬/전용 모델 분기 if model in LOCAL_MODELS: minfo = LOCAL_MODELS[model] if minfo["type"] == "friendli": return call_friendli(prompt, system=system, model_id=minfo["id"], max_tokens=max_tokens, temperature=temperature) elif minfo["type"] == "local_vllm": return call_local_model(prompt, system=system, max_tokens=max_tokens, temperature=temperature) elif minfo["type"] == "hf_inference": return call_hf_inference(prompt, system=system, model_id=minfo["id"], max_tokens=min(max_tokens, 4096)) # OpenAI API return call_openai(prompt, system=system, api_key=api_key, model=model, max_tokens=max_tokens, temperature=temperature, reasoning_effort=reasoning_effort) def _test_api_connection(api_key, model="gpt-4o-mini"): """API 연결 + 모델 유효성 빠른 테스트""" if not api_key: return False, "❌ API key is empty" headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} payload = {"model":model,"max_completion_tokens":50,"temperature":0, "messages":[{"role":"user","content":"Say OK"}]} try: r = requests.post("https://api.openai.com/v1/chat/completions", headers=headers,data=json.dumps(payload),timeout=30) if r.status_code == 200: return True, f"✅ {model} OK" err = "" try: err = r.json().get("error",{}).get("message", r.text[:200]) except: err = str(r.status_code) return False, f"❌ {r.status_code}: {err}" except Exception as e: return False, f"❌ Connection failed: {e}" # ── 모델명 폴백 맵 (존재하지 않는 모델 → 실제 모델) ── MODEL_FALLBACK = { "gpt-5.2": ["gpt-5.2", "gpt-4.1", "gpt-4o", "gpt-4o-mini"], "gpt-5.2-chat-latest": ["gpt-5.2-chat-latest", "gpt-4.1", "gpt-4o"], "gpt-5-mini": ["gpt-5-mini", "gpt-4o-mini", "gpt-4o"], "o4-mini": ["o4-mini", "o3-mini", "gpt-4o-mini"], "gpt-4.1": ["gpt-4.1", "gpt-4o", "gpt-4o-mini"], } _VERIFIED_MODELS = {} # 캐시: {requested_model: actual_working_model} def _resolve_model(model, api_key): """모델명이 유효한지 확인하고, 안 되면 폴백 모델 탐색""" if model in _VERIFIED_MODELS: return _VERIFIED_MODELS[model] ok, msg = _test_api_connection(api_key, model) if ok: _VERIFIED_MODELS[model] = model print(f" ✅ Model verified: {model}") return model # 폴백 탐색 fallbacks = MODEL_FALLBACK.get(model, []) for fb in fallbacks: if fb == model: continue ok2, msg2 = _test_api_connection(api_key, fb) if ok2: _VERIFIED_MODELS[model] = fb print(f" ⚠️ Model {model} unavailable → fallback to {fb}") return fb # 최후 수단 ok3, _ = _test_api_connection(api_key, "gpt-4o-mini") if ok3: _VERIFIED_MODELS[model] = "gpt-4o-mini" print(f" ⚠️ All fallbacks failed → using gpt-4o-mini") return "gpt-4o-mini" _VERIFIED_MODELS[model] = model print(f" ❌ No working model found for {model}: {msg}") return model def call_openai(prompt, system="", api_key="", model="gpt-5.2", max_tokens=8192, temperature=0.6, reasoning_effort=None): """OpenAI API — 자동 모델 검증/폴백 + 파라미터 호환성 자동 수정""" # ★ 모델명 자동 검증/폴백 actual_model = _resolve_model(model, api_key) headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} messages = [] if system: messages.append({"role":"system","content":system}) messages.append({"role":"user","content":prompt}) payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages} # reasoning_effort: 일부 모델만 지원 if reasoning_effort: payload["reasoning_effort"] = reasoning_effort for attempt in range(3): try: r = requests.post("https://api.openai.com/v1/chat/completions", headers=headers,data=json.dumps(payload),timeout=300) if r.status_code == 429: time.sleep(5*(attempt+1)); continue if r.status_code == 400: err_msg = "" try: err_msg = r.json().get("error",{}).get("message","") except: err_msg = str(r.status_code) print(f" ⚠️ 400 Error (attempt {attempt+1}): {err_msg[:200]}") # ★ 파라미터 호환성 자동 수정 if "max_completion_tokens" in err_msg: payload.pop("max_completion_tokens", None) payload["max_tokens"] = max_tokens if "reasoning_effort" in err_msg or "not supported" in err_msg.lower(): payload.pop("reasoning_effort", None) if "temperature" in err_msg: payload["temperature"] = 1 # reasoning 모델은 temperature 지원 안 할 수 있음 if attempt < 2: time.sleep(2); continue return f"[API_ERROR] 400: {err_msg[:200]}" r.raise_for_status() c = r.json()["choices"][0]["message"]["content"] return c if c else "[EMPTY]" except requests.exceptions.HTTPError: try: err=r.json().get("error",{}).get("message","") except: err=str(r.status_code) print(f" ⚠️ HTTP Error (attempt {attempt+1}): {err[:200]}") if attempt<2: time.sleep(3*(attempt+1)); continue return f"[API_ERROR] {err}" except Exception as e: print(f" ⚠️ Exception (attempt {attempt+1}): {str(e)[:200]}") if attempt<2: time.sleep(3*(attempt+1)) else: return f"[API_ERROR] {e}" # ════════════════════════════════════════════════════════════════ # §3.5 Proto-AGI 오행 멀티에이전트 파이프라인 # ════════════════════════════════════════════════════════════════ MAGIC_SQUARE_5x5 = np.array([ [17,24,1,8,15],[23,5,7,14,16],[4,6,13,20,22],[10,12,19,21,3],[11,18,25,2,9] ], dtype=np.float64) COMM_MATRIX = MAGIC_SQUARE_5x5 / MAGIC_SQUARE_5x5.sum(axis=1, keepdims=True) AGENT_BUDGETS = { "木_발상": 512, "火_실행": 65536, "火_이어쓰기": 32768, "土_판단": 512, "金_비평": 2048, "水_정제": 4096, } AGENT_REASONING = { "木_발상": "low", "火_실행": None, "火_이어쓰기": None, "土_판단": "low", "金_비평": "medium", "水_정제": "medium", } PROTO_AGENTS = { "木_발상": { "role": "Ideation (木/仁). 3 bullets MAX, 100 words total. " "What makes this AGI-hard? Key traps? Core angles?", "element":"木","index":2,"sheng_from":"水","ke_target":"土", }, "火_실행": { "role": "★ MAIN SOLVER (火/禮). You write THE COMPLETE FINAL ANSWER. " "Complete ALL numbered requirements — check each off. " "State confidence (0-100%) per major claim. " "★ MANDATORY SELF-CHECK at the END of your answer: " "Write 2-3 [BACKTRACK] corrections reviewing your own claims: " "'[BACKTRACK-1] I adjust X because Y. Corrected: Z.' " "'[BACKTRACK-2] I refine A because B. Corrected: C.' " "Find genuine improvements — qualify overconfident claims, fix edge cases, " "or add missing nuance. This is REQUIRED for scoring. " "NEVER stop mid-sentence. Be concise but COMPLETE all requirements.", "element":"火","index":3,"sheng_from":"木","ke_target":"金", }, "土_판단": { "role": "Auditor (土/信). ONE paragraph only. " "List: (1) missing requirements (2) overconfident claims (3) domain drift. Max 80 words.", "element":"土","index":4,"sheng_from":"火","ke_target":"水", }, "金_비평": { "role": "Verifier (金/義). Use STRUCTURED fix tags. For each error found: " "[FIX-1] error description → correction. [FIX-2] ... Max 5 fixes. " "Also: [TRAP-CHECK] verify hidden traps. [HALLUCINATION] tag unverifiable claims.", "element":"金","index":0,"sheng_from":"土","ke_target":"木", }, "水_정제": { "role": "Correction Agent (水/智). You apply 金's fixes to 火's answer. " "For EACH [FIX-n] from 金, write [APPLIED-n] with the specific correction. " "Also cool any overconfident claims (水克火). " "End with confidence summary table + 2 most uncertain points. " "Do NOT rewrite 火's entire answer — only write corrections and additions.", "element":"水","index":1,"sheng_from":"金","ke_target":"火", }, } AGENT_ORDER = ["木_발상","火_실행","土_판단","金_비평","水_정제"] AGENT_EMOJIS = {"木":"🌳","火":"🔥","土":"🏔️","金":"⚔️","水":"💧"} FINAL_AGENT_INSTRUCTIONS = { "木": "\n[BUDGET: 100 words MAX] 3 bullets: (1) core trap (2) key contradiction (3) best angle.", "火": "\n★★★ YOU ARE THE FINAL ANSWER. THE JUDGE SCORES YOUR TEXT. ★★★\n" "Complete ALL numbered requirements from the task. Check each off.\n" "Confidence per major claim.\n" "CRITICAL: Finish EVERY section. NEVER stop mid-sentence.\n" "If running long, be CONCISE but COMPLETE all requirements.\n" "PRIORITY: completeness > depth.\n" "\n★ MANDATORY SELF-CHECK (REQUIRED — at the very end of your answer):\n" "Review your answer and write 2-3 [BACKTRACK] corrections:\n" "[BACKTRACK-1] I adjust [claim] because [reason]. Corrected: [new version].\n" "[BACKTRACK-2] I refine [claim] because [reason]. Corrected: [new version].\n" "[BACKTRACK-3] (optional) ...\n" "These must be GENUINE improvements, not cosmetic. Examples:\n" "- Qualify an overconfident claim (90%→70%)\n" "- Fix an edge case you missed\n" "- Add a missing perspective or caveat\n" "The Judge gives ZERO for error_recovery without [BACKTRACK] tags.", "土": "\n[BUDGET: 80 words MAX] Checklist only: □missing items □overconfidence □drift.", "金": "\n[BUDGET: MAX 5 FIXES] Format strictly as:\n" "[FIX-1] Problem → Correction\n[FIX-2] Problem → Correction\n...\n" "[TRAP-CHECK] Hidden trap found? Y/N + detail\n[HALLUCINATION] Any? Y/N + which claims", "水": "\nYou MUST produce [APPLIED-n] for EVERY [FIX-n] from 金.\n" "Format:\n" "[APPLIED-1] Fixed: (what was wrong → what it should be, 1-2 sentences)\n" "[APPLIED-2] Fixed: ...\n" "[NO-FIXES-NEEDED] if 金 found no errors.\n" "\n## Confidence Adjustments\n" "(List 火's overclaimed items with corrected confidence, max 3)\n" "\n## Top-2 Uncertainties\n" "(Most uncertain claims and why)\n" "\nMAX 600 words. Focus on corrections, not repetition.", } def _count_requirements(prompt_text): count = 0 for line in prompt_text.split('\n'): stripped = line.strip() if stripped and len(stripped) > 3: if (stripped[0].isdigit() and len(stripped) > 2 and stripped[1] in '.):') or \ (stripped.startswith('(') and len(stripped) > 3 and stripped[1].isdigit()): count += 1 return count def _detect_truncation(text): if not text or len(text) < 100: return True if '[BACKTRACK' not in text: return True return False def _build_agent_prompt(agent_name, task_prompt, prev_outputs, task=None): info = PROTO_AGENTS[agent_name] elem = info['element'] idx = info['index'] sheng_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['sheng_from']] sheng_ref = "" if sheng_agent and sheng_agent[0] in prev_outputs: sheng_ref = f"\n[상생 {sheng_agent[0]}] {prev_outputs[sheng_agent[0]][:200]}\n" ke_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['ke_target']] ke_ref = "" if ke_agent and ke_agent[0] in prev_outputs: ke_ref = f"\n[상극 {ke_agent[0]}] {prev_outputs[ke_agent[0]][:150]}\n" weights = COMM_MATRIX[idx] comm_lines = [] for aname in AGENT_ORDER: if aname in prev_outputs and aname != agent_name: w = weights[PROTO_AGENTS[aname]['index']] if w >= 0.15: trunc = min(len(prev_outputs[aname]), 150) comm_lines.append(f"[{aname}] {prev_outputs[aname][:trunc]}") adaptive_instruction = "" if elem == "火" and task: req_count = _count_requirements(task.prompt) prompt_lower = task.prompt.lower() is_multi_perspective = any(kw in prompt_lower for kw in [ 'traditions debate', 'positions debate', 'expert panel', 'each position at maximum depth', 'each response at maximum depth', 'develop each', 'each tradition' ]) if req_count >= 7 or is_multi_perspective: adaptive_instruction = ( f"\n\n★ COMPLEX TASK detected ({req_count} requirements" f"{', multi-perspective' if is_multi_perspective else ''}).\n" "Use CONCISE mode: max 100 words per perspective/requirement.\n" "Do NOT elaborate beyond what's strictly needed.\n" "Cover ALL requirements and perspectives BRIEFLY rather than some deeply.\n" "Structure: address each requirement with 2-3 focused sentences, then move on." ) sys_prompt = ( f"You are {agent_name} — {info['role']}\n" f"{sheng_ref}{ke_ref}" f"{FINAL_AGENT_INSTRUCTIONS.get(elem, '')}" f"{adaptive_instruction}" ) if task: sys_prompt += f"\nTICOS: {task.ticos_type} | {task.domain} | {task.difficulty}" usr_prompt = task_prompt if comm_lines: usr_prompt += "\n\n[Previous Agents]\n" + "\n".join(comm_lines) return sys_prompt, usr_prompt def _strip_framework_noise(text): if not text: return text for pat in [r'(?:마방진|상생|상극|오행)[\s\w]{0,30}(?:구조|원리|체계)', r'Proto-AGI[\s\w]{0,60}[\.。]', r'(?:저는|나는)\s*(?:Proto-AGI|오행|木_|火_|土_|金_|水_)[\s\w]{0,60}[\.。]']: text = re.sub(pat, '', text, flags=re.IGNORECASE) return re.sub(r'\n{3,}', '\n\n', text).strip() def run_proto_agi_pipeline(task_prompt, api_key, eval_model, task=None): """★ Proto-AGI v2.8 — 통합 모델 호출 사용""" prev_outputs = {} budgets = dict(AGENT_BUDGETS) # 로컬/전용 모델인 경우 버짓 조정 is_local = eval_model in LOCAL_MODELS if is_local: minfo = LOCAL_MODELS.get(eval_model, {}) if minfo.get("type") == "friendli": # Friendli: max_tokens 16384까지 → 넉넉하게 budgets["火_실행"] = min(budgets["火_실행"], 12288) budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], 8192) budgets["金_비평"] = min(budgets["金_비평"], 2048) budgets["水_정제"] = min(budgets["水_정제"], 4096) else: # vLLM 로컬: 컨텍스트 길이 제한 active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {}) model_maxlen = active_cfg.get("maxlen", 4096) max_output = int(model_maxlen * 0.4) budgets["木_발상"] = min(budgets["木_발상"], 256) budgets["火_실행"] = min(budgets["火_실행"], max_output) budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], max_output) budgets["土_판단"] = min(budgets["土_판단"], 256) budgets["金_비평"] = min(budgets["金_비평"], min(512, max_output)) budgets["水_정제"] = min(budgets["水_정제"], min(1024, max_output)) for agent_name in AGENT_ORDER: sys_p, usr_p = _build_agent_prompt(agent_name, task_prompt, prev_outputs, task) re_effort = AGENT_REASONING.get(agent_name) if not is_local else None resp = call_model(usr_p, system=sys_p, api_key=api_key, model=eval_model, max_tokens=budgets[agent_name], reasoning_effort=re_effort) if agent_name == "火_실행" and _detect_truncation(resp): last_chunk = resp[-500:] if len(resp) > 500 else resp cont_prompt = ( f"You were writing an answer but it was CUT OFF. " f"Here is the end of what you wrote:\n\n" f"---\n{last_chunk}\n---\n\n" f"CONTINUE from exactly where you stopped. " f"Complete ALL remaining requirements. " f"Include your [BACKTRACK] self-check section at the end.\n" f"Do NOT repeat what was already written." ) cont_sys = ( f"You are 火_실행 — MAIN SOLVER continuing an interrupted answer.\n" f"TICOS: {task.ticos_type if task else ''} | {task.domain if task else ''}\n" f"Original task:\n{task_prompt[:1500]}" ) cont_resp = call_model(cont_prompt, system=cont_sys, api_key=api_key, model=eval_model, max_tokens=budgets.get("火_이어쓰기", 4096), reasoning_effort=AGENT_REASONING.get("火_이어쓰기") if not is_local else None) if cont_resp and not cont_resp.startswith("[API_ERROR") and not cont_resp.startswith("[LOCAL_ERROR"): resp = resp + "\n\n" + cont_resp if agent_name != "水_정제": prev_outputs[agent_name] = _strip_framework_noise(resp) else: prev_outputs[agent_name] = resp combined = [] for aname in AGENT_ORDER: if aname in prev_outputs: emoji = AGENT_EMOJIS.get(PROTO_AGENTS[aname]['element'], "") combined.append(f"{'='*40}\n{emoji} [{aname}]\n{'='*40}\n{prev_outputs[aname]}") return "\n\n".join(combined), prev_outputs def compress_for_judge(prev_outputs): parts = [] fire = prev_outputs.get("火_실행", "") parts.append(f"[ANSWER]\n{fire[:14000]}") water = prev_outputs.get("水_정제", "") if water and '[APPLIED' in water: applied_lines = [l for l in water.split('\n') if '[APPLIED' in l or 'Fixed:' in l or 'Corrected:' in l] if applied_lines: parts.append(f"\n[ADDITIONAL CORRECTIONS]\n" + "\n".join(applied_lines[:8])) elif water: parts.append(f"\n[CORRECTIONS]\n{water[:1500]}") metal = prev_outputs.get("金_비평", "") fix_lines = [l for l in metal.split('\n') if any(tag in l for tag in ['[FIX-','[TRAP-','[HALLUCINATION'])] if fix_lines: parts.append(f"\n[VERIFICATION]\n" + "\n".join(fix_lines[:8])) return "\n".join(parts) # ════════════════════════════════════════════════════════════════ # §4. Structured Judge (GPT-5.2 — 항상 OpenAI) # ════════════════════════════════════════════════════════════════ JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification. Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0 RUBRIC: process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher. metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. Honest uncertainty=0.75+ error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY of these exist: [BACKTRACK-n] numbered corrections in the answer, [APPLIED-n] correction tags, or explicit mid-chain corrections. Score 0.75 if 2+ genuine corrections are well-executed. integration_depth (15%): Multi-perspective synthesis + emergent insights final_correctness (15%): Answer accuracy and completeness. INCOMPLETE/TRUNCATED answers get 0.25 max. TICOS BONUSES: A_TrapEscape: ID'd ALL hidden traps? Challenged false premises? B_ContradictionResolution: Resolved both sides? Found shared error? C_ProgressiveDiscovery: Revised earlier stages with new info? D_MultiConstraint: Mapped ALL conflicts? Creative tradeoffs? E_SelfCorrecting: EXPLICIT backtrack+correct mid-chain? F_ExpertPanel: Max-depth per perspective? Surprising convergence? G_PivotDetection: Detected which premise reverses conclusion? H_DecisionUnderUncertainty: Scenario matrix? Minimax regret? SWE-BENCH SPECIFIC CRITERIA (for Software Engineering tasks): - Did the model correctly identify the bug root cause? - Is the proposed patch minimal and correct? - Does the patch avoid introducing regressions? - Are edge cases properly considered? MULTI-AGENT FORMAT: [ANSWER] = THE answer to judge for all rubrics. [BACKTRACK-n] = explicit self-corrections within the answer = evidence for error_recovery 0.5+. [ADDITIONAL CORRECTIONS] / [APPLIED-n] = post-hoc corrections = also count for error_recovery. Multiple genuine [BACKTRACK] corrections = 0.75. Single [BACKTRACK] = 0.5. [VERIFICATION] = What was checked externally. Judge the ANSWER's completeness. Corrections ENHANCE the score. STRICT: 1.0=AGI-worthy. 0.75=expert. 0.5=competent. 0.25=gaps. 0.0=failure. Output JSON: {"scores":{...},"comment":"<50 words>"}""" def _build_judge_schema(): sp = {k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC} return {"type":"object","properties":{ "scores":{"type":"object","properties":sp,"required":list(RUBRIC.keys()),"additionalProperties":False}, "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False} JUDGE_SCHEMA = _build_judge_schema() def call_judge_structured(prompt, system="", api_key="", model="gpt-5.2", temperature=0.1, max_tokens=2048): """★ Judge는 항상 GPT (OpenAI) — 구조적 출력 + 자동 폴백""" # ★ 모델 자동 검증 actual_model = _resolve_model(model, api_key) messages = [] if system: messages.append({"role":"system","content":system}) messages.append({"role":"user","content":prompt}) # ★ 먼저 structured output 시도 payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature, "messages":messages, "response_format":{"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":JUDGE_SCHEMA}}} # reasoning_effort: 일부 모델만 if actual_model in ("gpt-5.2","gpt-5.2-chat-latest","o4-mini","o3-mini"): payload["reasoning_effort"] = "none" headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} for attempt in range(3): try: r = requests.post("https://api.openai.com/v1/chat/completions", headers=headers,data=json.dumps(payload),timeout=180) if r.status_code==429: time.sleep(5*(attempt+1)); continue if r.status_code==400: err_msg = "" try: err_msg = r.json().get("error",{}).get("message","") except: err_msg = str(r.status_code) print(f" ⚠️ Judge 400 (attempt {attempt+1}): {err_msg[:200]}") # ★ 파라미터 호환성 자동 수정 if "json_schema" in err_msg or "response_format" in err_msg: # structured output 미지원 → json_object로 폴백 payload["response_format"] = {"type":"json_object"} payload.pop("reasoning_effort", None) if "max_completion_tokens" in err_msg: payload.pop("max_completion_tokens", None) payload["max_tokens"] = max_tokens if "reasoning_effort" in err_msg: payload.pop("reasoning_effort", None) if "temperature" in err_msg: payload.pop("temperature", None) if attempt < 2: time.sleep(2); continue # ★ structured 완전 실패 → plain text JSON 폴백 print(f" ⚠️ Judge structured output failed, falling back to plain JSON") return None r.raise_for_status() content = _strip_think(r.json()["choices"][0]["message"]["content"] or "") if not content: if attempt<2: time.sleep(2); continue return None data = json.loads(content) if "scores" in data and isinstance(data["scores"],dict): for k in RUBRIC: if k not in data["scores"]: data["scores"][k]=0.5 return {"scores":data["scores"],"comment":data.get("comment","ok")} except json.JSONDecodeError: print(f" ⚠️ Judge JSON parse failed (attempt {attempt+1})") if attempt<2: time.sleep(2); continue return None except Exception as e: print(f" ⚠️ Judge exception (attempt {attempt+1}): {str(e)[:100]}") if attempt<2: time.sleep(3*(attempt+1)); continue return None return None def build_judge_prompt(task, response): # SWE-bench 과제에 대한 추가 컨텍스트 swe_context = "" if task.metadata.get("source") == "SWE-bench_Verified": correct_patch = task.metadata.get("patch", "")[:2000] swe_context = f"\n\nREFERENCE CORRECT PATCH (for scoring final_correctness):\n{correct_patch}\n" return f"""FINAL Bench Task Evaluation Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty} TICOS: {task.ticos_type} | Title: {task.title} PROMPT: {task.prompt[:2000]} EXPECTED: {task.expected_behavior[:600]} HIDDEN TRAPS: {task.hidden_trap or 'None'} {swe_context} RESPONSE TO JUDGE: {response[:17000]} Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness Apply {task.ticos_type} bonus. Check for [BACKTRACK-n] and [APPLIED-n] tags (self-corrections). Output ONLY JSON.""" def parse_judge_fallback(text, keys): if not text or text.startswith("[API_ERROR"): return {"scores":{k:0.0 for k in keys},"comment":"API_ERROR","failed":True} cleaned = _strip_think(text); VALID={0.0,0.25,0.5,0.75,1.0} try: m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL) if m: d=json.loads(m.group()) if "scores" in d: return {"scores":{k:min(VALID,key=lambda x:abs(x-float(d["scores"].get(k,0.5)))) for k in keys},"comment":d.get("comment","parsed")} except: pass try: sc={} for k in keys: m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE) if m2: v=float(m2.group(1)) if 0<=v<=1: sc[k]=min(VALID,key=lambda x:abs(x-v)) if len(sc)>=3: for k in keys: if k not in sc: sc[k]=0.5 return {"scores":sc,"comment":"regex"} except: pass return {"scores":{k:0.0 for k in keys},"comment":"parse_failed","failed":True} # ════════════════════════════════════════════════════════════════ # §5. Scoring Engine # ════════════════════════════════════════════════════════════════ def compute_task_score(scores): return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2) def compute_axis_scores(results, tasks): tm={t.task_id:t for t in tasks}; ax={} for an,ai in AXIS_MAP.items(): vals=[] for tid,d in results.items(): if d["score"]<0: continue t=tm.get(tid) if not t: continue try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]; sc=jd.get("scores",{}) if isinstance(jd,dict) else {} except: sc={} rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc] w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0 if rv: vals.append(np.mean(rv)*w) ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0 return ax def compute_final_score(results, tasks): tm={t.task_id:t for t in tasks} ds={} for tid,d in results.items(): if d["score"]<0: continue t=tm.get(tid) if t: ds.setdefault(t.domain,[]).append(d["score"]) da={d:np.mean(v) for d,v in ds.items() if v} gd={} for t in tasks: gd.setdefault(t.grade,set()).add(t.domain) ws,wt=0,0 for g,doms in gd.items(): w=GRADE_WEIGHT.get(g,1.0) for d in doms: if d in da: ws+=da[d]*w; wt+=w base=ws/wt if wt>0 else 0 axis=compute_axis_scores(results,tasks) av=[max(v,0.01) for v in axis.values()] har=(len(av)/sum(1.0/v for v in av)) if av else 50 har_p=har/100.0 return round(base*har_p,2),round(base,2),round(har_p,3),axis,da def determine_agi_stage(score, axis): all60=all(v>=60 for v in axis.values()) if axis else False for s in reversed(AGI_STAGES): if score>=s["min"]: if s["stage"]>=4 and not all60: return AGI_STAGES[2] return s return AGI_STAGES[0] # ════════════════════════════════════════════════════════════════ # §6. Checkpoint DB # ════════════════════════════════════════════════════════════════ DB_PATH = "final_bench_eval.db" def _init_db(): c=sqlite3.connect(DB_PATH) c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))") c.commit(); c.close() def _make_run_id(m,mode="NON"): return hashlib.md5(f"FINALv31_{mode}_{m}".encode()).hexdigest()[:12] def _save_result(rid,tid,resp,jresp,sc): c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close() def _load_all(rid): c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close() return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows} def _clear_run(rid): c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close() _init_db() # ════════════════════════════════════════════════════════════════ # §7. CSV + HuggingFace # ════════════════════════════════════════════════════════════════ def generate_csv(results, tasks, model_name, mode="NON-AGI"): out=io.StringIO(); w=csv.writer(out) w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","model","mode","weighted_score", "process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness", "judge_comment","response_preview","timestamp","dataset_source"]) tm={t.task_id:t for t in tasks} for tid,d in sorted(results.items()): t=tm.get(tid) if not t: continue jd={} try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {}) except: pass sc=jd.get("scores",{}) if isinstance(jd,dict) else {} cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200] s=d["score"] if s<0: s=-1; cm=f"JUDGE_FAILED:{cm}" source = t.metadata.get("source", "FINAL_Bench") w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,mode,s, sc.get("process_quality",""),sc.get("metacognitive_accuracy",""), sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""), cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat(),source]) return out.getvalue() def upload_to_hf(csv_content, model_name, mode=""): hf_token=os.getenv("HF_TOKEN","") if not hf_token: return "⚠️ HF_TOKEN not set" try: from huggingface_hub import HfApi api=HfApi(token=hf_token); safe=re.sub(r'[^a-zA-Z0-9_-]','_',model_name.split('/')[-1]) repo="seawolf2357/FINAL-Bench-Results"; ts=datetime.now().strftime("%Y%m%d_%H%M%S") fn=f"eval_{safe}_{mode}_{ts}.csv" try: api.create_repo(repo_id=repo,repo_type="dataset",private=True,exist_ok=True) except: pass api.upload_file(path_or_fileobj=csv_content.encode("utf-8"),path_in_repo=fn,repo_id=repo,repo_type="dataset",commit_message=f"FINAL Bench: {safe} {mode} ({ts})") return f"✅ HF: {fn}" except Exception as e: return f"❌ HF: {e}" # ════════════════════════════════════════════════════════════════ # §8. HTML Builders # ════════════════════════════════════════════════════════════════ CSS = """""" def _sc(s): if s>=80: return "#4caf50" if s>=60: return "#ff9800" if s>=40: return "#ff5722" return "#f44336" def _build_progress_table(results, tasks): rows="" for t in tasks: info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"}) gb=f'{t.grade}' src_badge = "🐛" if t.metadata.get("source") == "SWE-bench_Verified" else "📝" if t.task_id in results: s=results[t.task_id]["score"] if s<0: rows+=f'
| ID | Domain | G | TICOS | Diff | Score | Val |
|---|
{stage['label']} · Base {base:.1f} × HAR {har_p:.3f} · {done}/{len(tasks)}{f" · JF={jf}" if jf else ""}{swe_label}
{hf_status}
아직 비교할 결과가 없습니다.
" tm = {t.task_id: t for t in tasks} non_final = compute_final_score(non_results, tasks) if non_results else (0,0,0,{},{}) pagi_final = compute_final_score(pagi_results, tasks) if pagi_results else (0,0,0,{},{}) nf, nb, nh, nax, nda = non_final pf, pb, ph2, pax, pda = pagi_final def _delta(a, b): d = a - b if abs(d) < 0.5: return f'±0' cls = "cmp-up" if d > 0 else "cmp-down" return f'{"+" if d>0 else ""}{d:.1f}' ns = determine_agi_stage(nf, nax) if non_results else AGI_STAGES[0] ps = determine_agi_stage(pf, pax) if pagi_results else AGI_STAGES[0] header = f"""{CSS}{model_name}
| 5-Axis | 🤖 Non-AGI | 🌟 Proto-AGI | Δ |
|---|
| Grade | 🤖 Non-AGI | 🌟 Proto-AGI | Δ |
|---|
| Task | G | 🤖 | 🌟 | Δ |
|---|
Evolutionary Merge (진화적 병합 v3.2)
| Base Models | openai/gpt-oss-20b + baidu/ERNIE-4.5-21B-A3B-Thinking |
| Merge Ratio | {cfg['merge_ratio']*100:.0f}% / {(1-cfg['merge_ratio'])*100:.0f}% |
| Parameters | {cfg['params']} total ({cfg['active_params']} active) |
| Architecture | MoE (Mixture of Experts) |
| 🚀 Friendli | {friendli_status} · model: deppfs281rgffnk |
| 🖥️ vLLM | {vllm_status} (port {cfg['server_port']}) | {active_str} |
| 🖥️ GPU | {gpu} |
📊 SWE-bench_Verified: {len(SWE_BENCH_TASKS)} tasks | FINAL Bench: {len(ALL_TASKS)} tasks
Run both Non-AGI and Proto-AGI (or use 🔄 Compare) to see comparison.
" dh = _build_detail_view(results,tasks) co = cp return(ph,th,sh,cmp_html,dh,co) # ════════════════════════════════════════════════════════════════ # §11. Gradio App # ════════════════════════════════════════════════════════════════ HEADER = """
FINAL 100 Tasks + SWE-bench 500 Tasks · 15+ Domains · 8 TICOS · 5-Axis · 5-Stage AGI Grade
🤖 Non-AGI (single LLM) vs 🌟 Proto-AGI (五行 木→火→土→金→水)
🧬 Darwin-gpt-ernie-20b (gpt-oss-20b + ERNIE-4.5-21B Merge) · Friendli Dedicated Endpoint
🐛 SWE-bench_Verified (500 Real-world Bug Fix Tasks) · ⚖️ Judge: GPT-5.2
' f'📊 Available: FINAL={len(ALL_TASKS)} · SWE-bench={len(SWE_BENCH_TASKS)} · ' f'🧠Solver=풀이(Darwin/GPT) · ⚖️Judge=채점(GPT-5.2) · ★ 풀이↔채점 파이프라인 병렬처리
') status = gr.Textbox(label="Status", interactive=False, max_lines=2) with gr.Tabs(): with gr.Tab("📊 Progress"): p_html = gr.HTML() with gr.Tab("📋 Results"): t_html = gr.HTML() with gr.Tab("🏆 FINAL Score"): s_html = gr.HTML() with gr.Tab("🔄 Compare"): cmp_html = gr.HTML() with gr.Tab("🔍 Details"): d_html = gr.HTML() with gr.Tab("💾 CSV"): c_file = gr.File(label="CSV") timer = gr.Timer(value=2, active=True) timer.tick(fn=_poll, outputs=[p_html, t_html, s_html, cmp_html, d_html, c_file]) single_ins = [api_key, eval_m, judge_m, proto_toggle, dataset_choice, gf, df, mt, sw, jw] s_btn.click(fn=lambda *a: _start_eval(*a, fresh=False), inputs=single_ins, outputs=[status]) f_btn.click(fn=lambda *a: _start_eval(*a, fresh=True), inputs=single_ins, outputs=[status]) cmp_ins = [api_key, eval_m, judge_m, dataset_choice, gf, df, mt, sw, jw] cmp_btn.click(fn=lambda *a: _start_compare(*a, fresh=True), inputs=cmp_ins, outputs=[status]) x_btn.click(fn=_stop, outputs=[status]) gr.Markdown("---\n