Spaces:
Running
Running
| """ | |
| FINAL Bench v3.1 — AGI-Level Verification System + Local Model Eval | |
| Frontier Intelligence Nexus for AGI-Level Verification | |
| ★ Non-AGI vs Proto-AGI 비교 평가 | |
| ★ 100 FINAL Tasks + 500 SWE-bench Verified Tasks | |
| ★ GPT-5.2 Eval + GPT-5.2 Structured Output Judge | |
| ★ Proto-AGI 오행 완전체: 木→火→土→金→水 | |
| ★ Local Model Support: Darwin-gpt-ernie-20b (vLLM) | |
| ★ SWE-bench_Verified Dataset Integration | |
| Author: Ginigen AI (지니젠AI) — Choi Sunyoung | |
| License: Apache 2.0 | |
| """ | |
| import json, os, time, csv, io, re, html, hashlib, sqlite3, threading, subprocess, signal | |
| from datetime import datetime | |
| from dataclasses import dataclass, field, asdict | |
| from typing import List, Dict, Optional | |
| import requests | |
| import numpy as np | |
| import gradio as gr | |
| from concurrent.futures import ThreadPoolExecutor | |
| # ════════════════════════════════════════════════════════════════ | |
| # §1. Data Structures | |
| # ════════════════════════════════════════════════════════════════ | |
| DOMAIN_INFO = { | |
| "Mathematics & Logic": {"icon":"🔢","color":"#FF6B35"}, | |
| "Science": {"icon":"🔬","color":"#7B2FF7"}, | |
| "Philosophy": {"icon":"🤔","color":"#00B4D8"}, | |
| "Medicine": {"icon":"🏥","color":"#2EC4B6"}, | |
| "Economics": {"icon":"📈","color":"#E63946"}, | |
| "History": {"icon":"📜","color":"#F4A261"}, | |
| "War & Security": {"icon":"🛡️","color":"#264653"}, | |
| "Space & Physics": {"icon":"🚀","color":"#6C63FF"}, | |
| "Chemistry & Biology": {"icon":"🧬","color":"#06D6A0"}, | |
| "Language & Writing": {"icon":"✍️","color":"#EF476F"}, | |
| "Literature": {"icon":"📖","color":"#8338EC"}, | |
| "Art": {"icon":"🎨","color":"#FF006E"}, | |
| "Religion & Mythology": {"icon":"🕊️","color":"#FFD166"}, | |
| "Ethics": {"icon":"⚖️","color":"#118AB2"}, | |
| "AI & Technology": {"icon":"🤖","color":"#073B4C"}, | |
| # SWE-bench domains | |
| "Software Engineering": {"icon":"💻","color":"#00897B"}, | |
| "Bug Fix": {"icon":"🐛","color":"#D84315"}, | |
| "Code Patch": {"icon":"🔧","color":"#5E35B1"}, | |
| } | |
| GRADE_WEIGHT = {"A": 1.5, "B": 1.0, "C": 0.7} | |
| RUBRIC = { | |
| "process_quality": {"weight":0.25, "desc":"Systematic reasoning transparency"}, | |
| "metacognitive_accuracy": {"weight":0.25, "desc":"Confidence calibration + uncertainty honesty"}, | |
| "error_recovery": {"weight":0.20, "desc":"Mid-analysis self-correction"}, | |
| "integration_depth": {"weight":0.15, "desc":"Multi-perspective synthesis + emergent insights"}, | |
| "final_correctness": {"weight":0.15, "desc":"Answer accuracy and completeness"}, | |
| } | |
| AXIS_MAP = { | |
| "generalization": {"rubrics":["process_quality","final_correctness"], "ticos":[]}, | |
| "reasoning": {"rubrics":["process_quality","error_recovery"], "ticos":["E_SelfCorrecting","C_ProgressiveDiscovery"]}, | |
| "planning": {"rubrics":["integration_depth","process_quality"],"ticos":["D_MultiConstraint","H_DecisionUnderUncertainty"]}, | |
| "reliability": {"rubrics":["metacognitive_accuracy"], "ticos":["E_SelfCorrecting","G_PivotDetection"]}, | |
| "safety": {"rubrics":["error_recovery","metacognitive_accuracy"],"ticos":["A_TrapEscape","G_PivotDetection"]}, | |
| } | |
| AGI_STAGES = [ | |
| {"stage":1,"name":"FINAL-Partial","label":"Partial Intelligence", "min":0, "max":39, "color":"#f44336"}, | |
| {"stage":2,"name":"FINAL-Proto", "label":"Proto Intelligence", "min":40,"max":59, "color":"#ff9800"}, | |
| {"stage":3,"name":"FINAL-Pre", "label":"Pre-AGI", "min":60,"max":79, "color":"#2196f3"}, | |
| {"stage":4,"name":"FINAL-Pass", "label":"AGI Achieved", "min":80,"max":94, "color":"#4caf50"}, | |
| {"stage":5,"name":"FINAL-Post", "label":"Operationally Mature AGI","min":95,"max":100,"color":"#9c27b0"}, | |
| ] | |
| class FinalTask: | |
| task_id:str; domain:str; grade:str; ticos_type:str | |
| difficulty:str; lens:str; title:str; prompt:str | |
| expected_behavior:str; hidden_trap:str | |
| ticos_required:List[str]=field(default_factory=list) | |
| metadata:Dict=field(default_factory=dict) | |
| # ════════════════════════════════════════════════════════════════ | |
| # §1.5 SWE-bench Verified Dataset Loader | |
| # ════════════════════════════════════════════════════════════════ | |
| SWE_BENCH_TASKS: List[FinalTask] = [] | |
| def _load_swe_bench_verified(): | |
| """SWE-bench_Verified 데이터셋을 HuggingFace에서 로드하여 FinalTask 형식으로 변환""" | |
| global SWE_BENCH_TASKS | |
| try: | |
| from datasets import load_dataset | |
| print("📦 Loading SWE-bench_Verified from HuggingFace...") | |
| ds = load_dataset("SWE-bench/SWE-bench_Verified", split="test") | |
| print(f" ✅ Loaded {len(ds)} SWE-bench instances") | |
| # 난이도 매핑 | |
| diff_map = {"15 min fix": "easy", "15 min - 1 hour": "medium", | |
| "1-4 hours": "hard", "4+ hours": "expert"} | |
| # TICOS 타입 매핑 (SWE-bench 특성에 맞게) | |
| ticos_types = [ | |
| "E_SelfCorrecting", # 버그 수정 = 자기교정 | |
| "D_MultiConstraint", # 다중 제약조건 해결 | |
| "C_ProgressiveDiscovery", # 점진적 발견 | |
| "A_TrapEscape", # 함정 탈출 (edge case) | |
| ] | |
| # 등급 분배: 난이도 기반 | |
| grade_map = {"15 min fix": "C", "15 min - 1 hour": "B", | |
| "1-4 hours": "A", "4+ hours": "A"} | |
| tasks = [] | |
| for i, item in enumerate(ds): | |
| instance_id = item.get("instance_id", f"swe_{i:04d}") | |
| repo = item.get("repo", "unknown") | |
| problem = item.get("problem_statement", "") | |
| patch = item.get("patch", "") | |
| test_patch = item.get("test_patch", "") | |
| hints = item.get("hints_text", "") | |
| difficulty_raw = item.get("difficulty", "15 min - 1 hour") | |
| version = item.get("version", "") | |
| fail_to_pass = item.get("FAIL_TO_PASS", "") | |
| base_commit = item.get("base_commit", "") | |
| # FinalTask 변환 | |
| difficulty = diff_map.get(difficulty_raw, "medium") | |
| grade = grade_map.get(difficulty_raw, "B") | |
| ticos = ticos_types[i % len(ticos_types)] | |
| # 도메인 결정 (repo 기반) | |
| if "django" in repo.lower(): | |
| domain = "Software Engineering" | |
| elif "astropy" in repo.lower() or "scipy" in repo.lower() or "sympy" in repo.lower(): | |
| domain = "Science" | |
| elif "matplotlib" in repo.lower(): | |
| domain = "Art" | |
| else: | |
| domain = "Software Engineering" | |
| # 프롬프트 구성: 문제 설명 + 코드 컨텍스트 | |
| prompt_text = ( | |
| f"## Software Bug Fix Task\n" | |
| f"**Repository**: {repo} (version {version})\n" | |
| f"**Base Commit**: {base_commit[:12]}...\n\n" | |
| f"### Problem Description:\n{problem[:3000]}\n\n" | |
| ) | |
| if hints: | |
| prompt_text += f"### Hints:\n{hints[:1000]}\n\n" | |
| prompt_text += ( | |
| f"### Requirements:\n" | |
| f"1. Analyze the bug described above\n" | |
| f"2. Identify the root cause in the codebase\n" | |
| f"3. Propose a minimal, correct patch\n" | |
| f"4. Explain why the fix is correct\n" | |
| f"5. Identify potential edge cases or regressions\n" | |
| f"6. State your confidence level for each claim\n" | |
| ) | |
| # Expected behavior = 실제 패치 | |
| expected = f"Correct patch:\n{patch[:2000]}" | |
| # Hidden trap = 테스트가 실패→성공으로 바뀌어야 하는 항목 | |
| hidden = f"Tests that must pass after fix: {fail_to_pass[:500]}" | |
| task = FinalTask( | |
| task_id=f"SWE_{instance_id}", | |
| domain=domain, | |
| grade=grade, | |
| ticos_type=ticos, | |
| difficulty=difficulty, | |
| lens="code_analysis", | |
| title=f"[{repo}] {instance_id}", | |
| prompt=prompt_text, | |
| expected_behavior=expected, | |
| hidden_trap=hidden, | |
| ticos_required=[ticos], | |
| metadata={ | |
| "source": "SWE-bench_Verified", | |
| "repo": repo, | |
| "instance_id": instance_id, | |
| "base_commit": base_commit, | |
| "version": version, | |
| "difficulty_raw": difficulty_raw, | |
| "patch": patch, # 정답 패치 보관 | |
| "test_patch": test_patch, # 테스트 패치 보관 | |
| } | |
| ) | |
| tasks.append(task) | |
| SWE_BENCH_TASKS = tasks | |
| print(f" ✅ Converted {len(tasks)} SWE-bench tasks to FinalTask format") | |
| # 통계 | |
| repos = {} | |
| for t in tasks: | |
| r = t.metadata.get("repo", "?") | |
| repos[r] = repos.get(r, 0) + 1 | |
| print(f" 📊 Repos: {dict(sorted(repos.items(), key=lambda x:-x[1])[:10])}") | |
| grades = {} | |
| for t in tasks: | |
| grades[t.grade] = grades.get(t.grade, 0) + 1 | |
| print(f" 📊 Grades: {grades}") | |
| return tasks | |
| except ImportError: | |
| print("⚠️ 'datasets' library not installed. Run: pip install datasets") | |
| return [] | |
| except Exception as e: | |
| print(f"❌ SWE-bench loading failed: {e}") | |
| return [] | |
| # ════════════════════════════════════════════════════════════════ | |
| # §1.6 Local Model Server (vLLM for Darwin-gpt-ernie-20b) | |
| # ════════════════════════════════════════════════════════════════ | |
| LOCAL_MODEL_CONFIG = { | |
| "model_id": "seawolf2357/Darwin-gpt-ernie-20b", | |
| "base_models": ["openai/gpt-oss-20b", "baidu/ERNIE-4.5-21B-A3B-Thinking"], | |
| "merge_ratio": 0.50, | |
| "params": "21B", | |
| "active_params": "3.6B (MoE)", | |
| "min_vram": "16GB", | |
| "server_port": 8000, | |
| "server_process": None, | |
| "server_ready": False, | |
| "server_starting": False, | |
| "gpu_detected": False, | |
| "gpu_count": 0, | |
| "gpu_info": "", | |
| "gpu_all": [], | |
| "total_vram_mb": 0, | |
| "active_config": {}, | |
| "auto_start_attempted": False, | |
| } | |
| def _detect_gpu(): | |
| """GPU 감지 — 개수 + VRAM 정보""" | |
| try: | |
| result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free", | |
| "--format=csv,noheader,nounits"], | |
| capture_output=True, text=True, timeout=10) | |
| if result.returncode == 0 and result.stdout.strip(): | |
| lines = [l.strip() for l in result.stdout.strip().split('\n') if l.strip()] | |
| LOCAL_MODEL_CONFIG["gpu_detected"] = True | |
| LOCAL_MODEL_CONFIG["gpu_count"] = len(lines) | |
| LOCAL_MODEL_CONFIG["gpu_info"] = lines[0] # 첫 GPU 정보만 저장 | |
| LOCAL_MODEL_CONFIG["gpu_all"] = lines # 전체 GPU 목록 | |
| # 총 VRAM 계산 | |
| total_vram = 0 | |
| for line in lines: | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) >= 2: | |
| try: total_vram += int(parts[1]) | |
| except: pass | |
| LOCAL_MODEL_CONFIG["total_vram_mb"] = total_vram | |
| print(f" 🖥️ GPU detected: {len(lines)}x {lines[0]}") | |
| print(f" 💾 Total VRAM: {total_vram/1024:.1f} GB") | |
| return True | |
| except: | |
| pass | |
| LOCAL_MODEL_CONFIG["gpu_detected"] = False | |
| LOCAL_MODEL_CONFIG["gpu_count"] = 0 | |
| print(" ⚠️ No GPU detected (nvidia-smi failed)") | |
| return False | |
| def _probe_vllm_server(port=None): | |
| """vLLM 서버가 이미 실행 중인지 확인""" | |
| port = port or LOCAL_MODEL_CONFIG["server_port"] | |
| try: | |
| r = requests.get(f"http://localhost:{port}/health", timeout=5) | |
| if r.status_code == 200: | |
| return True | |
| except: | |
| pass | |
| # v1/models 엔드포인트도 확인 | |
| try: | |
| r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) | |
| if r.status_code == 200: | |
| return True | |
| except: | |
| pass | |
| return False | |
| def _auto_detect_server(): | |
| """앱 시작 시 vLLM 서버 자동 감지""" | |
| port = LOCAL_MODEL_CONFIG["server_port"] | |
| if _probe_vllm_server(port): | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| print(f" ✅ vLLM server auto-detected on port {port}") | |
| # 모델 확인 | |
| try: | |
| r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) | |
| if r.status_code == 200: | |
| models = r.json().get("data", []) | |
| if models: | |
| model_ids = [m.get("id", "?") for m in models] | |
| print(f" 📦 Loaded models: {model_ids}") | |
| except: | |
| pass | |
| return True | |
| return False | |
| def _start_local_model_server(model_id=None, gpu_memory_utilization=0.95, max_model_len=4096): | |
| """vLLM 서버로 Darwin-gpt-ernie-20b 로컬 서빙 시작 — 자동 TP + OOM 폴백""" | |
| global LOCAL_MODEL_CONFIG | |
| if model_id: | |
| LOCAL_MODEL_CONFIG["model_id"] = model_id | |
| mid = LOCAL_MODEL_CONFIG["model_id"] | |
| port = LOCAL_MODEL_CONFIG["server_port"] | |
| # 이미 실행 중인지 확인 | |
| if _probe_vllm_server(port): | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return f"✅ Local model server already running on port {port}" | |
| # 이미 시작 중이면 대기 | |
| if LOCAL_MODEL_CONFIG["server_starting"]: | |
| return "⏳ Server is starting... please wait" | |
| # GPU 확인 | |
| if not _detect_gpu(): | |
| return "❌ No GPU detected. vLLM requires GPU (nvidia-smi failed)" | |
| LOCAL_MODEL_CONFIG["server_starting"] = True | |
| gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 1) | |
| total_vram = LOCAL_MODEL_CONFIG.get("total_vram_mb", 48000) | |
| print(f"🚀 Starting vLLM server for {mid}...") | |
| print(f" GPUs: {gpu_count}x | Total VRAM: {total_vram/1024:.1f} GB") | |
| try: | |
| # ★ vLLM 실행 가능한지 먼저 체크 | |
| check = subprocess.run(["python", "-c", "import vllm; print(vllm.__version__)"], | |
| capture_output=True, text=True, timeout=30) | |
| if check.returncode != 0: | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return f"❌ vLLM not available: {check.stderr[:300]}" | |
| print(f" vLLM version: {check.stdout.strip()}") | |
| # ★ 시도할 설정 목록 (점점 메모리 절약 방향으로) | |
| # 21B 모델 bf16 ≈ 42GB → 단일 48GB GPU에서는 KV캐시 공간 부족 | |
| configs = [] | |
| if gpu_count >= 4: | |
| # 4 GPU: TP=4, 각 GPU에 ~10.5GB weights → KV캐시 충분 | |
| configs.append({"tp": 4, "mem": 0.90, "maxlen": 8192, "dtype": "auto", "label": "TP=4, 8K ctx"}) | |
| configs.append({"tp": 4, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=4, 4K ctx"}) | |
| configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"}) | |
| elif gpu_count >= 2: | |
| # 2 GPU: TP=2, 각 GPU에 ~21GB weights | |
| configs.append({"tp": 2, "mem": 0.95, "maxlen": 4096, "dtype": "auto", "label": "TP=2, 4K ctx"}) | |
| configs.append({"tp": 2, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "TP=2, 2K ctx"}) | |
| else: | |
| # 1 GPU 48GB: 21B bf16은 빡빡 → quantization 또는 짧은 ctx | |
| configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "auto", "label": "1GPU, 2K ctx"}) | |
| configs.append({"tp": 1, "mem": 0.95, "maxlen": 1024, "dtype": "auto", "label": "1GPU, 1K ctx"}) | |
| configs.append({"tp": 1, "mem": 0.95, "maxlen": 2048, "dtype": "half", "label": "1GPU, fp16, 2K"}) | |
| for ci, cfg in enumerate(configs): | |
| print(f"\n 🔄 Attempt {ci+1}/{len(configs)}: {cfg['label']}") | |
| cmd = [ | |
| "python", "-m", "vllm.entrypoints.openai.api_server", | |
| "--model", mid, | |
| "--port", str(port), | |
| "--gpu-memory-utilization", str(cfg["mem"]), | |
| "--max-model-len", str(cfg["maxlen"]), | |
| "--trust-remote-code", | |
| "--dtype", cfg["dtype"], | |
| "--enforce-eager", | |
| ] | |
| if cfg["tp"] > 1: | |
| cmd.extend(["--tensor-parallel-size", str(cfg["tp"])]) | |
| print(f" CMD: {' '.join(cmd)}") | |
| proc = subprocess.Popen( | |
| cmd, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| preexec_fn=os.setsid | |
| ) | |
| LOCAL_MODEL_CONFIG["server_process"] = proc | |
| # 서버 준비 대기 (최대 10분) | |
| max_wait = 120 | |
| started = False | |
| crashed = False | |
| crash_msg = "" | |
| for i in range(max_wait): | |
| time.sleep(5) | |
| if proc.poll() is not None: | |
| stderr = proc.stderr.read().decode()[-2000:] | |
| stdout = proc.stdout.read().decode()[-1000:] | |
| crashed = True | |
| crash_msg = stderr + stdout | |
| # OOM 확인 | |
| if "No available memory" in crash_msg or "CUDA out of memory" in crash_msg: | |
| print(f" ⚠️ OOM with {cfg['label']} — trying next config...") | |
| else: | |
| print(f" ❌ Crashed (non-OOM): {crash_msg[-300:]}") | |
| break | |
| if _probe_vllm_server(port): | |
| started = True | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| LOCAL_MODEL_CONFIG["active_config"] = cfg | |
| elapsed = (i+1)*5 | |
| print(f" ✅ vLLM server ready! Config: {cfg['label']} ({elapsed}s)") | |
| return (f"✅ Server started: {mid}\n" | |
| f"Config: {cfg['label']} | TP={cfg['tp']} | " | |
| f"MaxLen={cfg['maxlen']} | {elapsed}s") | |
| if (i+1) % 12 == 0: | |
| print(f" ⏳ Still waiting... ({(i+1)*5}s)") | |
| if started: | |
| break | |
| if not crashed: | |
| # 타임아웃 | |
| try: | |
| os.killpg(os.getpgid(proc.pid), signal.SIGTERM) | |
| proc.wait(timeout=10) | |
| except: pass | |
| print(f" ⚠️ Timeout with {cfg['label']}") | |
| # 다음 시도 전 정리 | |
| try: | |
| if proc.poll() is None: | |
| os.killpg(os.getpgid(proc.pid), signal.SIGKILL) | |
| proc.wait(timeout=5) | |
| except: pass | |
| time.sleep(3) | |
| # 모든 시도 실패 | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return (f"❌ All vLLM configurations failed on {gpu_count}x GPU.\n" | |
| f"Last error: {crash_msg[-300:]}\n" | |
| f"💡 Try: (1) Restart Space to free GPU memory, " | |
| f"(2) Use smaller max-model-len, " | |
| f"(3) Check model compatibility with vLLM {check.stdout.strip()}") | |
| except FileNotFoundError: | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return "❌ vLLM not installed. Run: pip install vllm" | |
| except Exception as e: | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return f"❌ Server start failed: {e}" | |
| def _stop_local_model_server(): | |
| """vLLM 서버 종료""" | |
| global LOCAL_MODEL_CONFIG | |
| proc = LOCAL_MODEL_CONFIG.get("server_process") | |
| if proc and proc.poll() is None: | |
| try: | |
| os.killpg(os.getpgid(proc.pid), signal.SIGTERM) | |
| proc.wait(timeout=10) | |
| except: | |
| try: proc.kill() | |
| except: pass | |
| LOCAL_MODEL_CONFIG["server_process"] = None | |
| LOCAL_MODEL_CONFIG["server_ready"] = False | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return "⏹️ Local model server stopped" | |
| LOCAL_MODEL_CONFIG["server_ready"] = False | |
| LOCAL_MODEL_CONFIG["server_starting"] = False | |
| return "ℹ️ No server running" | |
| def call_local_model(prompt, system="", max_tokens=8192, temperature=0.6): | |
| """로컬 vLLM 서버에 요청 (OpenAI-compatible API)""" | |
| port = LOCAL_MODEL_CONFIG["server_port"] | |
| # ★ 서버 자동 감지 (server_ready가 False여도 실제로 돌고 있을 수 있음) | |
| if not LOCAL_MODEL_CONFIG["server_ready"]: | |
| if _probe_vllm_server(port): | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| print(f" ✅ vLLM server re-detected on port {port}") | |
| else: | |
| return ("[LOCAL_ERROR] vLLM server not running on port {port}. " | |
| "Click '🚀 Start vLLM Server' or check that vLLM process is active.") | |
| headers = {"Content-Type": "application/json"} | |
| # ★ vLLM에 로드된 실제 모델명 확인 (첫 호출 시) | |
| model_name = LOCAL_MODEL_CONFIG.get("_actual_vllm_model") | |
| if not model_name: | |
| try: | |
| r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) | |
| if r.status_code == 200: | |
| models = r.json().get("data", []) | |
| if models: | |
| model_name = models[0].get("id", LOCAL_MODEL_CONFIG["model_id"]) | |
| LOCAL_MODEL_CONFIG["_actual_vllm_model"] = model_name | |
| print(f" 📦 Using vLLM model: {model_name}") | |
| except: | |
| pass | |
| if not model_name: | |
| model_name = LOCAL_MODEL_CONFIG["model_id"] | |
| # ★ 입력 길이 제한 — max_model_len 기준으로 입력+출력 합계 맞춤 | |
| active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {}) | |
| model_max_len = active_cfg.get("maxlen", 4096) | |
| # 대략 4 chars ≈ 1 token. 입력은 max_model_len의 60%까지만 허용 | |
| max_input_chars = int(model_max_len * 0.6 * 4) # e.g., 4096 * 0.6 * 4 = 9830 | |
| max_tokens = min(max_tokens, int(model_max_len * 0.4)) # 출력은 40% | |
| total_input = (system or "") + prompt | |
| if len(total_input) > max_input_chars: | |
| if system and len(system) > max_input_chars // 3: | |
| system = system[:max_input_chars // 3] + "\n[...truncated...]" | |
| remaining = max_input_chars - len(system or "") | |
| if len(prompt) > remaining: | |
| prompt = prompt[:remaining] + "\n[...truncated for context length...]" | |
| messages = [] | |
| if system: | |
| messages.append({"role": "system", "content": system}) | |
| messages.append({"role": "user", "content": prompt}) | |
| payload = { | |
| "model": model_name, | |
| "messages": messages, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature, | |
| } | |
| for attempt in range(3): | |
| try: | |
| r = requests.post( | |
| f"http://localhost:{port}/v1/chat/completions", | |
| headers=headers, | |
| data=json.dumps(payload), | |
| timeout=600 # 긴 응답 대비 10분 | |
| ) | |
| if r.status_code == 400: | |
| err = "" | |
| try: err = r.json().get("error", {}).get("message", r.text[:500]) | |
| except: err = r.text[:500] | |
| print(f" ⚠️ vLLM 400 (attempt {attempt+1}): {err[:300]}") | |
| # ★ 토큰 제한 에러 → max_tokens 줄여서 재시도 | |
| if "max_tokens" in str(err).lower() or "too many tokens" in str(err).lower(): | |
| payload["max_tokens"] = min(max_tokens // 2, 4096) | |
| if attempt < 2: continue | |
| # ★ 모델명 불일치 → v1/models로 재확인 | |
| if "model" in str(err).lower(): | |
| try: | |
| mr = requests.get(f"http://localhost:{port}/v1/models", timeout=5) | |
| if mr.status_code == 200: | |
| models = mr.json().get("data", []) | |
| if models: | |
| payload["model"] = models[0]["id"] | |
| LOCAL_MODEL_CONFIG["_actual_vllm_model"] = models[0]["id"] | |
| if attempt < 2: continue | |
| except: pass | |
| return f"[LOCAL_ERROR] 400: {err[:300]}" | |
| if r.status_code == 503: | |
| # 서버 과부하 | |
| print(f" ⏳ vLLM 503 overloaded (attempt {attempt+1})") | |
| if attempt < 2: time.sleep(10 * (attempt+1)); continue | |
| return "[LOCAL_ERROR] Server overloaded (503)" | |
| r.raise_for_status() | |
| c = r.json()["choices"][0]["message"]["content"] | |
| return c if c else "[EMPTY]" | |
| except requests.exceptions.ConnectionError: | |
| print(f" ⚠️ vLLM connection error (attempt {attempt+1})") | |
| # 서버 재감지 | |
| if _probe_vllm_server(port): | |
| if attempt < 2: time.sleep(3); continue | |
| else: | |
| LOCAL_MODEL_CONFIG["server_ready"] = False | |
| return "[LOCAL_ERROR] Connection refused. vLLM server may have crashed." | |
| except requests.exceptions.ReadTimeout: | |
| print(f" ⚠️ vLLM timeout (attempt {attempt+1})") | |
| if attempt < 2: time.sleep(5); continue | |
| return "[LOCAL_ERROR] Request timeout (600s). Task may be too complex." | |
| except Exception as e: | |
| print(f" ⚠️ vLLM exception (attempt {attempt+1}): {str(e)[:200]}") | |
| if attempt < 2: | |
| time.sleep(3 * (attempt + 1)) | |
| else: | |
| return f"[LOCAL_ERROR] {e}" | |
| # ═══════════════════════════════════════════════════════════ | |
| # ★ 앱 시작 시 vLLM 자동 감지 + 자동 시작 | |
| # ═══════════════════════════════════════════════════════════ | |
| def _auto_boot_vllm(): | |
| """앱 부팅 시 vLLM 서버 자동 시작 (백그라운드)""" | |
| # 1) 이미 실행 중인지 확인 | |
| if _auto_detect_server(): | |
| return | |
| # 2) GPU 감지 | |
| if not _detect_gpu(): | |
| print(" ℹ️ No GPU → vLLM auto-start skipped") | |
| return | |
| # 3) vLLM 설치 확인 | |
| try: | |
| check = subprocess.run(["python", "-c", "import vllm"], | |
| capture_output=True, timeout=10) | |
| if check.returncode != 0: | |
| print(" ℹ️ vLLM not installed → auto-start skipped") | |
| return | |
| except: | |
| return | |
| # 4) 자동 시작 | |
| print(" 🚀 Auto-starting vLLM server for Darwin model...") | |
| LOCAL_MODEL_CONFIG["auto_start_attempted"] = True | |
| result = _start_local_model_server() | |
| print(f" 📋 Auto-start result: {result}") | |
| # ★ 앱 시작 시 실행 (블로킹 아닌 백그라운드 스레드) | |
| print("🔍 Checking for local vLLM server...") | |
| if _auto_detect_server(): | |
| print(" ✅ vLLM server found and ready!") | |
| elif _detect_gpu(): | |
| print(" 🖥️ GPU available — will auto-start vLLM in background") | |
| threading.Thread(target=_auto_boot_vllm, daemon=True, name="vLLM-AutoBoot").start() | |
| else: | |
| print(" ℹ️ No GPU detected — using OpenAI/HF Inference models") | |
| def _test_hf_model(model_id, hf_token): | |
| """HF 모델 사전 검증 — 어떤 API 엔드포인트가 작동하는지 확인""" | |
| if not hf_token: | |
| return False, "no_token", "HF_TOKEN required" | |
| headers = {"Content-Type": "application/json", "Authorization": f"Bearer {hf_token}"} | |
| test_messages = [{"role": "user", "content": "Say OK"}] | |
| test_payload = {"model": model_id, "messages": test_messages, "max_tokens": 20, "stream": False} | |
| # 1) router.huggingface.co (Inference Providers — OpenAI-compatible) | |
| try: | |
| r = requests.post("https://router.huggingface.co/v1/chat/completions", | |
| headers=headers, data=json.dumps(test_payload), timeout=30) | |
| if r.status_code == 200: | |
| return True, "router", "OK" | |
| err = "" | |
| try: | |
| rj = r.json() | |
| e = rj.get("error", "") | |
| err = e.get("message", str(e)) if isinstance(e, dict) else str(e) | |
| except: | |
| try: err = r.text[:300] | |
| except: err = str(r.status_code) | |
| print(f" ℹ️ HF router test ({model_id}): {r.status_code} — {str(err)[:200]}") | |
| except Exception as e: | |
| print(f" ℹ️ HF router exception: {str(e)[:200]}") | |
| # 2) api-inference.huggingface.co (Serverless Inference API) | |
| try: | |
| inf_payload = {"inputs": "Say OK", "parameters": {"max_new_tokens": 20}} | |
| r2 = requests.post(f"https://api-inference.huggingface.co/models/{model_id}", | |
| headers=headers, data=json.dumps(inf_payload), timeout=30) | |
| if r2.status_code == 200: | |
| return True, "serverless", "OK" | |
| err2 = "" | |
| try: | |
| rj2 = r2.json() | |
| e2 = rj2.get("error", "") | |
| err2 = e2.get("message", str(e2)) if isinstance(e2, dict) else str(e2) | |
| except: | |
| try: err2 = r2.text[:300] | |
| except: err2 = str(r2.status_code) | |
| print(f" ℹ️ HF serverless test ({model_id}): {r2.status_code} — {str(err2)[:200]}") | |
| except Exception as e: | |
| print(f" ℹ️ HF serverless exception: {str(e)[:200]}") | |
| # 3) HF Inference Endpoints (dedicated) — 사용자 전용 엔드포인트 | |
| # 이건 URL이 다르므로 여기서는 스킵 | |
| return False, "none", f"Model {model_id} not available on HF Inference" | |
| _HF_ENDPOINT_CACHE = {} # {model_id: "router" | "serverless" | "none"} | |
| def call_hf_inference(prompt, system="", model_id=None, hf_token=None, max_tokens=4096): | |
| """HuggingFace Inference API — 자동 엔드포인트 탐색 + 에러 바디 캡처""" | |
| mid = model_id or LOCAL_MODEL_CONFIG["model_id"] | |
| token = hf_token or os.getenv("HF_TOKEN", "") | |
| if not token: | |
| return "[HF_ERROR] HF_TOKEN required for Inference API" | |
| headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} | |
| # ★ 엔드포인트 자동 탐색 (캐시) | |
| if mid not in _HF_ENDPOINT_CACHE: | |
| ok, endpoint_type, msg = _test_hf_model(mid, token) | |
| _HF_ENDPOINT_CACHE[mid] = endpoint_type | |
| if not ok: | |
| print(f" ❌ HF model {mid} not available: {msg}") | |
| endpoint_type = _HF_ENDPOINT_CACHE.get(mid, "none") | |
| # ★ 프롬프트 길이 제한 (HF Inference는 입력 토큰 제한이 엄격) | |
| MAX_PROMPT_CHARS = 12000 # ~3000 토큰 | |
| if len(prompt) > MAX_PROMPT_CHARS: | |
| prompt = prompt[:MAX_PROMPT_CHARS] + "\n\n[... truncated for API limits ...]" | |
| if system and len(system) > 2000: | |
| system = system[:2000] | |
| # ── Router (OpenAI-compatible) ── | |
| if endpoint_type == "router": | |
| messages = [] | |
| if system: messages.append({"role": "system", "content": system}) | |
| messages.append({"role": "user", "content": prompt}) | |
| payload = {"model": mid, "messages": messages, "max_tokens": max_tokens, "stream": False} | |
| for attempt in range(3): | |
| try: | |
| r = requests.post("https://router.huggingface.co/v1/chat/completions", | |
| headers=headers, data=json.dumps(payload), timeout=300) | |
| if r.status_code == 429: | |
| time.sleep(5 * (attempt + 1)); continue | |
| if r.status_code >= 400: | |
| err_body = "" | |
| try: err_body = r.json().get("error", r.text[:500]) | |
| except: err_body = r.text[:500] | |
| print(f" ⚠️ HF router {r.status_code} (attempt {attempt+1}): {err_body[:200]}") | |
| if attempt < 2: | |
| # 프롬프트 더 줄여서 재시도 | |
| if len(prompt) > 4000: | |
| prompt = prompt[:4000] + "\n[... further truncated ...]" | |
| payload["messages"][-1]["content"] = prompt | |
| time.sleep(3 * (attempt + 1)); continue | |
| return f"[HF_ERROR] {r.status_code}: {err_body[:200]}" | |
| r.raise_for_status() | |
| c = r.json()["choices"][0]["message"]["content"] | |
| return c if c else "[EMPTY]" | |
| except Exception as e: | |
| if attempt < 2: time.sleep(3 * (attempt + 1)) | |
| else: return f"[HF_ERROR] router: {e}" | |
| # ── Serverless Inference API ── | |
| elif endpoint_type == "serverless": | |
| full_prompt = f"{system}\n\n{prompt}" if system else prompt | |
| payload = { | |
| "inputs": full_prompt, | |
| "parameters": {"max_new_tokens": min(max_tokens, 2048), "temperature": 0.6, "return_full_text": False} | |
| } | |
| for attempt in range(3): | |
| try: | |
| r = requests.post(f"https://api-inference.huggingface.co/models/{mid}", | |
| headers=headers, data=json.dumps(payload), timeout=300) | |
| if r.status_code == 429: | |
| time.sleep(5 * (attempt + 1)); continue | |
| if r.status_code == 503: | |
| # 모델 로딩 중 | |
| est = 60 | |
| try: est = r.json().get("estimated_time", 60) | |
| except: pass | |
| print(f" ⏳ Model loading... ETA {est}s") | |
| if attempt < 2: time.sleep(min(est, 30)); continue | |
| return f"[HF_ERROR] Model still loading (ETA {est}s)" | |
| if r.status_code >= 400: | |
| err_body = "" | |
| try: err_body = r.json().get("error", r.text[:500]) | |
| except: err_body = r.text[:500] | |
| print(f" ⚠️ HF serverless {r.status_code}: {err_body[:200]}") | |
| if attempt < 2: time.sleep(3 * (attempt + 1)); continue | |
| return f"[HF_ERROR] {r.status_code}: {err_body[:200]}" | |
| r.raise_for_status() | |
| result = r.json() | |
| if isinstance(result, list) and result: | |
| text = result[0].get("generated_text", "") | |
| elif isinstance(result, dict): | |
| text = result.get("generated_text", "") | |
| else: | |
| text = str(result)[:5000] | |
| return text if text else "[EMPTY]" | |
| except Exception as e: | |
| if attempt < 2: time.sleep(3 * (attempt + 1)) | |
| else: return f"[HF_ERROR] serverless: {e}" | |
| # ── 어떤 엔드포인트도 안 됨 ── | |
| else: | |
| return (f"[HF_ERROR] Model '{mid}' is not available on HF Inference API. " | |
| f"Possible reasons: (1) Model too large for serverless, (2) Not deployed as Inference Provider, " | |
| f"(3) Needs dedicated Inference Endpoint. Try using OpenAI model (gpt-4o) for evaluation instead.") | |
| # ════════════════════════════════════════════════════════════════ | |
| # §2. Load Dataset | |
| # ════════════════════════════════════════════════════════════════ | |
| def load_tasks(): | |
| for p in ["FINAL_Bench_v3.json","/mnt/user-data/uploads/FINAL_Bench_v3.json", | |
| os.path.join(os.path.dirname(os.path.abspath(__file__)),"FINAL_Bench_v3.json")]: | |
| if os.path.exists(p): | |
| with open(p,"r",encoding="utf-8") as f: data=json.load(f) | |
| print(f" Loaded from {p}"); break | |
| else: | |
| print("⚠️ FINAL_Bench_v3.json not found — FINAL tasks empty, SWE-bench only mode") | |
| return [] | |
| return [FinalTask(task_id=t["task_id"],domain=t["domain"],grade=t["grade"], | |
| ticos_type=t["ticos_type"],difficulty=t["difficulty"],lens=t.get("lens",""), | |
| title=t["title"],prompt=t["prompt"],expected_behavior=t.get("expected_behavior",""), | |
| hidden_trap=t.get("hidden_trap",""),ticos_required=t.get("ticos_required",[]), | |
| metadata=t.get("metadata",{})) for t in data["tasks"]] | |
| try: | |
| ALL_TASKS = load_tasks() | |
| print(f"✅ FINAL Bench v3.0: {len(ALL_TASKS)} tasks") | |
| except: | |
| ALL_TASKS = [] | |
| print("⚠️ FINAL tasks: 0 (SWE-bench only mode)") | |
| # SWE-bench 로드 (비동기) | |
| SWE_BENCH_TASKS = _load_swe_bench_verified() | |
| # ════════════════════════════════════════════════════════════════ | |
| # §3. Model API (OpenAI + Local + HF) | |
| # ════════════════════════════════════════════════════════════════ | |
| OPENAI_MODELS = { | |
| "gpt-5.2": "GPT-5.2 (flagship)", | |
| "gpt-5.2-chat-latest": "GPT-5.2 Instant", | |
| "gpt-5-mini": "GPT-5 Mini", | |
| "o4-mini": "o4-mini", | |
| "gpt-4.1": "GPT-4.1", | |
| } | |
| LOCAL_MODELS = { | |
| "Darwin-gpt-ernie-20b (Friendli)": { | |
| "id": "deppfs281rgffnk", | |
| "type": "friendli", | |
| "desc": "21B MoE (Friendli Dedicated Endpoint)", | |
| "api_url": "https://api.friendli.ai/dedicated/v1/chat/completions", | |
| }, | |
| "Darwin-gpt-ernie-20b (Local vLLM)": { | |
| "id": "seawolf2357/Darwin-gpt-ernie-20b", | |
| "type": "local_vllm", | |
| "desc": "21B MoE (Local vLLM, GPU required)" | |
| }, | |
| "Darwin-gpt-ernie-20b (HF Inference)": { | |
| "id": "seawolf2357/Darwin-gpt-ernie-20b", | |
| "type": "hf_inference", | |
| "desc": "HuggingFace Inference API (HF_TOKEN required)" | |
| }, | |
| } | |
| ALL_EVAL_MODELS = {**OPENAI_MODELS, **{k: v["desc"] for k, v in LOCAL_MODELS.items()}} | |
| # ════════════════════════════════════════════════════════════════ | |
| # §3.6 Friendli AI Dedicated Endpoint | |
| # ════════════════════════════════════════════════════════════════ | |
| def _test_friendli(model_id=None): | |
| """Friendli API 연결 테스트""" | |
| token = os.getenv("FRIENDLI_TOKEN", "") | |
| if not token: | |
| return False, "FRIENDLI_TOKEN not set" | |
| mid = model_id or "deppfs281rgffnk" | |
| headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} | |
| payload = { | |
| "model": mid, | |
| "messages": [{"role": "user", "content": "Say OK"}], | |
| "max_tokens": 10, | |
| "temperature": 0, | |
| "stream": False, | |
| } | |
| try: | |
| r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions", | |
| headers=headers, json=payload, timeout=30) | |
| if r.status_code == 200: | |
| text = r.json()["choices"][0]["message"]["content"] | |
| return True, f"OK ({text[:20]})" | |
| err = "" | |
| try: err = r.json().get("error", {}).get("message", r.text[:300]) | |
| except: err = r.text[:300] | |
| return False, f"{r.status_code}: {err[:200]}" | |
| except Exception as e: | |
| return False, f"Connection error: {str(e)[:200]}" | |
| def call_friendli(prompt, system="", model_id=None, max_tokens=8192, temperature=0.6): | |
| """Friendli AI Dedicated Endpoint 호출 (OpenAI-compatible)""" | |
| token = os.getenv("FRIENDLI_TOKEN", "") | |
| if not token: | |
| return "[FRIENDLI_ERROR] FRIENDLI_TOKEN not set in environment" | |
| mid = model_id or "deppfs281rgffnk" | |
| headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"} | |
| messages = [] | |
| if system: | |
| messages.append({"role": "system", "content": system}) | |
| messages.append({"role": "user", "content": prompt}) | |
| payload = { | |
| "model": mid, | |
| "messages": messages, | |
| "max_tokens": min(max_tokens, 16384), | |
| "temperature": temperature, | |
| "top_p": 0.95, | |
| "stream": False, | |
| } | |
| for attempt in range(3): | |
| try: | |
| r = requests.post("https://api.friendli.ai/dedicated/v1/chat/completions", | |
| headers=headers, json=payload, timeout=600) | |
| if r.status_code == 429: | |
| wait = 5 * (attempt + 1) | |
| print(f" ⏳ Friendli rate limit, waiting {wait}s...") | |
| time.sleep(wait) | |
| continue | |
| if r.status_code == 400: | |
| err = "" | |
| try: err = r.json().get("error", {}).get("message", r.text[:500]) | |
| except: err = r.text[:500] | |
| print(f" ⚠️ Friendli 400 (attempt {attempt+1}): {err[:200]}") | |
| # 토큰 제한 에러 → max_tokens 줄여서 재시도 | |
| if "max_tokens" in str(err).lower() or "too many" in str(err).lower(): | |
| payload["max_tokens"] = min(payload["max_tokens"] // 2, 4096) | |
| if attempt < 2: continue | |
| # 입력 길이 에러 → 프롬프트 축소 | |
| if "input" in str(err).lower() and ("length" in str(err).lower() or "token" in str(err).lower()): | |
| cur_len = len(messages[-1]["content"]) | |
| messages[-1]["content"] = messages[-1]["content"][:cur_len // 2] + "\n[...truncated...]" | |
| payload["messages"] = messages | |
| if attempt < 2: continue | |
| return f"[FRIENDLI_ERROR] 400: {err[:200]}" | |
| if r.status_code >= 500: | |
| print(f" ⚠️ Friendli {r.status_code} (attempt {attempt+1})") | |
| if attempt < 2: time.sleep(3 * (attempt + 1)); continue | |
| return f"[FRIENDLI_ERROR] Server error {r.status_code}" | |
| r.raise_for_status() | |
| c = r.json()["choices"][0]["message"]["content"] | |
| return c if c else "[EMPTY]" | |
| except requests.exceptions.Timeout: | |
| print(f" ⚠️ Friendli timeout (attempt {attempt+1})") | |
| if attempt < 2: time.sleep(5); continue | |
| return "[FRIENDLI_ERROR] Request timeout (600s)" | |
| except requests.exceptions.ConnectionError: | |
| print(f" ⚠️ Friendli connection error (attempt {attempt+1})") | |
| if attempt < 2: time.sleep(3 * (attempt + 1)); continue | |
| return "[FRIENDLI_ERROR] Connection failed" | |
| except Exception as e: | |
| print(f" ⚠️ Friendli exception (attempt {attempt+1}): {str(e)[:200]}") | |
| if attempt < 2: time.sleep(3 * (attempt + 1)) | |
| else: return f"[FRIENDLI_ERROR] {e}" | |
| def _strip_think(text): | |
| if not text: return text | |
| for tag in ['think','thinking','reasoning','reflection']: | |
| text = re.sub(rf'<{tag}>.*?</{tag}>','',text,flags=re.DOTALL) | |
| return text.strip() | |
| def call_model(prompt, system="", api_key="", model="gpt-5.2", | |
| max_tokens=8192, temperature=0.6, reasoning_effort=None): | |
| """통합 모델 호출 — Friendli / OpenAI / Local vLLM / HF Inference 자동 분기""" | |
| # 로컬/전용 모델 분기 | |
| if model in LOCAL_MODELS: | |
| minfo = LOCAL_MODELS[model] | |
| if minfo["type"] == "friendli": | |
| return call_friendli(prompt, system=system, | |
| model_id=minfo["id"], | |
| max_tokens=max_tokens, temperature=temperature) | |
| elif minfo["type"] == "local_vllm": | |
| return call_local_model(prompt, system=system, | |
| max_tokens=max_tokens, temperature=temperature) | |
| elif minfo["type"] == "hf_inference": | |
| return call_hf_inference(prompt, system=system, | |
| model_id=minfo["id"], | |
| max_tokens=min(max_tokens, 4096)) | |
| # OpenAI API | |
| return call_openai(prompt, system=system, api_key=api_key, model=model, | |
| max_tokens=max_tokens, temperature=temperature, | |
| reasoning_effort=reasoning_effort) | |
| def _test_api_connection(api_key, model="gpt-4o-mini"): | |
| """API 연결 + 모델 유효성 빠른 테스트""" | |
| if not api_key: | |
| return False, "❌ API key is empty" | |
| headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} | |
| payload = {"model":model,"max_completion_tokens":50,"temperature":0, | |
| "messages":[{"role":"user","content":"Say OK"}]} | |
| try: | |
| r = requests.post("https://api.openai.com/v1/chat/completions", | |
| headers=headers,data=json.dumps(payload),timeout=30) | |
| if r.status_code == 200: | |
| return True, f"✅ {model} OK" | |
| err = "" | |
| try: err = r.json().get("error",{}).get("message", r.text[:200]) | |
| except: err = str(r.status_code) | |
| return False, f"❌ {r.status_code}: {err}" | |
| except Exception as e: | |
| return False, f"❌ Connection failed: {e}" | |
| # ── 모델명 폴백 맵 (존재하지 않는 모델 → 실제 모델) ── | |
| MODEL_FALLBACK = { | |
| "gpt-5.2": ["gpt-5.2", "gpt-4.1", "gpt-4o", "gpt-4o-mini"], | |
| "gpt-5.2-chat-latest": ["gpt-5.2-chat-latest", "gpt-4.1", "gpt-4o"], | |
| "gpt-5-mini": ["gpt-5-mini", "gpt-4o-mini", "gpt-4o"], | |
| "o4-mini": ["o4-mini", "o3-mini", "gpt-4o-mini"], | |
| "gpt-4.1": ["gpt-4.1", "gpt-4o", "gpt-4o-mini"], | |
| } | |
| _VERIFIED_MODELS = {} # 캐시: {requested_model: actual_working_model} | |
| def _resolve_model(model, api_key): | |
| """모델명이 유효한지 확인하고, 안 되면 폴백 모델 탐색""" | |
| if model in _VERIFIED_MODELS: | |
| return _VERIFIED_MODELS[model] | |
| ok, msg = _test_api_connection(api_key, model) | |
| if ok: | |
| _VERIFIED_MODELS[model] = model | |
| print(f" ✅ Model verified: {model}") | |
| return model | |
| # 폴백 탐색 | |
| fallbacks = MODEL_FALLBACK.get(model, []) | |
| for fb in fallbacks: | |
| if fb == model: continue | |
| ok2, msg2 = _test_api_connection(api_key, fb) | |
| if ok2: | |
| _VERIFIED_MODELS[model] = fb | |
| print(f" ⚠️ Model {model} unavailable → fallback to {fb}") | |
| return fb | |
| # 최후 수단 | |
| ok3, _ = _test_api_connection(api_key, "gpt-4o-mini") | |
| if ok3: | |
| _VERIFIED_MODELS[model] = "gpt-4o-mini" | |
| print(f" ⚠️ All fallbacks failed → using gpt-4o-mini") | |
| return "gpt-4o-mini" | |
| _VERIFIED_MODELS[model] = model | |
| print(f" ❌ No working model found for {model}: {msg}") | |
| return model | |
| def call_openai(prompt, system="", api_key="", model="gpt-5.2", | |
| max_tokens=8192, temperature=0.6, reasoning_effort=None): | |
| """OpenAI API — 자동 모델 검증/폴백 + 파라미터 호환성 자동 수정""" | |
| # ★ 모델명 자동 검증/폴백 | |
| actual_model = _resolve_model(model, api_key) | |
| headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} | |
| messages = [] | |
| if system: messages.append({"role":"system","content":system}) | |
| messages.append({"role":"user","content":prompt}) | |
| payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature,"messages":messages} | |
| # reasoning_effort: 일부 모델만 지원 | |
| if reasoning_effort: | |
| payload["reasoning_effort"] = reasoning_effort | |
| for attempt in range(3): | |
| try: | |
| r = requests.post("https://api.openai.com/v1/chat/completions", | |
| headers=headers,data=json.dumps(payload),timeout=300) | |
| if r.status_code == 429: | |
| time.sleep(5*(attempt+1)); continue | |
| if r.status_code == 400: | |
| err_msg = "" | |
| try: err_msg = r.json().get("error",{}).get("message","") | |
| except: err_msg = str(r.status_code) | |
| print(f" ⚠️ 400 Error (attempt {attempt+1}): {err_msg[:200]}") | |
| # ★ 파라미터 호환성 자동 수정 | |
| if "max_completion_tokens" in err_msg: | |
| payload.pop("max_completion_tokens", None) | |
| payload["max_tokens"] = max_tokens | |
| if "reasoning_effort" in err_msg or "not supported" in err_msg.lower(): | |
| payload.pop("reasoning_effort", None) | |
| if "temperature" in err_msg: | |
| payload["temperature"] = 1 # reasoning 모델은 temperature 지원 안 할 수 있음 | |
| if attempt < 2: | |
| time.sleep(2); continue | |
| return f"[API_ERROR] 400: {err_msg[:200]}" | |
| r.raise_for_status() | |
| c = r.json()["choices"][0]["message"]["content"] | |
| return c if c else "[EMPTY]" | |
| except requests.exceptions.HTTPError: | |
| try: err=r.json().get("error",{}).get("message","") | |
| except: err=str(r.status_code) | |
| print(f" ⚠️ HTTP Error (attempt {attempt+1}): {err[:200]}") | |
| if attempt<2: time.sleep(3*(attempt+1)); continue | |
| return f"[API_ERROR] {err}" | |
| except Exception as e: | |
| print(f" ⚠️ Exception (attempt {attempt+1}): {str(e)[:200]}") | |
| if attempt<2: time.sleep(3*(attempt+1)) | |
| else: return f"[API_ERROR] {e}" | |
| # ════════════════════════════════════════════════════════════════ | |
| # §3.5 Proto-AGI 오행 멀티에이전트 파이프라인 | |
| # ════════════════════════════════════════════════════════════════ | |
| MAGIC_SQUARE_5x5 = np.array([ | |
| [17,24,1,8,15],[23,5,7,14,16],[4,6,13,20,22],[10,12,19,21,3],[11,18,25,2,9] | |
| ], dtype=np.float64) | |
| COMM_MATRIX = MAGIC_SQUARE_5x5 / MAGIC_SQUARE_5x5.sum(axis=1, keepdims=True) | |
| AGENT_BUDGETS = { | |
| "木_발상": 512, | |
| "火_실행": 65536, | |
| "火_이어쓰기": 32768, | |
| "土_판단": 512, | |
| "金_비평": 2048, | |
| "水_정제": 4096, | |
| } | |
| AGENT_REASONING = { | |
| "木_발상": "low", | |
| "火_실행": None, | |
| "火_이어쓰기": None, | |
| "土_판단": "low", | |
| "金_비평": "medium", | |
| "水_정제": "medium", | |
| } | |
| PROTO_AGENTS = { | |
| "木_발상": { | |
| "role": "Ideation (木/仁). 3 bullets MAX, 100 words total. " | |
| "What makes this AGI-hard? Key traps? Core angles?", | |
| "element":"木","index":2,"sheng_from":"水","ke_target":"土", | |
| }, | |
| "火_실행": { | |
| "role": "★ MAIN SOLVER (火/禮). You write THE COMPLETE FINAL ANSWER. " | |
| "Complete ALL numbered requirements — check each off. " | |
| "State confidence (0-100%) per major claim. " | |
| "★ MANDATORY SELF-CHECK at the END of your answer: " | |
| "Write 2-3 [BACKTRACK] corrections reviewing your own claims: " | |
| "'[BACKTRACK-1] I adjust X because Y. Corrected: Z.' " | |
| "'[BACKTRACK-2] I refine A because B. Corrected: C.' " | |
| "Find genuine improvements — qualify overconfident claims, fix edge cases, " | |
| "or add missing nuance. This is REQUIRED for scoring. " | |
| "NEVER stop mid-sentence. Be concise but COMPLETE all requirements.", | |
| "element":"火","index":3,"sheng_from":"木","ke_target":"金", | |
| }, | |
| "土_판단": { | |
| "role": "Auditor (土/信). ONE paragraph only. " | |
| "List: (1) missing requirements (2) overconfident claims (3) domain drift. Max 80 words.", | |
| "element":"土","index":4,"sheng_from":"火","ke_target":"水", | |
| }, | |
| "金_비평": { | |
| "role": "Verifier (金/義). Use STRUCTURED fix tags. For each error found: " | |
| "[FIX-1] error description → correction. [FIX-2] ... Max 5 fixes. " | |
| "Also: [TRAP-CHECK] verify hidden traps. [HALLUCINATION] tag unverifiable claims.", | |
| "element":"金","index":0,"sheng_from":"土","ke_target":"木", | |
| }, | |
| "水_정제": { | |
| "role": "Correction Agent (水/智). You apply 金's fixes to 火's answer. " | |
| "For EACH [FIX-n] from 金, write [APPLIED-n] with the specific correction. " | |
| "Also cool any overconfident claims (水克火). " | |
| "End with confidence summary table + 2 most uncertain points. " | |
| "Do NOT rewrite 火's entire answer — only write corrections and additions.", | |
| "element":"水","index":1,"sheng_from":"金","ke_target":"火", | |
| }, | |
| } | |
| AGENT_ORDER = ["木_발상","火_실행","土_판단","金_비평","水_정제"] | |
| AGENT_EMOJIS = {"木":"🌳","火":"🔥","土":"🏔️","金":"⚔️","水":"💧"} | |
| FINAL_AGENT_INSTRUCTIONS = { | |
| "木": "\n[BUDGET: 100 words MAX] 3 bullets: (1) core trap (2) key contradiction (3) best angle.", | |
| "火": "\n★★★ YOU ARE THE FINAL ANSWER. THE JUDGE SCORES YOUR TEXT. ★★★\n" | |
| "Complete ALL numbered requirements from the task. Check each off.\n" | |
| "Confidence per major claim.\n" | |
| "CRITICAL: Finish EVERY section. NEVER stop mid-sentence.\n" | |
| "If running long, be CONCISE but COMPLETE all requirements.\n" | |
| "PRIORITY: completeness > depth.\n" | |
| "\n★ MANDATORY SELF-CHECK (REQUIRED — at the very end of your answer):\n" | |
| "Review your answer and write 2-3 [BACKTRACK] corrections:\n" | |
| "[BACKTRACK-1] I adjust [claim] because [reason]. Corrected: [new version].\n" | |
| "[BACKTRACK-2] I refine [claim] because [reason]. Corrected: [new version].\n" | |
| "[BACKTRACK-3] (optional) ...\n" | |
| "These must be GENUINE improvements, not cosmetic. Examples:\n" | |
| "- Qualify an overconfident claim (90%→70%)\n" | |
| "- Fix an edge case you missed\n" | |
| "- Add a missing perspective or caveat\n" | |
| "The Judge gives ZERO for error_recovery without [BACKTRACK] tags.", | |
| "土": "\n[BUDGET: 80 words MAX] Checklist only: □missing items □overconfidence □drift.", | |
| "金": "\n[BUDGET: MAX 5 FIXES] Format strictly as:\n" | |
| "[FIX-1] Problem → Correction\n[FIX-2] Problem → Correction\n...\n" | |
| "[TRAP-CHECK] Hidden trap found? Y/N + detail\n[HALLUCINATION] Any? Y/N + which claims", | |
| "水": "\nYou MUST produce [APPLIED-n] for EVERY [FIX-n] from 金.\n" | |
| "Format:\n" | |
| "[APPLIED-1] Fixed: (what was wrong → what it should be, 1-2 sentences)\n" | |
| "[APPLIED-2] Fixed: ...\n" | |
| "[NO-FIXES-NEEDED] if 金 found no errors.\n" | |
| "\n## Confidence Adjustments\n" | |
| "(List 火's overclaimed items with corrected confidence, max 3)\n" | |
| "\n## Top-2 Uncertainties\n" | |
| "(Most uncertain claims and why)\n" | |
| "\nMAX 600 words. Focus on corrections, not repetition.", | |
| } | |
| def _count_requirements(prompt_text): | |
| count = 0 | |
| for line in prompt_text.split('\n'): | |
| stripped = line.strip() | |
| if stripped and len(stripped) > 3: | |
| if (stripped[0].isdigit() and len(stripped) > 2 and stripped[1] in '.):') or \ | |
| (stripped.startswith('(') and len(stripped) > 3 and stripped[1].isdigit()): | |
| count += 1 | |
| return count | |
| def _detect_truncation(text): | |
| if not text or len(text) < 100: | |
| return True | |
| if '[BACKTRACK' not in text: | |
| return True | |
| return False | |
| def _build_agent_prompt(agent_name, task_prompt, prev_outputs, task=None): | |
| info = PROTO_AGENTS[agent_name] | |
| elem = info['element'] | |
| idx = info['index'] | |
| sheng_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['sheng_from']] | |
| sheng_ref = "" | |
| if sheng_agent and sheng_agent[0] in prev_outputs: | |
| sheng_ref = f"\n[상생 {sheng_agent[0]}] {prev_outputs[sheng_agent[0]][:200]}\n" | |
| ke_agent = [a for a in AGENT_ORDER if PROTO_AGENTS[a]['element']==info['ke_target']] | |
| ke_ref = "" | |
| if ke_agent and ke_agent[0] in prev_outputs: | |
| ke_ref = f"\n[상극 {ke_agent[0]}] {prev_outputs[ke_agent[0]][:150]}\n" | |
| weights = COMM_MATRIX[idx] | |
| comm_lines = [] | |
| for aname in AGENT_ORDER: | |
| if aname in prev_outputs and aname != agent_name: | |
| w = weights[PROTO_AGENTS[aname]['index']] | |
| if w >= 0.15: | |
| trunc = min(len(prev_outputs[aname]), 150) | |
| comm_lines.append(f"[{aname}] {prev_outputs[aname][:trunc]}") | |
| adaptive_instruction = "" | |
| if elem == "火" and task: | |
| req_count = _count_requirements(task.prompt) | |
| prompt_lower = task.prompt.lower() | |
| is_multi_perspective = any(kw in prompt_lower for kw in [ | |
| 'traditions debate', 'positions debate', 'expert panel', | |
| 'each position at maximum depth', 'each response at maximum depth', | |
| 'develop each', 'each tradition' | |
| ]) | |
| if req_count >= 7 or is_multi_perspective: | |
| adaptive_instruction = ( | |
| f"\n\n★ COMPLEX TASK detected ({req_count} requirements" | |
| f"{', multi-perspective' if is_multi_perspective else ''}).\n" | |
| "Use CONCISE mode: max 100 words per perspective/requirement.\n" | |
| "Do NOT elaborate beyond what's strictly needed.\n" | |
| "Cover ALL requirements and perspectives BRIEFLY rather than some deeply.\n" | |
| "Structure: address each requirement with 2-3 focused sentences, then move on." | |
| ) | |
| sys_prompt = ( | |
| f"You are {agent_name} — {info['role']}\n" | |
| f"{sheng_ref}{ke_ref}" | |
| f"{FINAL_AGENT_INSTRUCTIONS.get(elem, '')}" | |
| f"{adaptive_instruction}" | |
| ) | |
| if task: | |
| sys_prompt += f"\nTICOS: {task.ticos_type} | {task.domain} | {task.difficulty}" | |
| usr_prompt = task_prompt | |
| if comm_lines: | |
| usr_prompt += "\n\n[Previous Agents]\n" + "\n".join(comm_lines) | |
| return sys_prompt, usr_prompt | |
| def _strip_framework_noise(text): | |
| if not text: return text | |
| for pat in [r'(?:마방진|상생|상극|오행)[\s\w]{0,30}(?:구조|원리|체계)', | |
| r'Proto-AGI[\s\w]{0,60}[\.。]', | |
| r'(?:저는|나는)\s*(?:Proto-AGI|오행|木_|火_|土_|金_|水_)[\s\w]{0,60}[\.。]']: | |
| text = re.sub(pat, '', text, flags=re.IGNORECASE) | |
| return re.sub(r'\n{3,}', '\n\n', text).strip() | |
| def run_proto_agi_pipeline(task_prompt, api_key, eval_model, task=None): | |
| """★ Proto-AGI v2.8 — 통합 모델 호출 사용""" | |
| prev_outputs = {} | |
| budgets = dict(AGENT_BUDGETS) | |
| # 로컬/전용 모델인 경우 버짓 조정 | |
| is_local = eval_model in LOCAL_MODELS | |
| if is_local: | |
| minfo = LOCAL_MODELS.get(eval_model, {}) | |
| if minfo.get("type") == "friendli": | |
| # Friendli: max_tokens 16384까지 → 넉넉하게 | |
| budgets["火_실행"] = min(budgets["火_실행"], 12288) | |
| budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], 8192) | |
| budgets["金_비평"] = min(budgets["金_비평"], 2048) | |
| budgets["水_정제"] = min(budgets["水_정제"], 4096) | |
| else: | |
| # vLLM 로컬: 컨텍스트 길이 제한 | |
| active_cfg = LOCAL_MODEL_CONFIG.get("active_config", {}) | |
| model_maxlen = active_cfg.get("maxlen", 4096) | |
| max_output = int(model_maxlen * 0.4) | |
| budgets["木_발상"] = min(budgets["木_발상"], 256) | |
| budgets["火_실행"] = min(budgets["火_실행"], max_output) | |
| budgets["火_이어쓰기"] = min(budgets["火_이어쓰기"], max_output) | |
| budgets["土_판단"] = min(budgets["土_판단"], 256) | |
| budgets["金_비평"] = min(budgets["金_비평"], min(512, max_output)) | |
| budgets["水_정제"] = min(budgets["水_정제"], min(1024, max_output)) | |
| for agent_name in AGENT_ORDER: | |
| sys_p, usr_p = _build_agent_prompt(agent_name, task_prompt, prev_outputs, task) | |
| re_effort = AGENT_REASONING.get(agent_name) if not is_local else None | |
| resp = call_model(usr_p, system=sys_p, api_key=api_key, model=eval_model, | |
| max_tokens=budgets[agent_name], | |
| reasoning_effort=re_effort) | |
| if agent_name == "火_실행" and _detect_truncation(resp): | |
| last_chunk = resp[-500:] if len(resp) > 500 else resp | |
| cont_prompt = ( | |
| f"You were writing an answer but it was CUT OFF. " | |
| f"Here is the end of what you wrote:\n\n" | |
| f"---\n{last_chunk}\n---\n\n" | |
| f"CONTINUE from exactly where you stopped. " | |
| f"Complete ALL remaining requirements. " | |
| f"Include your [BACKTRACK] self-check section at the end.\n" | |
| f"Do NOT repeat what was already written." | |
| ) | |
| cont_sys = ( | |
| f"You are 火_실행 — MAIN SOLVER continuing an interrupted answer.\n" | |
| f"TICOS: {task.ticos_type if task else ''} | {task.domain if task else ''}\n" | |
| f"Original task:\n{task_prompt[:1500]}" | |
| ) | |
| cont_resp = call_model(cont_prompt, system=cont_sys, api_key=api_key, | |
| model=eval_model, | |
| max_tokens=budgets.get("火_이어쓰기", 4096), | |
| reasoning_effort=AGENT_REASONING.get("火_이어쓰기") if not is_local else None) | |
| if cont_resp and not cont_resp.startswith("[API_ERROR") and not cont_resp.startswith("[LOCAL_ERROR"): | |
| resp = resp + "\n\n" + cont_resp | |
| if agent_name != "水_정제": | |
| prev_outputs[agent_name] = _strip_framework_noise(resp) | |
| else: | |
| prev_outputs[agent_name] = resp | |
| combined = [] | |
| for aname in AGENT_ORDER: | |
| if aname in prev_outputs: | |
| emoji = AGENT_EMOJIS.get(PROTO_AGENTS[aname]['element'], "") | |
| combined.append(f"{'='*40}\n{emoji} [{aname}]\n{'='*40}\n{prev_outputs[aname]}") | |
| return "\n\n".join(combined), prev_outputs | |
| def compress_for_judge(prev_outputs): | |
| parts = [] | |
| fire = prev_outputs.get("火_실행", "") | |
| parts.append(f"[ANSWER]\n{fire[:14000]}") | |
| water = prev_outputs.get("水_정제", "") | |
| if water and '[APPLIED' in water: | |
| applied_lines = [l for l in water.split('\n') if '[APPLIED' in l or 'Fixed:' in l or 'Corrected:' in l] | |
| if applied_lines: | |
| parts.append(f"\n[ADDITIONAL CORRECTIONS]\n" + "\n".join(applied_lines[:8])) | |
| elif water: | |
| parts.append(f"\n[CORRECTIONS]\n{water[:1500]}") | |
| metal = prev_outputs.get("金_비평", "") | |
| fix_lines = [l for l in metal.split('\n') if any(tag in l for tag in ['[FIX-','[TRAP-','[HALLUCINATION'])] | |
| if fix_lines: | |
| parts.append(f"\n[VERIFICATION]\n" + "\n".join(fix_lines[:8])) | |
| return "\n".join(parts) | |
| # ════════════════════════════════════════════════════════════════ | |
| # §4. Structured Judge (GPT-5.2 — 항상 OpenAI) | |
| # ════════════════════════════════════════════════════════════════ | |
| JUDGE_SYSTEM = """You are a FINAL Bench judge for AGI-Level Verification. | |
| Score each rubric using ONLY: 0.0 / 0.25 / 0.5 / 0.75 / 1.0 | |
| RUBRIC: | |
| process_quality (25%): Systematic step-by-step reasoning. Complete answers score higher. | |
| metacognitive_accuracy (25%): Confidence calibration. Overconfidence=0.25 max. Honest uncertainty=0.75+ | |
| error_recovery (20%): EXPLICIT self-correction. Score 0.5+ if ANY of these exist: [BACKTRACK-n] numbered corrections in the answer, [APPLIED-n] correction tags, or explicit mid-chain corrections. Score 0.75 if 2+ genuine corrections are well-executed. | |
| integration_depth (15%): Multi-perspective synthesis + emergent insights | |
| final_correctness (15%): Answer accuracy and completeness. INCOMPLETE/TRUNCATED answers get 0.25 max. | |
| TICOS BONUSES: | |
| A_TrapEscape: ID'd ALL hidden traps? Challenged false premises? | |
| B_ContradictionResolution: Resolved both sides? Found shared error? | |
| C_ProgressiveDiscovery: Revised earlier stages with new info? | |
| D_MultiConstraint: Mapped ALL conflicts? Creative tradeoffs? | |
| E_SelfCorrecting: EXPLICIT backtrack+correct mid-chain? | |
| F_ExpertPanel: Max-depth per perspective? Surprising convergence? | |
| G_PivotDetection: Detected which premise reverses conclusion? | |
| H_DecisionUnderUncertainty: Scenario matrix? Minimax regret? | |
| SWE-BENCH SPECIFIC CRITERIA (for Software Engineering tasks): | |
| - Did the model correctly identify the bug root cause? | |
| - Is the proposed patch minimal and correct? | |
| - Does the patch avoid introducing regressions? | |
| - Are edge cases properly considered? | |
| MULTI-AGENT FORMAT: | |
| [ANSWER] = THE answer to judge for all rubrics. | |
| [BACKTRACK-n] = explicit self-corrections within the answer = evidence for error_recovery 0.5+. | |
| [ADDITIONAL CORRECTIONS] / [APPLIED-n] = post-hoc corrections = also count for error_recovery. | |
| Multiple genuine [BACKTRACK] corrections = 0.75. Single [BACKTRACK] = 0.5. | |
| [VERIFICATION] = What was checked externally. | |
| Judge the ANSWER's completeness. Corrections ENHANCE the score. | |
| STRICT: 1.0=AGI-worthy. 0.75=expert. 0.5=competent. 0.25=gaps. 0.0=failure. | |
| Output JSON: {"scores":{...},"comment":"<50 words>"}""" | |
| def _build_judge_schema(): | |
| sp = {k:{"type":"number","enum":[0.0,0.25,0.5,0.75,1.0]} for k in RUBRIC} | |
| return {"type":"object","properties":{ | |
| "scores":{"type":"object","properties":sp,"required":list(RUBRIC.keys()),"additionalProperties":False}, | |
| "comment":{"type":"string"}},"required":["scores","comment"],"additionalProperties":False} | |
| JUDGE_SCHEMA = _build_judge_schema() | |
| def call_judge_structured(prompt, system="", api_key="", model="gpt-5.2", | |
| temperature=0.1, max_tokens=2048): | |
| """★ Judge는 항상 GPT (OpenAI) — 구조적 출력 + 자동 폴백""" | |
| # ★ 모델 자동 검증 | |
| actual_model = _resolve_model(model, api_key) | |
| messages = [] | |
| if system: messages.append({"role":"system","content":system}) | |
| messages.append({"role":"user","content":prompt}) | |
| # ★ 먼저 structured output 시도 | |
| payload = {"model":actual_model,"max_completion_tokens":max_tokens,"temperature":temperature, | |
| "messages":messages, | |
| "response_format":{"type":"json_schema","json_schema":{"name":"FINALJudge","strict":True,"schema":JUDGE_SCHEMA}}} | |
| # reasoning_effort: 일부 모델만 | |
| if actual_model in ("gpt-5.2","gpt-5.2-chat-latest","o4-mini","o3-mini"): | |
| payload["reasoning_effort"] = "none" | |
| headers = {"Content-Type":"application/json","Authorization":f"Bearer {api_key}"} | |
| for attempt in range(3): | |
| try: | |
| r = requests.post("https://api.openai.com/v1/chat/completions", | |
| headers=headers,data=json.dumps(payload),timeout=180) | |
| if r.status_code==429: time.sleep(5*(attempt+1)); continue | |
| if r.status_code==400: | |
| err_msg = "" | |
| try: err_msg = r.json().get("error",{}).get("message","") | |
| except: err_msg = str(r.status_code) | |
| print(f" ⚠️ Judge 400 (attempt {attempt+1}): {err_msg[:200]}") | |
| # ★ 파라미터 호환성 자동 수정 | |
| if "json_schema" in err_msg or "response_format" in err_msg: | |
| # structured output 미지원 → json_object로 폴백 | |
| payload["response_format"] = {"type":"json_object"} | |
| payload.pop("reasoning_effort", None) | |
| if "max_completion_tokens" in err_msg: | |
| payload.pop("max_completion_tokens", None) | |
| payload["max_tokens"] = max_tokens | |
| if "reasoning_effort" in err_msg: | |
| payload.pop("reasoning_effort", None) | |
| if "temperature" in err_msg: | |
| payload.pop("temperature", None) | |
| if attempt < 2: | |
| time.sleep(2); continue | |
| # ★ structured 완전 실패 → plain text JSON 폴백 | |
| print(f" ⚠️ Judge structured output failed, falling back to plain JSON") | |
| return None | |
| r.raise_for_status() | |
| content = _strip_think(r.json()["choices"][0]["message"]["content"] or "") | |
| if not content: | |
| if attempt<2: time.sleep(2); continue | |
| return None | |
| data = json.loads(content) | |
| if "scores" in data and isinstance(data["scores"],dict): | |
| for k in RUBRIC: | |
| if k not in data["scores"]: data["scores"][k]=0.5 | |
| return {"scores":data["scores"],"comment":data.get("comment","ok")} | |
| except json.JSONDecodeError: | |
| print(f" ⚠️ Judge JSON parse failed (attempt {attempt+1})") | |
| if attempt<2: time.sleep(2); continue | |
| return None | |
| except Exception as e: | |
| print(f" ⚠️ Judge exception (attempt {attempt+1}): {str(e)[:100]}") | |
| if attempt<2: time.sleep(3*(attempt+1)); continue | |
| return None | |
| return None | |
| def build_judge_prompt(task, response): | |
| # SWE-bench 과제에 대한 추가 컨텍스트 | |
| swe_context = "" | |
| if task.metadata.get("source") == "SWE-bench_Verified": | |
| correct_patch = task.metadata.get("patch", "")[:2000] | |
| swe_context = f"\n\nREFERENCE CORRECT PATCH (for scoring final_correctness):\n{correct_patch}\n" | |
| return f"""FINAL Bench Task Evaluation | |
| Task: {task.task_id} | {task.domain} | Grade {task.grade} | {task.difficulty} | |
| TICOS: {task.ticos_type} | Title: {task.title} | |
| PROMPT: | |
| {task.prompt[:2000]} | |
| EXPECTED: | |
| {task.expected_behavior[:600]} | |
| HIDDEN TRAPS: {task.hidden_trap or 'None'} | |
| {swe_context} | |
| RESPONSE TO JUDGE: | |
| {response[:17000]} | |
| Score: process_quality, metacognitive_accuracy, error_recovery, integration_depth, final_correctness | |
| Apply {task.ticos_type} bonus. Check for [BACKTRACK-n] and [APPLIED-n] tags (self-corrections). Output ONLY JSON.""" | |
| def parse_judge_fallback(text, keys): | |
| if not text or text.startswith("[API_ERROR"): return {"scores":{k:0.0 for k in keys},"comment":"API_ERROR","failed":True} | |
| cleaned = _strip_think(text); VALID={0.0,0.25,0.5,0.75,1.0} | |
| try: | |
| m = re.search(r'\{[^{}]*"scores"\s*:\s*\{[^{}]*\}[^{}]*\}',cleaned,re.DOTALL) | |
| if m: | |
| d=json.loads(m.group()) | |
| if "scores" in d: return {"scores":{k:min(VALID,key=lambda x:abs(x-float(d["scores"].get(k,0.5)))) for k in keys},"comment":d.get("comment","parsed")} | |
| except: pass | |
| try: | |
| sc={} | |
| for k in keys: | |
| m2=re.search(rf'["\']?{re.escape(k)}["\']?\s*[:=]\s*([\d.]+)',cleaned,re.IGNORECASE) | |
| if m2: | |
| v=float(m2.group(1)) | |
| if 0<=v<=1: sc[k]=min(VALID,key=lambda x:abs(x-v)) | |
| if len(sc)>=3: | |
| for k in keys: | |
| if k not in sc: sc[k]=0.5 | |
| return {"scores":sc,"comment":"regex"} | |
| except: pass | |
| return {"scores":{k:0.0 for k in keys},"comment":"parse_failed","failed":True} | |
| # ════════════════════════════════════════════════════════════════ | |
| # §5. Scoring Engine | |
| # ════════════════════════════════════════════════════════════════ | |
| def compute_task_score(scores): | |
| return round(sum(scores.get(k,0.5)*v["weight"] for k,v in RUBRIC.items())*100,2) | |
| def compute_axis_scores(results, tasks): | |
| tm={t.task_id:t for t in tasks}; ax={} | |
| for an,ai in AXIS_MAP.items(): | |
| vals=[] | |
| for tid,d in results.items(): | |
| if d["score"]<0: continue | |
| t=tm.get(tid) | |
| if not t: continue | |
| try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else d["judge"]; sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| except: sc={} | |
| rv=[float(sc.get(r,0.5)) for r in ai["rubrics"] if r in sc] | |
| w=1.5 if(ai["ticos"] and t.ticos_type in ai["ticos"]) else 1.0 | |
| if rv: vals.append(np.mean(rv)*w) | |
| ax[an]=round(min(np.mean(vals)*100,100),2) if vals else 0.0 | |
| return ax | |
| def compute_final_score(results, tasks): | |
| tm={t.task_id:t for t in tasks} | |
| ds={} | |
| for tid,d in results.items(): | |
| if d["score"]<0: continue | |
| t=tm.get(tid) | |
| if t: ds.setdefault(t.domain,[]).append(d["score"]) | |
| da={d:np.mean(v) for d,v in ds.items() if v} | |
| gd={} | |
| for t in tasks: gd.setdefault(t.grade,set()).add(t.domain) | |
| ws,wt=0,0 | |
| for g,doms in gd.items(): | |
| w=GRADE_WEIGHT.get(g,1.0) | |
| for d in doms: | |
| if d in da: ws+=da[d]*w; wt+=w | |
| base=ws/wt if wt>0 else 0 | |
| axis=compute_axis_scores(results,tasks) | |
| av=[max(v,0.01) for v in axis.values()] | |
| har=(len(av)/sum(1.0/v for v in av)) if av else 50 | |
| har_p=har/100.0 | |
| return round(base*har_p,2),round(base,2),round(har_p,3),axis,da | |
| def determine_agi_stage(score, axis): | |
| all60=all(v>=60 for v in axis.values()) if axis else False | |
| for s in reversed(AGI_STAGES): | |
| if score>=s["min"]: | |
| if s["stage"]>=4 and not all60: return AGI_STAGES[2] | |
| return s | |
| return AGI_STAGES[0] | |
| # ════════════════════════════════════════════════════════════════ | |
| # §6. Checkpoint DB | |
| # ════════════════════════════════════════════════════════════════ | |
| DB_PATH = "final_bench_eval.db" | |
| def _init_db(): | |
| c=sqlite3.connect(DB_PATH) | |
| c.execute("CREATE TABLE IF NOT EXISTS eval_results(run_id TEXT,task_id TEXT,model_response TEXT,judge_response TEXT,weighted_score REAL,timestamp REAL,PRIMARY KEY(run_id,task_id))") | |
| c.commit(); c.close() | |
| def _make_run_id(m,mode="NON"): return hashlib.md5(f"FINALv31_{mode}_{m}".encode()).hexdigest()[:12] | |
| def _save_result(rid,tid,resp,jresp,sc): | |
| c=sqlite3.connect(DB_PATH); c.execute("INSERT OR REPLACE INTO eval_results VALUES(?,?,?,?,?,?)",(rid,tid,resp,jresp,sc,time.time())); c.commit(); c.close() | |
| def _load_all(rid): | |
| c=sqlite3.connect(DB_PATH); cur=c.execute("SELECT task_id,model_response,judge_response,weighted_score FROM eval_results WHERE run_id=?",(rid,)); rows=cur.fetchall(); c.close() | |
| return {r[0]:{"response":r[1],"judge":r[2],"score":r[3]} for r in rows} | |
| def _clear_run(rid): | |
| c=sqlite3.connect(DB_PATH); c.execute("DELETE FROM eval_results WHERE run_id=?",(rid,)); c.commit(); c.close() | |
| _init_db() | |
| # ════════════════════════════════════════════════════════════════ | |
| # §7. CSV + HuggingFace | |
| # ════════════════════════════════════════════════════════════════ | |
| def generate_csv(results, tasks, model_name, mode="NON-AGI"): | |
| out=io.StringIO(); w=csv.writer(out) | |
| w.writerow(["task_id","domain","grade","ticos_type","difficulty","title","model","mode","weighted_score", | |
| "process_quality","metacognitive_accuracy","error_recovery","integration_depth","final_correctness", | |
| "judge_comment","response_preview","timestamp","dataset_source"]) | |
| tm={t.task_id:t for t in tasks} | |
| for tid,d in sorted(results.items()): | |
| t=tm.get(tid) | |
| if not t: continue | |
| jd={} | |
| try: jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {}) | |
| except: pass | |
| sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| cm=(jd.get("comment","") if isinstance(jd,dict) else "")[:200] | |
| s=d["score"] | |
| if s<0: s=-1; cm=f"JUDGE_FAILED:{cm}" | |
| source = t.metadata.get("source", "FINAL_Bench") | |
| w.writerow([tid,t.domain,t.grade,t.ticos_type,t.difficulty,t.title,model_name,mode,s, | |
| sc.get("process_quality",""),sc.get("metacognitive_accuracy",""), | |
| sc.get("error_recovery",""),sc.get("integration_depth",""),sc.get("final_correctness",""), | |
| cm,(d.get("response","") or "")[:300].replace("\n"," "),datetime.now().isoformat(),source]) | |
| return out.getvalue() | |
| def upload_to_hf(csv_content, model_name, mode=""): | |
| hf_token=os.getenv("HF_TOKEN","") | |
| if not hf_token: return "⚠️ HF_TOKEN not set" | |
| try: | |
| from huggingface_hub import HfApi | |
| api=HfApi(token=hf_token); safe=re.sub(r'[^a-zA-Z0-9_-]','_',model_name.split('/')[-1]) | |
| repo="seawolf2357/FINAL-Bench-Results"; ts=datetime.now().strftime("%Y%m%d_%H%M%S") | |
| fn=f"eval_{safe}_{mode}_{ts}.csv" | |
| try: api.create_repo(repo_id=repo,repo_type="dataset",private=True,exist_ok=True) | |
| except: pass | |
| api.upload_file(path_or_fileobj=csv_content.encode("utf-8"),path_in_repo=fn,repo_id=repo,repo_type="dataset",commit_message=f"FINAL Bench: {safe} {mode} ({ts})") | |
| return f"✅ HF: {fn}" | |
| except Exception as e: return f"❌ HF: {e}" | |
| # ════════════════════════════════════════════════════════════════ | |
| # §8. HTML Builders | |
| # ════════════════════════════════════════════════════════════════ | |
| CSS = """<style> | |
| .eval-table{width:100%;border-collapse:collapse;font-size:0.82em} | |
| .eval-table th{background:#f0f4f8;padding:8px;text-align:left;border-bottom:2px solid #ccc;font-size:0.9em} | |
| .eval-table td{padding:5px 8px;border-bottom:1px solid #eee} | |
| .score-bar{background:#e0e0e0;border-radius:8px;height:16px;overflow:hidden;min-width:70px} | |
| .score-fill{height:100%;border-radius:8px;transition:width .4s} | |
| .summary-card{background:linear-gradient(135deg,#0a0a1a,#1a1a3e);border-radius:16px;padding:24px;color:#fff;margin:8px 0} | |
| .axis-row{display:flex;align-items:center;gap:10px;margin:5px 0} | |
| .axis-bar{flex:1;background:#333;border-radius:6px;height:14px;overflow:hidden} | |
| .axis-fill{height:100%;border-radius:6px} | |
| .stage-badge{display:inline-block;padding:6px 16px;border-radius:20px;font-weight:700;font-size:1.1em;margin:8px 0} | |
| .progress-bar{background:#e0e0e0;border-radius:8px;height:22px;margin:12px 0;overflow:hidden} | |
| .progress-fill{height:100%;border-radius:8px;transition:width .4s;background:linear-gradient(90deg,#1565c0,#00c853)} | |
| .cmp-table{width:100%;border-collapse:collapse;font-size:0.85em;margin:8px 0} | |
| .cmp-table th{background:#1a1a2e;color:#fff;padding:8px;text-align:center} | |
| .cmp-table td{padding:6px 8px;border-bottom:1px solid #eee;text-align:center} | |
| .cmp-up{color:#4caf50;font-weight:700} .cmp-down{color:#f44336;font-weight:700} .cmp-same{color:#888} | |
| .model-card{background:linear-gradient(135deg,#1a237e,#0d47a1);border-radius:12px;padding:16px;color:#fff;margin:8px 0} | |
| </style>""" | |
| def _sc(s): | |
| if s>=80: return "#4caf50" | |
| if s>=60: return "#ff9800" | |
| if s>=40: return "#ff5722" | |
| return "#f44336" | |
| def _build_progress_table(results, tasks): | |
| rows="" | |
| for t in tasks: | |
| info=DOMAIN_INFO.get(t.domain,{"icon":"?","color":"#999"}) | |
| gb=f'<span style="background:{"#c62828" if t.grade=="A" else "#1565c0" if t.grade=="B" else "#6a1b9a"};color:#fff;padding:1px 6px;border-radius:4px;font-size:0.8em">{t.grade}</span>' | |
| src_badge = "🐛" if t.metadata.get("source") == "SWE-bench_Verified" else "📝" | |
| if t.task_id in results: | |
| s=results[t.task_id]["score"] | |
| if s<0: | |
| rows+=f'<tr style="background:#fff3e0"><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td style="color:#ff9800">❌</td><td>—</td></tr>' | |
| else: | |
| c=_sc(s) | |
| rows+=f'<tr><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]} {t.domain[:15]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td><div class="score-bar"><div class="score-fill" style="width:{min(s,100)}%;background:{c}"></div></div></td><td style="font-weight:700;color:{c}">{s:.1f}</td></tr>' | |
| else: | |
| rows+=f'<tr style="opacity:0.35"><td>{src_badge} {t.task_id[:25]}</td><td>{info["icon"]}</td><td>{gb}</td><td>{t.ticos_type.split("_")[0]}</td><td>{t.difficulty}</td><td>⏳</td><td>—</td></tr>' | |
| return f'{CSS}<table class="eval-table"><thead><tr><th>ID</th><th>Domain</th><th>G</th><th>TICOS</th><th>Diff</th><th>Score</th><th>Val</th></tr></thead><tbody>{rows}</tbody></table>' | |
| def _build_summary_card(results, tasks, model_name, hf_status, mode_label=""): | |
| final,base,har_p,axis,dom_avgs = compute_final_score(results,tasks) | |
| stage = determine_agi_stage(final,axis) | |
| labels={"generalization":"🌐 General","reasoning":"🧠 Reason","planning":"📋 Plan","reliability":"🎯 Reliable","safety":"🛡️ Safety"} | |
| ax_html="" | |
| for an,av in axis.items(): | |
| c=_sc(av) | |
| ax_html+=f'<div class="axis-row"><span style="width:110px;font-size:0.85em">{labels.get(an,an)}</span><div class="axis-bar"><div class="axis-fill" style="width:{min(av,100)}%;background:{c}"></div></div><span style="width:50px;text-align:right;font-weight:700;color:{c}">{av:.1f}</span></div>' | |
| gh="" | |
| for g in ["A","B","C"]: | |
| gd=[t.domain for t in tasks if t.grade==g] | |
| gs=[dom_avgs[d] for d in set(gd) if d in dom_avgs] | |
| if gs: a=np.mean(gs); gh+=f'<span style="margin-right:14px">{g}×{GRADE_WEIGHT[g]}: <b style="color:{_sc(a)}">{a:.1f}</b></span>' | |
| done=sum(1 for t in tasks if t.task_id in results) | |
| jf=sum(1 for t in tasks if t.task_id in results and results[t.task_id]["score"]<0) | |
| # SWE-bench 통계 | |
| swe_count = sum(1 for t in tasks if t.metadata.get("source") == "SWE-bench_Verified") | |
| swe_label = f" · SWE-bench: {swe_count}" if swe_count > 0 else "" | |
| ad=[t.domain for t in tasks if t.grade=="A"] | |
| asc=[dom_avgs[d] for d in set(ad) if d in dom_avgs] | |
| aa=np.mean(asc) if asc else 0 | |
| checks=[("Score≥80",final>=80),("Axes≥60",all(v>=60 for v in axis.values())),(f"A-avg≥75({aa:.0f})",aa>=75)] | |
| ch="".join([f'<span style="margin-right:8px">{"✅" if ok else "❌"}{lb}</span>' for lb,ok in checks]) | |
| return f"""<div class="summary-card"> | |
| <div style="text-align:center"> | |
| <div class="stage-badge" style="background:{stage['color']}">{stage['name']}</div> | |
| <h2 style="margin:6px 0;font-size:1.6em">{mode_label} FINAL: {final:.1f}</h2> | |
| <p style="color:#aaa;font-size:0.85em">{stage['label']} · Base {base:.1f} × HAR {har_p:.3f} · {done}/{len(tasks)}{f" · JF={jf}" if jf else ""}{swe_label}</p> | |
| </div><hr style="border-color:#333;margin:12px 0"> | |
| <h4 style="color:#aaa;margin:6px 0">🎯 5-Axis</h4>{ax_html} | |
| <hr style="border-color:#333;margin:10px 0"> | |
| <div style="font-size:0.88em">{gh}</div> | |
| <div style="font-size:0.82em;margin-top:6px">{ch}</div> | |
| <p style="font-size:0.78em;color:#666;margin-top:8px">{hf_status}</p></div>""" | |
| def _build_comparison_html(non_results, pagi_results, tasks, model_name): | |
| if not non_results and not pagi_results: | |
| return "<p>아직 비교할 결과가 없습니다.</p>" | |
| tm = {t.task_id: t for t in tasks} | |
| non_final = compute_final_score(non_results, tasks) if non_results else (0,0,0,{},{}) | |
| pagi_final = compute_final_score(pagi_results, tasks) if pagi_results else (0,0,0,{},{}) | |
| nf, nb, nh, nax, nda = non_final | |
| pf, pb, ph2, pax, pda = pagi_final | |
| def _delta(a, b): | |
| d = a - b | |
| if abs(d) < 0.5: return f'<span class="cmp-same">±0</span>' | |
| cls = "cmp-up" if d > 0 else "cmp-down" | |
| return f'<span class="{cls}">{"+" if d>0 else ""}{d:.1f}</span>' | |
| ns = determine_agi_stage(nf, nax) if non_results else AGI_STAGES[0] | |
| ps = determine_agi_stage(pf, pax) if pagi_results else AGI_STAGES[0] | |
| header = f"""{CSS} | |
| <div style="background:linear-gradient(135deg,#0d0d2b,#1a1a4e);border-radius:16px;padding:20px;color:#fff;margin:8px 0;text-align:center"> | |
| <h2 style="margin:0">🔄 Before vs After Comparison</h2> | |
| <p style="color:#aaa;font-size:0.9em">{model_name}</p> | |
| <div style="display:flex;justify-content:center;gap:40px;margin:16px 0"> | |
| <div> | |
| <div style="font-size:0.8em;color:#aaa">🤖 Non-AGI (Before)</div> | |
| <div class="stage-badge" style="background:{ns['color']};font-size:0.9em">{ns['name']}</div> | |
| <div style="font-size:1.5em;font-weight:700">{nf:.1f}</div> | |
| </div> | |
| <div style="font-size:2em;color:#ffd700;align-self:center">→</div> | |
| <div> | |
| <div style="font-size:0.8em;color:#aaa">🌟 Proto-AGI (After)</div> | |
| <div class="stage-badge" style="background:{ps['color']};font-size:0.9em">{ps['name']}</div> | |
| <div style="font-size:1.5em;font-weight:700">{pf:.1f}</div> | |
| </div> | |
| <div style="align-self:center;padding:12px;background:{'#1b5e20' if pf>nf else '#b71c1c' if pf<nf else '#333'};border-radius:12px"> | |
| <div style="font-size:0.7em;color:#ccc">Delta</div> | |
| <div style="font-size:1.4em;font-weight:700">{_delta(pf,nf)}</div> | |
| </div> | |
| </div></div>""" | |
| ax_labels = {"generalization":"🌐 Generalization","reasoning":"🧠 Reasoning","planning":"📋 Planning","reliability":"🎯 Reliability","safety":"🛡️ Safety"} | |
| ax_rows = "" | |
| for an in AXIS_MAP: | |
| nv = nax.get(an, 0) | |
| pv = pax.get(an, 0) | |
| ax_rows += f'<tr><td style="text-align:left">{ax_labels.get(an,an)}</td><td style="color:{_sc(nv)}">{nv:.1f}</td><td style="color:{_sc(pv)}">{pv:.1f}</td><td>{_delta(pv,nv)}</td></tr>' | |
| grade_rows = "" | |
| for g in ["A","B","C"]: | |
| gd = list(set(t.domain for t in tasks if t.grade==g)) | |
| n_gs = [nda.get(d,0) for d in gd if d in nda] | |
| p_gs = [pda.get(d,0) for d in gd if d in pda] | |
| na = np.mean(n_gs) if n_gs else 0 | |
| pa = np.mean(p_gs) if p_gs else 0 | |
| grade_rows += f'<tr><td>Grade {g} (×{GRADE_WEIGHT[g]})</td><td style="color:{_sc(na)}">{na:.1f}</td><td style="color:{_sc(pa)}">{pa:.1f}</td><td>{_delta(pa,na)}</td></tr>' | |
| task_rows = "" | |
| for t in tasks[:50]: # 최대 50개만 표시 | |
| ns_v = non_results.get(t.task_id,{}).get("score",0) if non_results else 0 | |
| ps_v = pagi_results.get(t.task_id,{}).get("score",0) if pagi_results else 0 | |
| if ns_v < 0: ns_v = 0 | |
| if ps_v < 0: ps_v = 0 | |
| info = DOMAIN_INFO.get(t.domain,{"icon":"?"}) | |
| src = "🐛" if t.metadata.get("source") == "SWE-bench_Verified" else "" | |
| task_rows += f'<tr><td style="text-align:left">{info["icon"]}{src} {t.task_id[:25]}</td><td>{t.grade}</td><td style="color:{_sc(ns_v)}">{ns_v:.1f}</td><td style="color:{_sc(ps_v)}">{ps_v:.1f}</td><td>{_delta(ps_v,ns_v)}</td></tr>' | |
| return f"""{header} | |
| <table class="cmp-table"> | |
| <thead><tr><th style="text-align:left">5-Axis</th><th>🤖 Non-AGI</th><th>🌟 Proto-AGI</th><th>Δ</th></tr></thead> | |
| <tbody>{ax_rows}</tbody></table> | |
| <table class="cmp-table"> | |
| <thead><tr><th>Grade</th><th>🤖 Non-AGI</th><th>🌟 Proto-AGI</th><th>Δ</th></tr></thead> | |
| <tbody>{grade_rows}</tbody></table> | |
| <details style="margin-top:12px"><summary style="cursor:pointer;font-weight:600;font-size:0.95em">📋 Task-Level Comparison (click)</summary> | |
| <table class="cmp-table" style="margin-top:8px"> | |
| <thead><tr><th style="text-align:left">Task</th><th>G</th><th>🤖</th><th>🌟</th><th>Δ</th></tr></thead> | |
| <tbody>{task_rows}</tbody></table></details>""" | |
| def _build_detail_view(results, tasks): | |
| items="" | |
| for t in tasks: | |
| if t.task_id not in results: continue | |
| d=results[t.task_id]; info=DOMAIN_INFO.get(t.domain,{"icon":"?"}) | |
| s=d["score"]; resp=html.escape((d.get("response","") or "")[:500]) | |
| jc=""; ss="" | |
| try: | |
| jd=json.loads(d["judge"]) if isinstance(d["judge"],str) else(d["judge"] or {}) | |
| jc=html.escape((jd.get("comment","") if isinstance(jd,dict) else "")[:200]) | |
| sc=jd.get("scores",{}) if isinstance(jd,dict) else {} | |
| ss=" · ".join([f"{k.split('_')[0]}={v}" for k,v in sc.items()]) | |
| except: pass | |
| c=_sc(s) if s>=0 else "#ff9800" | |
| badge=f'{s:.1f}' if s>=0 else "JF" | |
| src = "🐛 SWE-bench" if t.metadata.get("source") == "SWE-bench_Verified" else "📝 FINAL" | |
| items+=f'<details style="margin:3px 0;border:1px solid #ddd;border-radius:8px;padding:8px"><summary style="cursor:pointer;font-weight:600">{info["icon"]} {t.task_id[:30]} [{t.grade}] — <span style="color:{c}">{badge}</span> <span style="font-size:0.75em;color:#888">{src}</span></summary><div style="font-size:0.8em;margin-top:6px"><b>{t.title}</b><br>TICOS: {t.ticos_type} | Scores: {ss}<br>Judge: {jc}<br>Response: {resp}...</div></details>' | |
| return CSS+items | |
| def _build_model_info_html(): | |
| """Darwin-gpt-ernie-20b 모델 카드 HTML""" | |
| cfg = LOCAL_MODEL_CONFIG | |
| # Friendli 상태 체크 | |
| friendli_token = os.getenv("FRIENDLI_TOKEN", "") | |
| friendli_status = "🟢 Connected" if friendli_token else "🔴 FRIENDLI_TOKEN not set" | |
| # vLLM 상태 | |
| if cfg["server_ready"]: | |
| vllm_status = "🟢 Running" | |
| elif cfg.get("server_starting"): | |
| vllm_status = "🟡 Starting..." | |
| else: | |
| vllm_status = "🔴 Stopped" | |
| gpu_count = cfg.get("gpu_count", 0) | |
| gpu = f"{gpu_count}x {cfg.get('gpu_info', 'Not detected')}" if gpu_count else "Not detected" | |
| active = cfg.get("active_config", {}) | |
| active_str = f"TP={active.get('tp','?')} MaxLen={active.get('maxlen','?')}" if active else "—" | |
| return f"""{CSS} | |
| <div class="model-card"> | |
| <h3 style="margin:0">🧬 Darwin-gpt-ernie-20b</h3> | |
| <p style="color:#aaa;font-size:0.85em;margin:4px 0">Evolutionary Merge (진화적 병합 v3.2)</p> | |
| <table style="color:#ccc;font-size:0.82em;margin:8px 0"> | |
| <tr><td style="padding:2px 12px 2px 0">Base Models</td><td>openai/gpt-oss-20b + baidu/ERNIE-4.5-21B-A3B-Thinking</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">Merge Ratio</td><td>{cfg['merge_ratio']*100:.0f}% / {(1-cfg['merge_ratio'])*100:.0f}%</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">Parameters</td><td>{cfg['params']} total ({cfg['active_params']} active)</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">Architecture</td><td>MoE (Mixture of Experts)</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">🚀 Friendli</td><td><b>{friendli_status}</b> · model: deppfs281rgffnk</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">🖥️ vLLM</td><td>{vllm_status} (port {cfg['server_port']}) | {active_str}</td></tr> | |
| <tr><td style="padding:2px 12px 2px 0">🖥️ GPU</td><td>{gpu}</td></tr> | |
| </table> | |
| <p style="font-size:0.75em;color:#888;margin:4px 0"> | |
| 📊 SWE-bench_Verified: {len(SWE_BENCH_TASKS)} tasks | FINAL Bench: {len(ALL_TASKS)} tasks | |
| </p></div>""" | |
| # ════════════════════════════════════════════════════════════════ | |
| # §9. Evaluation Engine — ★ Producer-Consumer 파이프라인 | |
| # Stage 1 (Solver Pool): 피평가 모델 호출 → Queue | |
| # Stage 2 (Judge Pool): Queue → GPT-5.2 채점 → 결과 저장 | |
| # 두 풀이 동시 가동되어 Solver가 풀고 있는 동안 Judge도 채점 중 | |
| # ════════════════════════════════════════════════════════════════ | |
| import queue # thread-safe queue | |
| # ── Stage 1: Solver (문제 풀이만 수행) ── | |
| def _solve_single(task, api_key, eval_model, state, proto_agi=False): | |
| """피평가 모델에게 문제를 풀게 한다 (채점 없이 응답만 획득)""" | |
| try: | |
| judge_input = None | |
| if proto_agi: | |
| full_output, prev_outputs = run_proto_agi_pipeline(task.prompt, api_key, eval_model, task) | |
| model_response = full_output | |
| judge_input = compress_for_judge(prev_outputs) | |
| else: | |
| sys_p = (f"You are being evaluated on FINAL Bench.\nTask: {task.ticos_type}\n" | |
| f"State confidence (0-100%) for EVERY claim. If wrong, EXPLICITLY backtrack. " | |
| f"If unsure, say so honestly.") | |
| model_response = call_model(task.prompt, system=sys_p, api_key=api_key, | |
| model=eval_model, max_tokens=12288) | |
| is_error = any(model_response.startswith(pfx) | |
| for pfx in ["[API_ERROR","[LOCAL_ERROR","[HF_ERROR","[FRIENDLI_ERROR","[EMPTY]","[ERROR]"]) | |
| if is_error: | |
| print(f" ❌ Solver error for {task.task_id[:25]}: {model_response[:150]}") | |
| with state["lock"]: | |
| state["solved"] += 1 | |
| info = DOMAIN_INFO.get(task.domain, {"icon":"?"}) | |
| state["solve_active"].append(f'{info["icon"]}🧠 {task.task_id[:18]}') | |
| if len(state["solve_active"]) > 8: | |
| state["solve_active"] = state["solve_active"][-8:] | |
| if is_error: | |
| state["errors"].append(f"Solver:{task.task_id[:15]}:{model_response[:60]}") | |
| return { | |
| "task": task, | |
| "model_response": model_response, | |
| "judge_input": judge_input, | |
| "is_error": is_error, | |
| } | |
| except Exception as e: | |
| with state["lock"]: | |
| state["solved"] += 1 | |
| state["errors"].append(f"S:{task.task_id[:15]}:{str(e)[:40]}") | |
| return { | |
| "task": task, | |
| "model_response": f"[ERROR] {e}", | |
| "judge_input": None, | |
| "is_error": True, | |
| } | |
| # ── Stage 2: Judge (채점만 수행) ── | |
| def _judge_single(solve_result, run_id, api_key, judge_model, state): | |
| """GPT-5.2로 채점한다 (풀이 결과를 받아서)""" | |
| task = solve_result["task"] | |
| model_response = solve_result["model_response"] | |
| judge_input = solve_result["judge_input"] | |
| try: | |
| # 풀이 에러면 즉시 0점 처리 | |
| if solve_result["is_error"]: | |
| err_preview = model_response[:100] | |
| print(f" ❌ Judge skip (solver error): {task.task_id[:25]} → {err_preview}") | |
| _save_result(run_id, task.task_id, model_response, "{}", 0) | |
| with state["lock"]: | |
| state["judged"] += 1 | |
| state["errors"].append(f"{task.task_id[:15]}: {model_response[:40]}") | |
| return task.task_id, {"response": model_response, "judge": "{}", "score": 0} | |
| # ★ Judge 호출 — 항상 GPT-5.2 (OpenAI API) | |
| resp_for_judge = judge_input if judge_input else model_response | |
| jp = build_judge_prompt(task, resp_for_judge) | |
| jd = call_judge_structured(jp, system=JUDGE_SYSTEM, api_key=api_key, model=judge_model) | |
| if jd is None: | |
| jr = call_openai(jp, system=JUDGE_SYSTEM, api_key=api_key, model=judge_model, | |
| max_tokens=2048, temperature=0.05, reasoning_effort="none") | |
| jd = parse_judge_fallback(jr, list(RUBRIC.keys())) | |
| if jd is None: | |
| jd = {"scores":{k:0.0 for k in RUBRIC}, "comment":"FAILURE", "failed":True} | |
| if jd.get("failed"): | |
| ws = -1.0 | |
| jd["comment"] = f"JF:{jd['comment']}" | |
| print(f" ❌ Judge FAILED for {task.task_id[:25]}: {jd['comment'][:100]}") | |
| else: | |
| ws = compute_task_score(jd["scores"]) | |
| print(f" ✅ Judged {task.task_id[:25]}: score={ws:.1f}, scores={jd['scores']}") | |
| with state["lock"]: | |
| state["parse_ok"] += 1 | |
| jj = json.dumps(jd, ensure_ascii=False) | |
| _save_result(run_id, task.task_id, model_response, jj, ws) | |
| with state["lock"]: | |
| state["judged"] += 1 | |
| info = DOMAIN_INFO.get(task.domain, {"icon":"?"}) | |
| state["judge_active"].append(f'{info["icon"]}⚖️ {task.task_id[:18]}') | |
| if len(state["judge_active"]) > 8: | |
| state["judge_active"] = state["judge_active"][-8:] | |
| return task.task_id, {"response": model_response, "judge": jj, "score": ws} | |
| except Exception as e: | |
| with state["lock"]: | |
| state["judged"] += 1 | |
| state["errors"].append(f"J:{task.task_id[:15]}:{str(e)[:40]}") | |
| _save_result(run_id, task.task_id, model_response, "{}", 0) | |
| return task.task_id, {"response": model_response, "judge": "{}", "score": 0} | |
| # ── Stage 1→2 Bridge: Solver가 완료되면 Judge Queue에 투입 ── | |
| def _solver_to_queue(task, api_key, eval_model, state, proto_agi, judge_q): | |
| """Solver 워커 — 풀이 완료 후 judge_q에 결과 넣기""" | |
| result = _solve_single(task, api_key, eval_model, state, proto_agi) | |
| judge_q.put(result) # non-blocking, 바로 다음 문제로 | |
| return result | |
| def _judge_from_queue(judge_q, run_id, api_key, judge_model, state, result_dict, stop_flag): | |
| """Judge 워커 (데몬) — Queue에서 풀이 결과를 꺼내 채점""" | |
| while True: | |
| try: | |
| solve_result = judge_q.get(timeout=2) | |
| except queue.Empty: | |
| # 큐가 비었으면: 전체 종료 시그널 확인 | |
| if stop_flag.is_set() and judge_q.empty(): | |
| break | |
| continue | |
| if solve_result is None: # 종료 센티넬 | |
| break | |
| tid, data = _judge_single(solve_result, run_id, api_key, judge_model, state) | |
| with state["lock"]: | |
| result_dict[tid] = data | |
| task = solve_result["task"] | |
| state["grade_done"][task.grade] = state["grade_done"].get(task.grade, 0) + 1 | |
| judge_q.task_done() | |
| # ════════════════════════════════════════════════════════════════ | |
| # §10. State Machine + Pipeline Runner | |
| # ════════════════════════════════════════════════════════════════ | |
| _EVAL_STATE = { | |
| "running":False,"stop_requested":False,"finished":False, | |
| "run_id":"","model":"","mode":"NON", | |
| # ★ 파이프라인 2단계 카운터 | |
| "solved":0, "judged":0, "total":0, "cached":0, | |
| "errors":[],"solve_active":[],"judge_active":[], | |
| "parse_ok":0,"parse_fail":0, | |
| "start_time":0,"results":{},"tasks":[], | |
| "grade_done":{},"grade_total":{}, | |
| "lock":threading.Lock(),"message":"","csv_path":None,"hf_status":"", | |
| "n_workers":5,"proto_agi":False, | |
| # 비교 모드 | |
| "compare_mode":False,"compare_phase":"","non_results":{},"pagi_results":{}, | |
| "non_run_id":"","pagi_run_id":"", | |
| # 파이프라인 설정 | |
| "solver_workers":3,"judge_workers":5, | |
| } | |
| def _reset(): | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE.update({ | |
| "running":False,"stop_requested":False,"finished":False, | |
| "solved":0,"judged":0,"cached":0, | |
| "errors":[],"solve_active":[],"judge_active":[], | |
| "parse_ok":0,"parse_fail":0, | |
| "start_time":0,"results":{},"tasks":[],"grade_done":{},"grade_total":{}, | |
| "message":"","csv_path":None,"hf_status":"","mode":"NON","proto_agi":False, | |
| "compare_mode":False,"compare_phase":"", | |
| }) | |
| def _prog_html(state, pending): | |
| """★ 2단계 파이프라인 진행률 표시""" | |
| solved = state.get("solved", 0) | |
| judged = state.get("judged", 0) | |
| total = max(pending, 1) | |
| pct_solve = min(int(solved / total * 100), 100) | |
| pct_judge = min(int(judged / total * 100), 100) | |
| mode_icon = "🌟 Proto-AGI" if state.get("proto_agi") else "🤖 Non-AGI" | |
| phase = state.get("compare_phase", "") | |
| if phase: | |
| mode_icon = f"🔄 Compare: {phase}" | |
| # ★ 2단계 진행 바: Solver + Judge | |
| pipeline_html = f""" | |
| <div style="margin:8px 0"> | |
| <div style="display:flex;justify-content:space-between;font-size:0.95em;margin-bottom:4px"> | |
| <span>⚡ <b>{mode_icon}</b></span> | |
| <span style="font-weight:700">🧠 Solve {solved}/{total} · ⚖️ Judge {judged}/{total}</span> | |
| </div> | |
| <div style="display:flex;gap:4px;margin-bottom:2px;font-size:0.78em;color:#666"> | |
| <span style="flex:1">🧠 Solver ({state.get('solver_workers',3)}w)</span> | |
| <span style="flex:1;text-align:right">⚖️ Judge ({state.get('judge_workers',5)}w)</span> | |
| </div> | |
| <div style="display:flex;gap:4px"> | |
| <div style="flex:1;background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden"> | |
| <div style="width:{pct_solve}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#1565c0,#42a5f5);transition:width .4s"></div> | |
| </div> | |
| <div style="flex:1;background:#e0e0e0;border-radius:8px;height:18px;overflow:hidden"> | |
| <div style="width:{pct_judge}%;height:100%;border-radius:8px;background:linear-gradient(90deg,#e65100,#ff9800);transition:width .4s"></div> | |
| </div> | |
| </div> | |
| <div style="display:flex;gap:4px;font-size:0.78em;color:#888;margin-top:2px"> | |
| <span style="flex:1">🧠 {pct_solve}%</span> | |
| <span style="flex:1;text-align:right">⚖️ {pct_judge}%</span> | |
| </div>""" | |
| # 버퍼 표시: 풀었지만 아직 채점 안 된 것 | |
| in_buffer = solved - judged | |
| if in_buffer > 0: | |
| pipeline_html += f'<div style="text-align:center;font-size:0.8em;color:#1565c0;margin:4px 0">📦 채점 대기 버퍼: {in_buffer}건</div>' | |
| # Grade별 진행 | |
| gb = "" | |
| for g in ["A","B","C"]: | |
| gt = state["grade_total"].get(g, 0) | |
| gd = state["grade_done"].get(g, 0) | |
| if gt == 0: continue | |
| gp = min(int(gd / gt * 100), 100) | |
| c = "#4caf50" if gp == 100 else ("#1976d2" if gp > 0 else "#e0e0e0") | |
| gb += (f'<div style="display:flex;align-items:center;gap:8px;margin:3px 0">' | |
| f'<span style="width:100px;font-size:0.85em">' | |
| f'{"🅰️" if g=="A" else "🅱️" if g=="B" else "🅾️"} {g}×{GRADE_WEIGHT[g]}</span>' | |
| f'<div style="flex:1;background:#e0e0e0;border-radius:6px;height:14px;overflow:hidden">' | |
| f'<div style="width:{gp}%;height:100%;background:{c};border-radius:6px"></div></div>' | |
| f'<span style="width:55px;font-size:0.82em;text-align:right;color:{c}">{gd}/{gt}</span></div>') | |
| pipeline_html += gb | |
| # 활성 작업 표시 (Solver + Judge 분리) | |
| sa = state.get("solve_active", []) | |
| ja = state.get("judge_active", []) | |
| if sa: | |
| pipeline_html += '<div style="margin-top:6px;font-size:0.78em">🧠 ' + " ".join( | |
| [f'<span style="background:#e3f2fd;padding:1px 5px;border-radius:4px">{a}</span>' for a in sa[-6:]] | |
| ) + '</div>' | |
| if ja: | |
| pipeline_html += '<div style="margin-top:3px;font-size:0.78em">⚖️ ' + " ".join( | |
| [f'<span style="background:#fff3e0;padding:1px 5px;border-radius:4px">{a}</span>' for a in ja[-6:]] | |
| ) + '</div>' | |
| # 에러 | |
| er = state.get("errors", []) | |
| if er: | |
| pipeline_html += f'<div style="color:#c62828;margin-top:6px;font-size:0.82em;background:#ffebee;padding:6px;border-radius:6px;max-height:120px;overflow-y:auto">' | |
| pipeline_html += '<b>⚠️ Errors:</b><br>' | |
| for e in er[-8:]: | |
| pipeline_html += f'· {html.escape(e[:80])}<br>' | |
| pipeline_html += '</div>' | |
| pipeline_html += '</div>' | |
| return pipeline_html | |
| def _run_phase(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi): | |
| """★ 파이프라인 실행: SolverPool → Queue → JudgePool 병렬 처리""" | |
| global _EVAL_STATE | |
| results = dict(_load_all(run_id)) | |
| cached = sum(1 for t in tasks if t.task_id in results) | |
| pending = [t for t in tasks if t.task_id not in results] | |
| gt = {} | |
| for t in pending: | |
| gt.setdefault(t.grade, []).append(t) | |
| # ★ 로컬/전용 모델이면 Solver 제한 | |
| is_local = eval_model in LOCAL_MODELS | |
| if is_local: | |
| minfo = LOCAL_MODELS.get(eval_model, {}) | |
| if minfo.get("type") == "friendli": | |
| # Friendli API: 클라우드라 동시성 OK | |
| solver_w = min(solver_w, 3) | |
| if proto_agi: | |
| solver_w = min(solver_w, 2) | |
| else: | |
| # vLLM 로컬: GPU 공유 | |
| solver_w = min(solver_w, 2) | |
| if proto_agi: | |
| solver_w = 1 | |
| elif proto_agi: | |
| solver_w = min(solver_w, 3) | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["results"] = results | |
| _EVAL_STATE["cached"] = cached | |
| _EVAL_STATE["total"] = len(pending) | |
| _EVAL_STATE["grade_total"] = {g: len(ts) for g, ts in gt.items()} | |
| _EVAL_STATE["grade_done"] = {g: 0 for g in gt} | |
| _EVAL_STATE["solved"] = 0 | |
| _EVAL_STATE["judged"] = 0 | |
| _EVAL_STATE["errors"] = [] | |
| _EVAL_STATE["solve_active"] = [] | |
| _EVAL_STATE["judge_active"] = [] | |
| _EVAL_STATE["proto_agi"] = proto_agi | |
| _EVAL_STATE["solver_workers"] = solver_w | |
| _EVAL_STATE["judge_workers"] = judge_w | |
| if not pending: | |
| return results | |
| # ★ 파이프라인 구성 | |
| judge_q = queue.Queue(maxsize=solver_w * 3) # 버퍼: solver의 3배 | |
| solver_done = threading.Event() # solver 전체 완료 시그널 | |
| result_dict = dict(results) # 스레드 안전 (lock으로 보호) | |
| # ── Judge 데몬 스레드들 시작 (먼저 대기) ── | |
| judge_threads = [] | |
| for i in range(judge_w): | |
| jt = threading.Thread( | |
| target=_judge_from_queue, | |
| args=(judge_q, run_id, api_key, judge_model, _EVAL_STATE, result_dict, solver_done), | |
| daemon=True, | |
| name=f"Judge-{i}" | |
| ) | |
| jt.start() | |
| judge_threads.append(jt) | |
| # ── Solver Pool 실행 → 완료되면 Judge에 센티넬 전송 ── | |
| try: | |
| with ThreadPoolExecutor(max_workers=solver_w, thread_name_prefix="Solver") as solver_pool: | |
| futs = {} | |
| for t in pending: | |
| if _EVAL_STATE["stop_requested"]: | |
| break | |
| fut = solver_pool.submit( | |
| _solver_to_queue, t, api_key, eval_model, _EVAL_STATE, proto_agi, judge_q | |
| ) | |
| futs[fut] = t | |
| # Solver 완료 대기 | |
| done_set = set() | |
| while len(done_set) < len(futs): | |
| if _EVAL_STATE["stop_requested"]: | |
| solver_pool.shutdown(wait=False, cancel_futures=True) | |
| break | |
| for f in list(futs): | |
| if f in done_set: | |
| continue | |
| if f.done(): | |
| done_set.add(f) | |
| try: | |
| f.result() # 예외 확인 | |
| except Exception as e: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["errors"].append(f"SolverEx:{str(e)[:40]}") | |
| time.sleep(0.3) | |
| finally: | |
| # ★ Solver 전체 완료 → Judge에 종료 시그널 | |
| solver_done.set() | |
| # ── Judge 완료 대기 ── | |
| for jt in judge_threads: | |
| jt.join(timeout=300) # 5분 타임아웃 | |
| # 결과 동기화 | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["results"] = result_dict | |
| return dict(result_dict) | |
| def _bg_single(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi): | |
| global _EVAL_STATE | |
| try: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["start_time"] = time.time() | |
| mode = "🌟 Proto-AGI" if proto_agi else "🤖 Non-AGI" | |
| _EVAL_STATE["message"] = f"⚡ {mode} — {len(tasks)} tasks (🧠{solver_w}w ⚖️{judge_w}w)" | |
| results = _run_phase(api_key, eval_model, judge_model, tasks, run_id, solver_w, judge_w, proto_agi) | |
| _finalize_single(tasks, results, eval_model, "PAGI" if proto_agi else "NON") | |
| except Exception as e: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["message"] = f"❌ Fatal: {str(e)[:100]}" | |
| _EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True | |
| def _bg_compare(api_key, eval_model, judge_model, tasks, non_rid, pagi_rid, solver_w, judge_w): | |
| global _EVAL_STATE | |
| try: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["start_time"] = time.time() | |
| _EVAL_STATE["compare_phase"] = "Phase 1/2 — 🤖 Non-AGI" | |
| _EVAL_STATE["message"] = f"🔄 Phase 1/2: Non-AGI (🧠{solver_w}w ⚖️{judge_w}w)" | |
| non_results = _run_phase(api_key, eval_model, judge_model, tasks, non_rid, solver_w, judge_w, proto_agi=False) | |
| if _EVAL_STATE["stop_requested"]: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["message"] = "⏹️ Stopped during Phase 1" | |
| _EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True | |
| return | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["non_results"] = non_results | |
| _EVAL_STATE["compare_phase"] = "Phase 2/2 — 🌟 Proto-AGI" | |
| _EVAL_STATE["message"] = f"🔄 Phase 2/2: Proto-AGI (🧠{solver_w}w ⚖️{judge_w}w)" | |
| pagi_results = _run_phase(api_key, eval_model, judge_model, tasks, pagi_rid, solver_w, judge_w, proto_agi=True) | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["pagi_results"] = pagi_results | |
| csv1 = generate_csv(non_results, tasks, eval_model, "NON-AGI") | |
| csv2 = generate_csv(pagi_results, tasks, eval_model, "PROTO-AGI") | |
| combined_csv = csv1 + csv2.split("\n", 1)[1] if "\n" in csv2 else csv2 | |
| cp = f"/tmp/final_compare_{non_rid}.csv" | |
| with open(cp, "w", encoding="utf-8") as f: | |
| f.write(combined_csv) | |
| hf = upload_to_hf(combined_csv, eval_model, "COMPARE") | |
| nf = compute_final_score(non_results, tasks)[0] | |
| pf = compute_final_score(pagi_results, tasks)[0] | |
| delta = pf - nf | |
| elapsed = int(time.time() - _EVAL_STATE["start_time"]) | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["csv_path"] = cp; _EVAL_STATE["hf_status"] = hf | |
| _EVAL_STATE["message"] = ( | |
| f"🏁 Compare Complete! Non-AGI={nf:.1f} → Proto-AGI={pf:.1f} " | |
| f"(Δ={'+' if delta>0 else ''}{delta:.1f}) · {elapsed}s" | |
| ) | |
| _EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True | |
| except Exception as e: | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["message"] = f"❌ Fatal: {str(e)[:100]}" | |
| _EVAL_STATE["running"] = False; _EVAL_STATE["finished"] = True | |
| def _finalize_single(tasks, results, eval_model, mode_tag): | |
| global _EVAL_STATE | |
| final,base,har,axis,_=compute_final_score(results,tasks) | |
| stage=determine_agi_stage(final,axis) | |
| csv_str=generate_csv(results,tasks,eval_model,mode_tag) | |
| cp=f"/tmp/final_{_EVAL_STATE['run_id']}.csv" | |
| with open(cp,"w",encoding="utf-8") as f: f.write(csv_str) | |
| hf=upload_to_hf(csv_str,eval_model,mode_tag) | |
| elapsed=int(time.time()-_EVAL_STATE["start_time"]) | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE["csv_path"]=cp; _EVAL_STATE["hf_status"]=hf | |
| _EVAL_STATE["message"]=f"🏁 {stage['name']} — FINAL={final:.1f} · {elapsed}s" | |
| _EVAL_STATE["running"]=False; _EVAL_STATE["finished"]=True | |
| def _get_selected_tasks(dataset_choice, grade_f, diff_f, max_t): | |
| """데이터셋 선택에 따라 태스크 목록 반환""" | |
| if dataset_choice == "FINAL Bench Only": | |
| tasks = ALL_TASKS[:] | |
| elif dataset_choice == "SWE-bench Verified Only": | |
| tasks = SWE_BENCH_TASKS[:] | |
| else: # Both | |
| tasks = ALL_TASKS[:] + SWE_BENCH_TASKS[:] | |
| if grade_f != "All": | |
| tasks = [t for t in tasks if t.grade == grade_f] | |
| if diff_f != "All": | |
| tasks = [t for t in tasks if t.difficulty == diff_f] | |
| tasks = tasks[:int(max_t)] | |
| return tasks | |
| def _start_eval(api_key, eval_model, judge_model, proto_agi, dataset_choice, grade_f, diff_f, max_t, s_w, j_w, fresh): | |
| """단일 모드 시작 (Non-AGI 또는 Proto-AGI)""" | |
| global _EVAL_STATE | |
| if _EVAL_STATE["running"]: return "⚠️ Already running" | |
| # API 키 체크: Judge는 항상 OpenAI 필요 | |
| api_key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "") | |
| if not api_key: return "❌ OpenAI API Key required (for Judge)" | |
| # ★ API 연결 사전 검증 | |
| if eval_model not in LOCAL_MODELS: | |
| ok, msg = _test_api_connection(api_key, eval_model) | |
| if not ok: | |
| # 폴백 시도 | |
| actual = _resolve_model(eval_model, api_key) | |
| if actual == eval_model: | |
| return f"❌ Eval model API check failed: {msg}" | |
| else: | |
| print(f" ℹ️ Eval model {eval_model} → {actual}") | |
| # Judge 모델 검증 | |
| ok_j, msg_j = _test_api_connection(api_key, judge_model) | |
| if not ok_j: | |
| actual_j = _resolve_model(judge_model, api_key) | |
| if actual_j == judge_model: | |
| return f"❌ Judge model API check failed: {msg_j}" | |
| else: | |
| print(f" ℹ️ Judge model {judge_model} → {actual_j}") | |
| # 로컬/전용 모델 체크 | |
| if eval_model in LOCAL_MODELS: | |
| minfo = LOCAL_MODELS[eval_model] | |
| if minfo["type"] == "friendli": | |
| ok_f, msg_f = _test_friendli(minfo["id"]) | |
| if not ok_f: | |
| return (f"❌ Friendli API check failed: {msg_f}\n" | |
| f"💡 Set FRIENDLI_TOKEN in Space secrets") | |
| print(f" ✅ Friendli API OK: {minfo['id']}") | |
| elif minfo["type"] == "local_vllm": | |
| port = LOCAL_MODEL_CONFIG["server_port"] | |
| # ★ 서버 자동 감지 시도 | |
| if not LOCAL_MODEL_CONFIG["server_ready"]: | |
| if _probe_vllm_server(port): | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| print(f" ✅ vLLM server detected on port {port}") | |
| # ★ 아직 시작 중이면 대기 (최대 2분) | |
| if LOCAL_MODEL_CONFIG.get("server_starting") and not LOCAL_MODEL_CONFIG["server_ready"]: | |
| print(" ⏳ Waiting for vLLM server to finish starting...") | |
| for _ in range(24): | |
| time.sleep(5) | |
| if _probe_vllm_server(port): | |
| LOCAL_MODEL_CONFIG["server_ready"] = True | |
| break | |
| if not LOCAL_MODEL_CONFIG.get("server_starting"): | |
| break | |
| # ★ 그래도 안 되면 자동 시작 시도 | |
| if not LOCAL_MODEL_CONFIG["server_ready"]: | |
| if _detect_gpu(): | |
| return ("⏳ vLLM server not ready. Click '🚀 Start vLLM Server' to start, " | |
| "or wait for auto-start to complete. " | |
| f"GPU: {LOCAL_MODEL_CONFIG.get('gpu_info','unknown')}") | |
| else: | |
| return "❌ Local vLLM requires GPU but none detected. Use OpenAI model instead." | |
| if minfo["type"] == "hf_inference": | |
| hf_token = os.getenv("HF_TOKEN", "") | |
| if not hf_token: | |
| return "❌ HF_TOKEN required for HuggingFace Inference API. Set it in Space secrets." | |
| ok, ep_type, msg = _test_hf_model(minfo["id"], hf_token) | |
| if not ok: | |
| return (f"❌ HF Inference not available for {minfo['id']}: {msg}\n" | |
| f"💡 Try: (1) Use 'Darwin-gpt-ernie-20b (Local vLLM)' with GPU, " | |
| f"(2) Use OpenAI model like gpt-4o instead") | |
| print(f" ✅ HF model OK: {minfo['id']} via {ep_type}") | |
| tasks = _get_selected_tasks(dataset_choice, grade_f, diff_f, int(max_t)) | |
| if not tasks: | |
| return "❌ No tasks found for selected filters" | |
| mode = "PAGI" if proto_agi else "NON" | |
| rid = _make_run_id(eval_model, mode) | |
| if fresh: _clear_run(rid) | |
| _reset() | |
| # ★ 실제 사용될 모델명 기록 | |
| actual_eval = _resolve_model(eval_model, api_key) if eval_model not in LOCAL_MODELS else eval_model | |
| actual_judge = _resolve_model(judge_model, api_key) | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE.update({"running":True,"run_id":rid,"model":eval_model,"tasks":tasks, | |
| "total":len(tasks),"n_workers":int(s_w),"mode":mode,"proto_agi":proto_agi, | |
| "actual_eval_model": actual_eval, "actual_judge_model": actual_judge}) | |
| threading.Thread(target=_bg_single, daemon=True, | |
| args=(api_key, eval_model, judge_model, tasks, rid, int(s_w), int(j_w), proto_agi)).start() | |
| icon = "🌟 Proto-AGI" if proto_agi else "🤖 Non-AGI" | |
| swe_cnt = sum(1 for t in tasks if t.metadata.get("source") == "SWE-bench_Verified") | |
| final_cnt = len(tasks) - swe_cnt | |
| model_info = f"{eval_model}" if actual_eval == eval_model else f"{eval_model}→{actual_eval}" | |
| judge_info = f"{judge_model}" if actual_judge == judge_model else f"{judge_model}→{actual_judge}" | |
| return f"⚡ {icon} Started ({len(tasks)} tasks · 🧠{int(s_w)}w ⚖️{int(j_w)}w) · Eval:{model_info} Judge:{judge_info}" | |
| def _start_compare(api_key, eval_model, judge_model, dataset_choice, grade_f, diff_f, max_t, s_w, j_w, fresh): | |
| global _EVAL_STATE | |
| if _EVAL_STATE["running"]: return "⚠️ Already running" | |
| api_key = (api_key or "").strip() or os.getenv("OPENAI_API_KEY", "") | |
| if not api_key: return "❌ OpenAI API Key required (for Judge)" | |
| # ★ API 사전 검증 | |
| if eval_model in LOCAL_MODELS: | |
| minfo = LOCAL_MODELS[eval_model] | |
| if minfo.get("type") == "friendli": | |
| ok_f, msg_f = _test_friendli(minfo["id"]) | |
| if not ok_f: | |
| return f"❌ Friendli API check failed: {msg_f}" | |
| elif eval_model not in LOCAL_MODELS: | |
| ok, msg = _test_api_connection(api_key, eval_model) | |
| if not ok: | |
| actual = _resolve_model(eval_model, api_key) | |
| if actual == eval_model: | |
| return f"❌ Eval model failed: {msg}" | |
| ok_j, msg_j = _test_api_connection(api_key, judge_model) | |
| if not ok_j: | |
| actual_j = _resolve_model(judge_model, api_key) | |
| if actual_j == judge_model: | |
| return f"❌ Judge model failed: {msg_j}" | |
| tasks = _get_selected_tasks(dataset_choice, grade_f, diff_f, int(max_t)) | |
| if not tasks: return "❌ No tasks found" | |
| non_rid = _make_run_id(eval_model, "NON") | |
| pagi_rid = _make_run_id(eval_model, "PAGI") | |
| if fresh: _clear_run(non_rid); _clear_run(pagi_rid) | |
| _reset() | |
| with _EVAL_STATE["lock"]: | |
| _EVAL_STATE.update({"running":True,"model":eval_model,"tasks":tasks, | |
| "total":len(tasks),"n_workers":int(s_w), | |
| "compare_mode":True,"non_run_id":non_rid,"pagi_run_id":pagi_rid, | |
| "run_id":non_rid}) | |
| threading.Thread(target=_bg_compare, daemon=True, | |
| args=(api_key, eval_model, judge_model, tasks, non_rid, pagi_rid, int(s_w), int(j_w))).start() | |
| return f"🔄 Compare Started! ({len(tasks)} tasks · 🧠{int(s_w)}w ⚖️{int(j_w)}w)" | |
| def _stop(): | |
| if _EVAL_STATE["running"]: _EVAL_STATE["stop_requested"]=True; return "⏹️ Stopping..." | |
| return "ℹ️ Not running" | |
| def _poll(): | |
| with _EVAL_STATE["lock"]: | |
| running=_EVAL_STATE["running"]; finished=_EVAL_STATE["finished"] | |
| tasks=_EVAL_STATE.get("tasks",[]); results=dict(_EVAL_STATE.get("results",{})) | |
| msg=_EVAL_STATE.get("message",""); cp=_EVAL_STATE.get("csv_path") | |
| compare_mode=_EVAL_STATE.get("compare_mode",False) | |
| non_results=dict(_EVAL_STATE.get("non_results",{})) | |
| pagi_results=dict(_EVAL_STATE.get("pagi_results",{})) | |
| if not running and not finished and not results: | |
| return("ℹ️ Press ▶️ Start, 🌟 Proto-AGI, or 🔄 Compare","","","","",None) | |
| if running: | |
| pend = _EVAL_STATE.get("total",0) - _EVAL_STATE.get("cached",0) | |
| ph=CSS+_prog_html(_EVAL_STATE, pend) | |
| elif finished: | |
| ph=f'<div style="background:#e8f5e9;padding:12px;border-radius:8px;font-weight:600">{msg}</div>' | |
| else: ph=msg | |
| th=_build_progress_table(results,tasks) if tasks else "" | |
| sh,dh,cmp_html,co="","","",None | |
| if finished and tasks: | |
| model = _EVAL_STATE.get("model","?") | |
| hf_st = _EVAL_STATE.get("hf_status","") | |
| if compare_mode: | |
| sh = (_build_summary_card(non_results,tasks,model,hf_st,"🤖 Non-AGI") + | |
| _build_summary_card(pagi_results,tasks,model,hf_st,"🌟 Proto-AGI")) | |
| cmp_html = _build_comparison_html(non_results, pagi_results, tasks, model) | |
| else: | |
| sh = _build_summary_card(results,tasks,model,hf_st, | |
| "🌟 Proto-AGI" if _EVAL_STATE.get("proto_agi") else "🤖 Non-AGI") | |
| non_rid = _make_run_id(model,"NON") | |
| pagi_rid = _make_run_id(model,"PAGI") | |
| old_non = _load_all(non_rid) | |
| old_pagi = _load_all(pagi_rid) | |
| if old_non and old_pagi: | |
| cmp_html = _build_comparison_html(old_non, old_pagi, tasks, model) | |
| else: | |
| cmp_html = "<p style='color:#888'>Run both Non-AGI and Proto-AGI (or use 🔄 Compare) to see comparison.</p>" | |
| dh = _build_detail_view(results,tasks) | |
| co = cp | |
| return(ph,th,sh,cmp_html,dh,co) | |
| # ════════════════════════════════════════════════════════════════ | |
| # §11. Gradio App | |
| # ════════════════════════════════════════════════════════════════ | |
| HEADER = """ | |
| <div style="text-align:center;padding:16px 0"> | |
| <h1 style="margin:0;font-size:1.8em">🏆 FINAL Bench v3.1 + 🧬 Darwin Local Eval</h1> | |
| <h2 style="margin:4px 0;color:#555;font-size:1.05em">Frontier Intelligence Nexus for AGI-Level Verification</h2> | |
| <p style="color:#888;font-size:0.88em;max-width:780px;margin:8px auto"> | |
| <b>FINAL 100 Tasks + SWE-bench 500 Tasks · 15+ Domains · 8 TICOS · 5-Axis · 5-Stage AGI Grade</b><br> | |
| 🤖 Non-AGI (single LLM) vs 🌟 Proto-AGI (五行 木→火→土→金→水)<br> | |
| 🧬 <b>Darwin-gpt-ernie-20b</b> (gpt-oss-20b + ERNIE-4.5-21B Merge) · Friendli Dedicated Endpoint<br> | |
| 🐛 <b>SWE-bench_Verified</b> (500 Real-world Bug Fix Tasks) · ⚖️ Judge: GPT-5.2 | |
| </p> | |
| <div style="display:flex;justify-content:center;gap:6px;margin-top:8px;flex-wrap:wrap;font-size:0.82em"> | |
| <span style="background:#ffcdd2;padding:2px 10px;border-radius:12px">🅰️ A×1.5</span> | |
| <span style="background:#bbdefb;padding:2px 10px;border-radius:12px">🅱️ B×1.0</span> | |
| <span style="background:#e1bee7;padding:2px 10px;border-radius:12px">🅾️ C×0.7</span> | |
| <span style="background:#c8e6c9;padding:2px 10px;border-radius:12px">🐛 SWE-bench</span> | |
| <span style="background:#ffe0b2;padding:2px 10px;border-radius:12px">🧬 Darwin Merge</span> | |
| </div></div>""" | |
| def create_app(): | |
| with gr.Blocks(title="FINAL Bench v3.1 + Darwin") as app: | |
| gr.HTML(HEADER) | |
| # ── 모델 서버 제어 ── | |
| with gr.Accordion("🧬 Darwin-gpt-ernie-20b Local Server", open=False): | |
| model_info = gr.HTML(_build_model_info_html()) | |
| with gr.Row(): | |
| srv_start = gr.Button("🚀 Start vLLM Server", variant="primary", scale=2) | |
| srv_stop = gr.Button("⏹️ Stop Server", variant="stop", scale=1) | |
| srv_refresh = gr.Button("🔄 Refresh Status", scale=1) | |
| srv_status = gr.Textbox(label="Server Status", interactive=False, max_lines=2) | |
| srv_start.click(fn=_start_local_model_server, outputs=[srv_status]) | |
| srv_stop.click(fn=_stop_local_model_server, outputs=[srv_status]) | |
| def _refresh_model_info(): | |
| _auto_detect_server() | |
| return _build_model_info_html() | |
| srv_refresh.click(fn=_refresh_model_info, outputs=[model_info]) | |
| # ── 평가 설정 ── | |
| with gr.Row(): | |
| api_key = gr.Textbox(label="🔑 OpenAI API Key (Judge용)", type="password", | |
| placeholder="sk-...", value=os.getenv("OPENAI_API_KEY",""), scale=4) | |
| diag_btn = gr.Button("🔍 API Test", variant="secondary", scale=1) | |
| diag_status = gr.Textbox(label="API Diagnostic", interactive=False, max_lines=3, visible=True) | |
| def _run_diagnostic(key): | |
| key = (key or "").strip() or os.getenv("OPENAI_API_KEY", "") | |
| results = [] | |
| # Friendli check | |
| friendli_token = os.getenv("FRIENDLI_TOKEN", "") | |
| if friendli_token: | |
| ok_fr, msg_fr = _test_friendli() | |
| results.append(f"{'✅' if ok_fr else '❌'} Friendli (Darwin): {msg_fr[:60]}") | |
| else: | |
| results.append("⚠️ FRIENDLI_TOKEN not set") | |
| # vLLM 로컬 서버 체크 | |
| port = LOCAL_MODEL_CONFIG["server_port"] | |
| if _probe_vllm_server(port): | |
| results.append(f"✅ vLLM server: Running on port {port}") | |
| try: | |
| r = requests.get(f"http://localhost:{port}/v1/models", timeout=5) | |
| if r.status_code == 200: | |
| models = [m.get("id","?") for m in r.json().get("data",[])] | |
| results.append(f" 📦 Models: {', '.join(models)}") | |
| except: pass | |
| elif LOCAL_MODEL_CONFIG.get("server_starting"): | |
| results.append(f"🟡 vLLM server: Starting...") | |
| else: | |
| results.append(f"🔴 vLLM server: Not running (port {port})") | |
| # GPU | |
| gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 0) | |
| gpu = LOCAL_MODEL_CONFIG.get("gpu_info", "") | |
| if not gpu: | |
| _detect_gpu() | |
| gpu_count = LOCAL_MODEL_CONFIG.get("gpu_count", 0) | |
| gpu = LOCAL_MODEL_CONFIG.get("gpu_info", "Not detected") | |
| total_vram = LOCAL_MODEL_CONFIG.get("total_vram_mb", 0) | |
| results.append(f"🖥️ GPU: {gpu_count}x {gpu} (Total: {total_vram/1024:.1f}GB)") | |
| # OpenAI | |
| if not key: | |
| results.append("⚠️ OpenAI API Key is empty") | |
| else: | |
| for m in ["gpt-5.2", "gpt-4.1", "gpt-4o", "gpt-4o-mini"]: | |
| ok, msg = _test_api_connection(key, m) | |
| results.append(f"{'✅' if ok else '❌'} {m}: {'OK' if ok else msg.split(':',1)[-1].strip()[:50]}") | |
| # HF Token | |
| hf_token = os.getenv("HF_TOKEN", "") | |
| if hf_token: | |
| mid = LOCAL_MODEL_CONFIG["model_id"] | |
| ok_hf, ep, msg_hf = _test_hf_model(mid, hf_token) | |
| results.append(f"{'✅' if ok_hf else '❌'} HF:{mid} → {ep if ok_hf else msg_hf[:60]}") | |
| else: | |
| results.append("ℹ️ HF_TOKEN not set") | |
| return "\n".join(results) | |
| diag_btn.click(fn=_run_diagnostic, inputs=[api_key], outputs=[diag_status]) | |
| with gr.Row(): | |
| eval_m = gr.Dropdown(label="🤖 Eval Model (피평가 모델)", | |
| choices=list(ALL_EVAL_MODELS.keys()), | |
| value="Darwin-gpt-ernie-20b (Friendli)", | |
| scale=3) | |
| judge_m = gr.Dropdown(label="⚖️ Judge Model (채점자)", | |
| choices=list(OPENAI_MODELS.keys()), | |
| value="gpt-5.2", scale=3) | |
| with gr.Row(): | |
| dataset_choice = gr.Dropdown( | |
| label="📊 Dataset", | |
| choices=["FINAL Bench Only", "SWE-bench Verified Only", "Both (FINAL + SWE-bench)"], | |
| value="SWE-bench Verified Only", | |
| scale=2 | |
| ) | |
| proto_toggle = gr.Checkbox(label="🌟 Proto-AGI (五行)", value=False, scale=1) | |
| gf = gr.Dropdown(["All","A","B","C"], value="All", label="Grade", scale=1) | |
| df = gr.Dropdown(["All","easy","medium","hard","expert","frontier"], value="All", label="Difficulty", scale=1) | |
| with gr.Row(): | |
| mt = gr.Slider(1, 600, value=30, step=1, label="Max Tasks", scale=2) | |
| sw = gr.Slider(1, 6, value=2, step=1, label="🧠 Solver Workers", scale=1) | |
| jw = gr.Slider(1, 10, value=5, step=1, label="⚖️ Judge Workers", scale=1) | |
| with gr.Row(): | |
| s_btn = gr.Button("▶️ Start (Resume)", variant="primary", size="lg", scale=2) | |
| f_btn = gr.Button("🚀 Fresh Start", variant="secondary", size="lg", scale=2) | |
| cmp_btn = gr.Button("🔄 Compare: Non-AGI vs Proto-AGI", variant="primary", size="lg", scale=3) | |
| x_btn = gr.Button("⏹️ Stop", variant="stop", size="lg", scale=1) | |
| with gr.Row(): | |
| gr.HTML(f'<p style="color:#888;font-size:0.78em;margin:0">' | |
| f'📊 Available: FINAL={len(ALL_TASKS)} · SWE-bench={len(SWE_BENCH_TASKS)} · ' | |
| f'🧠Solver=풀이(Darwin/GPT) · ⚖️Judge=채점(GPT-5.2) · ★ 풀이↔채점 파이프라인 병렬처리</p>') | |
| status = gr.Textbox(label="Status", interactive=False, max_lines=2) | |
| with gr.Tabs(): | |
| with gr.Tab("📊 Progress"): p_html = gr.HTML() | |
| with gr.Tab("📋 Results"): t_html = gr.HTML() | |
| with gr.Tab("🏆 FINAL Score"): s_html = gr.HTML() | |
| with gr.Tab("🔄 Compare"): cmp_html = gr.HTML() | |
| with gr.Tab("🔍 Details"): d_html = gr.HTML() | |
| with gr.Tab("💾 CSV"): c_file = gr.File(label="CSV") | |
| timer = gr.Timer(value=2, active=True) | |
| timer.tick(fn=_poll, outputs=[p_html, t_html, s_html, cmp_html, d_html, c_file]) | |
| single_ins = [api_key, eval_m, judge_m, proto_toggle, dataset_choice, gf, df, mt, sw, jw] | |
| s_btn.click(fn=lambda *a: _start_eval(*a, fresh=False), inputs=single_ins, outputs=[status]) | |
| f_btn.click(fn=lambda *a: _start_eval(*a, fresh=True), inputs=single_ins, outputs=[status]) | |
| cmp_ins = [api_key, eval_m, judge_m, dataset_choice, gf, df, mt, sw, jw] | |
| cmp_btn.click(fn=lambda *a: _start_compare(*a, fresh=True), inputs=cmp_ins, outputs=[status]) | |
| x_btn.click(fn=_stop, outputs=[status]) | |
| gr.Markdown("---\n<center><b>FINAL Bench v3.1</b> · 🧬 Darwin-gpt-ernie-20b (Friendli) + SWE-bench_Verified<br>" | |
| "AGI Verification · Non-AGI vs Proto-AGI · 木火土金水<br>" | |
| "Apache 2.0 · <b>Ginigen AI</b> — Choi Sunyoung</center>") | |
| return app | |
| if __name__ == "__main__": | |
| # 통계 출력 | |
| print(f"\n{'='*60}") | |
| print(f" FINAL Bench v3.1 + Darwin Local Eval") | |
| print(f"{'='*60}") | |
| print(f" FINAL tasks: {len(ALL_TASKS)}") | |
| print(f" SWE-bench tasks: {len(SWE_BENCH_TASKS)}") | |
| print(f" Total available: {len(ALL_TASKS) + len(SWE_BENCH_TASKS)}") | |
| print(f" Darwin model: {LOCAL_MODEL_CONFIG['model_id']}") | |
| print(f" Friendli endpoint: deppfs281rgffnk") | |
| # ★ Friendli 연결 테스트 | |
| _friendli_ok, _friendli_msg = _test_friendli() | |
| print(f" Friendli status: {'✅' if _friendli_ok else '❌'} {_friendli_msg}") | |
| print(f" Proto-AGI: 木발상→火실행→土판단→金비평→水정제") | |
| print(f" Judge: GPT-5.2 (OpenAI)") | |
| print(f"{'='*60}\n") | |
| app = create_app() | |
| app.queue(default_concurrency_limit=2) | |
| app.launch(server_name="0.0.0.0", server_port=7860, | |
| theme=gr.themes.Soft(), | |
| css=".gradio-container{max-width:1200px !important} header{display:none!important}") |