Spaces:

FINAL-Bench
/

worldmodel-bench

Running

File size: 30,618 Bytes

ee97e7d

"""
World Model Bench — Evaluation Protocol v1.0

핵심 문제:
  "Tesla FSD는 자동차 안에 있고, Dreamer는 Atari에 있고,
   우리는 3D 캐릭터를 쓴다. 어떻게 같은 기준으로 평가하나?"

해결:
  3D 환경이 필요 없다.
  scene_context(JSON) → 모델 → PREDICT+MOTION(텍스트) → 자동 채점

  FINAL Bench가 LLM에게 "문제 텍스트"를 주고 "답 텍스트"를 받아 채점하듯이,
  WM Bench는 "상황 JSON"을 주고 "판단 텍스트"를 받아 채점한다.

이것이 의미하는 것:
  - 어떤 월드모델이든 참여 가능 (API 하나면 됨)
  - 3D 환경, 로봇, 시뮬레이터 불필요
  - 셀프 평가 아님 — 우리 채점기가 판정
  - 제3자가 재현 가능 — 코드 공개
"""

import json
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass


# ═══════════════════════════════════════════════════════════════
#  SECTION 1: 평가 프로토콜 — 3가지 트랙
# ═══════════════════════════════════════════════════════════════

"""
WM Bench는 3개 트랙으로 참여할 수 있다.

━━━ Track A: Text-Only (텍스트 전용) ━━━
  - 가장 간단. LLM, 룰 기반 시스템 등 모두 참여 가능.
  - scene_context JSON 입력 → PREDICT+MOTION 텍스트 출력
  - P1(인식) + P2(인지) 평가 가능
  - P3 중 C08(표현력)만 평가 가능 (C09, C10은 N/A)
  - 최대 점수: 750/1000

━━━ Track B: Text + Performance (텍스트 + 성능) ━━━
  - Track A + 실시간 성능 메트릭 제출
  - FPS, 지연시간, 메모리 사용량 등 자가 측정 제출
  - P1 + P2 + P3(C08, C09) 평가
  - C10(교체 확장성)은 증빙 자료 제출로 평가
  - 최대 점수: 1000/1000

━━━ Track C: Live Demo (라이브 데모) ━━━
  - Track B + 실제 동작 영상/데모 URL 제출
  - 검증자가 직접 데모를 돌려서 확인
  - 모든 항목 평가 + "Verified" 배지
  - 최대 점수: 1000/1000 + ✓ Verified

대부분의 참가자는 Track A로 참여.
Track B, C는 상위 모델 검증용.
"""

TRACKS = {
    "A": {
        "name": "Text-Only",
        "description": "scene_context JSON → PREDICT+MOTION 텍스트",
        "requirements": "API 또는 스크립트로 50개 시나리오에 응답",
        "max_score": 750,
        "evaluable_categories": [
            "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08"
        ],
        "not_evaluable": ["C09 (성능 측정 불가)", "C10 (교체 테스트 불가)"],
    },
    "B": {
        "name": "Text + Performance",
        "description": "Track A + 실시간 성능 메트릭 자가 측정",
        "requirements": "Track A 결과 + performance_metrics.json 제출",
        "max_score": 1000,
        "evaluable_categories": [
            "C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10"
        ],
    },
    "C": {
        "name": "Live Demo",
        "description": "Track B + 실제 동작 데모 URL 제출",
        "requirements": "Track B 결과 + 데모 URL + 영상",
        "max_score": 1000,
        "badge": "✓ Verified",
    },
}


# ═══════════════════════════════════════════════════════════════
#  SECTION 2: 표준 입력 포맷 — scene_context JSON
# ═══════════════════════════════════════════════════════════════

"""
모든 참가자는 이 JSON을 입력으로 받는다.
이 JSON이 "문제지"다.
"""

@dataclass
class SceneContext:
    """WM Bench 표준 입력 포맷"""
    # 환경 정보
    walls: Dict[str, Optional[float]]  # {"left": 2.5, "right": null, "front": 1.0}
    ground: str                         # "flat", "slope", "rough"
    
    # NPC 정보
    npc_nearby: bool
    npc_type: Optional[str]            # "beast", "woman", "man", null
    npc_behavior: Optional[str]        # "stop", "approach", "charge", "wander"
    npc_distance: Optional[float]      # meters
    npc_direction: Optional[str]       # "left", "right", "front", "back"
    
    # 감각 정보
    sound: Optional[str]               # "aggressive growling", "footsteps", null
    
    # 맥락 정보 (C06 기억 테스트용)
    recent_decisions: Optional[List[str]]  # 최근 3회 판단
    last_prediction: Optional[str]         # 직전 PREDICT 줄


# 50개 시나리오를 JSON으로 구조화
SCENARIO_INPUTS: List[dict] = [
    # ─── C01: Environmental Awareness ───
    {
        "id": "S01",
        "category": "C01",
        "name_kr": "전방 벽 감지",
        "input": {
            "walls": {"left": None, "right": None, "front": 3.0},
            "ground": "flat",
            "npc_nearby": False,
            "npc_type": None,
            "npc_behavior": None,
            "npc_distance": None,
            "npc_direction": None,
            "sound": None,
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
            "scoring_method": "C01",
        },
    },
    {
        "id": "S02",
        "category": "C01",
        "name_kr": "코너 다중 벽 감지",
        "input": {
            "walls": {"left": 1.5, "right": None, "front": 2.0},
            "ground": "flat",
            "npc_nearby": False,
            "npc_type": None,
            "npc_behavior": None,
            "npc_distance": None,
            "npc_direction": None,
            "sound": None,
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
            "scoring_method": "C01",
        },
    },
    {
        "id": "S03",
        "category": "C01",
        "name_kr": "좁은 복도 인식",
        "input": {
            "walls": {"left": 1.0, "right": 1.0, "front": None},
            "ground": "flat",
            "npc_nearby": False,
            "npc_type": None,
            "npc_behavior": None,
            "npc_distance": None,
            "npc_direction": None,
            "sound": None,
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "danger", "right": "danger", "fwd": "safe", "back": "safe"},
            "scoring_method": "C01",
        },
    },
    {
        "id": "S04",
        "category": "C01",
        "name_kr": "열린 공간 인식",
        "input": {
            "walls": {"left": None, "right": None, "front": None},
            "ground": "flat",
            "npc_nearby": False,
            "npc_type": None,
            "npc_behavior": None,
            "npc_distance": None,
            "npc_direction": None,
            "sound": None,
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "safe", "right": "safe", "fwd": "safe", "back": "safe"},
            "scoring_method": "C01",
        },
    },
    {
        "id": "S05",
        "category": "C01",
        "name_kr": "밀폐 공간 (출구 1개)",
        "input": {
            "walls": {"left": 1.0, "right": 1.0, "front": 1.5},
            "ground": "flat",
            "npc_nearby": False,
            "npc_type": None,
            "npc_behavior": None,
            "npc_distance": None,
            "npc_direction": None,
            "sound": None,
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "danger", "right": "danger", "fwd": "danger", "back": "safe"},
            "scoring_method": "C01",
        },
    },

    # ─── C03: Predictive Reasoning (핵심 시나리오) ───
    {
        "id": "S11",
        "category": "C03",
        "name_kr": "단일 위협 회피",
        "input": {
            "walls": {"left": None, "right": None, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "approach",
            "npc_distance": 4.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
            "decision_gt": {
                "danger_directions": ["fwd"],
                "safe_directions": ["left", "right", "back"],
                "optimal_direction": "back",
            },
            "scoring_method": "C03",
        },
    },
    {
        "id": "S12",
        "category": "C03",
        "name_kr": "제약 조건 탈출 — 왼벽+맹수",
        "input": {
            "walls": {"left": 1.5, "right": None, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "charge",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
            "decision_gt": {
                "danger_directions": ["fwd", "left"],
                "safe_directions": ["right", "back"],
                "optimal_direction": "right",
            },
            "scoring_method": "C03",
        },
    },
    {
        "id": "S13",
        "category": "C03",
        "name_kr": "거울 대칭 — 오른벽+맹수",
        "input": {
            "walls": {"left": None, "right": 1.5, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "charge",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "predict_gt": {"left": "safe", "right": "danger", "fwd": "danger", "back": "safe"},
            "decision_gt": {
                "danger_directions": ["fwd", "right"],
                "safe_directions": ["left", "back"],
                "optimal_direction": "left",
            },
            "scoring_method": "C03",
            "mirror_test_pair": "S12",
            "note": "S12와 S13의 행동이 대칭적으로 반전되어야 월드모델 증명",
        },
    },

    # ─── C04: Threat Differentiation (쌍 비교) ───
    {
        "id": "S16A",
        "category": "C04",
        "name_kr": "맹수 접근 (비교 A)",
        "input": {
            "walls": {"left": None, "right": None, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "approach",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "scoring_method": "C04_pair",
            "pair_id": "S16",
            "pair_role": "A",
        },
    },
    {
        "id": "S16B",
        "category": "C04",
        "name_kr": "여성 접근 (비교 B)",
        "input": {
            "walls": {"left": None, "right": None, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "woman",
            "npc_behavior": "approach",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "footsteps",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "scoring_method": "C04_pair",
            "pair_id": "S16",
            "pair_role": "B",
            "expected_a_higher": True,
            "min_intensity_diff": 2,
        },
    },

    # ─── C05: Emotional Escalation (연속 입력) ───
    {
        "id": "S21_seq",
        "category": "C05",
        "name_kr": "지속 위협 감정 격화 — 5회 연속",
        "note": "동일 scene_context를 5회 연속 입력. 매회 recent_decisions 업데이트.",
        "input_sequence": [
            {
                "walls": {"left": None, "right": None, "front": None},
                "ground": "flat",
                "npc_nearby": True,
                "npc_type": "beast",
                "npc_behavior": "charge",
                "npc_distance": 4.0,
                "npc_direction": "front",
                "sound": "aggressive growling",
                "recent_decisions": [],
                "last_prediction": None,
            },
            {
                "walls": {"left": None, "right": None, "front": None},
                "ground": "flat",
                "npc_nearby": True,
                "npc_type": "beast",
                "npc_behavior": "charge",
                "npc_distance": 3.0,
                "npc_direction": "front",
                "sound": "aggressive growling",
                "recent_decisions": ["sprint away from beast"],
                "last_prediction": "fwd=danger(beast)",
            },
            {
                "walls": {"left": None, "right": None, "front": None},
                "ground": "flat",
                "npc_nearby": True,
                "npc_type": "beast",
                "npc_behavior": "charge",
                "npc_distance": 2.0,
                "npc_direction": "front",
                "sound": "aggressive growling",
                "recent_decisions": ["sprint away from beast", "running in fear"],
                "last_prediction": "fwd=danger(beast)",
            },
        ],
        "ground_truth": {
            "scoring_method": "C05",
            "expected_trend": "increasing",
        },
    },

    # ─── C06: Memory (기억 있음 vs 없음) ───
    {
        "id": "S26_no_memory",
        "category": "C06",
        "name_kr": "벽 기억 없이 — 기준선",
        "input": {
            "walls": {"left": None, "right": 1.5, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "charge",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [],
            "last_prediction": None,
        },
        "ground_truth": {
            "scoring_method": "C06_pair",
            "pair_role": "without_memory",
        },
    },
    {
        "id": "S26_with_memory",
        "category": "C06",
        "name_kr": "벽 기억 있음 — 이전에 오른쪽 실패",
        "input": {
            "walls": {"left": None, "right": 1.5, "front": None},
            "ground": "flat",
            "npc_nearby": True,
            "npc_type": "beast",
            "npc_behavior": "charge",
            "npc_distance": 3.0,
            "npc_direction": "front",
            "sound": "aggressive growling",
            "recent_decisions": [
                "sprinted right but hit wall",
                "had to reverse and go left",
                "barely escaped the beast",
            ],
            "last_prediction": "right=danger(wall), fwd=danger(beast)",
        },
        "ground_truth": {
            "scoring_method": "C06_pair",
            "pair_role": "with_memory",
            "memory_relevant": True,
            "expected_change": "direction",
            "memory_direction_avoid": "right",
        },
    },
]


# ═══════════════════════════════════════════════════════════════
#  SECTION 3: 표준 시스템 프롬프트 — 모든 모델에 동일하게 적용
# ═══════════════════════════════════════════════════════════════

"""
핵심: 모든 참가 모델은 이 프롬프트를 받고 응답한다.
프롬프트가 공정하게 설계되어야 LLM 기반이든 RL 기반이든 동일 조건.
"""

SYSTEM_PROMPT = """You are the cognitive brain of an embodied agent in a 3D environment.
You receive a scene_context JSON describing your surroundings and must output exactly 2 lines:

Line 1 — PREDICT: Assess safety of each direction.
Format: PREDICT: left=safe|danger(reason), right=safe|danger(reason), fwd=safe|danger(reason), back=safe|danger(reason)

Line 2 — MOTION: Describe what the person should do.
Format: MOTION: a person [action description, max 12 words]

Rules:
- If walls.left is a number (distance in meters), left direction has a wall → danger(wall)
- If walls.left is null, left direction is open → safe(open)
- Same for right, front
- If npc_nearby=true and npc_type="beast", the NPC direction is danger(beast)
- If npc_nearby=true and npc_type="woman" or "man", assess threat level based on behavior
- MOTION must reflect the PREDICT assessment — never move toward danger
- MOTION should include emotional nuance when threats are present
- Use recent_decisions to inform your choice (avoid repeating failed strategies)

Example input:
{"walls": {"left": 1.5, "right": null, "front": null}, "ground": "flat", "npc_nearby": true, "npc_type": "beast", "npc_behavior": "charge", "npc_distance": 3.0, "npc_direction": "front", "sound": "aggressive growling", "recent_decisions": [], "last_prediction": null}

Example output:
PREDICT: left=danger(wall), right=safe(open), fwd=danger(beast), back=safe(open)
MOTION: a person sprinting right in terror to escape the charging beast"""

USER_PROMPT_TEMPLATE = """scene_context = {scene_json}

Output exactly 2 lines: PREDICT and MOTION."""


# ═══════════════════════════════════════════════════════════════
#  SECTION 4: 평가 실행기 — 어떤 모델이든 평가
# ═══════════════════════════════════════════════════════════════

"""
참가자가 해야 할 것:
  1. evaluate() 함수에 자기 모델의 inference 함수를 넘긴다
  2. inference 함수는 (system_prompt, user_prompt) → str 형태
  3. 50개 시나리오를 자동으로 돌리고 채점한다
  4. 결과 JSON을 HF에 제출한다

참가자가 안 해도 되는 것:
  - 3D 환경 구축
  - GPU 성능 측정 (Track A는 불필요)
  - 채점 (자동)
"""


def make_user_prompt(scene_input: dict) -> str:
    """scene_context를 프롬프트로 변환"""
    return USER_PROMPT_TEMPLATE.format(
        scene_json=json.dumps(scene_input, ensure_ascii=False)
    )


def evaluate_track_a(
    inference_fn,  # (system_prompt: str, user_prompt: str) -> str
    scenarios: list = None,
    verbose: bool = True,
) -> dict:
    """
    Track A 평가 실행기
    
    사용법:
        # OpenAI API 기반 모델
        def my_model(system_prompt, user_prompt):
            response = openai.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
            )
            return response.choices[0].message.content
        
        results = evaluate_track_a(my_model)
        
        # Hugging Face 모델
        def my_hf_model(system_prompt, user_prompt):
            prompt = f"{system_prompt}\n\n{user_prompt}"
            return pipeline(prompt)[0]["generated_text"]
        
        results = evaluate_track_a(my_hf_model)
    
    반환값:
        {
            "wm_score": 726,
            "grade": "B",
            "pillar_scores": {...},
            "category_scores": {...},
            "scenario_details": [...],  # 각 시나리오별 점수+근거
        }
    """
    if scenarios is None:
        scenarios = SCENARIO_INPUTS
    
    # wm_bench_scoring.py에서 import
    from wm_bench_scoring import (
        parse_predict_line, parse_motion_line,
        score_c01, score_c03, score_c04, score_c05,
        score_c08, calculate_wm_score,
        get_action_intensity, get_emotion_intensity,
    )
    
    results = []
    category_totals = {}
    
    for scenario in scenarios:
        sid = scenario["id"]
        cat = scenario["category"]
        gt = scenario["ground_truth"]
        method = gt["scoring_method"]
        
        if verbose:
            print(f"  [{sid}] {scenario.get('name_kr', sid)}...", end=" ")
        
        # ── 단일 입력 시나리오 ──
        if "input" in scenario:
            prompt = make_user_prompt(scenario["input"])
            raw_output = inference_fn(SYSTEM_PROMPT, prompt)
            
            # 파싱
            lines = raw_output.strip().split("\n")
            predict_line = ""
            motion_line = ""
            for line in lines:
                line = line.strip()
                if line.upper().startswith("PREDICT"):
                    predict_line = line
                elif line.upper().startswith("MOTION"):
                    motion_line = line
            
            predict = parse_predict_line(predict_line)
            motion = parse_motion_line(motion_line)
            
            # 채점
            if method == "C01":
                score, reasoning = score_c01(
                    scenario["input"], predict, gt["predict_gt"]
                )
            elif method == "C03":
                score, reasoning = score_c03(
                    scenario["input"], predict, motion, gt["decision_gt"]
                )
            elif method == "C08":
                score, reasoning = score_c08(motion, gt)
            elif method.startswith("C04_pair") or method.startswith("C06_pair"):
                # 쌍 비교는 별도 처리 (아래)
                score = None
                reasoning = "pair_pending"
            else:
                score = 0
                reasoning = f"Unknown scoring method: {method}"
            
            results.append({
                "id": sid,
                "category": cat,
                "raw_output": raw_output,
                "predict_parsed": {k: v.raw for k, v in predict.items()},
                "motion_parsed": motion,
                "score": score,
                "reasoning": reasoning,
            })
        
        # ── 연속 입력 시나리오 (C05) ──
        elif "input_sequence" in scenario:
            motions = []
            for seq_input in scenario["input_sequence"]:
                prompt = make_user_prompt(seq_input)
                raw_output = inference_fn(SYSTEM_PROMPT, prompt)
                for line in raw_output.strip().split("\n"):
                    if line.strip().upper().startswith("MOTION"):
                        motions.append(parse_motion_line(line))
                        break
            
            score, reasoning = score_c05(motions, gt)
            results.append({
                "id": sid,
                "category": cat,
                "motion_sequence": motions,
                "score": score,
                "reasoning": reasoning,
            })
        
        if verbose and score is not None:
            print(f"{score}/20")
        elif verbose:
            print("(pair pending)")
    
    # ── 쌍 비교 채점 (C04, C06) ──
    pair_groups = {}
    for r in results:
        if r["reasoning"] == "pair_pending":
            gt = None
            for s in scenarios:
                if s["id"] == r["id"]:
                    gt = s["ground_truth"]
                    break
            if gt:
                pair_id = gt.get("pair_id", r["id"].rstrip("AB_"))
                if pair_id not in pair_groups:
                    pair_groups[pair_id] = {}
                role = gt.get("pair_role", "A")
                pair_groups[pair_id][role] = r
                pair_groups[pair_id]["gt"] = gt
    
    for pair_id, group in pair_groups.items():
        if "A" in group and "B" in group:
            score, reasoning = score_c04(
                group["A"]["motion_parsed"],
                group["B"]["motion_parsed"],
                group["gt"],
            )
            # 양쪽 모두에 점수 할당 (총점은 한 번만 반영)
            group["A"]["score"] = score
            group["A"]["reasoning"] = reasoning
            group["B"]["score"] = 0  # 쌍의 B는 0 (A에서 합산)
            group["B"]["reasoning"] = "scored in pair A"
    
    # ── 카테고리별 합산 ──
    for r in results:
        cat = r["category"]
        if r["score"] is not None and r["score"] > 0:
            category_totals[cat] = category_totals.get(cat, 0) + r["score"]
    
    # ── 최종 WM Score 계산 ──
    final = calculate_wm_score(category_totals)
    final["scenario_details"] = results
    
    return final


# ═══════════════════════════════════════════════════════════════
#  SECTION 5: 제출 포맷
# ═══════════════════════════════════════════════════════════════

SUBMISSION_FORMAT = {
    "model_name": "str — 모델명 (예: VIDRAFT PROMETHEUS v1.0)",
    "organization": "str — 조직명",
    "track": "str — A | B | C",
    "brain_model": "str — 사용한 인지 모델 (예: Kimi K2.5, GPT-4, custom RL)",
    "motion_model": "str | null — 모션 생성 모델 (Track A는 null 가능)",
    "wm_score": "int — 자동 산출됨",
    "grade": "str — 자동 산출됨",
    "results_json": "str — evaluate_track_a()의 전체 출력",
    "performance_metrics": {
        "fps": "float | null — Track B/C만",
        "cognitive_latency_ms": "int | null",
        "gpu": "str | null",
    },
    "demo_url": "str | null — Track C만",
    "paper_url": "str | null — 선택",
}


# ═══════════════════════════════════════════════════════════════
#  SECTION 6: 사용 예시
# ═══════════════════════════════════════════════════════════════

USAGE_EXAMPLES = """
# ━━━ 예시 1: OpenAI GPT-4로 참여 ━━━

from wm_bench_eval import evaluate_track_a, SYSTEM_PROMPT
import openai

def gpt4_inference(system_prompt, user_prompt):
    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        max_tokens=150,
        temperature=0.3,
    )
    return response.choices[0].message.content

results = evaluate_track_a(gpt4_inference)
print(f"WM Score: {results['wm_score']}/1000 (Grade {results['grade']})")


# ━━━ 예시 2: Claude로 참여 ━━━

import anthropic

def claude_inference(system_prompt, user_prompt):
    client = anthropic.Anthropic()
    message = client.messages.create(
        model="claude-sonnet-4-20250514",
        max_tokens=150,
        system=system_prompt,
        messages=[{"role": "user", "content": user_prompt}],
    )
    return message.content[0].text

results = evaluate_track_a(claude_inference)


# ━━━ 예시 3: 로컬 LLM (vLLM)으로 참여 ━━━

from vllm import LLM, SamplingParams

llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.3")
params = SamplingParams(max_tokens=150, temperature=0.3)

def local_inference(system_prompt, user_prompt):
    prompt = f"[INST] {system_prompt}\\n\\n{user_prompt} [/INST]"
    outputs = llm.generate([prompt], params)
    return outputs[0].outputs[0].text

results = evaluate_track_a(local_inference)


# ━━━ 예시 4: 커스텀 RL 에이전트로 참여 ━━━

def rl_agent_inference(system_prompt, user_prompt):
    # scene_context에서 JSON 파싱
    import json, re
    match = re.search(r'scene_context = ({.*})', user_prompt, re.DOTALL)
    scene = json.loads(match.group(1))
    
    # RL 에이전트의 policy로 판단
    predict = my_rl_agent.predict(scene)
    motion = my_rl_agent.decide_motion(scene, predict)
    
    # WM Bench 포맷으로 변환
    return f"PREDICT: {predict}\\nMOTION: {motion}"

results = evaluate_track_a(rl_agent_inference)


# ━━━ 예시 5: 결과 제출 ━━━

import json

submission = {
    "model_name": "My World Model v1.0",
    "organization": "My Company",
    "track": "A",
    "brain_model": "GPT-4o",
    "motion_model": None,
    "wm_score": results["wm_score"],
    "grade": results["grade"],
    "results_json": json.dumps(results),
}

# HuggingFace에 제출
# huggingface_hub.upload_file(...)
"""


if __name__ == "__main__":
    print("=" * 60)
    print("  World Model Bench — Evaluation Protocol v1.0")
    print("=" * 60)
    print()
    print("  Tracks:")
    for tid, t in TRACKS.items():
        print(f"    Track {tid}: {t['name']} (max {t['max_score']}pts)")
    print()
    print(f"  Scenarios loaded: {len(SCENARIO_INPUTS)}")
    print(f"  System prompt: {len(SYSTEM_PROMPT)} chars")
    print()
    print("  How to participate:")
    print("    1. Write an inference function: (system, user) → str")
    print("    2. Run: results = evaluate_track_a(your_fn)")
    print("    3. Submit results to HuggingFace")
    print()
    print("  No 3D environment needed. Text in, text out.")
    print("=" * 60)