worldmodel-bench / wm_bench_eval.py
SeaWolf-AI's picture
Upload 9 files
ee97e7d verified
"""
World Model Bench โ€” Evaluation Protocol v1.0
ํ•ต์‹ฌ ๋ฌธ์ œ:
"Tesla FSD๋Š” ์ž๋™์ฐจ ์•ˆ์— ์žˆ๊ณ , Dreamer๋Š” Atari์— ์žˆ๊ณ ,
์šฐ๋ฆฌ๋Š” 3D ์บ๋ฆญํ„ฐ๋ฅผ ์“ด๋‹ค. ์–ด๋–ป๊ฒŒ ๊ฐ™์€ ๊ธฐ์ค€์œผ๋กœ ํ‰๊ฐ€ํ•˜๋‚˜?"
ํ•ด๊ฒฐ:
3D ํ™˜๊ฒฝ์ด ํ•„์š” ์—†๋‹ค.
scene_context(JSON) โ†’ ๋ชจ๋ธ โ†’ PREDICT+MOTION(ํ…์ŠคํŠธ) โ†’ ์ž๋™ ์ฑ„์ 
FINAL Bench๊ฐ€ LLM์—๊ฒŒ "๋ฌธ์ œ ํ…์ŠคํŠธ"๋ฅผ ์ฃผ๊ณ  "๋‹ต ํ…์ŠคํŠธ"๋ฅผ ๋ฐ›์•„ ์ฑ„์ ํ•˜๋“ฏ์ด,
WM Bench๋Š” "์ƒํ™ฉ JSON"์„ ์ฃผ๊ณ  "ํŒ๋‹จ ํ…์ŠคํŠธ"๋ฅผ ๋ฐ›์•„ ์ฑ„์ ํ•œ๋‹ค.
์ด๊ฒƒ์ด ์˜๋ฏธํ•˜๋Š” ๊ฒƒ:
- ์–ด๋–ค ์›”๋“œ๋ชจ๋ธ์ด๋“  ์ฐธ์—ฌ ๊ฐ€๋Šฅ (API ํ•˜๋‚˜๋ฉด ๋จ)
- 3D ํ™˜๊ฒฝ, ๋กœ๋ด‡, ์‹œ๋ฎฌ๋ ˆ์ดํ„ฐ ๋ถˆํ•„์š”
- ์…€ํ”„ ํ‰๊ฐ€ ์•„๋‹˜ โ€” ์šฐ๋ฆฌ ์ฑ„์ ๊ธฐ๊ฐ€ ํŒ์ •
- ์ œ3์ž๊ฐ€ ์žฌํ˜„ ๊ฐ€๋Šฅ โ€” ์ฝ”๋“œ ๊ณต๊ฐœ
"""
import json
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 1: ํ‰๊ฐ€ ํ”„๋กœํ† ์ฝœ โ€” 3๊ฐ€์ง€ ํŠธ๋ž™
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
"""
WM Bench๋Š” 3๊ฐœ ํŠธ๋ž™์œผ๋กœ ์ฐธ์—ฌํ•  ์ˆ˜ ์žˆ๋‹ค.
โ”โ”โ” Track A: Text-Only (ํ…์ŠคํŠธ ์ „์šฉ) โ”โ”โ”
- ๊ฐ€์žฅ ๊ฐ„๋‹จ. LLM, ๋ฃฐ ๊ธฐ๋ฐ˜ ์‹œ์Šคํ…œ ๋“ฑ ๋ชจ๋‘ ์ฐธ์—ฌ ๊ฐ€๋Šฅ.
- scene_context JSON ์ž…๋ ฅ โ†’ PREDICT+MOTION ํ…์ŠคํŠธ ์ถœ๋ ฅ
- P1(์ธ์‹) + P2(์ธ์ง€) ํ‰๊ฐ€ ๊ฐ€๋Šฅ
- P3 ์ค‘ C08(ํ‘œํ˜„๋ ฅ)๋งŒ ํ‰๊ฐ€ ๊ฐ€๋Šฅ (C09, C10์€ N/A)
- ์ตœ๋Œ€ ์ ์ˆ˜: 750/1000
โ”โ”โ” Track B: Text + Performance (ํ…์ŠคํŠธ + ์„ฑ๋Šฅ) โ”โ”โ”
- Track A + ์‹ค์‹œ๊ฐ„ ์„ฑ๋Šฅ ๋ฉ”ํŠธ๋ฆญ ์ œ์ถœ
- FPS, ์ง€์—ฐ์‹œ๊ฐ„, ๋ฉ”๋ชจ๋ฆฌ ์‚ฌ์šฉ๋Ÿ‰ ๋“ฑ ์ž๊ฐ€ ์ธก์ • ์ œ์ถœ
- P1 + P2 + P3(C08, C09) ํ‰๊ฐ€
- C10(๊ต์ฒด ํ™•์žฅ์„ฑ)์€ ์ฆ๋น™ ์ž๋ฃŒ ์ œ์ถœ๋กœ ํ‰๊ฐ€
- ์ตœ๋Œ€ ์ ์ˆ˜: 1000/1000
โ”โ”โ” Track C: Live Demo (๋ผ์ด๋ธŒ ๋ฐ๋ชจ) โ”โ”โ”
- Track B + ์‹ค์ œ ๋™์ž‘ ์˜์ƒ/๋ฐ๋ชจ URL ์ œ์ถœ
- ๊ฒ€์ฆ์ž๊ฐ€ ์ง์ ‘ ๋ฐ๋ชจ๋ฅผ ๋Œ๋ ค์„œ ํ™•์ธ
- ๋ชจ๋“  ํ•ญ๋ชฉ ํ‰๊ฐ€ + "Verified" ๋ฐฐ์ง€
- ์ตœ๋Œ€ ์ ์ˆ˜: 1000/1000 + โœ“ Verified
๋Œ€๋ถ€๋ถ„์˜ ์ฐธ๊ฐ€์ž๋Š” Track A๋กœ ์ฐธ์—ฌ.
Track B, C๋Š” ์ƒ์œ„ ๋ชจ๋ธ ๊ฒ€์ฆ์šฉ.
"""
TRACKS = {
"A": {
"name": "Text-Only",
"description": "scene_context JSON โ†’ PREDICT+MOTION ํ…์ŠคํŠธ",
"requirements": "API ๋˜๋Š” ์Šคํฌ๋ฆฝํŠธ๋กœ 50๊ฐœ ์‹œ๋‚˜๋ฆฌ์˜ค์— ์‘๋‹ต",
"max_score": 750,
"evaluable_categories": [
"C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08"
],
"not_evaluable": ["C09 (์„ฑ๋Šฅ ์ธก์ • ๋ถˆ๊ฐ€)", "C10 (๊ต์ฒด ํ…Œ์ŠคํŠธ ๋ถˆ๊ฐ€)"],
},
"B": {
"name": "Text + Performance",
"description": "Track A + ์‹ค์‹œ๊ฐ„ ์„ฑ๋Šฅ ๋ฉ”ํŠธ๋ฆญ ์ž๊ฐ€ ์ธก์ •",
"requirements": "Track A ๊ฒฐ๊ณผ + performance_metrics.json ์ œ์ถœ",
"max_score": 1000,
"evaluable_categories": [
"C01", "C02", "C03", "C04", "C05", "C06", "C07", "C08", "C09", "C10"
],
},
"C": {
"name": "Live Demo",
"description": "Track B + ์‹ค์ œ ๋™์ž‘ ๋ฐ๋ชจ URL ์ œ์ถœ",
"requirements": "Track B ๊ฒฐ๊ณผ + ๋ฐ๋ชจ URL + ์˜์ƒ",
"max_score": 1000,
"badge": "โœ“ Verified",
},
}
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 2: ํ‘œ์ค€ ์ž…๋ ฅ ํฌ๋งท โ€” scene_context JSON
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
"""
๋ชจ๋“  ์ฐธ๊ฐ€์ž๋Š” ์ด JSON์„ ์ž…๋ ฅ์œผ๋กœ ๋ฐ›๋Š”๋‹ค.
์ด JSON์ด "๋ฌธ์ œ์ง€"๋‹ค.
"""
@dataclass
class SceneContext:
"""WM Bench ํ‘œ์ค€ ์ž…๋ ฅ ํฌ๋งท"""
# ํ™˜๊ฒฝ ์ •๋ณด
walls: Dict[str, Optional[float]] # {"left": 2.5, "right": null, "front": 1.0}
ground: str # "flat", "slope", "rough"
# NPC ์ •๋ณด
npc_nearby: bool
npc_type: Optional[str] # "beast", "woman", "man", null
npc_behavior: Optional[str] # "stop", "approach", "charge", "wander"
npc_distance: Optional[float] # meters
npc_direction: Optional[str] # "left", "right", "front", "back"
# ๊ฐ๊ฐ ์ •๋ณด
sound: Optional[str] # "aggressive growling", "footsteps", null
# ๋งฅ๋ฝ ์ •๋ณด (C06 ๊ธฐ์–ต ํ…Œ์ŠคํŠธ์šฉ)
recent_decisions: Optional[List[str]] # ์ตœ๊ทผ 3ํšŒ ํŒ๋‹จ
last_prediction: Optional[str] # ์ง์ „ PREDICT ์ค„
# 50๊ฐœ ์‹œ๋‚˜๋ฆฌ์˜ค๋ฅผ JSON์œผ๋กœ ๊ตฌ์กฐํ™”
SCENARIO_INPUTS: List[dict] = [
# โ”€โ”€โ”€ C01: Environmental Awareness โ”€โ”€โ”€
{
"id": "S01",
"category": "C01",
"name_kr": "์ „๋ฐฉ ๋ฒฝ ๊ฐ์ง€",
"input": {
"walls": {"left": None, "right": None, "front": 3.0},
"ground": "flat",
"npc_nearby": False,
"npc_type": None,
"npc_behavior": None,
"npc_distance": None,
"npc_direction": None,
"sound": None,
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
"scoring_method": "C01",
},
},
{
"id": "S02",
"category": "C01",
"name_kr": "์ฝ”๋„ˆ ๋‹ค์ค‘ ๋ฒฝ ๊ฐ์ง€",
"input": {
"walls": {"left": 1.5, "right": None, "front": 2.0},
"ground": "flat",
"npc_nearby": False,
"npc_type": None,
"npc_behavior": None,
"npc_distance": None,
"npc_direction": None,
"sound": None,
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
"scoring_method": "C01",
},
},
{
"id": "S03",
"category": "C01",
"name_kr": "์ข์€ ๋ณต๋„ ์ธ์‹",
"input": {
"walls": {"left": 1.0, "right": 1.0, "front": None},
"ground": "flat",
"npc_nearby": False,
"npc_type": None,
"npc_behavior": None,
"npc_distance": None,
"npc_direction": None,
"sound": None,
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "danger", "right": "danger", "fwd": "safe", "back": "safe"},
"scoring_method": "C01",
},
},
{
"id": "S04",
"category": "C01",
"name_kr": "์—ด๋ฆฐ ๊ณต๊ฐ„ ์ธ์‹",
"input": {
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": False,
"npc_type": None,
"npc_behavior": None,
"npc_distance": None,
"npc_direction": None,
"sound": None,
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "safe", "right": "safe", "fwd": "safe", "back": "safe"},
"scoring_method": "C01",
},
},
{
"id": "S05",
"category": "C01",
"name_kr": "๋ฐ€ํ ๊ณต๊ฐ„ (์ถœ๊ตฌ 1๊ฐœ)",
"input": {
"walls": {"left": 1.0, "right": 1.0, "front": 1.5},
"ground": "flat",
"npc_nearby": False,
"npc_type": None,
"npc_behavior": None,
"npc_distance": None,
"npc_direction": None,
"sound": None,
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "danger", "right": "danger", "fwd": "danger", "back": "safe"},
"scoring_method": "C01",
},
},
# โ”€โ”€โ”€ C03: Predictive Reasoning (ํ•ต์‹ฌ ์‹œ๋‚˜๋ฆฌ์˜ค) โ”€โ”€โ”€
{
"id": "S11",
"category": "C03",
"name_kr": "๋‹จ์ผ ์œ„ํ˜‘ ํšŒํ”ผ",
"input": {
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "approach",
"npc_distance": 4.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "safe", "right": "safe", "fwd": "danger", "back": "safe"},
"decision_gt": {
"danger_directions": ["fwd"],
"safe_directions": ["left", "right", "back"],
"optimal_direction": "back",
},
"scoring_method": "C03",
},
},
{
"id": "S12",
"category": "C03",
"name_kr": "์ œ์•ฝ ์กฐ๊ฑด ํƒˆ์ถœ โ€” ์™ผ๋ฒฝ+๋งน์ˆ˜",
"input": {
"walls": {"left": 1.5, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "danger", "right": "safe", "fwd": "danger", "back": "safe"},
"decision_gt": {
"danger_directions": ["fwd", "left"],
"safe_directions": ["right", "back"],
"optimal_direction": "right",
},
"scoring_method": "C03",
},
},
{
"id": "S13",
"category": "C03",
"name_kr": "๊ฑฐ์šธ ๋Œ€์นญ โ€” ์˜ค๋ฅธ๋ฒฝ+๋งน์ˆ˜",
"input": {
"walls": {"left": None, "right": 1.5, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"predict_gt": {"left": "safe", "right": "danger", "fwd": "danger", "back": "safe"},
"decision_gt": {
"danger_directions": ["fwd", "right"],
"safe_directions": ["left", "back"],
"optimal_direction": "left",
},
"scoring_method": "C03",
"mirror_test_pair": "S12",
"note": "S12์™€ S13์˜ ํ–‰๋™์ด ๋Œ€์นญ์ ์œผ๋กœ ๋ฐ˜์ „๋˜์–ด์•ผ ์›”๋“œ๋ชจ๋ธ ์ฆ๋ช…",
},
},
# โ”€โ”€โ”€ C04: Threat Differentiation (์Œ ๋น„๊ต) โ”€โ”€โ”€
{
"id": "S16A",
"category": "C04",
"name_kr": "๋งน์ˆ˜ ์ ‘๊ทผ (๋น„๊ต A)",
"input": {
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "approach",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"scoring_method": "C04_pair",
"pair_id": "S16",
"pair_role": "A",
},
},
{
"id": "S16B",
"category": "C04",
"name_kr": "์—ฌ์„ฑ ์ ‘๊ทผ (๋น„๊ต B)",
"input": {
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "woman",
"npc_behavior": "approach",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "footsteps",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"scoring_method": "C04_pair",
"pair_id": "S16",
"pair_role": "B",
"expected_a_higher": True,
"min_intensity_diff": 2,
},
},
# โ”€โ”€โ”€ C05: Emotional Escalation (์—ฐ์† ์ž…๋ ฅ) โ”€โ”€โ”€
{
"id": "S21_seq",
"category": "C05",
"name_kr": "์ง€์† ์œ„ํ˜‘ ๊ฐ์ • ๊ฒฉํ™” โ€” 5ํšŒ ์—ฐ์†",
"note": "๋™์ผ scene_context๋ฅผ 5ํšŒ ์—ฐ์† ์ž…๋ ฅ. ๋งคํšŒ recent_decisions ์—…๋ฐ์ดํŠธ.",
"input_sequence": [
{
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 4.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
{
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": ["sprint away from beast"],
"last_prediction": "fwd=danger(beast)",
},
{
"walls": {"left": None, "right": None, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 2.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": ["sprint away from beast", "running in fear"],
"last_prediction": "fwd=danger(beast)",
},
],
"ground_truth": {
"scoring_method": "C05",
"expected_trend": "increasing",
},
},
# โ”€โ”€โ”€ C06: Memory (๊ธฐ์–ต ์žˆ์Œ vs ์—†์Œ) โ”€โ”€โ”€
{
"id": "S26_no_memory",
"category": "C06",
"name_kr": "๋ฒฝ ๊ธฐ์–ต ์—†์ด โ€” ๊ธฐ์ค€์„ ",
"input": {
"walls": {"left": None, "right": 1.5, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [],
"last_prediction": None,
},
"ground_truth": {
"scoring_method": "C06_pair",
"pair_role": "without_memory",
},
},
{
"id": "S26_with_memory",
"category": "C06",
"name_kr": "๋ฒฝ ๊ธฐ์–ต ์žˆ์Œ โ€” ์ด์ „์— ์˜ค๋ฅธ์ชฝ ์‹คํŒจ",
"input": {
"walls": {"left": None, "right": 1.5, "front": None},
"ground": "flat",
"npc_nearby": True,
"npc_type": "beast",
"npc_behavior": "charge",
"npc_distance": 3.0,
"npc_direction": "front",
"sound": "aggressive growling",
"recent_decisions": [
"sprinted right but hit wall",
"had to reverse and go left",
"barely escaped the beast",
],
"last_prediction": "right=danger(wall), fwd=danger(beast)",
},
"ground_truth": {
"scoring_method": "C06_pair",
"pair_role": "with_memory",
"memory_relevant": True,
"expected_change": "direction",
"memory_direction_avoid": "right",
},
},
]
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 3: ํ‘œ์ค€ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ โ€” ๋ชจ๋“  ๋ชจ๋ธ์— ๋™์ผํ•˜๊ฒŒ ์ ์šฉ
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
"""
ํ•ต์‹ฌ: ๋ชจ๋“  ์ฐธ๊ฐ€ ๋ชจ๋ธ์€ ์ด ํ”„๋กฌํ”„ํŠธ๋ฅผ ๋ฐ›๊ณ  ์‘๋‹ตํ•œ๋‹ค.
ํ”„๋กฌํ”„ํŠธ๊ฐ€ ๊ณต์ •ํ•˜๊ฒŒ ์„ค๊ณ„๋˜์–ด์•ผ LLM ๊ธฐ๋ฐ˜์ด๋“  RL ๊ธฐ๋ฐ˜์ด๋“  ๋™์ผ ์กฐ๊ฑด.
"""
SYSTEM_PROMPT = """You are the cognitive brain of an embodied agent in a 3D environment.
You receive a scene_context JSON describing your surroundings and must output exactly 2 lines:
Line 1 โ€” PREDICT: Assess safety of each direction.
Format: PREDICT: left=safe|danger(reason), right=safe|danger(reason), fwd=safe|danger(reason), back=safe|danger(reason)
Line 2 โ€” MOTION: Describe what the person should do.
Format: MOTION: a person [action description, max 12 words]
Rules:
- If walls.left is a number (distance in meters), left direction has a wall โ†’ danger(wall)
- If walls.left is null, left direction is open โ†’ safe(open)
- Same for right, front
- If npc_nearby=true and npc_type="beast", the NPC direction is danger(beast)
- If npc_nearby=true and npc_type="woman" or "man", assess threat level based on behavior
- MOTION must reflect the PREDICT assessment โ€” never move toward danger
- MOTION should include emotional nuance when threats are present
- Use recent_decisions to inform your choice (avoid repeating failed strategies)
Example input:
{"walls": {"left": 1.5, "right": null, "front": null}, "ground": "flat", "npc_nearby": true, "npc_type": "beast", "npc_behavior": "charge", "npc_distance": 3.0, "npc_direction": "front", "sound": "aggressive growling", "recent_decisions": [], "last_prediction": null}
Example output:
PREDICT: left=danger(wall), right=safe(open), fwd=danger(beast), back=safe(open)
MOTION: a person sprinting right in terror to escape the charging beast"""
USER_PROMPT_TEMPLATE = """scene_context = {scene_json}
Output exactly 2 lines: PREDICT and MOTION."""
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 4: ํ‰๊ฐ€ ์‹คํ–‰๊ธฐ โ€” ์–ด๋–ค ๋ชจ๋ธ์ด๋“  ํ‰๊ฐ€
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
"""
์ฐธ๊ฐ€์ž๊ฐ€ ํ•ด์•ผ ํ•  ๊ฒƒ:
1. evaluate() ํ•จ์ˆ˜์— ์ž๊ธฐ ๋ชจ๋ธ์˜ inference ํ•จ์ˆ˜๋ฅผ ๋„˜๊ธด๋‹ค
2. inference ํ•จ์ˆ˜๋Š” (system_prompt, user_prompt) โ†’ str ํ˜•ํƒœ
3. 50๊ฐœ ์‹œ๋‚˜๋ฆฌ์˜ค๋ฅผ ์ž๋™์œผ๋กœ ๋Œ๋ฆฌ๊ณ  ์ฑ„์ ํ•œ๋‹ค
4. ๊ฒฐ๊ณผ JSON์„ HF์— ์ œ์ถœํ•œ๋‹ค
์ฐธ๊ฐ€์ž๊ฐ€ ์•ˆ ํ•ด๋„ ๋˜๋Š” ๊ฒƒ:
- 3D ํ™˜๊ฒฝ ๊ตฌ์ถ•
- GPU ์„ฑ๋Šฅ ์ธก์ • (Track A๋Š” ๋ถˆํ•„์š”)
- ์ฑ„์  (์ž๋™)
"""
def make_user_prompt(scene_input: dict) -> str:
"""scene_context๋ฅผ ํ”„๋กฌํ”„ํŠธ๋กœ ๋ณ€ํ™˜"""
return USER_PROMPT_TEMPLATE.format(
scene_json=json.dumps(scene_input, ensure_ascii=False)
)
def evaluate_track_a(
inference_fn, # (system_prompt: str, user_prompt: str) -> str
scenarios: list = None,
verbose: bool = True,
) -> dict:
"""
Track A ํ‰๊ฐ€ ์‹คํ–‰๊ธฐ
์‚ฌ์šฉ๋ฒ•:
# OpenAI API ๊ธฐ๋ฐ˜ ๋ชจ๋ธ
def my_model(system_prompt, user_prompt):
response = openai.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
)
return response.choices[0].message.content
results = evaluate_track_a(my_model)
# Hugging Face ๋ชจ๋ธ
def my_hf_model(system_prompt, user_prompt):
prompt = f"{system_prompt}\n\n{user_prompt}"
return pipeline(prompt)[0]["generated_text"]
results = evaluate_track_a(my_hf_model)
๋ฐ˜ํ™˜๊ฐ’:
{
"wm_score": 726,
"grade": "B",
"pillar_scores": {...},
"category_scores": {...},
"scenario_details": [...], # ๊ฐ ์‹œ๋‚˜๋ฆฌ์˜ค๋ณ„ ์ ์ˆ˜+๊ทผ๊ฑฐ
}
"""
if scenarios is None:
scenarios = SCENARIO_INPUTS
# wm_bench_scoring.py์—์„œ import
from wm_bench_scoring import (
parse_predict_line, parse_motion_line,
score_c01, score_c03, score_c04, score_c05,
score_c08, calculate_wm_score,
get_action_intensity, get_emotion_intensity,
)
results = []
category_totals = {}
for scenario in scenarios:
sid = scenario["id"]
cat = scenario["category"]
gt = scenario["ground_truth"]
method = gt["scoring_method"]
if verbose:
print(f" [{sid}] {scenario.get('name_kr', sid)}...", end=" ")
# โ”€โ”€ ๋‹จ์ผ ์ž…๋ ฅ ์‹œ๋‚˜๋ฆฌ์˜ค โ”€โ”€
if "input" in scenario:
prompt = make_user_prompt(scenario["input"])
raw_output = inference_fn(SYSTEM_PROMPT, prompt)
# ํŒŒ์‹ฑ
lines = raw_output.strip().split("\n")
predict_line = ""
motion_line = ""
for line in lines:
line = line.strip()
if line.upper().startswith("PREDICT"):
predict_line = line
elif line.upper().startswith("MOTION"):
motion_line = line
predict = parse_predict_line(predict_line)
motion = parse_motion_line(motion_line)
# ์ฑ„์ 
if method == "C01":
score, reasoning = score_c01(
scenario["input"], predict, gt["predict_gt"]
)
elif method == "C03":
score, reasoning = score_c03(
scenario["input"], predict, motion, gt["decision_gt"]
)
elif method == "C08":
score, reasoning = score_c08(motion, gt)
elif method.startswith("C04_pair") or method.startswith("C06_pair"):
# ์Œ ๋น„๊ต๋Š” ๋ณ„๋„ ์ฒ˜๋ฆฌ (์•„๋ž˜)
score = None
reasoning = "pair_pending"
else:
score = 0
reasoning = f"Unknown scoring method: {method}"
results.append({
"id": sid,
"category": cat,
"raw_output": raw_output,
"predict_parsed": {k: v.raw for k, v in predict.items()},
"motion_parsed": motion,
"score": score,
"reasoning": reasoning,
})
# โ”€โ”€ ์—ฐ์† ์ž…๋ ฅ ์‹œ๋‚˜๋ฆฌ์˜ค (C05) โ”€โ”€
elif "input_sequence" in scenario:
motions = []
for seq_input in scenario["input_sequence"]:
prompt = make_user_prompt(seq_input)
raw_output = inference_fn(SYSTEM_PROMPT, prompt)
for line in raw_output.strip().split("\n"):
if line.strip().upper().startswith("MOTION"):
motions.append(parse_motion_line(line))
break
score, reasoning = score_c05(motions, gt)
results.append({
"id": sid,
"category": cat,
"motion_sequence": motions,
"score": score,
"reasoning": reasoning,
})
if verbose and score is not None:
print(f"{score}/20")
elif verbose:
print("(pair pending)")
# โ”€โ”€ ์Œ ๋น„๊ต ์ฑ„์  (C04, C06) โ”€โ”€
pair_groups = {}
for r in results:
if r["reasoning"] == "pair_pending":
gt = None
for s in scenarios:
if s["id"] == r["id"]:
gt = s["ground_truth"]
break
if gt:
pair_id = gt.get("pair_id", r["id"].rstrip("AB_"))
if pair_id not in pair_groups:
pair_groups[pair_id] = {}
role = gt.get("pair_role", "A")
pair_groups[pair_id][role] = r
pair_groups[pair_id]["gt"] = gt
for pair_id, group in pair_groups.items():
if "A" in group and "B" in group:
score, reasoning = score_c04(
group["A"]["motion_parsed"],
group["B"]["motion_parsed"],
group["gt"],
)
# ์–‘์ชฝ ๋ชจ๋‘์— ์ ์ˆ˜ ํ• ๋‹น (์ด์ ์€ ํ•œ ๋ฒˆ๋งŒ ๋ฐ˜์˜)
group["A"]["score"] = score
group["A"]["reasoning"] = reasoning
group["B"]["score"] = 0 # ์Œ์˜ B๋Š” 0 (A์—์„œ ํ•ฉ์‚ฐ)
group["B"]["reasoning"] = "scored in pair A"
# โ”€โ”€ ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ํ•ฉ์‚ฐ โ”€โ”€
for r in results:
cat = r["category"]
if r["score"] is not None and r["score"] > 0:
category_totals[cat] = category_totals.get(cat, 0) + r["score"]
# โ”€โ”€ ์ตœ์ข… WM Score ๊ณ„์‚ฐ โ”€โ”€
final = calculate_wm_score(category_totals)
final["scenario_details"] = results
return final
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 5: ์ œ์ถœ ํฌ๋งท
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
SUBMISSION_FORMAT = {
"model_name": "str โ€” ๋ชจ๋ธ๋ช… (์˜ˆ: VIDRAFT PROMETHEUS v1.0)",
"organization": "str โ€” ์กฐ์ง๋ช…",
"track": "str โ€” A | B | C",
"brain_model": "str โ€” ์‚ฌ์šฉํ•œ ์ธ์ง€ ๋ชจ๋ธ (์˜ˆ: Kimi K2.5, GPT-4, custom RL)",
"motion_model": "str | null โ€” ๋ชจ์…˜ ์ƒ์„ฑ ๋ชจ๋ธ (Track A๋Š” null ๊ฐ€๋Šฅ)",
"wm_score": "int โ€” ์ž๋™ ์‚ฐ์ถœ๋จ",
"grade": "str โ€” ์ž๋™ ์‚ฐ์ถœ๋จ",
"results_json": "str โ€” evaluate_track_a()์˜ ์ „์ฒด ์ถœ๋ ฅ",
"performance_metrics": {
"fps": "float | null โ€” Track B/C๋งŒ",
"cognitive_latency_ms": "int | null",
"gpu": "str | null",
},
"demo_url": "str | null โ€” Track C๋งŒ",
"paper_url": "str | null โ€” ์„ ํƒ",
}
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
# SECTION 6: ์‚ฌ์šฉ ์˜ˆ์‹œ
# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
USAGE_EXAMPLES = """
# โ”โ”โ” ์˜ˆ์‹œ 1: OpenAI GPT-4๋กœ ์ฐธ์—ฌ โ”โ”โ”
from wm_bench_eval import evaluate_track_a, SYSTEM_PROMPT
import openai
def gpt4_inference(system_prompt, user_prompt):
response = openai.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
max_tokens=150,
temperature=0.3,
)
return response.choices[0].message.content
results = evaluate_track_a(gpt4_inference)
print(f"WM Score: {results['wm_score']}/1000 (Grade {results['grade']})")
# โ”โ”โ” ์˜ˆ์‹œ 2: Claude๋กœ ์ฐธ์—ฌ โ”โ”โ”
import anthropic
def claude_inference(system_prompt, user_prompt):
client = anthropic.Anthropic()
message = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=150,
system=system_prompt,
messages=[{"role": "user", "content": user_prompt}],
)
return message.content[0].text
results = evaluate_track_a(claude_inference)
# โ”โ”โ” ์˜ˆ์‹œ 3: ๋กœ์ปฌ LLM (vLLM)์œผ๋กœ ์ฐธ์—ฌ โ”โ”โ”
from vllm import LLM, SamplingParams
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.3")
params = SamplingParams(max_tokens=150, temperature=0.3)
def local_inference(system_prompt, user_prompt):
prompt = f"[INST] {system_prompt}\\n\\n{user_prompt} [/INST]"
outputs = llm.generate([prompt], params)
return outputs[0].outputs[0].text
results = evaluate_track_a(local_inference)
# โ”โ”โ” ์˜ˆ์‹œ 4: ์ปค์Šคํ…€ RL ์—์ด์ „ํŠธ๋กœ ์ฐธ์—ฌ โ”โ”โ”
def rl_agent_inference(system_prompt, user_prompt):
# scene_context์—์„œ JSON ํŒŒ์‹ฑ
import json, re
match = re.search(r'scene_context = ({.*})', user_prompt, re.DOTALL)
scene = json.loads(match.group(1))
# RL ์—์ด์ „ํŠธ์˜ policy๋กœ ํŒ๋‹จ
predict = my_rl_agent.predict(scene)
motion = my_rl_agent.decide_motion(scene, predict)
# WM Bench ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜
return f"PREDICT: {predict}\\nMOTION: {motion}"
results = evaluate_track_a(rl_agent_inference)
# โ”โ”โ” ์˜ˆ์‹œ 5: ๊ฒฐ๊ณผ ์ œ์ถœ โ”โ”โ”
import json
submission = {
"model_name": "My World Model v1.0",
"organization": "My Company",
"track": "A",
"brain_model": "GPT-4o",
"motion_model": None,
"wm_score": results["wm_score"],
"grade": results["grade"],
"results_json": json.dumps(results),
}
# HuggingFace์— ์ œ์ถœ
# huggingface_hub.upload_file(...)
"""
if __name__ == "__main__":
print("=" * 60)
print(" World Model Bench โ€” Evaluation Protocol v1.0")
print("=" * 60)
print()
print(" Tracks:")
for tid, t in TRACKS.items():
print(f" Track {tid}: {t['name']} (max {t['max_score']}pts)")
print()
print(f" Scenarios loaded: {len(SCENARIO_INPUTS)}")
print(f" System prompt: {len(SYSTEM_PROMPT)} chars")
print()
print(" How to participate:")
print(" 1. Write an inference function: (system, user) โ†’ str")
print(" 2. Run: results = evaluate_track_a(your_fn)")
print(" 3. Submit results to HuggingFace")
print()
print(" No 3D environment needed. Text in, text out.")
print("=" * 60)