diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,271 +1,139 @@ -import json +# app.py import math -import hashlib -from dataclasses import dataclass, asdict +import json +import base64 +import random +from dataclasses import dataclass, asdict, field from typing import Dict, List, Tuple, Optional, Any import numpy as np -from PIL import Image, ImageDraw - -import matplotlib.pyplot as plt -from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas +from PIL import Image, ImageDraw, ImageFont import gradio as gr # ============================================================ -# ZEN AgentLab — Agent POV + Multi-Agent Mini-Sim Arena -# -# Fix included: -# - Matplotlib rendering uses FigureCanvas.buffer_rgba() (HF-safe) -# -# Features: -# - Deterministic gridworld + first-person raycast POV -# - Multiple environments (Chase / CoopVault / MiniCiv) -# - Click-to-edit tiles + pickups + hazards + simple combat tags -# - Full step trace: obs -> action -> reward -> (optional) Q-update -# - Branching timelines (rewind + fork) -# - Batch training (tabular Q-learning) + metrics dashboard -# - Export/import full runs + SHA256 proof hash +# ZEN AgentLab++ — Animated Multi-Map Agent Simulation Arena +# ============================================================ +# Goals: +# - Working, automated "autoplay" simulation (timer-driven) +# - Multiple gameplay types (Predator/Prey "Pacman", CTF, Treasure, Resource Raid) +# - Multiple maps/courses (hand-crafted + procedural mazes) +# - "Cool" UI/UX + animations in the actual environment: +# * Smooth animated top-down via SVG + CSS transitions (browser-side animation) +# * Optional pseudo-3D "POV" panel (simple raycast look) +# * Mini objective HUD + event highlights +# - Fully self-contained: just this app.py + requirements.txt # -# HF Spaces compatible: no timers, no fn_kwargs +# NOTE: The SVG renderer is the secret weapon: +# It updates positions each tick; CSS transitions animate movement smoothly +# without generating tons of frames server-side. # ============================================================ # ----------------------------- -# Global config (shared) +# Grid + Render Config # ----------------------------- -GRID_W, GRID_H = 21, 15 -TILE = 22 +GRID_W, GRID_H = 29, 19 # map resolution (tiles) +TILE = 24 # pixels per tile for SVG +HUD_H = 64 # HUD header height (px) +SVG_W = GRID_W * TILE +SVG_H = GRID_H * TILE + HUD_H -VIEW_W, VIEW_H = 640, 360 -RAY_W = 320 -FOV_DEG = 78 -MAX_DEPTH = 20 +VIEW_W, VIEW_H = 560, 315 # pseudo-3D POV panel +FOV_DEG = 74 +MAX_DEPTH = 22 DIRS = [(1, 0), (0, 1), (-1, 0), (0, -1)] ORI_DEG = [0, 90, 180, 270] +# ----------------------------- # Tiles +# ----------------------------- EMPTY = 0 WALL = 1 -FOOD = 2 -NOISE = 3 -DOOR = 4 -TELE = 5 -KEY = 6 -EXIT = 7 -ARTIFACT = 8 -HAZARD = 9 -WOOD = 10 -ORE = 11 -MEDKIT = 12 -SWITCH = 13 -BASE = 14 +PELLET = 2 +POWER = 3 +FLAG_A = 4 +FLAG_B = 5 +TREASURE = 6 +BASE_A = 7 +BASE_B = 8 +RESOURCE = 9 +HAZARD = 10 +GATE = 11 TILE_NAMES = { EMPTY: "Empty", WALL: "Wall", - FOOD: "Food", - NOISE: "Noise", - DOOR: "Door", - TELE: "Teleporter", - KEY: "Key", - EXIT: "Exit", - ARTIFACT: "Artifact", + PELLET: "Pellet", + POWER: "Power", + FLAG_A: "Flag A", + FLAG_B: "Flag B", + TREASURE: "Treasure", + BASE_A: "Base A", + BASE_B: "Base B", + RESOURCE: "Resource", HAZARD: "Hazard", - WOOD: "Wood", - ORE: "Ore", - MEDKIT: "Medkit", - SWITCH: "Switch", - BASE: "Base", + GATE: "Gate", } +# Palette (kept consistent / readable) +COL_BG = "#0b1020" +COL_PANEL = "#0f1733" +COL_GRIDLINE = "#121a3b" +COL_WALL = "#cdd2e6" +COL_EMPTY = "#19214a" +COL_PELLET = "#ffd17a" +COL_POWER = "#ff7ad9" +COL_FLAG_A = "#7affc8" +COL_FLAG_B = "#ff7a7a" +COL_TREASURE = "#ffb86b" +COL_BASE_A = "#a0ffd9" +COL_BASE_B = "#ffb0b0" +COL_RESOURCE = "#9ab0ff" +COL_HAZARD = "#ff3b3b" +COL_GATE = "#7ad9ff" + AGENT_COLORS = { - "Predator": (255, 120, 90), - "Prey": (120, 255, 160), - "Scout": (120, 190, 255), - "Alpha": (255, 205, 120), - "Bravo": (160, 210, 255), - "Guardian": (255, 120, 220), - "BuilderA": (140, 255, 200), - "BuilderB": (160, 200, 255), - "Raider": (255, 160, 120), + "Predator": "#ff6d6d", + "Prey": "#6dffb0", + "Ghost1": "#ff7ad9", + "Ghost2": "#7ad9ff", + "RunnerA": "#ffd17a", + "RunnerB": "#9ab0ff", + "GuardA": "#7affc8", + "GuardB": "#ffb0b0", + "MinerA": "#a0ffd9", + "MinerB": "#c7d2fe", + "Raider": "#ff9b6b", } -SKY = np.array([14, 16, 26], dtype=np.uint8) -FLOOR_NEAR = np.array([24, 26, 40], dtype=np.uint8) -FLOOR_FAR = np.array([10, 11, 18], dtype=np.uint8) -WALL_BASE = np.array([210, 210, 225], dtype=np.uint8) -WALL_SIDE = np.array([150, 150, 170], dtype=np.uint8) -DOOR_COL = np.array([140, 210, 255], dtype=np.uint8) - -# Small action space for tabular stability -ACTIONS = ["L", "F", "R", "I"] # interact - # ----------------------------- -# Deterministic RNG +# Utility # ----------------------------- -def rng_for(seed: int, step: int, stream: int = 0) -> np.random.Generator: - mix = (seed * 1_000_003) ^ (step * 9_999_937) ^ (stream * 97_531) - return np.random.default_rng(mix & 0xFFFFFFFFFFFFFFFF) - -# ----------------------------- -# Data structures -# ----------------------------- -@dataclass -class Agent: - name: str - x: int - y: int - ori: int - hp: int = 10 - energy: int = 100 - team: str = "A" - brain: str = "q" # q | heuristic | random - inventory: Dict[str, int] = None - - def __post_init__(self): - if self.inventory is None: - self.inventory = {} - -@dataclass -class TrainConfig: - use_q: bool = True - alpha: float = 0.15 - gamma: float = 0.95 - epsilon: float = 0.10 - epsilon_min: float = 0.02 - epsilon_decay: float = 0.995 - - # generic shaping - step_penalty: float = -0.01 - explore_reward: float = 0.015 - damage_penalty: float = -0.20 - heal_reward: float = 0.10 - - # chase - chase_close_coeff: float = 0.03 - chase_catch_reward: float = 3.0 - chase_escaped_reward: float = 0.2 - chase_caught_penalty: float = -3.0 - food_reward: float = 0.6 - - # vault - artifact_pick_reward: float = 1.2 - exit_win_reward: float = 3.0 - guardian_tag_reward: float = 2.0 - tagged_penalty: float = -2.0 - switch_reward: float = 0.8 - key_reward: float = 0.4 - - # civ - resource_pick_reward: float = 0.15 - deposit_reward: float = 0.4 - base_progress_win_reward: float = 3.5 - raider_elim_reward: float = 2.0 - builder_elim_penalty: float = -2.0 - -@dataclass -class GlobalMetrics: - episodes: int = 0 - wins_teamA: int = 0 - wins_teamB: int = 0 - draws: int = 0 - avg_steps: float = 0.0 - rolling_winrate_A: float = 0.0 - epsilon: float = 0.10 - last_outcome: str = "init" - last_steps: int = 0 - -@dataclass -class EpisodeMetrics: - steps: int = 0 - returns: Dict[str, float] = None - action_counts: Dict[str, Dict[str, int]] = None - tiles_discovered: Dict[str, int] = None - - def __post_init__(self): - if self.returns is None: - self.returns = {} - if self.action_counts is None: - self.action_counts = {} - if self.tiles_discovered is None: - self.tiles_discovered = {} - -@dataclass -class WorldState: - seed: int - step: int - env_key: str - grid: List[List[int]] - agents: Dict[str, Agent] - - controlled: str - pov: str - overlay: bool - - done: bool - outcome: str # A_win | B_win | draw | ongoing - - # env state - door_opened_global: bool = False - base_progress: int = 0 - base_target: int = 10 - - # instrumentation - event_log: List[str] = None - trace_log: List[str] = None - - # learning - cfg: TrainConfig = None - q_tables: Dict[str, Dict[str, List[float]]] = None - gmetrics: GlobalMetrics = None - emetrics: EpisodeMetrics = None - - def __post_init__(self): - if self.event_log is None: - self.event_log = [] - if self.trace_log is None: - self.trace_log = [] - if self.cfg is None: - self.cfg = TrainConfig() - if self.q_tables is None: - self.q_tables = {} - if self.gmetrics is None: - self.gmetrics = GlobalMetrics(epsilon=self.cfg.epsilon) - if self.emetrics is None: - self.emetrics = EpisodeMetrics() - -@dataclass -class Snapshot: - branch: str - step: int - env_key: str - grid: List[List[int]] - agents: Dict[str, Dict[str, Any]] - done: bool - outcome: str - door_opened_global: bool - base_progress: int - base_target: int - event_tail: List[str] - trace_tail: List[str] - emetrics: Dict[str, Any] +def clamp(v, lo, hi): + return lo if v < lo else hi if v > hi else v -# ----------------------------- -# Helpers -# ----------------------------- def in_bounds(x: int, y: int) -> bool: return 0 <= x < GRID_W and 0 <= y < GRID_H -def is_blocking(tile: int, door_open: bool = False) -> bool: - if tile == WALL: - return True - if tile == DOOR and not door_open: - return True - return False +def manhattan(a: Tuple[int, int], b: Tuple[int, int]) -> int: + return abs(a[0] - b[0]) + abs(a[1] - b[1]) + +def rng(seed: int) -> random.Random: + r = random.Random() + r.seed(seed & 0xFFFFFFFF) + return r -def manhattan_xy(ax: int, ay: int, bx: int, by: int) -> int: - return abs(ax - bx) + abs(ay - by) +def grid_copy(g: List[List[int]]) -> List[List[int]]: + return [row[:] for row in g] + +def find_all(g: List[List[int]], tile: int) -> List[Tuple[int, int]]: + out = [] + for y in range(GRID_H): + for x in range(GRID_W): + if g[y][x] == tile: + out.append((x, y)) + return out def bresenham_los(grid: List[List[int]], x0: int, y0: int, x1: int, y1: int) -> bool: dx = abs(x1 - x0) @@ -288,1465 +156,1507 @@ def bresenham_los(grid: List[List[int]], x0: int, y0: int, x1: int, y1: int) -> err += dx y += sy -def within_fov(observer: Agent, tx: int, ty: int, fov_deg: float = FOV_DEG) -> bool: - dx = tx - observer.x - dy = ty - observer.y +def within_fov(ax: int, ay: int, ori: int, tx: int, ty: int, fov_deg: float = FOV_DEG) -> bool: + dx = tx - ax + dy = ty - ay if dx == 0 and dy == 0: return True - angle = math.degrees(math.atan2(dy, dx)) % 360 - facing = ORI_DEG[observer.ori] - diff = (angle - facing + 540) % 360 - 180 + ang = (math.degrees(math.atan2(dy, dx)) % 360) + facing = ORI_DEG[ori] + diff = (ang - facing + 540) % 360 - 180 return abs(diff) <= (fov_deg / 2) -def visible(state: WorldState, observer: Agent, target: Agent) -> bool: - if not within_fov(observer, target.x, target.y, FOV_DEG): - return False - return bresenham_los(state.grid, observer.x, observer.y, target.x, target.y) - -def hash_sha256(txt: str) -> str: - return hashlib.sha256(txt.encode("utf-8")).hexdigest() - # ----------------------------- -# Beliefs / fog-of-war +# Data Models # ----------------------------- -def init_beliefs(agent_names: List[str]) -> Dict[str, np.ndarray]: - return {nm: (-1 * np.ones((GRID_H, GRID_W), dtype=np.int16)) for nm in agent_names} +@dataclass +class Agent: + name: str + team: str + x: int + y: int + ori: int = 0 + hp: int = 5 + energy: int = 200 + inventory: Dict[str, int] = field(default_factory=dict) + mode: str = "auto" # auto | manual + brain: str = "heur" # heur | random -def update_belief_for_agent(state: WorldState, belief: np.ndarray, agent: Agent) -> int: - """Returns number of newly discovered tiles this update.""" - before_unknown = int(np.sum(belief == -1)) +@dataclass +class Objective: + title: str + detail: str - belief[agent.y, agent.x] = state.grid[agent.y][agent.x] - base = math.radians(ORI_DEG[agent.ori]) - half = math.radians(FOV_DEG / 2) - rays = 45 if agent.name.lower().startswith("scout") else 33 +@dataclass +class EnvSpec: + key: str + title: str + summary: str + max_steps: int - for i in range(rays): - t = i / (rays - 1) - ang = base + (t * 2 - 1) * half - sin_a, cos_a = math.sin(ang), math.cos(ang) - ox, oy = agent.x + 0.5, agent.y + 0.5 - depth = 0.0 - while depth < MAX_DEPTH: - depth += 0.2 - tx = int(ox + cos_a * depth) - ty = int(oy + sin_a * depth) - if not in_bounds(tx, ty): - break - belief[ty, tx] = state.grid[ty][tx] - tile = state.grid[ty][tx] - if tile == WALL: - break - if tile == DOOR and not state.door_opened_global: - break +@dataclass +class World: + seed: int + step: int + env_key: str + map_key: str - after_unknown = int(np.sum(belief == -1)) - return max(0, before_unknown - after_unknown) + grid: List[List[int]] + agents: Dict[str, Agent] -# ----------------------------- -# Rendering -# ----------------------------- -def raycast_view(state: WorldState, observer: Agent) -> np.ndarray: - img = np.zeros((VIEW_H, VIEW_W, 3), dtype=np.uint8) - img[:, :] = SKY + # gameplay flags / counters + done: bool = False + outcome: str = "ongoing" # A_win | B_win | draw | ongoing - for y in range(VIEW_H // 2, VIEW_H): - t = (y - VIEW_H // 2) / (VIEW_H // 2 + 1e-6) - col = (1 - t) * FLOOR_NEAR + t * FLOOR_FAR - img[y, :] = col.astype(np.uint8) + # pacman-style + power_timer: int = 0 + pellets_left: int = 0 - fov = math.radians(FOV_DEG) - half_fov = fov / 2 + # capture-the-flag + flag_carrier: Optional[str] = None + flag_taken_from: Optional[str] = None - for rx in range(RAY_W): - cam_x = (2 * rx / (RAY_W - 1)) - 1 - ray_ang = math.radians(ORI_DEG[observer.ori]) + cam_x * half_fov + # treasure run + treasure_collected_A: int = 0 + treasure_collected_B: int = 0 - ox, oy = observer.x + 0.5, observer.y + 0.5 - sin_a = math.sin(ray_ang) - cos_a = math.cos(ray_ang) + # resource raid + baseA_progress: int = 0 + baseB_progress: int = 0 + base_target: int = 10 - depth = 0.0 - hit = None # "wall" | "door" - side = 0 + # UX + controlled: str = "" + pov: str = "" + overlay: bool = True + auto_camera: bool = True - while depth < MAX_DEPTH: - depth += 0.05 - tx = int(ox + cos_a * depth) - ty = int(oy + sin_a * depth) - if not in_bounds(tx, ty): - break - tile = state.grid[ty][tx] - if tile == WALL: - hit = "wall" - side = 1 if abs(cos_a) > abs(sin_a) else 0 - break - if tile == DOOR and not state.door_opened_global: - hit = "door" - break - - if hit is None: - continue + # logs + events: List[str] = field(default_factory=list) - depth *= math.cos(ray_ang - math.radians(ORI_DEG[observer.ori])) - depth = max(depth, 0.001) +# ----------------------------- +# Maps / Courses +# ----------------------------- +def base_border_grid() -> List[List[int]]: + g = [[EMPTY for _ in range(GRID_W)] for _ in range(GRID_H)] + for x in range(GRID_W): + g[0][x] = WALL + g[GRID_H - 1][x] = WALL + for y in range(GRID_H): + g[y][0] = WALL + g[y][GRID_W - 1] = WALL + return g - proj_h = int((VIEW_H * 0.9) / depth) - y0 = max(0, VIEW_H // 2 - proj_h // 2) - y1 = min(VIEW_H - 1, VIEW_H // 2 + proj_h // 2) +def carve_maze(seed: int, density: float = 0.66) -> List[List[int]]: + """ + Procedural "course" generator: a DFS maze with a few open plazas. + We generate walls then carve corridors. This produces interesting navigation. + """ + r = rng(seed) + g = [[WALL for _ in range(GRID_W)] for _ in range(GRID_H)] - if hit == "door": - col = DOOR_COL.copy() - else: - col = WALL_BASE.copy() if side == 0 else WALL_SIDE.copy() + # keep borders solid + for y in range(GRID_H): + for x in range(GRID_W): + if x in (0, GRID_W - 1) or y in (0, GRID_H - 1): + g[y][x] = WALL + + # carve from odd cells + def neighbors(cx, cy): + dirs = [(2, 0), (-2, 0), (0, 2), (0, -2)] + r.shuffle(dirs) + for dx, dy in dirs: + nx, ny = cx + dx, cy + dy + if 1 <= nx < GRID_W - 1 and 1 <= ny < GRID_H - 1: + yield nx, ny, dx, dy + + start = (1 + 2 * (r.randint(0, (GRID_W - 3) // 2)), + 1 + 2 * (r.randint(0, (GRID_H - 3) // 2))) + stack = [start] + g[start[1]][start[0]] = EMPTY + + visited = set([start]) + while stack: + cx, cy = stack[-1] + moved = False + for nx, ny, dx, dy in neighbors(cx, cy): + if (nx, ny) in visited: + continue + visited.add((nx, ny)) + g[cy + dy // 2][cx + dx // 2] = EMPTY + g[ny][nx] = EMPTY + stack.append((nx, ny)) + moved = True + break + if not moved: + stack.pop() + + # open up some "plazas" based on density + plazas = int((1.0 - density) * 8) + 2 + for _ in range(plazas): + px = r.randint(3, GRID_W - 4) + py = r.randint(3, GRID_H - 4) + w = r.randint(2, 4) + h = r.randint(2, 3) + for yy in range(py - h, py + h + 1): + for xx in range(px - w, px + w + 1): + if 1 <= xx < GRID_W - 1 and 1 <= yy < GRID_H - 1: + g[yy][xx] = EMPTY - dim = max(0.25, 1.0 - (depth / MAX_DEPTH)) - col = (col * dim).astype(np.uint8) + return g - x0 = int(rx * (VIEW_W / RAY_W)) - x1 = int((rx + 1) * (VIEW_W / RAY_W)) - img[y0:y1, x0:x1] = col +def map_pac_chase(seed: int) -> List[List[int]]: + g = base_border_grid() + # iconic mid-wall with gates + for x in range(4, GRID_W - 4): + g[GRID_H // 2][x] = WALL + gate_x = GRID_W // 2 + g[GRID_H // 2][gate_x] = GATE + g[GRID_H // 2][gate_x - 1] = GATE + g[GRID_H // 2][gate_x + 1] = GATE + + # pellets everywhere open + for y in range(1, GRID_H - 1): + for x in range(1, GRID_W - 1): + if g[y][x] == EMPTY: + g[y][x] = PELLET + + # power pellets at corners + for (x, y) in [(2, 2), (GRID_W - 3, 2), (2, GRID_H - 3), (GRID_W - 3, GRID_H - 3)]: + g[y][x] = POWER + + # a few internal blocks + r = rng(seed) + for _ in range(26): + x = r.randint(2, GRID_W - 3) + y = r.randint(2, GRID_H - 3) + if g[y][x] in (PELLET, EMPTY): + g[y][x] = WALL - # billboards for visible agents - for nm, other in state.agents.items(): - if nm == observer.name or other.hp <= 0: - continue - if visible(state, observer, other): - dx = other.x - observer.x - dy = other.y - observer.y - ang = (math.degrees(math.atan2(dy, dx)) % 360) - facing = ORI_DEG[observer.ori] - diff = (ang - facing + 540) % 360 - 180 - sx = int((diff / (FOV_DEG / 2)) * (VIEW_W / 2) + (VIEW_W / 2)) - dist = math.sqrt(dx * dx + dy * dy) - h = int((VIEW_H * 0.65) / max(dist, 0.75)) - w = max(10, h // 3) - y_mid = VIEW_H // 2 - y0 = max(0, y_mid - h // 2) - y1 = min(VIEW_H - 1, y_mid + h // 2) - x0 = max(0, sx - w // 2) - x1 = min(VIEW_W - 1, sx + w // 2) - col = AGENT_COLORS.get(nm, (255, 200, 120)) - img[y0:y1, x0:x1] = np.array(col, dtype=np.uint8) - - if state.overlay: - cx, cy = VIEW_W // 2, VIEW_H // 2 - img[cy - 1:cy + 2, cx - 10:cx + 10] = np.array([120, 190, 255], dtype=np.uint8) - img[cy - 10:cy + 10, cx - 1:cx + 2] = np.array([120, 190, 255], dtype=np.uint8) + return g - return img +def map_ctf_arena(seed: int) -> List[List[int]]: + g = carve_maze(seed, density=0.60) + # clear some central arena + cx, cy = GRID_W // 2, GRID_H // 2 + for y in range(cy - 3, cy + 4): + for x in range(cx - 5, cx + 6): + if 1 <= x < GRID_W - 1 and 1 <= y < GRID_H - 1: + g[y][x] = EMPTY + + # flags + bases + g[2][2] = FLAG_A + g[GRID_H - 3][GRID_W - 3] = FLAG_B + g[2][GRID_W - 3] = BASE_A + g[GRID_H - 3][2] = BASE_B + + # hazards sprinkled + r = rng(seed + 11) + for _ in range(18): + x = r.randint(2, GRID_W - 3) + y = r.randint(2, GRID_H - 3) + if g[y][x] == EMPTY: + g[y][x] = HAZARD -def render_topdown(grid: np.ndarray, agents: Dict[str, Agent], title: str, show_agents: bool = True) -> Image.Image: - w = grid.shape[1] * TILE - h = grid.shape[0] * TILE - im = Image.new("RGB", (w, h + 28), (10, 12, 18)) - draw = ImageDraw.Draw(im) - - for y in range(grid.shape[0]): - for x in range(grid.shape[1]): - t = int(grid[y, x]) - if t == -1: - col = (18, 20, 32) - elif t == EMPTY: - col = (26, 30, 44) - elif t == WALL: - col = (190, 190, 210) - elif t == FOOD: - col = (255, 210, 120) - elif t == NOISE: - col = (255, 120, 220) - elif t == DOOR: - col = (140, 210, 255) - elif t == TELE: - col = (120, 190, 255) - elif t == KEY: - col = (255, 235, 160) - elif t == EXIT: - col = (120, 255, 220) - elif t == ARTIFACT: - col = (255, 170, 60) - elif t == HAZARD: - col = (255, 90, 90) - elif t == WOOD: - col = (170, 120, 60) - elif t == ORE: - col = (140, 140, 160) - elif t == MEDKIT: - col = (120, 255, 140) - elif t == SWITCH: - col = (200, 180, 255) - elif t == BASE: - col = (220, 220, 240) - else: - col = (80, 80, 90) + return g - x0, y0 = x * TILE, y * TILE + 28 - draw.rectangle([x0, y0, x0 + TILE - 1, y0 + TILE - 1], fill=col) +def map_treasure_run(seed: int) -> List[List[int]]: + g = carve_maze(seed, density=0.70) + # treasures + r = rng(seed + 7) + for _ in range(12): + x = r.randint(2, GRID_W - 3) + y = r.randint(2, GRID_H - 3) + if g[y][x] == EMPTY: + g[y][x] = TREASURE + # bases + g[2][2] = BASE_A + g[GRID_H - 3][GRID_W - 3] = BASE_B + return g - for x in range(grid.shape[1] + 1): - xx = x * TILE - draw.line([xx, 28, xx, h + 28], fill=(12, 14, 22)) - for y in range(grid.shape[0] + 1): - yy = y * TILE + 28 - draw.line([0, yy, w, yy], fill=(12, 14, 22)) +def map_resource_raid(seed: int) -> List[List[int]]: + g = carve_maze(seed, density=0.64) + # resource clusters + r = rng(seed + 23) + for _ in range(22): + x = r.randint(2, GRID_W - 3) + y = r.randint(2, GRID_H - 3) + if g[y][x] == EMPTY: + g[y][x] = RESOURCE + # bases + g[2][2] = BASE_A + g[GRID_H - 3][GRID_W - 3] = BASE_B + return g - if show_agents: - for nm, a in agents.items(): - if a.hp <= 0: - continue - cx = a.x * TILE + TILE // 2 - cy = a.y * TILE + 28 + TILE // 2 - col = AGENT_COLORS.get(nm, (220, 220, 220)) - r = TILE // 3 - draw.ellipse([cx - r, cy - r, cx + r, cy + r], fill=col) - dx, dy = DIRS[a.ori] - draw.line([cx, cy, cx + dx * r, cy + dy * r], fill=(10, 10, 10), width=3) - - draw.rectangle([0, 0, w, 28], fill=(14, 16, 26)) - draw.text((8, 6), title, fill=(230, 230, 240)) - return im +MAP_BUILDERS = { + "Classic Pac-Chase": map_pac_chase, + "CTF Maze Arena": map_ctf_arena, + "Treasure Labyrinth": map_treasure_run, + "Resource Raid Maze": map_resource_raid, + "Procedural Maze (General)": lambda seed: carve_maze(seed, density=0.62), +} # ----------------------------- -# Environments +# Environments (Gameplay Types) # ----------------------------- -def grid_with_border() -> List[List[int]]: - g = [[EMPTY for _ in range(GRID_W)] for _ in range(GRID_H)] - for x in range(GRID_W): - g[0][x] = WALL - g[GRID_H - 1][x] = WALL - for y in range(GRID_H): - g[y][0] = WALL - g[y][GRID_W - 1] = WALL - return g +ENVS: Dict[str, EnvSpec] = { + "pac_chase": EnvSpec( + key="pac_chase", + title="Predator/Prey (Pac-Chase)", + summary="Predator hunts Prey. Prey scores by eating pellets; power flips the chase temporarily.", + max_steps=650, + ), + "ctf": EnvSpec( + key="ctf", + title="Capture The Flag", + summary="Steal the opponent’s flag and return it to your base. Hazards drain HP.", + max_steps=800, + ), + "treasure": EnvSpec( + key="treasure", + title="Treasure Run", + summary="Collect treasures scattered in the maze and deposit at base. First to 6 deposits wins.", + max_steps=750, + ), + "resource": EnvSpec( + key="resource", + title="Resource Raid", + summary="Mine resources, deposit to build base progress. Raider tries to disrupt and tag.", + max_steps=850, + ), +} -def env_chase(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]: - g = grid_with_border() - for x in range(4, 17): - g[7][x] = WALL - g[7][10] = DOOR - - g[3][4] = FOOD - g[11][15] = FOOD - g[4][14] = NOISE - g[12][5] = NOISE - g[2][18] = TELE - g[13][2] = TELE - - agents = { - "Predator": Agent("Predator", 2, 2, 0, hp=10, energy=100, team="A", brain="q"), - "Prey": Agent("Prey", 18, 12, 2, hp=10, energy=100, team="B", brain="q"), - "Scout": Agent("Scout", 10, 3, 1, hp=10, energy=100, team="A", brain="heuristic"), - } - return g, agents - -def env_vault(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]: - g = grid_with_border() - for x in range(3, 18): - g[5][x] = WALL - for x in range(3, 18): - g[9][x] = WALL - g[5][10] = DOOR - g[9][12] = DOOR - - g[2][2] = KEY - g[12][18] = EXIT - g[12][2] = ARTIFACT - g[2][18] = TELE - g[13][2] = TELE - g[7][10] = SWITCH - g[3][15] = HAZARD - g[11][6] = MEDKIT - g[2][12] = FOOD - - agents = { - "Alpha": Agent("Alpha", 2, 12, 0, hp=10, energy=100, team="A", brain="q"), - "Bravo": Agent("Bravo", 3, 12, 0, hp=10, energy=100, team="A", brain="q"), - "Guardian": Agent("Guardian", 18, 2, 2, hp=10, energy=100, team="B", brain="q"), - } - return g, agents - -def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]: - g = grid_with_border() - for y in range(3, 12): - g[y][9] = WALL - g[7][9] = DOOR - - g[2][3] = WOOD - g[3][3] = WOOD - g[4][3] = WOOD - g[12][16] = ORE - g[11][16] = ORE - g[10][16] = ORE - g[6][4] = FOOD - g[8][15] = FOOD - - g[13][10] = BASE - g[4][15] = HAZARD - g[10][4] = HAZARD - g[2][18] = TELE - g[13][2] = TELE - g[2][2] = KEY - g[12][6] = SWITCH - - agents = { - "BuilderA": Agent("BuilderA", 3, 12, 0, hp=10, energy=100, team="A", brain="q"), - "BuilderB": Agent("BuilderB", 4, 12, 0, hp=10, energy=100, team="A", brain="q"), - "Raider": Agent("Raider", 18, 2, 2, hp=10, energy=100, team="B", brain="q"), - } - return g, agents - -ENV_BUILDERS = {"chase": env_chase, "vault": env_vault, "civ": env_civ} +def env_objectives(env_key: str) -> List[Objective]: + if env_key == "pac_chase": + return [ + Objective("Prey", "Eat pellets (+) and survive. Power pellet makes Predator vulnerable temporarily."), + Objective("Predator", "Catch the Prey (tag on same tile). Avoid chasing into power windows."), + ] + if env_key == "ctf": + return [ + Objective("Team A", "Grab Flag B and return to Base A."), + Objective("Team B", "Grab Flag A and return to Base B."), + ] + if env_key == "treasure": + return [ + Objective("Both Teams", "Collect Treasures and deposit at your Base. First to 6 deposits wins."), + ] + if env_key == "resource": + return [ + Objective("Builders (A & B)", "Collect Resources and deposit to raise base progress."), + Objective("Raider", "Tag builders (collision) to slow progress; win by eliminating both or forcing timeout."), + ] + return [Objective("Objective", "Explore.")] # ----------------------------- -# Observation / Q-learning +# Spawn / Init # ----------------------------- -def local_tile_ahead(state: WorldState, a: Agent) -> int: - dx, dy = DIRS[a.ori] - nx, ny = a.x + dx, a.y + dy - if not in_bounds(nx, ny): - return WALL - return state.grid[ny][nx] - -def nearest_enemy_vec(state: WorldState, a: Agent) -> Tuple[int, int, int]: - best = None - for _, other in state.agents.items(): - if other.hp <= 0: - continue - if other.team == a.team: - continue - d = manhattan_xy(a.x, a.y, other.x, other.y) - if best is None or d < best[0]: - best = (d, other.x - a.x, other.y - a.y) - if best is None: - return (99, 0, 0) - d, dx, dy = best - return (d, int(np.clip(dx, -6, 6)), int(np.clip(dy, -6, 6))) - -def obs_key(state: WorldState, who: str) -> str: - a = state.agents[who] - d, dx, dy = nearest_enemy_vec(state, a) - ahead = local_tile_ahead(state, a) - keys = a.inventory.get("key", 0) - art = a.inventory.get("artifact", 0) - wood = a.inventory.get("wood", 0) - ore = a.inventory.get("ore", 0) - inv_bucket = f"k{min(keys,2)}a{min(art,1)}w{min(wood,3)}o{min(ore,3)}" - door = 1 if state.door_opened_global else 0 - return f"{state.env_key}|{who}|{a.x},{a.y},{a.ori}|e{d}:{dx},{dy}|t{ahead}|hp{a.hp}|{inv_bucket}|D{door}|bp{state.base_progress}" - -def q_get(q: Dict[str, List[float]], key: str) -> List[float]: - if key not in q: - q[key] = [0.0 for _ in ACTIONS] - return q[key] - -def epsilon_greedy(qvals: List[float], eps: float, r: np.random.Generator) -> int: - if r.random() < eps: - return int(r.integers(0, len(qvals))) - return int(np.argmax(qvals)) - -def q_update(q: Dict[str, List[float]], key: str, a_idx: int, reward: float, next_key: str, - alpha: float, gamma: float) -> Tuple[float, float, float]: - qv = q_get(q, key) - nq = q_get(q, next_key) - old = qv[a_idx] - target = reward + gamma * float(np.max(nq)) - new = old + alpha * (target - old) - qv[a_idx] = new - return old, target, new +def random_empty_cell(g: List[List[int]], r: random.Random) -> Tuple[int, int]: + empties = [(x, y) for y in range(1, GRID_H - 1) for x in range(1, GRID_W - 1) if g[y][x] in (EMPTY, PELLET)] + return r.choice(empties) if empties else (2, 2) + +def init_world(seed: int, env_key: str, map_key: str) -> World: + r = rng(seed) + g = MAP_BUILDERS[map_key](seed) + spec = ENVS[env_key] + + agents: Dict[str, Agent] = {} + + if env_key == "pac_chase": + # Predator + Prey + 2 ghosts (as roaming threats / decoys) + px, py = 2, 2 + qx, qy = GRID_W - 3, GRID_H - 3 + agents["Predator"] = Agent("Predator", "A", px, py, ori=0, hp=6, mode="auto", brain="heur") + agents["Prey"] = Agent("Prey", "B", qx, qy, ori=2, hp=5, mode="auto", brain="heur") + gx1, gy1 = (GRID_W // 2, 2) + gx2, gy2 = (GRID_W // 2, GRID_H - 3) + agents["Ghost1"] = Agent("Ghost1", "A", gx1, gy1, ori=1, hp=4, mode="auto", brain="random") + agents["Ghost2"] = Agent("Ghost2", "A", gx2, gy2, ori=3, hp=4, mode="auto", brain="random") + + pellets = sum(1 for y in range(GRID_H) for x in range(GRID_W) if g[y][x] in (PELLET, POWER)) + controlled = "Prey" + pov = "Prey" + + elif env_key == "ctf": + # 2 runners + 2 guards + ax, ay = 2, GRID_H - 3 + bx, by = GRID_W - 3, 2 + agents["RunnerA"] = Agent("RunnerA", "A", ax, ay, ori=0, hp=6, mode="auto", brain="heur") + agents["GuardA"] = Agent("GuardA", "A", 2, 2, ori=0, hp=7, mode="auto", brain="heur") + agents["RunnerB"] = Agent("RunnerB", "B", bx, by, ori=2, hp=6, mode="auto", brain="heur") + agents["GuardB"] = Agent("GuardB", "B", GRID_W - 3, GRID_H - 3, ori=2, hp=7, mode="auto", brain="heur") + pellets = 0 + controlled = "RunnerA" + pov = "RunnerA" + + elif env_key == "treasure": + agents["RunnerA"] = Agent("RunnerA", "A", 2, 2, ori=0, hp=6, mode="auto", brain="heur") + agents["RunnerB"] = Agent("RunnerB", "B", GRID_W - 3, GRID_H - 3, ori=2, hp=6, mode="auto", brain="heur") + agents["GuardA"] = Agent("GuardA", "A", 2, GRID_H - 3, ori=0, hp=6, mode="auto", brain="heur") + agents["GuardB"] = Agent("GuardB", "B", GRID_W - 3, 2, ori=2, hp=6, mode="auto", brain="heur") + pellets = 0 + controlled = "RunnerA" + pov = "RunnerA" + + else: # resource + agents["MinerA"] = Agent("MinerA", "A", 2, 2, ori=0, hp=6, mode="auto", brain="heur") + agents["MinerB"] = Agent("MinerB", "B", GRID_W - 3, GRID_H - 3, ori=2, hp=6, mode="auto", brain="heur") + agents["Raider"] = Agent("Raider", "R", GRID_W - 3, 2, ori=2, hp=7, mode="auto", brain="heur") + pellets = 0 + controlled = "MinerA" + pov = "MinerA" + + w = World( + seed=seed, + step=0, + env_key=env_key, + map_key=map_key, + grid=g, + agents=agents, + pellets_left=pellets, + controlled=controlled, + pov=pov, + overlay=True, + auto_camera=True, + events=[f"Initialized: env={env_key} ({spec.title}) | map={map_key} | seed={seed}"], + ) + return w # ----------------------------- -# Baseline heuristics +# Pathing + Movement # ----------------------------- -def face_towards(a: Agent, tx: int, ty: int) -> str: +def is_blocking(tile: int) -> bool: + return tile == WALL + +def neighbors4(x: int, y: int) -> List[Tuple[int, int]]: + return [(x + 1, y), (x, y + 1), (x - 1, y), (x, y - 1)] + +def bfs_next_step(grid: List[List[int]], start: Tuple[int, int], goal: Tuple[int, int]) -> Optional[Tuple[int, int]]: + if start == goal: + return None + sx, sy = start + gx, gy = goal + q = [(sx, sy)] + prev = {start: None} + while q: + x, y = q.pop(0) + if (x, y) == (gx, gy): + break + for nx, ny in neighbors4(x, y): + if not in_bounds(nx, ny): + continue + if is_blocking(grid[ny][nx]): + continue + if (nx, ny) not in prev: + prev[(nx, ny)] = (x, y) + q.append((nx, ny)) + if (gx, gy) not in prev: + return None + # backtrack one step from goal to start + cur = (gx, gy) + while prev[cur] != start and prev[cur] is not None: + cur = prev[cur] + return cur + +def face_towards(a: Agent, tx: int, ty: int): dx = tx - a.x dy = ty - a.y - ang = (math.degrees(math.atan2(dy, dx)) % 360) - facing = ORI_DEG[a.ori] - diff = (ang - facing + 540) % 360 - 180 - if diff < -10: - return "L" - if diff > 10: - return "R" - return "F" - -def heuristic_action(state: WorldState, who: str) -> str: - a = state.agents[who] - r = rng_for(state.seed, state.step, stream=900 + hash(who) % 1000) - - # Prioritize interacting on valuable tiles - t_here = state.grid[a.y][a.x] - if t_here in (FOOD, KEY, ARTIFACT, WOOD, ORE, MEDKIT, SWITCH, BASE, EXIT): - return "I" - - # Find nearest enemy - best = None - best_d = 999 - for _, other in state.agents.items(): - if other.hp <= 0 or other.team == a.team: - continue - d = manhattan_xy(a.x, a.y, other.x, other.y) - if d < best_d: - best_d = d - best = other - - if best is not None and best_d <= 6 and visible(state, a, best): - # attackers chase, defenders try to flee - if a.team == "B": - return face_towards(a, best.x, best.y) - - dx = best.x - a.x - dy = best.y - a.y - ang = (math.degrees(math.atan2(dy, dx)) % 360) - facing = ORI_DEG[a.ori] - diff = (ang - facing + 540) % 360 - 180 - diff_away = ((diff + 180) + 540) % 360 - 180 - if diff_away < -10: - return "L" - if diff_away > 10: - return "R" - return "F" - - return r.choice(["F", "F", "L", "R", "I"]) - -def random_action(state: WorldState, who: str) -> str: - r = rng_for(state.seed, state.step, stream=700 + hash(who) % 1000) - return r.choice(ACTIONS) - -# ----------------------------- -# Movement + interaction -# ----------------------------- -def turn_left(a: Agent) -> None: - a.ori = (a.ori - 1) % 4 - -def turn_right(a: Agent) -> None: - a.ori = (a.ori + 1) % 4 + if abs(dx) > abs(dy): + a.ori = 0 if dx > 0 else 2 + else: + a.ori = 1 if dy > 0 else 3 -def move_forward(state: WorldState, a: Agent) -> str: - dx, dy = DIRS[a.ori] - nx, ny = a.x + dx, a.y + dy +def move_to(world: World, a: Agent, nx: int, ny: int) -> bool: if not in_bounds(nx, ny): - return "blocked: bounds" - tile = state.grid[ny][nx] - if is_blocking(tile, door_open=state.door_opened_global): - return "blocked: wall/door" + return False + if is_blocking(world.grid[ny][nx]): + return False a.x, a.y = nx, ny - - if state.grid[ny][nx] == TELE: - teles = [(x, y) for y in range(GRID_H) for x in range(GRID_W) if state.grid[y][x] == TELE] - if len(teles) >= 2: - teles_sorted = sorted(teles) - idx = teles_sorted.index((nx, ny)) - dest = teles_sorted[(idx + 1) % len(teles_sorted)] - a.x, a.y = dest - state.event_log.append(f"t={state.step}: {a.name} teleported.") - return "moved: teleported" - return "moved" - -def try_interact(state: WorldState, a: Agent) -> str: - t = state.grid[a.y][a.x] - - if t == SWITCH: - state.door_opened_global = True - state.grid[a.y][a.x] = EMPTY - a.inventory["switch"] = a.inventory.get("switch", 0) + 1 - return "switch: opened all doors" - - if t == KEY: - a.inventory["key"] = a.inventory.get("key", 0) + 1 - state.grid[a.y][a.x] = EMPTY - return "picked: key" - - if t == ARTIFACT: - a.inventory["artifact"] = a.inventory.get("artifact", 0) + 1 - state.grid[a.y][a.x] = EMPTY - return "picked: artifact" - - if t == FOOD: - a.energy = min(200, a.energy + 35) - state.grid[a.y][a.x] = EMPTY - return "ate: food" - - if t == WOOD: - a.inventory["wood"] = a.inventory.get("wood", 0) + 1 - state.grid[a.y][a.x] = EMPTY - return "picked: wood" - - if t == ORE: - a.inventory["ore"] = a.inventory.get("ore", 0) + 1 - state.grid[a.y][a.x] = EMPTY - return "picked: ore" - - if t == MEDKIT: - a.hp = min(10, a.hp + 3) - state.grid[a.y][a.x] = EMPTY - return "used: medkit" - - if t == BASE: - w = a.inventory.get("wood", 0) - o = a.inventory.get("ore", 0) - dep = min(w, 2) + min(o, 2) - if dep > 0: - a.inventory["wood"] = max(0, w - min(w, 2)) - a.inventory["ore"] = max(0, o - min(o, 2)) - state.base_progress += dep - return f"deposited: +{dep} base_progress" - return "base: nothing to deposit" - - if t == EXIT: - return "at_exit" - - return "interact: none" - -def apply_action(state: WorldState, who: str, action: str) -> str: - a = state.agents[who] - if a.hp <= 0: - return "dead" - if action == "L": - turn_left(a) - return "turned left" - if action == "R": - turn_right(a) - return "turned right" - if action == "F": - return move_forward(state, a) - if action == "I": - return try_interact(state, a) - return "noop" + a.energy = max(0, a.energy - 1) + return True # ----------------------------- -# Hazards / collisions / done +# Core Interactions # ----------------------------- -def resolve_hazards(state: WorldState, a: Agent) -> Tuple[bool, str]: - if a.hp <= 0: - return (False, "") - if state.grid[a.y][a.x] == HAZARD: +def apply_tile_effects(world: World, a: Agent): + t = world.grid[a.y][a.x] + + # hazards drain HP + if t == HAZARD: a.hp -= 1 - return (True, "hazard:-hp") - return (False, "") + world.events.append(f"t={world.step}: {a.name} hit a hazard (-hp).") + + if world.env_key == "pac_chase": + if t == PELLET: + world.grid[a.y][a.x] = EMPTY + world.pellets_left = max(0, world.pellets_left - 1) + a.inventory["pellets"] = a.inventory.get("pellets", 0) + 1 + elif t == POWER: + world.grid[a.y][a.x] = EMPTY + world.pellets_left = max(0, world.pellets_left - 1) + world.power_timer = 26 + world.events.append(f"t={world.step}: POWER ACTIVE — chase flips for a bit.") + + if world.env_key == "ctf": + if t == FLAG_A and a.team == "B" and world.flag_carrier is None: + world.flag_carrier = a.name + world.flag_taken_from = "A" + world.grid[a.y][a.x] = EMPTY + world.events.append(f"t={world.step}: {a.name} stole Flag A!") + if t == FLAG_B and a.team == "A" and world.flag_carrier is None: + world.flag_carrier = a.name + world.flag_taken_from = "B" + world.grid[a.y][a.x] = EMPTY + world.events.append(f"t={world.step}: {a.name} stole Flag B!") + + # return conditions + if world.flag_carrier == a.name: + if a.team == "A" and world.grid[a.y][a.x] == BASE_A and world.flag_taken_from == "B": + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Team A captured the flag!") + if a.team == "B" and world.grid[a.y][a.x] == BASE_B and world.flag_taken_from == "A": + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Team B captured the flag!") + + if world.env_key == "treasure": + if t == TREASURE: + world.grid[a.y][a.x] = EMPTY + a.inventory["treasure"] = a.inventory.get("treasure", 0) + 1 + world.events.append(f"t={world.step}: {a.name} picked treasure.") + if t == BASE_A and a.team == "A": + dep = a.inventory.get("treasure", 0) + if dep > 0: + a.inventory["treasure"] = 0 + world.treasure_collected_A += dep + world.events.append(f"t={world.step}: Team A deposited {dep} treasure (total={world.treasure_collected_A}).") + if t == BASE_B and a.team == "B": + dep = a.inventory.get("treasure", 0) + if dep > 0: + a.inventory["treasure"] = 0 + world.treasure_collected_B += dep + world.events.append(f"t={world.step}: Team B deposited {dep} treasure (total={world.treasure_collected_B}).") + + if world.env_key == "resource": + if t == RESOURCE: + world.grid[a.y][a.x] = EMPTY + a.inventory["res"] = a.inventory.get("res", 0) + 1 + world.events.append(f"t={world.step}: {a.name} mined resource.") + if t == BASE_A and a.name == "MinerA": + dep = min(2, a.inventory.get("res", 0)) + if dep > 0: + a.inventory["res"] -= dep + world.baseA_progress += dep + world.events.append(f"t={world.step}: MinerA deposited +{dep} (A={world.baseA_progress}/{world.base_target}).") + if t == BASE_B and a.name == "MinerB": + dep = min(2, a.inventory.get("res", 0)) + if dep > 0: + a.inventory["res"] -= dep + world.baseB_progress += dep + world.events.append(f"t={world.step}: MinerB deposited +{dep} (B={world.baseB_progress}/{world.base_target}).") -def resolve_tags(state: WorldState) -> List[str]: - msgs = [] - occupied: Dict[Tuple[int, int], List[str]] = {} - for nm, a in state.agents.items(): +# ----------------------------- +# Collision / Tagging +# ----------------------------- +def resolve_tags(world: World): + # If opposing agents occupy same tile: tag event + pos: Dict[Tuple[int, int], List[str]] = {} + for nm, a in world.agents.items(): if a.hp <= 0: continue - occupied.setdefault((a.x, a.y), []).append(nm) + pos.setdefault((a.x, a.y), []).append(nm) - for (x, y), names in occupied.items(): + for (x, y), names in pos.items(): if len(names) < 2: continue - teams = set(state.agents[n].team for n in names) - if len(teams) >= 2: - for n in names: - state.agents[n].hp -= 1 - msgs.append(f"t={state.step}: collision/tag at ({x},{y}) {names} (-hp all)") - return msgs - -def check_done(state: WorldState) -> None: - if state.env_key == "chase": - pred = state.agents["Predator"] - prey = state.agents["Prey"] - if pred.hp <= 0 and prey.hp <= 0: - state.done = True - state.outcome = "draw" - return - if pred.hp > 0 and prey.hp > 0 and pred.x == prey.x and pred.y == prey.y: - state.done = True - state.outcome = "A_win" - state.event_log.append(f"t={state.step}: CAUGHT (Predator wins).") - return - if state.step >= 300 and prey.hp > 0: - state.done = True - state.outcome = "B_win" - state.event_log.append(f"t={state.step}: ESCAPED (Prey survives).") - return + teams = set(world.agents[n].team for n in names) + if len(teams) <= 1: + continue - if state.env_key == "vault": - for nm in ["Alpha", "Bravo"]: - a = state.agents[nm] - if a.hp > 0 and a.inventory.get("artifact", 0) > 0 and state.grid[a.y][a.x] == EXIT: - state.done = True - state.outcome = "A_win" - state.event_log.append(f"t={state.step}: VAULT CLEARED (Team A wins).") + # pac_chase special: power flips who is vulnerable + if world.env_key == "pac_chase": + if "Predator" in names and "Prey" in names: + if world.power_timer > 0: + # Predator vulnerable + world.agents["Predator"].hp -= 2 + world.events.append(f"t={world.step}: Prey TAGGED Predator during POWER (-2hp Predator).") + else: + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Predator CAUGHT Prey.") return - alive_A = any(state.agents[n].hp > 0 for n in ["Alpha", "Bravo"]) - if not alive_A: - state.done = True - state.outcome = "B_win" - state.event_log.append(f"t={state.step}: TEAM A ELIMINATED (Guardian wins).") - return - if state.env_key == "civ": - if state.base_progress >= state.base_target: - state.done = True - state.outcome = "A_win" - state.event_log.append(f"t={state.step}: BASE COMPLETE (Builders win).") - return - alive_A = any(state.agents[n].hp > 0 for n in ["BuilderA", "BuilderB"]) - if not alive_A: - state.done = True - state.outcome = "B_win" - state.event_log.append(f"t={state.step}: BUILDERS ELIMINATED (Raider wins).") - return - if state.step >= 350: - state.done = True - state.outcome = "draw" - state.event_log.append(f"t={state.step}: TIMEOUT (draw).") - return + # otherwise, both lose hp + for n in names: + world.agents[n].hp -= 1 + world.events.append(f"t={world.step}: TAG at ({x},{y}) {names} (-hp).") + + # CTF: drop flag if carrier tagged + if world.env_key == "ctf" and world.flag_carrier in names: + carrier = world.flag_carrier + world.flag_carrier = None + # respawn flag to original side + if world.flag_taken_from == "A": + world.grid[2][2] = FLAG_A + elif world.flag_taken_from == "B": + world.grid[GRID_H - 3][GRID_W - 3] = FLAG_B + world.events.append(f"t={world.step}: {carrier} dropped the flag!") # ----------------------------- -# Rewards +# Done Conditions # ----------------------------- -def reward_for(prev: WorldState, now: WorldState, who: str, outcome_msg: str, took_damage: bool) -> float: - cfg = now.cfg - r = cfg.step_penalty - if outcome_msg.startswith("moved"): - r += cfg.explore_reward - if took_damage: - r += cfg.damage_penalty - if outcome_msg.startswith("used: medkit"): - r += cfg.heal_reward - - if now.env_key == "chase": - pred = now.agents["Predator"] - prey = now.agents["Prey"] - if who == "Predator": - d0 = manhattan_xy(prev.agents["Predator"].x, prev.agents["Predator"].y, - prev.agents["Prey"].x, prev.agents["Prey"].y) - d1 = manhattan_xy(pred.x, pred.y, prey.x, prey.y) - r += cfg.chase_close_coeff * float(d0 - d1) - if now.done and now.outcome == "A_win": - r += cfg.chase_catch_reward - if who == "Prey": - if outcome_msg.startswith("ate: food"): - r += cfg.food_reward - if now.done and now.outcome == "B_win": - r += cfg.chase_escaped_reward - if now.done and now.outcome == "A_win": - r += cfg.chase_caught_penalty - - if now.env_key == "vault": - if outcome_msg.startswith("picked: artifact"): - r += cfg.artifact_pick_reward - if outcome_msg.startswith("picked: key"): - r += cfg.key_reward - if outcome_msg.startswith("switch:"): - r += cfg.switch_reward - if now.done: - if now.outcome == "A_win" and now.agents[who].team == "A": - r += cfg.exit_win_reward - if now.outcome == "B_win" and now.agents[who].team == "B": - r += cfg.guardian_tag_reward - if now.outcome == "B_win" and now.agents[who].team == "A": - r += cfg.tagged_penalty - - if now.env_key == "civ": - if outcome_msg.startswith("picked: wood") or outcome_msg.startswith("picked: ore"): - r += cfg.resource_pick_reward - if outcome_msg.startswith("deposited:"): - r += cfg.deposit_reward - if now.done: - if now.outcome == "A_win" and now.agents[who].team == "A": - r += cfg.base_progress_win_reward - if now.outcome == "B_win" and now.agents[who].team == "B": - r += cfg.raider_elim_reward - if now.outcome == "B_win" and now.agents[who].team == "A": - r += cfg.builder_elim_penalty - - return float(r) +def check_done(world: World): + spec = ENVS[world.env_key] + if world.done: + return -# ----------------------------- -# Policy selection -# ----------------------------- -def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, Optional[Tuple[str, int]]]: - a = state.agents[who] - cfg = state.cfg - r = rng_for(state.seed, state.step, stream=stream) + # timeout / survival + if world.step >= spec.max_steps: + world.done = True + world.outcome = "draw" + world.events.append(f"t={world.step}: TIMEOUT (draw).") + return - if a.brain == "random": - act = random_action(state, who) - return act, "random", None - if a.brain == "heuristic": - act = heuristic_action(state, who) - return act, "heuristic", None - - if cfg.use_q: - key = obs_key(state, who) - qtab = state.q_tables.setdefault(who, {}) - qv = q_get(qtab, key) - a_idx = epsilon_greedy(qv, state.gmetrics.epsilon, r) - return ACTIONS[a_idx], f"Q eps={state.gmetrics.epsilon:.3f} q={np.round(qv,3).tolist()}", (key, a_idx) - - act = heuristic_action(state, who) - return act, "heuristic(fallback)", None + if world.env_key == "pac_chase": + prey = world.agents["Prey"] + pred = world.agents["Predator"] + if prey.hp <= 0: + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Prey eliminated — Predator wins.") + return + if pred.hp <= 0: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Predator eliminated — Prey wins.") + return + if world.pellets_left <= 0: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: All pellets cleared — Prey wins.") + return -# ----------------------------- -# Init / reset -# ----------------------------- -def init_state(seed: int, env_key: str) -> WorldState: - g, agents = ENV_BUILDERS[env_key](seed) - st = WorldState( - seed=seed, - step=0, - env_key=env_key, - grid=g, - agents=agents, - controlled=list(agents.keys())[0], - pov=list(agents.keys())[0], - overlay=False, - done=False, - outcome="ongoing", - door_opened_global=False, - base_progress=0, - base_target=10, - ) - st.event_log = [f"Initialized env={env_key} seed={seed}."] - return st - -def reset_episode_keep_learning(state: WorldState, seed: Optional[int] = None) -> WorldState: - if seed is None: - seed = state.seed - fresh = init_state(int(seed), state.env_key) - fresh.cfg = state.cfg - fresh.q_tables = state.q_tables - fresh.gmetrics = state.gmetrics - fresh.gmetrics.epsilon = state.gmetrics.epsilon - return fresh - -def wipe_all(seed: int, env_key: str) -> WorldState: - st = init_state(seed, env_key) - st.cfg = TrainConfig() - st.gmetrics = GlobalMetrics(epsilon=st.cfg.epsilon) - st.q_tables = {} - return st + if world.env_key == "ctf": + # done handled on return + # elimination condition + aliveA = any(a.hp > 0 for a in world.agents.values() if a.team == "A") + aliveB = any(a.hp > 0 for a in world.agents.values() if a.team == "B") + if not aliveA and aliveB: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Team A eliminated — Team B wins.") + elif not aliveB and aliveA: + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Team B eliminated — Team A wins.") + + if world.env_key == "treasure": + if world.treasure_collected_A >= 6: + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Team A reached 6 treasure — wins.") + elif world.treasure_collected_B >= 6: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Team B reached 6 treasure — wins.") + + if world.env_key == "resource": + if world.baseA_progress >= world.base_target: + world.done = True + world.outcome = "A_win" + world.events.append(f"t={world.step}: Base A complete — MinerA wins.") + elif world.baseB_progress >= world.base_target: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Base B complete — MinerB wins.") + # Raider wins by eliminating both miners + alive_miners = sum(1 for nm in ("MinerA", "MinerB") if world.agents.get(nm) and world.agents[nm].hp > 0) + if alive_miners == 0 and world.agents["Raider"].hp > 0: + world.done = True + world.outcome = "B_win" + world.events.append(f"t={world.step}: Miners eliminated — Raider wins.") # ----------------------------- -# History / branching +# Agent "Brains" (Heuristic + Random) # ----------------------------- -TRACE_MAX = 500 -MAX_HISTORY = 1400 - -def snapshot_of(state: WorldState, branch: str) -> Snapshot: - return Snapshot( - branch=branch, - step=state.step, - env_key=state.env_key, - grid=[row[:] for row in state.grid], - agents={k: asdict(v) for k, v in state.agents.items()}, - done=state.done, - outcome=state.outcome, - door_opened_global=state.door_opened_global, - base_progress=state.base_progress, - base_target=state.base_target, - event_tail=state.event_log[-25:], - trace_tail=state.trace_log[-40:], - emetrics=asdict(state.emetrics), - ) - -def restore_into(state: WorldState, snap: Snapshot) -> WorldState: - state.step = snap.step - state.env_key = snap.env_key - state.grid = [row[:] for row in snap.grid] - state.agents = {k: Agent(**d) for k, d in snap.agents.items()} - state.done = snap.done - state.outcome = snap.outcome - state.door_opened_global = snap.door_opened_global - state.base_progress = snap.base_progress - state.base_target = snap.base_target - state.event_log.append(f"Jumped to snapshot t={snap.step} (branch={snap.branch}).") - return state +def choose_target_pac(world: World, who: str) -> Tuple[int, int]: + a = world.agents[who] + prey = world.agents["Prey"] + pred = world.agents["Predator"] + + if who == "Prey": + # survival logic: if power active, prey can bully predator a bit, otherwise flee + if world.power_timer > 0: + return (pred.x, pred.y) # go toward predator (aggressive window) + # otherwise: go toward nearest pellet/power but avoid predator + pellets = find_all(world.grid, PELLET) + find_all(world.grid, POWER) + if pellets: + pellets.sort(key=lambda p: manhattan((a.x, a.y), p)) + return pellets[0] + return (a.x, a.y) + + if who == "Predator": + # if power active, avoid prey (pred vulnerable) + if world.power_timer > 0: + # run away from prey by targeting a far corner + corners = [(2, 2), (GRID_W - 3, 2), (2, GRID_H - 3), (GRID_W - 3, GRID_H - 3)] + corners.sort(key=lambda c: -manhattan((prey.x, prey.y), c)) + return corners[0] + return (prey.x, prey.y) + + # ghosts roam toward prey loosely + return (prey.x, prey.y) + +def choose_target_ctf(world: World, who: str) -> Tuple[int, int]: + a = world.agents[who] + # runners prioritize stealing flags; guards prioritize intercepting carrier / defending + if a.team == "A": + home_base = BASE_A + enemy_flag = FLAG_B + home_base_pos = find_all(world.grid, BASE_A)[0] + else: + home_base = BASE_B + enemy_flag = FLAG_A + home_base_pos = find_all(world.grid, BASE_B)[0] + + # if carrying flag, run home + if world.flag_carrier == who: + return home_base_pos + + # if teammate carrying flag, guard/intercept threats + if world.flag_carrier is not None: + carrier = world.agents[world.flag_carrier] + return (carrier.x, carrier.y) + + # otherwise: runners go to enemy flag; guards hover mid or defend base + if "Runner" in who: + flags = find_all(world.grid, enemy_flag) + if flags: + return flags[0] + return home_base_pos + + # guard: midpoint between base and enemy + enemy_flag_pos = find_all(world.grid, enemy_flag) + if enemy_flag_pos: + ex, ey = enemy_flag_pos[0] + bx, by = home_base_pos + return ((ex + bx) // 2, (ey + by) // 2) + return home_base_pos + +def choose_target_treasure(world: World, who: str) -> Tuple[int, int]: + a = world.agents[who] + base = BASE_A if a.team == "A" else BASE_B + base_pos = find_all(world.grid, base)[0] + + # deposit if holding + if a.inventory.get("treasure", 0) >= 2: + return base_pos + + treasures = find_all(world.grid, TREASURE) + if treasures: + treasures.sort(key=lambda p: manhattan((a.x, a.y), p)) + return treasures[0] + return base_pos + +def choose_target_resource(world: World, who: str) -> Tuple[int, int]: + a = world.agents[who] + if who == "Raider": + # hunt nearest miner + miners = [world.agents[n] for n in ("MinerA", "MinerB") if world.agents.get(n) and world.agents[n].hp > 0] + if miners: + miners.sort(key=lambda m: manhattan((a.x, a.y), (m.x, m.y))) + return (miners[0].x, miners[0].y) + return (a.x, a.y) + + # miners: deposit if holding enough + base_tile = BASE_A if who == "MinerA" else BASE_B + base_pos = find_all(world.grid, base_tile)[0] + if a.inventory.get("res", 0) >= 3: + return base_pos + + res = find_all(world.grid, RESOURCE) + if res: + res.sort(key=lambda p: manhattan((a.x, a.y), p)) + return res[0] + return base_pos + +def choose_target(world: World, who: str) -> Tuple[int, int]: + if world.env_key == "pac_chase": + return choose_target_pac(world, who) + if world.env_key == "ctf": + return choose_target_ctf(world, who) + if world.env_key == "treasure": + return choose_target_treasure(world, who) + if world.env_key == "resource": + return choose_target_resource(world, who) + return (world.agents[who].x, world.agents[who].y) + +def auto_step_agent(world: World, who: str): + a = world.agents[who] + if a.hp <= 0: + return -# ----------------------------- -# Metrics dashboard (HF-safe) -# ----------------------------- -def metrics_dashboard_image(state: WorldState) -> Image.Image: - gm = state.gmetrics - - fig = plt.figure(figsize=(7.0, 2.2), dpi=120) - ax = fig.add_subplot(111) - - x1 = max(1, gm.episodes) - ax.plot([0, x1], [gm.rolling_winrate_A, gm.rolling_winrate_A]) - ax.set_title("Global Metrics Snapshot") - ax.set_xlabel("Episodes") - ax.set_ylabel("Rolling winrate Team A") - ax.set_ylim(-0.05, 1.05) - ax.grid(True) - - txt = ( - f"env={state.env_key} | eps={gm.epsilon:.3f} | episodes={gm.episodes}\n" - f"A_wins={gm.wins_teamA} B_wins={gm.wins_teamB} draws={gm.draws} | avg_steps~{gm.avg_steps:.1f}\n" - f"last_outcome={gm.last_outcome} last_steps={gm.last_steps}" - ) - ax.text(0.01, 0.05, txt, transform=ax.transAxes, fontsize=8, va="bottom") + # choose next move + if a.brain == "random": + cand = [] + for nx, ny in neighbors4(a.x, a.y): + if in_bounds(nx, ny) and not is_blocking(world.grid[ny][nx]): + cand.append((nx, ny)) + if cand: + nx, ny = random.choice(cand) + face_towards(a, nx, ny) + move_to(world, a, nx, ny) + return - fig.tight_layout() + tx, ty = choose_target(world, who) + nxt = bfs_next_step(world.grid, (a.x, a.y), (tx, ty)) + if nxt is None: + # small wander if stuck + cand = [] + for nx, ny in neighbors4(a.x, a.y): + if in_bounds(nx, ny) and not is_blocking(world.grid[ny][nx]): + cand.append((nx, ny)) + if cand: + nx, ny = cand[world.step % len(cand)] + face_towards(a, nx, ny) + move_to(world, a, nx, ny) + return - canvas = FigureCanvas(fig) - canvas.draw() - buf = np.asarray(canvas.buffer_rgba()) # (H,W,4) - img = Image.fromarray(buf, mode="RGBA").convert("RGB") - plt.close(fig) - return img + nx, ny = nxt + face_towards(a, nx, ny) + move_to(world, a, nx, ny) + +def manual_action(world: World, action: str): + """ + Manual control for the 'controlled' agent: + L/R/F/I style minimal actions (Pacman-appropriate). + """ + who = world.controlled + a = world.agents[who] + if a.hp <= 0: + return -def action_entropy(counts: Dict[str, int]) -> float: - total = sum(counts.values()) - if total <= 0: - return 0.0 - p = np.array([c / total for c in counts.values()], dtype=np.float64) - p = np.clip(p, 1e-12, 1.0) - return float(-np.sum(p * np.log2(p))) - -def agent_scoreboard(state: WorldState) -> str: - rows = [] - header = ["agent", "team", "hp", "return", "steps", "entropy", "tiles_disc", "q_states", "inventory"] - rows.append(header) - steps = state.emetrics.steps - - for nm, a in state.agents.items(): - ret = state.emetrics.returns.get(nm, 0.0) - counts = state.emetrics.action_counts.get(nm, {}) - ent = action_entropy(counts) - td = state.emetrics.tiles_discovered.get(nm, 0) - qs = len(state.q_tables.get(nm, {})) - inv = json.dumps(a.inventory, sort_keys=True) - rows.append([nm, a.team, a.hp, f"{ret:.2f}", steps, f"{ent:.2f}", td, qs, inv]) - - col_w = [max(len(str(r[i])) for r in rows) for i in range(len(header))] - lines = [] - for ridx, r in enumerate(rows): - line = " | ".join(str(r[i]).ljust(col_w[i]) for i in range(len(header))) - lines.append(line) - if ridx == 0: - lines.append("-+-".join("-" * w for w in col_w)) - return "\n".join(lines) + if action == "L": + a.ori = (a.ori - 1) % 4 + return + if action == "R": + a.ori = (a.ori + 1) % 4 + return + if action == "F": + dx, dy = DIRS[a.ori] + nx, ny = a.x + dx, a.y + dy + if in_bounds(nx, ny) and not is_blocking(world.grid[ny][nx]): + move_to(world, a, nx, ny) + return + if action == "I": + # In this sim, "I" is effectively "interact": for some envs, that means "pick/drop". + # Most pickups happen automatically via tile effects; so we use I for "drop" in CTF if holding. + if world.env_key == "ctf" and world.flag_carrier == who: + world.flag_carrier = None + # drop flag at current position (simple) + if world.flag_taken_from == "A": + world.grid[a.y][a.x] = FLAG_A + elif world.flag_taken_from == "B": + world.grid[a.y][a.x] = FLAG_B + world.events.append(f"t={world.step}: {who} dropped the flag manually.") + return # ----------------------------- -# Tick / training +# Pseudo-3D POV Renderer (lightweight) # ----------------------------- -def clone_shallow(state: WorldState) -> WorldState: - return WorldState( - seed=state.seed, - step=state.step, - env_key=state.env_key, - grid=[row[:] for row in state.grid], - agents={k: Agent(**asdict(v)) for k, v in state.agents.items()}, - controlled=state.controlled, - pov=state.pov, - overlay=state.overlay, - done=state.done, - outcome=state.outcome, - door_opened_global=state.door_opened_global, - base_progress=state.base_progress, - base_target=state.base_target, - event_log=list(state.event_log), - trace_log=list(state.trace_log), - cfg=state.cfg, - q_tables=state.q_tables, - gmetrics=state.gmetrics, - emetrics=state.emetrics, - ) - -def update_action_counts(state: WorldState, who: str, act: str): - state.emetrics.action_counts.setdefault(who, {}) - state.emetrics.action_counts[who][act] = state.emetrics.action_counts[who].get(act, 0) + 1 +SKY = np.array([12, 14, 26], dtype=np.uint8) +FLOOR1 = np.array([24, 28, 54], dtype=np.uint8) +FLOOR2 = np.array([10, 12, 22], dtype=np.uint8) +WALL1 = np.array([205, 210, 232], dtype=np.uint8) +WALL2 = np.array([160, 168, 195], dtype=np.uint8) +GATEC = np.array([120, 220, 255], dtype=np.uint8) + +def raycast_pov(world: World, who: str) -> np.ndarray: + a = world.agents[who] + img = np.zeros((VIEW_H, VIEW_W, 3), dtype=np.uint8) + img[:, :] = SKY + # floor gradient + for y in range(VIEW_H // 2, VIEW_H): + t = (y - VIEW_H // 2) / max(1, (VIEW_H // 2)) + col = (1 - t) * FLOOR1 + t * FLOOR2 + img[y, :] = col.astype(np.uint8) -def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optional[str] = None) -> None: - if state.done: - return + # rays + ray_cols = VIEW_W + half = math.radians(FOV_DEG / 2) + base = math.radians(ORI_DEG[a.ori]) - prev = clone_shallow(state) - chosen: Dict[str, str] = {} - reasons: Dict[str, str] = {} - qinfo: Dict[str, Optional[Tuple[str, int]]] = {} + for rx in range(ray_cols): + cam = (2 * rx / (ray_cols - 1)) - 1 + ang = base + cam * half + sin_a = math.sin(ang) + cos_a = math.cos(ang) - if manual_action is not None: - chosen[state.controlled] = manual_action - reasons[state.controlled] = "manual" - qinfo[state.controlled] = None + ox, oy = a.x + 0.5, a.y + 0.5 + depth = 0.0 + hit = None + side = 0 - order = list(state.agents.keys()) + while depth < MAX_DEPTH: + depth += 0.06 + tx = int(ox + cos_a * depth) + ty = int(oy + sin_a * depth) + if not in_bounds(tx, ty): + break + tile = world.grid[ty][tx] + if tile == WALL: + hit = "wall" + side = 1 if abs(cos_a) > abs(sin_a) else 0 + break + if tile == GATE: + hit = "gate" + break - for who in order: - if who in chosen: + if hit is None: continue - act, reason, qi = choose_action(state, who, stream=200 + (hash(who) % 1000)) - chosen[who] = act - reasons[who] = reason - qinfo[who] = qi - outcomes: Dict[str, str] = {} - took_damage: Dict[str, bool] = {nm: False for nm in order} + depth *= math.cos(ang - base) + depth = max(depth, 0.001) - for who in order: - outcomes[who] = apply_action(state, who, chosen[who]) + h = int((VIEW_H * 0.92) / depth) + y0 = max(0, VIEW_H // 2 - h // 2) + y1 = min(VIEW_H - 1, VIEW_H // 2 + h // 2) - dmg, msg = resolve_hazards(state, state.agents[who]) - took_damage[who] = dmg - if msg: - state.event_log.append(f"t={state.step}: {who} {msg}") + col = (GATEC.copy() if hit == "gate" else (WALL1.copy() if side == 0 else WALL2.copy())) + dim = max(0.28, 1.0 - depth / MAX_DEPTH) + col = (col * dim).astype(np.uint8) + img[y0:y1, rx:rx + 1] = col - update_action_counts(state, who, chosen[who]) + # simple agent sprites in view if visible + for nm, other in world.agents.items(): + if nm == who or other.hp <= 0: + continue + if not within_fov(a.x, a.y, a.ori, other.x, other.y): + continue + if not bresenham_los(world.grid, a.x, a.y, other.x, other.y): + continue - for m in resolve_tags(state): - state.event_log.append(m) + dx = other.x - a.x + dy = other.y - a.y + ang = math.degrees(math.atan2(dy, dx)) % 360 + facing = ORI_DEG[a.ori] + diff = (ang - facing + 540) % 360 - 180 + sx = int((diff / (FOV_DEG / 2)) * (VIEW_W / 2) + (VIEW_W / 2)) + dist = math.sqrt(dx * dx + dy * dy) + size = int((VIEW_H * 0.55) / max(dist, 1.0)) + size = clamp(size, 10, 110) + ymid = VIEW_H // 2 + x0 = clamp(sx - size // 4, 0, VIEW_W - 1) + x1 = clamp(sx + size // 4, 0, VIEW_W - 1) + y0 = clamp(ymid - size // 2, 0, VIEW_H - 1) + y1 = clamp(ymid + size // 2, 0, VIEW_H - 1) + + # convert agent hex color to rgb + hexcol = AGENT_COLORS.get(nm, "#ffd17a").lstrip("#") + rgb = np.array([int(hexcol[i:i+2], 16) for i in (0, 2, 4)], dtype=np.uint8) + img[y0:y1, x0:x1] = rgb + + # reticle + if world.overlay: + cx, cy = VIEW_W // 2, VIEW_H // 2 + img[cy - 1:cy + 2, cx - 16:cx + 16] = np.array([110, 210, 255], dtype=np.uint8) + img[cy - 16:cy + 16, cx - 1:cx + 2] = np.array([110, 210, 255], dtype=np.uint8) - # belief updates + discovered tiles - for nm, a in state.agents.items(): - if a.hp <= 0: - continue - disc = update_belief_for_agent(state, beliefs[nm], a) - state.emetrics.tiles_discovered[nm] = state.emetrics.tiles_discovered.get(nm, 0) + disc - - check_done(state) - - # rewards + Q - q_lines = [] - for who in order: - if who not in state.emetrics.returns: - state.emetrics.returns[who] = 0.0 - - r = reward_for(prev, state, who, outcomes[who], took_damage[who]) - state.emetrics.returns[who] += r - - if qinfo.get(who) is not None: - key, a_idx = qinfo[who] - next_key = obs_key(state, who) - qtab = state.q_tables.setdefault(who, {}) - old, tgt, new = q_update(qtab, key, a_idx, r, next_key, state.cfg.alpha, state.cfg.gamma) - q_lines.append(f"{who}: old={old:.3f} tgt={tgt:.3f} new={new:.3f} (a={ACTIONS[a_idx]})") - - trace = f"t={state.step} env={state.env_key} done={state.done} outcome={state.outcome}" - for who in order: - a = state.agents[who] - trace += f" | {who}:{chosen[who]} ({outcomes[who]}) hp={a.hp} [{reasons[who]}]" - if q_lines: - trace += " | Q: " + " ; ".join(q_lines) - - state.trace_log.append(trace) - if len(state.trace_log) > TRACE_MAX: - state.trace_log = state.trace_log[-TRACE_MAX:] - - state.step += 1 - state.emetrics.steps = state.step - -def run_episode(state: WorldState, beliefs: Dict[str, np.ndarray], max_steps: int) -> Tuple[str, int]: - while state.step < max_steps and not state.done: - tick(state, beliefs, manual_action=None) - return state.outcome, state.step - -def update_global_metrics_after_episode(state: WorldState, outcome: str, steps: int): - gm = state.gmetrics - gm.episodes += 1 - gm.last_outcome = outcome - gm.last_steps = steps - - if outcome == "A_win": - gm.wins_teamA += 1 - gm.rolling_winrate_A = 0.90 * gm.rolling_winrate_A + 0.10 * 1.0 - elif outcome == "B_win": - gm.wins_teamB += 1 - gm.rolling_winrate_A = 0.90 * gm.rolling_winrate_A + 0.10 * 0.0 - else: - gm.draws += 1 - gm.rolling_winrate_A = 0.90 * gm.rolling_winrate_A + 0.10 * 0.5 - - gm.avg_steps = (0.90 * gm.avg_steps + 0.10 * steps) if gm.avg_steps > 0 else float(steps) - gm.epsilon = max(state.cfg.epsilon_min, gm.epsilon * state.cfg.epsilon_decay) - -def train(state: WorldState, episodes: int, max_steps: int) -> WorldState: - for ep in range(episodes): - ep_seed = (state.seed * 1_000_003 + (state.gmetrics.episodes + ep) * 97_531) & 0xFFFFFFFF - state = reset_episode_keep_learning(state, seed=int(ep_seed)) - beliefs = init_beliefs(list(state.agents.keys())) - outcome, steps = run_episode(state, beliefs, max_steps=max_steps) - update_global_metrics_after_episode(state, outcome, steps) - - state.event_log.append( - f"Training: +{episodes} eps | eps={state.gmetrics.epsilon:.3f} | " - f"A={state.gmetrics.wins_teamA} B={state.gmetrics.wins_teamB} D={state.gmetrics.draws}" - ) - state = reset_episode_keep_learning(state, seed=state.seed) - return state + return img # ----------------------------- -# Export / Import +# SVG Animated Renderer (the "cool UI" core) # ----------------------------- -def export_run(state: WorldState, branches: Dict[str, List[Snapshot]], active_branch: str, rewind_idx: int) -> str: - payload = { - "seed": state.seed, - "env_key": state.env_key, - "controlled": state.controlled, - "pov": state.pov, - "overlay": state.overlay, - "cfg": asdict(state.cfg), - "gmetrics": asdict(state.gmetrics), - "q_tables": state.q_tables, - "branches": {b: [asdict(s) for s in snaps] for b, snaps in branches.items()}, - "active_branch": active_branch, - "rewind_idx": int(rewind_idx), - "grid": state.grid, - "door_opened_global": state.door_opened_global, - "base_progress": state.base_progress, - "base_target": state.base_target, - } - txt = json.dumps(payload, indent=2) - proof = hash_sha256(txt) - return txt + "\n\n" + json.dumps({"proof_sha256": proof}, indent=2) - -def import_run(txt: str) -> Tuple[WorldState, Dict[str, List[Snapshot]], str, int, Dict[str, np.ndarray]]: - parts = txt.strip().split("\n\n") - data = json.loads(parts[0]) - - st = init_state(int(data.get("seed", 1337)), data.get("env_key", "chase")) - st.controlled = data.get("controlled", st.controlled) - st.pov = data.get("pov", st.pov) - st.overlay = bool(data.get("overlay", False)) - st.grid = data.get("grid", st.grid) - st.door_opened_global = bool(data.get("door_opened_global", False)) - st.base_progress = int(data.get("base_progress", 0)) - st.base_target = int(data.get("base_target", 10)) - - st.cfg = TrainConfig(**data.get("cfg", asdict(st.cfg))) - st.gmetrics = GlobalMetrics(**data.get("gmetrics", asdict(st.gmetrics))) - st.q_tables = data.get("q_tables", {}) - - branches_in = data.get("branches", {}) - branches: Dict[str, List[Snapshot]] = {} - for bname, snaps in branches_in.items(): - branches[bname] = [Snapshot(**s) for s in snaps] - - active = data.get("active_branch", "main") - r_idx = int(data.get("rewind_idx", 0)) - - if active in branches and branches[active]: - st = restore_into(st, branches[active][-1]) - st.event_log.append("Imported run (restored last snapshot).") +def tile_color(tile: int) -> str: + return { + EMPTY: COL_EMPTY, + WALL: COL_WALL, + PELLET: COL_PELLET, + POWER: COL_POWER, + FLAG_A: COL_FLAG_A, + FLAG_B: COL_FLAG_B, + TREASURE: COL_TREASURE, + BASE_A: COL_BASE_A, + BASE_B: COL_BASE_B, + RESOURCE: COL_RESOURCE, + HAZARD: COL_HAZARD, + GATE: COL_GATE, + }.get(tile, COL_EMPTY) + +def objective_hud(world: World) -> Tuple[str, str]: + spec = ENVS[world.env_key] + # short headline + detail line + if world.env_key == "pac_chase": + prey_score = world.agents["Prey"].inventory.get("pellets", 0) + headline = f"{spec.title} • pellets_left={world.pellets_left} • prey_score={prey_score} • power={world.power_timer}" + detail = "Prey clears pellets; Predator catches. Power flips vulnerability briefly." + elif world.env_key == "ctf": + carrier = world.flag_carrier or "none" + headline = f"{spec.title} • carrier={carrier} • step={world.step}/{spec.max_steps}" + detail = "Steal opponent flag → return to base. Tagging drops the flag." + elif world.env_key == "treasure": + headline = f"{spec.title} • A={world.treasure_collected_A}/6 • B={world.treasure_collected_B}/6 • step={world.step}/{spec.max_steps}" + detail = "Collect treasures and deposit at base. First to 6 wins." else: - st.event_log.append("Imported run (no snapshots).") + headline = f"{spec.title} • A={world.baseA_progress}/{world.base_target} • B={world.baseB_progress}/{world.base_target} • step={world.step}/{spec.max_steps}" + detail = "Mine resources, deposit to build progress. Raider tags to disrupt." + return headline, detail + +def svg_render(world: World, highlight: Optional[Tuple[int, int]] = None) -> str: + headline, detail = objective_hud(world) + + # CSS transitions: smooth movement + subtle breathing glow + # Note: SVG updates each tick, browser animates between transforms. + css = f""" + + """ + + # HUD panel + svg = [f""" +