Spaces:

ZENLLC
/

POV

Sleeping

App Files Files Community

ZENLLC commited on Jan 5

Commit

1f24d62

verified ·

1 Parent(s): 0ef25d9

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -272

app.py CHANGED Viewed

@@ -6,23 +6,28 @@ from typing import Dict, List, Tuple, Optional, Any
 import numpy as np
 from PIL import Image, ImageDraw
 import matplotlib.pyplot as plt
 import gradio as gr
 # ============================================================
 # ZEN AgentLab — Agent POV + Multi-Agent Mini-Sim Arena
-# State-of-the-art evolution of your "ChronoSandbox++" reference.
 #
 # Features:
 # - Deterministic gridworld + first-person raycast POV
 # - Multiple environments (Chase / CoopVault / MiniCiv)
-# - Click-to-edit tiles + inventory pickups
 # - Full step trace: obs -> action -> reward -> (optional) Q-update
-# - Branching timelines (fork from any rewind point)
 # - Batch training (tabular Q-learning) + metrics dashboard
-# - Export/import full runs + SHA256 proof hash ("Proof-of-Run")
 #
-# Hugging Face Spaces compatible: no timers, no fn_kwargs
 # ============================================================
 # -----------------------------
@@ -74,16 +79,13 @@ TILE_NAMES = {
     BASE: "Base",
 }
-# Colors
 AGENT_COLORS = {
     "Predator": (255, 120, 90),
     "Prey": (120, 255, 160),
     "Scout": (120, 190, 255),
     "Alpha": (255, 205, 120),
     "Bravo": (160, 210, 255),
     "Guardian": (255, 120, 220),
     "BuilderA": (140, 255, 200),
     "BuilderB": (160, 200, 255),
     "Raider": (255, 160, 120),
@@ -96,17 +98,16 @@ WALL_BASE = np.array([210, 210, 225], dtype=np.uint8)
 WALL_SIDE = np.array([150, 150, 170], dtype=np.uint8)
 DOOR_COL = np.array([140, 210, 255], dtype=np.uint8)
-# Keep actions small for tabular stability
-ACTIONS = ["L", "F", "R", "I"]  # I = interact (pickups/door/key/switch/base)
 # -----------------------------
-# Deterministic RNG streams
 # -----------------------------
 def rng_for(seed: int, step: int, stream: int = 0) -> np.random.Generator:
     mix = (seed * 1_000_003) ^ (step * 9_999_937) ^ (stream * 97_531)
     return np.random.default_rng(mix & 0xFFFFFFFFFFFFFFFF)
 # -----------------------------
 # Data structures
 # -----------------------------
@@ -135,20 +136,20 @@ class TrainConfig:
     epsilon_min: float = 0.02
     epsilon_decay: float = 0.995
-    # shaping (generic)
     step_penalty: float = -0.01
     explore_reward: float = 0.015
     damage_penalty: float = -0.20
     heal_reward: float = 0.10
-    # chase env shaping
     chase_close_coeff: float = 0.03
     chase_catch_reward: float = 3.0
     chase_escaped_reward: float = 0.2
     chase_caught_penalty: float = -3.0
     food_reward: float = 0.6
-    # vault env shaping
     artifact_pick_reward: float = 1.2
     exit_win_reward: float = 3.0
     guardian_tag_reward: float = 2.0
@@ -156,7 +157,7 @@ class TrainConfig:
     switch_reward: float = 0.8
     key_reward: float = 0.4
-    # civ env shaping
     resource_pick_reward: float = 0.15
     deposit_reward: float = 0.4
     base_progress_win_reward: float = 3.5
@@ -181,7 +182,6 @@ class EpisodeMetrics:
     returns: Dict[str, float] = None
     action_counts: Dict[str, Dict[str, int]] = None
     tiles_discovered: Dict[str, int] = None
-    q_states: Dict[str, int] = None
     def __post_init__(self):
         if self.returns is None:
@@ -190,8 +190,6 @@ class EpisodeMetrics:
             self.action_counts = {}
         if self.tiles_discovered is None:
             self.tiles_discovered = {}
-        if self.q_states is None:
-            self.q_states = {}
 @dataclass
 class WorldState:
@@ -206,7 +204,7 @@ class WorldState:
     overlay: bool
     done: bool
-    outcome: str  # "A_win" | "B_win" | "draw" | "ongoing"
     # env state
     door_opened_global: bool = False
@@ -219,7 +217,7 @@ class WorldState:
     # learning
     cfg: TrainConfig = None
-    q_tables: Dict[str, Dict[str, List[float]]] = None  # per-agent Q
     gmetrics: GlobalMetrics = None
     emetrics: EpisodeMetrics = None
@@ -301,25 +299,23 @@ def within_fov(observer: Agent, tx: int, ty: int, fov_deg: float = FOV_DEG) -> b
     return abs(diff) <= (fov_deg / 2)
 def visible(state: WorldState, observer: Agent, target: Agent) -> bool:
-    # doors block LOS like walls for simplicity
     if not within_fov(observer, target.x, target.y, FOV_DEG):
         return False
-    # treat door as wall in LOS even if opened, to keep simple
     return bresenham_los(state.grid, observer.x, observer.y, target.x, target.y)
 def hash_sha256(txt: str) -> str:
     return hashlib.sha256(txt.encode("utf-8")).hexdigest()
 # -----------------------------
-# Belief maps / fog-of-war
 # -----------------------------
 def init_beliefs(agent_names: List[str]) -> Dict[str, np.ndarray]:
-    b = {}
-    for nm in agent_names:
-        b[nm] = -1 * np.ones((GRID_H, GRID_W), dtype=np.int16)
-    return b
-def update_belief_for_agent(state: WorldState, belief: np.ndarray, agent: Agent) -> None:
     belief[agent.y, agent.x] = state.grid[agent.y][agent.x]
     base = math.radians(ORI_DEG[agent.ori])
     half = math.radians(FOV_DEG / 2)
@@ -344,6 +340,9 @@ def update_belief_for_agent(state: WorldState, belief: np.ndarray, agent: Agent)
             if tile == DOOR and not state.door_opened_global:
                 break
 # -----------------------------
 # Rendering
 # -----------------------------
@@ -410,7 +409,7 @@ def raycast_view(state: WorldState, observer: Agent) -> np.ndarray:
     # billboards for visible agents
     for nm, other in state.agents.items():
-        if nm == observer.name:
             continue
         if visible(state, observer, other):
             dx = other.x - observer.x
@@ -508,7 +507,7 @@ def render_topdown(grid: np.ndarray, agents: Dict[str, Agent], title: str, show_
     return im
 # -----------------------------
-# Environments (3 modes)
 # -----------------------------
 def grid_with_border() -> List[List[int]]:
     g = [[EMPTY for _ in range(GRID_W)] for _ in range(GRID_H)]
@@ -522,12 +521,10 @@ def grid_with_border() -> List[List[int]]:
 def env_chase(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
-    # mid-wall + door
     for x in range(4, 17):
         g[7][x] = WALL
     g[7][10] = DOOR
-    # objects
     g[3][4] = FOOD
     g[11][15] = FOOD
     g[4][14] = NOISE
@@ -544,7 +541,6 @@ def env_chase(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
 def env_vault(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
-    # internal maze
     for x in range(3, 18):
         g[5][x] = WALL
     for x in range(3, 18):
@@ -552,7 +548,6 @@ def env_vault(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g[5][10] = DOOR
     g[9][12] = DOOR
-    # special tiles
     g[2][2] = KEY
     g[12][18] = EXIT
     g[12][2] = ARTIFACT
@@ -572,13 +567,10 @@ def env_vault(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
 def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
-    # walls forming zones
     for y in range(3, 12):
         g[y][9] = WALL
     g[7][9] = DOOR
-    # resources
     g[2][3] = WOOD
     g[3][3] = WOOD
     g[4][3] = WOOD
@@ -588,7 +580,6 @@ def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g[6][4] = FOOD
     g[8][15] = FOOD
-    # base + hazards + switch/key
     g[13][10] = BASE
     g[4][15] = HAZARD
     g[10][4] = HAZARD
@@ -596,7 +587,6 @@ def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g[13][2] = TELE
     g[2][2] = KEY
     g[12][6] = SWITCH
-    g[7][9] = DOOR
     agents = {
         "BuilderA": Agent("BuilderA", 3, 12, 0, hp=10, energy=100, team="A", brain="q"),
@@ -605,14 +595,10 @@ def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     }
     return g, agents
-ENV_BUILDERS = {
-    "chase": env_chase,
-    "vault": env_vault,
-    "civ": env_civ,
-}
 # -----------------------------
-# Observation encoding (compact)
 # -----------------------------
 def local_tile_ahead(state: WorldState, a: Agent) -> int:
     dx, dy = DIRS[a.ori]
@@ -623,7 +609,7 @@ def local_tile_ahead(state: WorldState, a: Agent) -> int:
 def nearest_enemy_vec(state: WorldState, a: Agent) -> Tuple[int, int, int]:
     best = None
-    for nm, other in state.agents.items():
         if other.hp <= 0:
             continue
         if other.team == a.team:
@@ -638,15 +624,12 @@ def nearest_enemy_vec(state: WorldState, a: Agent) -> Tuple[int, int, int]:
 def obs_key(state: WorldState, who: str) -> str:
     a = state.agents[who]
-    # include env key so tables don't cross-contaminate
-    # include coarse enemy vector, tile ahead, inventory coarse
     d, dx, dy = nearest_enemy_vec(state, a)
     ahead = local_tile_ahead(state, a)
     keys = a.inventory.get("key", 0)
     art = a.inventory.get("artifact", 0)
     wood = a.inventory.get("wood", 0)
     ore = a.inventory.get("ore", 0)
     inv_bucket = f"k{min(keys,2)}a{min(art,1)}w{min(wood,3)}o{min(ore,3)}"
     door = 1 if state.door_opened_global else 0
     return f"{state.env_key}|{who}|{a.x},{a.y},{a.ori}|e{d}:{dx},{dy}|t{ahead}|hp{a.hp}|{inv_bucket}|D{door}|bp{state.base_progress}"
@@ -672,7 +655,7 @@ def q_update(q: Dict[str, List[float]], key: str, a_idx: int, reward: float, nex
     return old, target, new
 # -----------------------------
-# Heuristic baselines
 # -----------------------------
 def face_towards(a: Agent, tx: int, ty: int) -> str:
     dx = tx - a.x
@@ -690,30 +673,27 @@ def heuristic_action(state: WorldState, who: str) -> str:
     a = state.agents[who]
     r = rng_for(state.seed, state.step, stream=900 + hash(who) % 1000)
-    # simple: if enemy visible, chase if team B (raider/guardian/predator) else flee-ish
-    # also, prioritize interact if standing on something valuable
-    tile_here = state.grid[a.y][a.x]
-    if tile_here in (FOOD, KEY, ARTIFACT, WOOD, ORE, MEDKIT, SWITCH, BASE, EXIT):
         return "I"
-    # find nearest enemy
-    best_nm = None
-    best_d = 999
     best = None
-    for nm, other in state.agents.items():
         if other.hp <= 0 or other.team == a.team:
             continue
         d = manhattan_xy(a.x, a.y, other.x, other.y)
         if d < best_d:
             best_d = d
-            best_nm = nm
             best = other
     if best is not None and best_d <= 6 and visible(state, a, best):
-        # attackers chase
         if a.team == "B":
             return face_towards(a, best.x, best.y)
-        # defenders flee: turn away from enemy vector
         dx = best.x - a.x
         dy = best.y - a.y
         ang = (math.degrees(math.atan2(dy, dx)) % 360)
@@ -726,7 +706,6 @@ def heuristic_action(state: WorldState, who: str) -> str:
             return "R"
         return "F"
-    # mild exploration bias: try forward more
     return r.choice(["F", "F", "L", "R", "I"])
 def random_action(state: WorldState, who: str) -> str:
@@ -766,7 +745,6 @@ def move_forward(state: WorldState, a: Agent) -> str:
 def try_interact(state: WorldState, a: Agent) -> str:
     t = state.grid[a.y][a.x]
-    # door open global via key or switch
     if t == SWITCH:
         state.door_opened_global = True
         state.grid[a.y][a.x] = EMPTY
@@ -804,7 +782,6 @@ def try_interact(state: WorldState, a: Agent) -> str:
         return "used: medkit"
     if t == BASE:
-        # deposit resources into base_progress
         w = a.inventory.get("wood", 0)
         o = a.inventory.get("ore", 0)
         dep = min(w, 2) + min(o, 2)
@@ -837,22 +814,19 @@ def apply_action(state: WorldState, who: str, action: str) -> str:
     return "noop"
 # -----------------------------
-# Combat / hazards / win conditions
 # -----------------------------
 def resolve_hazards(state: WorldState, a: Agent) -> Tuple[bool, str]:
-    # returns (took_damage, msg)
     if a.hp <= 0:
         return (False, "")
-    t = state.grid[a.y][a.x]
-    if t == HAZARD:
         a.hp -= 1
         return (True, "hazard:-hp")
     return (False, "")
 def resolve_tags(state: WorldState) -> List[str]:
-    # if two opposing agents occupy same tile: team B "tags" team A
     msgs = []
-    occupied = {}
     for nm, a in state.agents.items():
         if a.hp <= 0:
             continue
@@ -863,14 +837,12 @@ def resolve_tags(state: WorldState) -> List[str]:
             continue
         teams = set(state.agents[n].team for n in names)
         if len(teams) >= 2:
-            # tag: both sides take 1 hp damage, but log who collided
             for n in names:
                 state.agents[n].hp -= 1
             msgs.append(f"t={state.step}: collision/tag at ({x},{y}) {names} (-hp all)")
     return msgs
 def check_done(state: WorldState) -> None:
-    # Determine environment-specific terminal conditions
     if state.env_key == "chase":
         pred = state.agents["Predator"]
         prey = state.agents["Prey"]
@@ -878,12 +850,11 @@ def check_done(state: WorldState) -> None:
             state.done = True
             state.outcome = "draw"
             return
-        if pred.x == prey.x and pred.y == prey.y and pred.hp > 0 and prey.hp > 0:
             state.done = True
-            state.outcome = "A_win"  # Predator team A
             state.event_log.append(f"t={state.step}: CAUGHT (Predator wins).")
             return
-        # prey "escape" win if survives long enough with food? Use energy threshold
         if state.step >= 300 and prey.hp > 0:
             state.done = True
             state.outcome = "B_win"
@@ -891,7 +862,6 @@ def check_done(state: WorldState) -> None:
             return
     if state.env_key == "vault":
-        # Team A wins if any A has artifact and reaches exit
         for nm in ["Alpha", "Bravo"]:
             a = state.agents[nm]
             if a.hp > 0 and a.inventory.get("artifact", 0) > 0 and state.grid[a.y][a.x] == EXIT:
@@ -899,7 +869,6 @@ def check_done(state: WorldState) -> None:
                 state.outcome = "A_win"
                 state.event_log.append(f"t={state.step}: VAULT CLEARED (Team A wins).")
                 return
-        # Team B wins if all A agents eliminated
         alive_A = any(state.agents[n].hp > 0 for n in ["Alpha", "Bravo"])
         if not alive_A:
             state.done = True
@@ -908,20 +877,17 @@ def check_done(state: WorldState) -> None:
             return
     if state.env_key == "civ":
-        # Team A wins if base_progress reaches target
         if state.base_progress >= state.base_target:
             state.done = True
             state.outcome = "A_win"
             state.event_log.append(f"t={state.step}: BASE COMPLETE (Builders win).")
             return
-        # Team B wins if both builders eliminated
         alive_A = any(state.agents[n].hp > 0 for n in ["BuilderA", "BuilderB"])
         if not alive_A:
             state.done = True
             state.outcome = "B_win"
             state.event_log.append(f"t={state.step}: BUILDERS ELIMINATED (Raider wins).")
             return
-        # draw if too long
         if state.step >= 350:
             state.done = True
             state.outcome = "draw"
@@ -931,88 +897,72 @@ def check_done(state: WorldState) -> None:
 # -----------------------------
 # Rewards
 # -----------------------------
-def reward_for(state_prev: WorldState, state_now: WorldState, who: str, outcome_msg: str,
-               took_damage: bool, interacted: bool) -> float:
-    cfg = state_now.cfg
-    a0 = state_prev.agents[who]
-    a1 = state_now.agents[who]
     r = cfg.step_penalty
-    # exploration reward: if agent discovered new tiles in belief (we approximate via emetrics tiles_discovered)
-    # we update this outside; here just tiny reward if moved
     if outcome_msg.startswith("moved"):
         r += cfg.explore_reward
     if took_damage:
         r += cfg.damage_penalty
-    # heal reward if used medkit
     if outcome_msg.startswith("used: medkit"):
         r += cfg.heal_reward
-    # environment shaping
-    if state_now.env_key == "chase":
-        pred = state_now.agents["Predator"]
-        prey = state_now.agents["Prey"]
         if who == "Predator":
-            d0 = manhattan_xy(state_prev.agents["Predator"].x, state_prev.agents["Predator"].y,
-                             state_prev.agents["Prey"].x, state_prev.agents["Prey"].y)
             d1 = manhattan_xy(pred.x, pred.y, prey.x, prey.y)
             r += cfg.chase_close_coeff * float(d0 - d1)
-            if state_now.done and state_now.outcome == "A_win":
                 r += cfg.chase_catch_reward
         if who == "Prey":
             if outcome_msg.startswith("ate: food"):
                 r += cfg.food_reward
-            if state_now.done and state_now.outcome == "B_win":
                 r += cfg.chase_escaped_reward
-            if state_now.done and state_now.outcome == "A_win":
                 r += cfg.chase_caught_penalty
-    if state_now.env_key == "vault":
         if outcome_msg.startswith("picked: artifact"):
             r += cfg.artifact_pick_reward
         if outcome_msg.startswith("picked: key"):
             r += cfg.key_reward
         if outcome_msg.startswith("switch:"):
             r += cfg.switch_reward
-        if state_now.done:
-            if state_now.outcome == "A_win" and state_now.agents[who].team == "A":
                 r += cfg.exit_win_reward
-            if state_now.outcome == "B_win" and state_now.agents[who].team == "B":
                 r += cfg.guardian_tag_reward
-            if state_now.outcome == "B_win" and state_now.agents[who].team == "A":
                 r += cfg.tagged_penalty
-    if state_now.env_key == "civ":
         if outcome_msg.startswith("picked: wood") or outcome_msg.startswith("picked: ore"):
             r += cfg.resource_pick_reward
         if outcome_msg.startswith("deposited:"):
             r += cfg.deposit_reward
-        if state_now.done:
-            if state_now.outcome == "A_win" and state_now.agents[who].team == "A":
                 r += cfg.base_progress_win_reward
-            if state_now.outcome == "B_win" and state_now.agents[who].team == "B":
                 r += cfg.raider_elim_reward
-            if state_now.outcome == "B_win" and state_now.agents[who].team == "A":
                 r += cfg.builder_elim_penalty
     return float(r)
 # -----------------------------
-# Q / policy selection
 # -----------------------------
 def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, Optional[Tuple[str, int]]]:
-    """
-    Returns (action, reason, q_info)
-    q_info: (obs_key, action_index) if Q-based else None
-    """
     a = state.agents[who]
     cfg = state.cfg
     r = rng_for(state.seed, state.step, stream=stream)
-    # manual control handled outside
     if a.brain == "random":
         act = random_action(state, who)
         return act, "random", None
@@ -1020,7 +970,6 @@ def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, O
         act = heuristic_action(state, who)
         return act, "heuristic", None
-    # Q learning
     if cfg.use_q:
         key = obs_key(state, who)
         qtab = state.q_tables.setdefault(who, {})
@@ -1032,11 +981,10 @@ def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, O
     return act, "heuristic(fallback)", None
 # -----------------------------
-# Episode initialization / reset
 # -----------------------------
 def init_state(seed: int, env_key: str) -> WorldState:
     g, agents = ENV_BUILDERS[env_key](seed)
     st = WorldState(
         seed=seed,
         step=0,
@@ -1059,7 +1007,6 @@ def reset_episode_keep_learning(state: WorldState, seed: Optional[int] = None) -
     if seed is None:
         seed = state.seed
     fresh = init_state(int(seed), state.env_key)
-    # carry learning + global metrics
     fresh.cfg = state.cfg
     fresh.q_tables = state.q_tables
     fresh.gmetrics = state.gmetrics
@@ -1110,52 +1057,52 @@ def restore_into(state: WorldState, snap: Snapshot) -> WorldState:
     return state
 # -----------------------------
-# Metrics / dashboard
 # -----------------------------
-def update_action_counts(state: WorldState, who: str, act: str):
-    state.emetrics.action_counts.setdefault(who, {})
-    state.emetrics.action_counts[who][act] = state.emetrics.action_counts[who].get(act, 0) + 1
-def action_entropy(counts: Dict[str, int]) -> float:
-    total = sum(counts.values())
-    if total <= 0:
-        return 0.0
-    p = np.array([c / total for c in counts.values()], dtype=np.float64)
-    p = np.clip(p, 1e-12, 1.0)
-    return float(-np.sum(p * np.log2(p)))
 def metrics_dashboard_image(state: WorldState) -> Image.Image:
     gm = state.gmetrics
     fig = plt.figure(figsize=(7.0, 2.2), dpi=120)
     ax = fig.add_subplot(111)
-    ax.plot([0, gm.episodes], [gm.rolling_winrate_A, gm.rolling_winrate_A])
     ax.set_title("Global Metrics Snapshot")
-    ax.set_xlabel("Episodes (scalar)")
     ax.set_ylabel("Rolling winrate Team A")
     ax.grid(True)
-    # annotate
     txt = (
         f"env={state.env_key} | eps={gm.epsilon:.3f} | episodes={gm.episodes}\n"
-        f"A_wins={gm.wins_teamA} B_wins={gm.wins_teamB} draws={gm.draws} | "
-        f"avg_steps~{gm.avg_steps:.1f}\n"
         f"last_outcome={gm.last_outcome} last_steps={gm.last_steps}"
     )
     ax.text(0.01, 0.05, txt, transform=ax.transAxes, fontsize=8, va="bottom")
     fig.tight_layout()
-    fig.canvas.draw()
-    w, h = fig.canvas.get_width_height()
-    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8).reshape(h, w, 3)
     plt.close(fig)
-    return Image.fromarray(img)
 def agent_scoreboard(state: WorldState) -> str:
     rows = []
     header = ["agent", "team", "hp", "return", "steps", "entropy", "tiles_disc", "q_states", "inventory"]
     rows.append(header)
     steps = state.emetrics.steps
     for nm, a in state.agents.items():
         ret = state.emetrics.returns.get(nm, 0.0)
         counts = state.emetrics.action_counts.get(nm, {})
@@ -1165,7 +1112,6 @@ def agent_scoreboard(state: WorldState) -> str:
         inv = json.dumps(a.inventory, sort_keys=True)
         rows.append([nm, a.team, a.hp, f"{ret:.2f}", steps, f"{ent:.2f}", td, qs, inv])
-    # pretty format as fixed-width table
     col_w = [max(len(str(r[i])) for r in rows) for i in range(len(header))]
     lines = []
     for ridx, r in enumerate(rows):
@@ -1176,11 +1122,10 @@ def agent_scoreboard(state: WorldState) -> str:
     return "\n".join(lines)
 # -----------------------------
-# Tick (core simulation step)
 # -----------------------------
 def clone_shallow(state: WorldState) -> WorldState:
-    # minimal clone to compute rewards
-    st = WorldState(
         seed=state.seed,
         step=state.step,
         env_key=state.env_key,
@@ -1201,15 +1146,16 @@ def clone_shallow(state: WorldState) -> WorldState:
         gmetrics=state.gmetrics,
         emetrics=state.emetrics,
     )
-    return st
 def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optional[str] = None) -> None:
     if state.done:
         return
     prev = clone_shallow(state)
-    # pick actions
     chosen: Dict[str, str] = {}
     reasons: Dict[str, str] = {}
     qinfo: Dict[str, Optional[Tuple[str, int]]] = {}
@@ -1219,8 +1165,9 @@ def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optio
         reasons[state.controlled] = "manual"
         qinfo[state.controlled] = None
-    # others choose
-    for who in list(state.agents.keys()):
         if who in chosen:
             continue
         act, reason, qi = choose_action(state, who, stream=200 + (hash(who) % 1000))
@@ -1228,50 +1175,38 @@ def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optio
         reasons[who] = reason
         qinfo[who] = qi
-    # apply actions in fixed order (deterministic)
-    order = list(state.agents.keys())
     outcomes: Dict[str, str] = {}
     took_damage: Dict[str, bool] = {nm: False for nm in order}
-    interacted: Dict[str, bool] = {nm: False for nm in order}
     for who in order:
-        before_tile = state.grid[state.agents[who].y][state.agents[who].x] if state.agents[who].hp > 0 else EMPTY
         outcomes[who] = apply_action(state, who, chosen[who])
-        if chosen[who] == "I" and outcomes[who] != "interact: none":
-            interacted[who] = True
         dmg, msg = resolve_hazards(state, state.agents[who])
         took_damage[who] = dmg
         if msg:
             state.event_log.append(f"t={state.step}: {who} {msg}")
-        # track action counts
         update_action_counts(state, who, chosen[who])
-    # collisions/tags after movement
-    tag_msgs = resolve_tags(state)
-    for m in tag_msgs:
         state.event_log.append(m)
-    # update beliefs / tiles discovered metric
     for nm, a in state.agents.items():
         if a.hp <= 0:
             continue
-        before_unknown = int(np.sum(beliefs[nm] == -1))
-        update_belief_for_agent(state, beliefs[nm], a)
-        after_unknown = int(np.sum(beliefs[nm] == -1))
-        discovered = max(0, before_unknown - after_unknown)
-        state.emetrics.tiles_discovered[nm] = state.emetrics.tiles_discovered.get(nm, 0) + discovered
-    # check done conditions
     check_done(state)
-    # rewards + Q updates + returns
     q_lines = []
     for who in order:
         if who not in state.emetrics.returns:
             state.emetrics.returns[who] = 0.0
-        r = reward_for(prev, state, who, outcomes[who], took_damage[who], interacted[who])
         state.emetrics.returns[who] += r
         if qinfo.get(who) is not None:
@@ -1281,7 +1216,6 @@ def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optio
             old, tgt, new = q_update(qtab, key, a_idx, r, next_key, state.cfg.alpha, state.cfg.gamma)
             q_lines.append(f"{who}: old={old:.3f} tgt={tgt:.3f} new={new:.3f} (a={ACTIONS[a_idx]})")
-    # trace
     trace = f"t={state.step} env={state.env_key} done={state.done} outcome={state.outcome}"
     for who in order:
         a = state.agents[who]
@@ -1296,9 +1230,6 @@ def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optio
     state.step += 1
     state.emetrics.steps = state.step
-# -----------------------------
-# Training
-# -----------------------------
 def run_episode(state: WorldState, beliefs: Dict[str, np.ndarray], max_steps: int) -> Tuple[str, int]:
     while state.step < max_steps and not state.done:
         tick(state, beliefs, manual_action=None)
@@ -1321,23 +1252,20 @@ def update_global_metrics_after_episode(state: WorldState, outcome: str, steps:
         gm.rolling_winrate_A = 0.90 * gm.rolling_winrate_A + 0.10 * 0.5
     gm.avg_steps = (0.90 * gm.avg_steps + 0.10 * steps) if gm.avg_steps > 0 else float(steps)
-    # epsilon decay
     gm.epsilon = max(state.cfg.epsilon_min, gm.epsilon * state.cfg.epsilon_decay)
 def train(state: WorldState, episodes: int, max_steps: int) -> WorldState:
     for ep in range(episodes):
-        # vary seed deterministically per episode
         ep_seed = (state.seed * 1_000_003 + (state.gmetrics.episodes + ep) * 97_531) & 0xFFFFFFFF
         state = reset_episode_keep_learning(state, seed=int(ep_seed))
         beliefs = init_beliefs(list(state.agents.keys()))
         outcome, steps = run_episode(state, beliefs, max_steps=max_steps)
         update_global_metrics_after_episode(state, outcome, steps)
     state.event_log.append(
         f"Training: +{episodes} eps | eps={state.gmetrics.epsilon:.3f} | "
         f"A={state.gmetrics.wins_teamA} B={state.gmetrics.wins_teamB} D={state.gmetrics.draws}"
     )
-    # after training return a clean episode at current seed
     state = reset_episode_keep_learning(state, seed=state.seed)
     return state
@@ -1356,7 +1284,7 @@ def export_run(state: WorldState, branches: Dict[str, List[Snapshot]], active_br
         "q_tables": state.q_tables,
         "branches": {b: [asdict(s) for s in snaps] for b, snaps in branches.items()},
         "active_branch": active_branch,
-        "rewind_idx": rewind_idx,
         "grid": state.grid,
         "door_opened_global": state.door_opened_global,
         "base_progress": state.base_progress,
@@ -1367,7 +1295,6 @@ def export_run(state: WorldState, branches: Dict[str, List[Snapshot]], active_br
     return txt + "\n\n" + json.dumps({"proof_sha256": proof}, indent=2)
 def import_run(txt: str) -> Tuple[WorldState, Dict[str, List[Snapshot]], str, int, Dict[str, np.ndarray]]:
-    # allow trailing proof block
     parts = txt.strip().split("\n\n")
     data = json.loads(parts[0])
@@ -1392,7 +1319,6 @@ def import_run(txt: str) -> Tuple[WorldState, Dict[str, List[Snapshot]], str, in
     active = data.get("active_branch", "main")
     r_idx = int(data.get("rewind_idx", 0))
-    # restore last snap of active branch if exists
     if active in branches and branches[active]:
         st = restore_into(st, branches[active][-1])
         st.event_log.append("Imported run (restored last snapshot).")
@@ -1406,21 +1332,19 @@ def import_run(txt: str) -> Tuple[WorldState, Dict[str, List[Snapshot]], str, in
 # UI helpers
 # -----------------------------
 def build_views(state: WorldState, beliefs: Dict[str, np.ndarray]) -> Tuple[np.ndarray, Image.Image, Image.Image, Image.Image, Image.Image, str, str, str, str]:
-    # update beliefs
     for nm, a in state.agents.items():
         if a.hp > 0:
             update_belief_for_agent(state, beliefs[nm], a)
     pov = raycast_view(state, state.agents[state.pov])
     truth_np = np.array(state.grid, dtype=np.int16)
-    truth_img = render_topdown(truth_np, state.agents, f"Truth Map — env={state.env_key} t={state.step} seed={state.seed}", show_agents=True)
     ctrl = state.controlled
-    # pick "other belief" as someone else
     others = [k for k in state.agents.keys() if k != ctrl]
     other = others[0] if others else ctrl
-    b_ctrl = render_topdown(beliefs[ctrl], state.agents, f"{ctrl} Belief", show_agents=True)
-    b_other = render_topdown(beliefs[other], state.agents, f"{other} Belief", show_agents=True)
     dash = metrics_dashboard_image(state)
@@ -1444,7 +1368,6 @@ def grid_click_to_tile(evt: gr.SelectData, selected_tile: int, state: WorldState
     gy = int(y_px // TILE)
     if not in_bounds(gx, gy):
         return state
-    # keep border walls fixed
     if gx == 0 or gy == 0 or gx == GRID_W - 1 or gy == GRID_H - 1:
         return state
     state.grid[gy][gx] = selected_tile
@@ -1452,23 +1375,23 @@ def grid_click_to_tile(evt: gr.SelectData, selected_tile: int, state: WorldState
     return state
 # -----------------------------
-# Gradio App
 # -----------------------------
 TITLE = "ZEN AgentLab — Agent POV + Multi-Agent Mini-Sim Arena"
 with gr.Blocks(title=TITLE) as demo:
     gr.Markdown(
         f"## {TITLE}\n"
-        "A multi-environment agent observatory with POV, belief maps, branching timelines, training, and metrics.\n"
-        "**Controls:** No timers. Use Tick / Run / Train for deterministic experiments."
     )
-    # Core state
-    st = gr.State(init_state(1337, "chase"))
-    branches = gr.State({"main": [snapshot_of(init_state(1337, "chase"), "main")]})
     active_branch = gr.State("main")
     rewind_idx = gr.State(0)
-    beliefs = gr.State(init_beliefs(list(init_state(1337, "chase").agents.keys())))
     with gr.Row():
         pov_img = gr.Image(label="POV (Pseudo-3D)", type="numpy", width=VIEW_W, height=VIEW_H)
@@ -1486,7 +1409,7 @@ with gr.Blocks(title=TITLE) as demo:
     with gr.Row():
         events = gr.Textbox(label="Event Log", lines=10)
-        trace = gr.Textbox(label="Step Trace (why it happened)", lines=10)
     with gr.Row():
         with gr.Column(scale=2):
@@ -1523,14 +1446,14 @@ with gr.Blocks(title=TITLE) as demo:
         with gr.Column(scale=3):
             gr.Markdown("### Training Controls (Tabular Q-learning)")
             use_q = gr.Checkbox(True, label="Use Q-learning (agents with brain='q')")
-            alpha = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label="alpha (learn rate)")
-            gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="gamma (discount)")
-            eps = gr.Slider(0.0, 0.5, value=0.10, step=0.01, label="epsilon (exploration)")
             eps_decay = gr.Slider(0.90, 0.999, value=0.995, step=0.001, label="epsilon decay")
             eps_min = gr.Slider(0.0, 0.2, value=0.02, step=0.01, label="epsilon min")
             episodes = gr.Number(value=50, label="Train episodes", precision=0)
-            max_steps = gr.Number(value=260, label="Max steps per episode", precision=0)
             btn_train = gr.Button("Train")
             btn_reset = gr.Button("Reset Episode (keep learning)")
@@ -1554,19 +1477,16 @@ with gr.Blocks(title=TITLE) as demo:
             import_box = gr.Textbox(label="Import JSON", lines=8)
             btn_import = gr.Button("Import")
-    # ---------- UI glue ----------
     def refresh(state: WorldState, branches_d: Dict[str, List[Snapshot]], active: str, bel: Dict[str, np.ndarray], r: int):
         snaps = branches_d.get(active, [])
         r_max = max(0, len(snaps) - 1)
         r = max(0, min(int(r), r_max))
         pov, tr, ba, bb, dimg, stxt, etxt, ttxt, sb = build_views(state, bel)
         branch_choices = sorted(list(branches_d.keys()))
         return (
-            pov, tr, ba, bb, dimg,
-            stxt, sb, etxt, ttxt,
-            gr.update(maximum=r_max, value=r),
-            r,
             gr.update(choices=branch_choices, value=active),
             gr.update(choices=branch_choices, value=active),
         )
@@ -1664,9 +1584,7 @@ with gr.Blocks(title=TITLE) as demo:
             branches_d[new_name].append(snapshot_of(state, new_name))
         else:
             idx = max(0, min(int(r), len(snaps) - 1))
-            # fork snapshots up to idx (inclusive)
             branches_d[new_name] = [Snapshot(**asdict(s)) for s in snaps[:idx + 1]]
-            # restore state at fork point (last fork snap)
             state = restore_into(state, branches_d[new_name][-1])
         active = new_name
         state.event_log.append(f"Forked branch -> {new_name}")
@@ -1680,17 +1598,15 @@ with gr.Blocks(title=TITLE) as demo:
         if br not in branches_d:
             branches_d[br] = [snapshot_of(state, br)]
         active = br
-        # restore latest state on that branch
         if branches_d[active]:
             state = restore_into(state, branches_d[active][-1])
         bel = init_beliefs(list(state.agents.keys()))
-        out = refresh(state, branches_d, active, bel, len(branches_d[active]) - 1)
         r = len(branches_d[active]) - 1
         return out + (state, branches_d, active, bel, r)
     def change_env(state, branches_d, active, bel, r, env_key):
         env_key = env_key or "chase"
-        # reset episode but keep learning tables (they are per-agent key so safe)
         state.env_key = env_key
         state = reset_episode_keep_learning(state, seed=state.seed)
         bel = init_beliefs(list(state.agents.keys()))
@@ -1735,130 +1651,102 @@ with gr.Blocks(title=TITLE) as demo:
     def import_fn(txt):
         state, branches_d, active, r, bel = import_run(txt)
-        # ensure at least a snapshot
         branches_d.setdefault(active, [])
         if not branches_d[active]:
             branches_d[active].append(snapshot_of(state, active))
         out = refresh(state, branches_d, active, bel, r)
         return out + (state, branches_d, active, bel, r)
-    # ----- Wire buttons (no fn_kwargs) -----
     btn_L.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"L"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                         rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                queue=True)
     btn_F.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"F"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                         rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                queue=True)
     btn_R.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"R"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                         rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                queue=True)
     btn_I.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"I"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                         rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                queue=True)
     btn_tick.click(do_tick,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                   outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                            rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                   queue=True)
     btn_run.click(do_run,
                   inputs=[st, branches, active_branch, beliefs, rewind_idx, run_steps],
-                  outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                           rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                  queue=True)
     btn_toggle_control.click(toggle_control,
                              inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                             outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                                      rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                             queue=True)
     btn_toggle_pov.click(toggle_pov,
                          inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                         outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                                  rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                         queue=True)
     overlay.change(set_overlay,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, overlay],
-                   outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                            rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                   queue=True)
     env_pick.change(change_env,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx, env_pick],
-                    outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                             rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                    queue=True)
     truth.select(click_truth,
                  inputs=[tile_pick, st, branches, active_branch, beliefs, rewind_idx],
-                 outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                          rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                 queue=True)
     btn_jump.click(jump,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, rewind],
-                   outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                            rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                   queue=True)
     btn_fork.click(fork_branch,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, new_branch_name],
-                   outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                            rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                   queue=True)
     btn_set_branch.click(set_active_branch,
                          inputs=[st, branches, active_branch, beliefs, rewind_idx, branch_pick],
-                         outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                                  rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                         queue=True)
     btn_reset.click(reset_ep,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx],
-                    outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                             rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                    queue=True)
     btn_reset_all.click(reset_all,
                         inputs=[st, branches, active_branch, beliefs, rewind_idx, env_pick],
-                        outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                                 rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                        queue=True)
     btn_train.click(do_train,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx,
                             use_q, alpha, gamma, eps, eps_decay, eps_min,
                             episodes, max_steps],
-                    outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                             rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                    queue=True)
     btn_export.click(export_fn, inputs=[st, branches, active_branch, rewind_idx], outputs=[export_box], queue=True)
     btn_import.click(import_fn,
                      inputs=[import_box],
-                     outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                              rewind, rewind_idx, branch_pick, branch_pick, st, branches, active_branch, beliefs, rewind_idx],
-                     queue=True)
     demo.load(refresh,
               inputs=[st, branches, active_branch, beliefs, rewind_idx],
-              outputs=[pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
-                       rewind, rewind_idx, branch_pick, branch_pick],
               queue=True)
-demo.queue().launch()

 import numpy as np
 from PIL import Image, ImageDraw
 import matplotlib.pyplot as plt
+from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
 import gradio as gr
 # ============================================================
 # ZEN AgentLab — Agent POV + Multi-Agent Mini-Sim Arena
+#
+# Fix included:
+# - Matplotlib rendering uses FigureCanvas.buffer_rgba() (HF-safe)
 #
 # Features:
 # - Deterministic gridworld + first-person raycast POV
 # - Multiple environments (Chase / CoopVault / MiniCiv)
+# - Click-to-edit tiles + pickups + hazards + simple combat tags
 # - Full step trace: obs -> action -> reward -> (optional) Q-update
+# - Branching timelines (rewind + fork)
 # - Batch training (tabular Q-learning) + metrics dashboard
+# - Export/import full runs + SHA256 proof hash
 #
+# HF Spaces compatible: no timers, no fn_kwargs
 # ============================================================
 # -----------------------------
     BASE: "Base",
 }
 AGENT_COLORS = {
     "Predator": (255, 120, 90),
     "Prey": (120, 255, 160),
     "Scout": (120, 190, 255),
     "Alpha": (255, 205, 120),
     "Bravo": (160, 210, 255),
     "Guardian": (255, 120, 220),
     "BuilderA": (140, 255, 200),
     "BuilderB": (160, 200, 255),
     "Raider": (255, 160, 120),
 WALL_SIDE = np.array([150, 150, 170], dtype=np.uint8)
 DOOR_COL = np.array([140, 210, 255], dtype=np.uint8)
+# Small action space for tabular stability
+ACTIONS = ["L", "F", "R", "I"]  # interact
 # -----------------------------
+# Deterministic RNG
 # -----------------------------
 def rng_for(seed: int, step: int, stream: int = 0) -> np.random.Generator:
     mix = (seed * 1_000_003) ^ (step * 9_999_937) ^ (stream * 97_531)
     return np.random.default_rng(mix & 0xFFFFFFFFFFFFFFFF)
 # -----------------------------
 # Data structures
 # -----------------------------
     epsilon_min: float = 0.02
     epsilon_decay: float = 0.995
+    # generic shaping
     step_penalty: float = -0.01
     explore_reward: float = 0.015
     damage_penalty: float = -0.20
     heal_reward: float = 0.10
+    # chase
     chase_close_coeff: float = 0.03
     chase_catch_reward: float = 3.0
     chase_escaped_reward: float = 0.2
     chase_caught_penalty: float = -3.0
     food_reward: float = 0.6
+    # vault
     artifact_pick_reward: float = 1.2
     exit_win_reward: float = 3.0
     guardian_tag_reward: float = 2.0
     switch_reward: float = 0.8
     key_reward: float = 0.4
+    # civ
     resource_pick_reward: float = 0.15
     deposit_reward: float = 0.4
     base_progress_win_reward: float = 3.5
     returns: Dict[str, float] = None
     action_counts: Dict[str, Dict[str, int]] = None
     tiles_discovered: Dict[str, int] = None
     def __post_init__(self):
         if self.returns is None:
             self.action_counts = {}
         if self.tiles_discovered is None:
             self.tiles_discovered = {}
 @dataclass
 class WorldState:
     overlay: bool
     done: bool
+    outcome: str  # A_win | B_win | draw | ongoing
     # env state
     door_opened_global: bool = False
     # learning
     cfg: TrainConfig = None
+    q_tables: Dict[str, Dict[str, List[float]]] = None
     gmetrics: GlobalMetrics = None
     emetrics: EpisodeMetrics = None
     return abs(diff) <= (fov_deg / 2)
 def visible(state: WorldState, observer: Agent, target: Agent) -> bool:
     if not within_fov(observer, target.x, target.y, FOV_DEG):
         return False
     return bresenham_los(state.grid, observer.x, observer.y, target.x, target.y)
 def hash_sha256(txt: str) -> str:
     return hashlib.sha256(txt.encode("utf-8")).hexdigest()
 # -----------------------------
+# Beliefs / fog-of-war
 # -----------------------------
 def init_beliefs(agent_names: List[str]) -> Dict[str, np.ndarray]:
+    return {nm: (-1 * np.ones((GRID_H, GRID_W), dtype=np.int16)) for nm in agent_names}
+def update_belief_for_agent(state: WorldState, belief: np.ndarray, agent: Agent) -> int:
+    """Returns number of newly discovered tiles this update."""
+    before_unknown = int(np.sum(belief == -1))
     belief[agent.y, agent.x] = state.grid[agent.y][agent.x]
     base = math.radians(ORI_DEG[agent.ori])
     half = math.radians(FOV_DEG / 2)
             if tile == DOOR and not state.door_opened_global:
                 break
+    after_unknown = int(np.sum(belief == -1))
+    return max(0, before_unknown - after_unknown)
 # -----------------------------
 # Rendering
 # -----------------------------
     # billboards for visible agents
     for nm, other in state.agents.items():
+        if nm == observer.name or other.hp <= 0:
             continue
         if visible(state, observer, other):
             dx = other.x - observer.x
     return im
 # -----------------------------
+# Environments
 # -----------------------------
 def grid_with_border() -> List[List[int]]:
     g = [[EMPTY for _ in range(GRID_W)] for _ in range(GRID_H)]
 def env_chase(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
     for x in range(4, 17):
         g[7][x] = WALL
     g[7][10] = DOOR
     g[3][4] = FOOD
     g[11][15] = FOOD
     g[4][14] = NOISE
 def env_vault(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
     for x in range(3, 18):
         g[5][x] = WALL
     for x in range(3, 18):
     g[5][10] = DOOR
     g[9][12] = DOOR
     g[2][2] = KEY
     g[12][18] = EXIT
     g[12][2] = ARTIFACT
 def env_civ(seed: int) -> Tuple[List[List[int]], Dict[str, Agent]]:
     g = grid_with_border()
     for y in range(3, 12):
         g[y][9] = WALL
     g[7][9] = DOOR
     g[2][3] = WOOD
     g[3][3] = WOOD
     g[4][3] = WOOD
     g[6][4] = FOOD
     g[8][15] = FOOD
     g[13][10] = BASE
     g[4][15] = HAZARD
     g[10][4] = HAZARD
     g[13][2] = TELE
     g[2][2] = KEY
     g[12][6] = SWITCH
     agents = {
         "BuilderA": Agent("BuilderA", 3, 12, 0, hp=10, energy=100, team="A", brain="q"),
     }
     return g, agents
+ENV_BUILDERS = {"chase": env_chase, "vault": env_vault, "civ": env_civ}
 # -----------------------------
+# Observation / Q-learning
 # -----------------------------
 def local_tile_ahead(state: WorldState, a: Agent) -> int:
     dx, dy = DIRS[a.ori]
 def nearest_enemy_vec(state: WorldState, a: Agent) -> Tuple[int, int, int]:
     best = None
+    for _, other in state.agents.items():
         if other.hp <= 0:
             continue
         if other.team == a.team:
 def obs_key(state: WorldState, who: str) -> str:
     a = state.agents[who]
     d, dx, dy = nearest_enemy_vec(state, a)
     ahead = local_tile_ahead(state, a)
     keys = a.inventory.get("key", 0)
     art = a.inventory.get("artifact", 0)
     wood = a.inventory.get("wood", 0)
     ore = a.inventory.get("ore", 0)
     inv_bucket = f"k{min(keys,2)}a{min(art,1)}w{min(wood,3)}o{min(ore,3)}"
     door = 1 if state.door_opened_global else 0
     return f"{state.env_key}|{who}|{a.x},{a.y},{a.ori}|e{d}:{dx},{dy}|t{ahead}|hp{a.hp}|{inv_bucket}|D{door}|bp{state.base_progress}"
     return old, target, new
 # -----------------------------
+# Baseline heuristics
 # -----------------------------
 def face_towards(a: Agent, tx: int, ty: int) -> str:
     dx = tx - a.x
     a = state.agents[who]
     r = rng_for(state.seed, state.step, stream=900 + hash(who) % 1000)
+    # Prioritize interacting on valuable tiles
+    t_here = state.grid[a.y][a.x]
+    if t_here in (FOOD, KEY, ARTIFACT, WOOD, ORE, MEDKIT, SWITCH, BASE, EXIT):
         return "I"
+    # Find nearest enemy
     best = None
+    best_d = 999
+    for _, other in state.agents.items():
         if other.hp <= 0 or other.team == a.team:
             continue
         d = manhattan_xy(a.x, a.y, other.x, other.y)
         if d < best_d:
             best_d = d
             best = other
     if best is not None and best_d <= 6 and visible(state, a, best):
+        # attackers chase, defenders try to flee
         if a.team == "B":
             return face_towards(a, best.x, best.y)
         dx = best.x - a.x
         dy = best.y - a.y
         ang = (math.degrees(math.atan2(dy, dx)) % 360)
             return "R"
         return "F"
     return r.choice(["F", "F", "L", "R", "I"])
 def random_action(state: WorldState, who: str) -> str:
 def try_interact(state: WorldState, a: Agent) -> str:
     t = state.grid[a.y][a.x]
     if t == SWITCH:
         state.door_opened_global = True
         state.grid[a.y][a.x] = EMPTY
         return "used: medkit"
     if t == BASE:
         w = a.inventory.get("wood", 0)
         o = a.inventory.get("ore", 0)
         dep = min(w, 2) + min(o, 2)
     return "noop"
 # -----------------------------
+# Hazards / collisions / done
 # -----------------------------
 def resolve_hazards(state: WorldState, a: Agent) -> Tuple[bool, str]:
     if a.hp <= 0:
         return (False, "")
+    if state.grid[a.y][a.x] == HAZARD:
         a.hp -= 1
         return (True, "hazard:-hp")
     return (False, "")
 def resolve_tags(state: WorldState) -> List[str]:
     msgs = []
+    occupied: Dict[Tuple[int, int], List[str]] = {}
     for nm, a in state.agents.items():
         if a.hp <= 0:
             continue
             continue
         teams = set(state.agents[n].team for n in names)
         if len(teams) >= 2:
             for n in names:
                 state.agents[n].hp -= 1
             msgs.append(f"t={state.step}: collision/tag at ({x},{y}) {names} (-hp all)")
     return msgs
 def check_done(state: WorldState) -> None:
     if state.env_key == "chase":
         pred = state.agents["Predator"]
         prey = state.agents["Prey"]
             state.done = True
             state.outcome = "draw"
             return
+        if pred.hp > 0 and prey.hp > 0 and pred.x == prey.x and pred.y == prey.y:
             state.done = True
+            state.outcome = "A_win"
             state.event_log.append(f"t={state.step}: CAUGHT (Predator wins).")
             return
         if state.step >= 300 and prey.hp > 0:
             state.done = True
             state.outcome = "B_win"
             return
     if state.env_key == "vault":
         for nm in ["Alpha", "Bravo"]:
             a = state.agents[nm]
             if a.hp > 0 and a.inventory.get("artifact", 0) > 0 and state.grid[a.y][a.x] == EXIT:
                 state.outcome = "A_win"
                 state.event_log.append(f"t={state.step}: VAULT CLEARED (Team A wins).")
                 return
         alive_A = any(state.agents[n].hp > 0 for n in ["Alpha", "Bravo"])
         if not alive_A:
             state.done = True
             return
     if state.env_key == "civ":
         if state.base_progress >= state.base_target:
             state.done = True
             state.outcome = "A_win"
             state.event_log.append(f"t={state.step}: BASE COMPLETE (Builders win).")
             return
         alive_A = any(state.agents[n].hp > 0 for n in ["BuilderA", "BuilderB"])
         if not alive_A:
             state.done = True
             state.outcome = "B_win"
             state.event_log.append(f"t={state.step}: BUILDERS ELIMINATED (Raider wins).")
             return
         if state.step >= 350:
             state.done = True
             state.outcome = "draw"
 # -----------------------------
 # Rewards
 # -----------------------------
+def reward_for(prev: WorldState, now: WorldState, who: str, outcome_msg: str, took_damage: bool) -> float:
+    cfg = now.cfg
     r = cfg.step_penalty
     if outcome_msg.startswith("moved"):
         r += cfg.explore_reward
     if took_damage:
         r += cfg.damage_penalty
     if outcome_msg.startswith("used: medkit"):
         r += cfg.heal_reward
+    if now.env_key == "chase":
+        pred = now.agents["Predator"]
+        prey = now.agents["Prey"]
         if who == "Predator":
+            d0 = manhattan_xy(prev.agents["Predator"].x, prev.agents["Predator"].y,
+                             prev.agents["Prey"].x, prev.agents["Prey"].y)
             d1 = manhattan_xy(pred.x, pred.y, prey.x, prey.y)
             r += cfg.chase_close_coeff * float(d0 - d1)
+            if now.done and now.outcome == "A_win":
                 r += cfg.chase_catch_reward
         if who == "Prey":
             if outcome_msg.startswith("ate: food"):
                 r += cfg.food_reward
+            if now.done and now.outcome == "B_win":
                 r += cfg.chase_escaped_reward
+            if now.done and now.outcome == "A_win":
                 r += cfg.chase_caught_penalty
+    if now.env_key == "vault":
         if outcome_msg.startswith("picked: artifact"):
             r += cfg.artifact_pick_reward
         if outcome_msg.startswith("picked: key"):
             r += cfg.key_reward
         if outcome_msg.startswith("switch:"):
             r += cfg.switch_reward
+        if now.done:
+            if now.outcome == "A_win" and now.agents[who].team == "A":
                 r += cfg.exit_win_reward
+            if now.outcome == "B_win" and now.agents[who].team == "B":
                 r += cfg.guardian_tag_reward
+            if now.outcome == "B_win" and now.agents[who].team == "A":
                 r += cfg.tagged_penalty
+    if now.env_key == "civ":
         if outcome_msg.startswith("picked: wood") or outcome_msg.startswith("picked: ore"):
             r += cfg.resource_pick_reward
         if outcome_msg.startswith("deposited:"):
             r += cfg.deposit_reward
+        if now.done:
+            if now.outcome == "A_win" and now.agents[who].team == "A":
                 r += cfg.base_progress_win_reward
+            if now.outcome == "B_win" and now.agents[who].team == "B":
                 r += cfg.raider_elim_reward
+            if now.outcome == "B_win" and now.agents[who].team == "A":
                 r += cfg.builder_elim_penalty
     return float(r)
 # -----------------------------
+# Policy selection
 # -----------------------------
 def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, Optional[Tuple[str, int]]]:
     a = state.agents[who]
     cfg = state.cfg
     r = rng_for(state.seed, state.step, stream=stream)
     if a.brain == "random":
         act = random_action(state, who)
         return act, "random", None
         act = heuristic_action(state, who)
         return act, "heuristic", None
     if cfg.use_q:
         key = obs_key(state, who)
         qtab = state.q_tables.setdefault(who, {})
     return act, "heuristic(fallback)", None
 # -----------------------------
+# Init / reset
 # -----------------------------
 def init_state(seed: int, env_key: str) -> WorldState:
     g, agents = ENV_BUILDERS[env_key](seed)
     st = WorldState(
         seed=seed,
         step=0,
     if seed is None:
         seed = state.seed
     fresh = init_state(int(seed), state.env_key)
     fresh.cfg = state.cfg
     fresh.q_tables = state.q_tables
     fresh.gmetrics = state.gmetrics
     return state
 # -----------------------------
+# Metrics dashboard (HF-safe)
 # -----------------------------
 def metrics_dashboard_image(state: WorldState) -> Image.Image:
     gm = state.gmetrics
     fig = plt.figure(figsize=(7.0, 2.2), dpi=120)
     ax = fig.add_subplot(111)
+    x1 = max(1, gm.episodes)
+    ax.plot([0, x1], [gm.rolling_winrate_A, gm.rolling_winrate_A])
     ax.set_title("Global Metrics Snapshot")
+    ax.set_xlabel("Episodes")
     ax.set_ylabel("Rolling winrate Team A")
+    ax.set_ylim(-0.05, 1.05)
     ax.grid(True)
     txt = (
         f"env={state.env_key} | eps={gm.epsilon:.3f} | episodes={gm.episodes}\n"
+        f"A_wins={gm.wins_teamA} B_wins={gm.wins_teamB} draws={gm.draws} | avg_steps~{gm.avg_steps:.1f}\n"
         f"last_outcome={gm.last_outcome} last_steps={gm.last_steps}"
     )
     ax.text(0.01, 0.05, txt, transform=ax.transAxes, fontsize=8, va="bottom")
     fig.tight_layout()
+    canvas = FigureCanvas(fig)
+    canvas.draw()
+    buf = np.asarray(canvas.buffer_rgba())  # (H,W,4)
+    img = Image.fromarray(buf, mode="RGBA").convert("RGB")
     plt.close(fig)
+    return img
+def action_entropy(counts: Dict[str, int]) -> float:
+    total = sum(counts.values())
+    if total <= 0:
+        return 0.0
+    p = np.array([c / total for c in counts.values()], dtype=np.float64)
+    p = np.clip(p, 1e-12, 1.0)
+    return float(-np.sum(p * np.log2(p)))
 def agent_scoreboard(state: WorldState) -> str:
     rows = []
     header = ["agent", "team", "hp", "return", "steps", "entropy", "tiles_disc", "q_states", "inventory"]
     rows.append(header)
     steps = state.emetrics.steps
     for nm, a in state.agents.items():
         ret = state.emetrics.returns.get(nm, 0.0)
         counts = state.emetrics.action_counts.get(nm, {})
         inv = json.dumps(a.inventory, sort_keys=True)
         rows.append([nm, a.team, a.hp, f"{ret:.2f}", steps, f"{ent:.2f}", td, qs, inv])
     col_w = [max(len(str(r[i])) for r in rows) for i in range(len(header))]
     lines = []
     for ridx, r in enumerate(rows):
     return "\n".join(lines)
 # -----------------------------
+# Tick / training
 # -----------------------------
 def clone_shallow(state: WorldState) -> WorldState:
+    return WorldState(
         seed=state.seed,
         step=state.step,
         env_key=state.env_key,
         gmetrics=state.gmetrics,
         emetrics=state.emetrics,
     )
+def update_action_counts(state: WorldState, who: str, act: str):
+    state.emetrics.action_counts.setdefault(who, {})
+    state.emetrics.action_counts[who][act] = state.emetrics.action_counts[who].get(act, 0) + 1
 def tick(state: WorldState, beliefs: Dict[str, np.ndarray], manual_action: Optional[str] = None) -> None:
     if state.done:
         return
     prev = clone_shallow(state)
     chosen: Dict[str, str] = {}
     reasons: Dict[str, str] = {}
     qinfo: Dict[str, Optional[Tuple[str, int]]] = {}
         reasons[state.controlled] = "manual"
         qinfo[state.controlled] = None
+    order = list(state.agents.keys())
+    for who in order:
         if who in chosen:
             continue
         act, reason, qi = choose_action(state, who, stream=200 + (hash(who) % 1000))
         reasons[who] = reason
         qinfo[who] = qi
     outcomes: Dict[str, str] = {}
     took_damage: Dict[str, bool] = {nm: False for nm in order}
     for who in order:
         outcomes[who] = apply_action(state, who, chosen[who])
         dmg, msg = resolve_hazards(state, state.agents[who])
         took_damage[who] = dmg
         if msg:
             state.event_log.append(f"t={state.step}: {who} {msg}")
         update_action_counts(state, who, chosen[who])
+    for m in resolve_tags(state):
         state.event_log.append(m)
+    # belief updates + discovered tiles
     for nm, a in state.agents.items():
         if a.hp <= 0:
             continue
+        disc = update_belief_for_agent(state, beliefs[nm], a)
+        state.emetrics.tiles_discovered[nm] = state.emetrics.tiles_discovered.get(nm, 0) + disc
     check_done(state)
+    # rewards + Q
     q_lines = []
     for who in order:
         if who not in state.emetrics.returns:
             state.emetrics.returns[who] = 0.0
+        r = reward_for(prev, state, who, outcomes[who], took_damage[who])
         state.emetrics.returns[who] += r
         if qinfo.get(who) is not None:
             old, tgt, new = q_update(qtab, key, a_idx, r, next_key, state.cfg.alpha, state.cfg.gamma)
             q_lines.append(f"{who}: old={old:.3f} tgt={tgt:.3f} new={new:.3f} (a={ACTIONS[a_idx]})")
     trace = f"t={state.step} env={state.env_key} done={state.done} outcome={state.outcome}"
     for who in order:
         a = state.agents[who]
     state.step += 1
     state.emetrics.steps = state.step
 def run_episode(state: WorldState, beliefs: Dict[str, np.ndarray], max_steps: int) -> Tuple[str, int]:
     while state.step < max_steps and not state.done:
         tick(state, beliefs, manual_action=None)
         gm.rolling_winrate_A = 0.90 * gm.rolling_winrate_A + 0.10 * 0.5
     gm.avg_steps = (0.90 * gm.avg_steps + 0.10 * steps) if gm.avg_steps > 0 else float(steps)
     gm.epsilon = max(state.cfg.epsilon_min, gm.epsilon * state.cfg.epsilon_decay)
 def train(state: WorldState, episodes: int, max_steps: int) -> WorldState:
     for ep in range(episodes):
         ep_seed = (state.seed * 1_000_003 + (state.gmetrics.episodes + ep) * 97_531) & 0xFFFFFFFF
         state = reset_episode_keep_learning(state, seed=int(ep_seed))
         beliefs = init_beliefs(list(state.agents.keys()))
         outcome, steps = run_episode(state, beliefs, max_steps=max_steps)
         update_global_metrics_after_episode(state, outcome, steps)
     state.event_log.append(
         f"Training: +{episodes} eps | eps={state.gmetrics.epsilon:.3f} | "
         f"A={state.gmetrics.wins_teamA} B={state.gmetrics.wins_teamB} D={state.gmetrics.draws}"
     )
     state = reset_episode_keep_learning(state, seed=state.seed)
     return state
         "q_tables": state.q_tables,
         "branches": {b: [asdict(s) for s in snaps] for b, snaps in branches.items()},
         "active_branch": active_branch,
+        "rewind_idx": int(rewind_idx),
         "grid": state.grid,
         "door_opened_global": state.door_opened_global,
         "base_progress": state.base_progress,
     return txt + "\n\n" + json.dumps({"proof_sha256": proof}, indent=2)
 def import_run(txt: str) -> Tuple[WorldState, Dict[str, List[Snapshot]], str, int, Dict[str, np.ndarray]]:
     parts = txt.strip().split("\n\n")
     data = json.loads(parts[0])
     active = data.get("active_branch", "main")
     r_idx = int(data.get("rewind_idx", 0))
     if active in branches and branches[active]:
         st = restore_into(st, branches[active][-1])
         st.event_log.append("Imported run (restored last snapshot).")
 # UI helpers
 # -----------------------------
 def build_views(state: WorldState, beliefs: Dict[str, np.ndarray]) -> Tuple[np.ndarray, Image.Image, Image.Image, Image.Image, Image.Image, str, str, str, str]:
     for nm, a in state.agents.items():
         if a.hp > 0:
             update_belief_for_agent(state, beliefs[nm], a)
     pov = raycast_view(state, state.agents[state.pov])
     truth_np = np.array(state.grid, dtype=np.int16)
+    truth_img = render_topdown(truth_np, state.agents, f"Truth Map — env={state.env_key} t={state.step} seed={state.seed}", True)
     ctrl = state.controlled
     others = [k for k in state.agents.keys() if k != ctrl]
     other = others[0] if others else ctrl
+    b_ctrl = render_topdown(beliefs[ctrl], state.agents, f"{ctrl} Belief", True)
+    b_other = render_topdown(beliefs[other], state.agents, f"{other} Belief", True)
     dash = metrics_dashboard_image(state)
     gy = int(y_px // TILE)
     if not in_bounds(gx, gy):
         return state
     if gx == 0 or gy == 0 or gx == GRID_W - 1 or gy == GRID_H - 1:
         return state
     state.grid[gy][gx] = selected_tile
     return state
 # -----------------------------
+# Gradio app
 # -----------------------------
 TITLE = "ZEN AgentLab — Agent POV + Multi-Agent Mini-Sim Arena"
 with gr.Blocks(title=TITLE) as demo:
     gr.Markdown(
         f"## {TITLE}\n"
+        "Multi-environment agent sandbox with POV, belief maps, branching timelines, training, and metrics.\n"
+        "**No timers** — use Tick / Run / Train for deterministic experiments."
     )
+    st0 = init_state(1337, "chase")
+    st = gr.State(st0)
+    branches = gr.State({"main": [snapshot_of(st0, "main")]})
     active_branch = gr.State("main")
     rewind_idx = gr.State(0)
+    beliefs = gr.State(init_beliefs(list(st0.agents.keys())))
     with gr.Row():
         pov_img = gr.Image(label="POV (Pseudo-3D)", type="numpy", width=VIEW_W, height=VIEW_H)
     with gr.Row():
         events = gr.Textbox(label="Event Log", lines=10)
+        trace = gr.Textbox(label="Step Trace", lines=10)
     with gr.Row():
         with gr.Column(scale=2):
         with gr.Column(scale=3):
             gr.Markdown("### Training Controls (Tabular Q-learning)")
             use_q = gr.Checkbox(True, label="Use Q-learning (agents with brain='q')")
+            alpha = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label="alpha")
+            gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="gamma")
+            eps = gr.Slider(0.0, 0.5, value=0.10, step=0.01, label="epsilon")
             eps_decay = gr.Slider(0.90, 0.999, value=0.995, step=0.001, label="epsilon decay")
             eps_min = gr.Slider(0.0, 0.2, value=0.02, step=0.01, label="epsilon min")
             episodes = gr.Number(value=50, label="Train episodes", precision=0)
+            max_steps = gr.Number(value=260, label="Max steps/episode", precision=0)
             btn_train = gr.Button("Train")
             btn_reset = gr.Button("Reset Episode (keep learning)")
             import_box = gr.Textbox(label="Import JSON", lines=8)
             btn_import = gr.Button("Import")
+    # ---------- glue ----------
     def refresh(state: WorldState, branches_d: Dict[str, List[Snapshot]], active: str, bel: Dict[str, np.ndarray], r: int):
         snaps = branches_d.get(active, [])
         r_max = max(0, len(snaps) - 1)
         r = max(0, min(int(r), r_max))
         pov, tr, ba, bb, dimg, stxt, etxt, ttxt, sb = build_views(state, bel)
         branch_choices = sorted(list(branches_d.keys()))
         return (
+            pov, tr, ba, bb, dimg, stxt, sb, etxt, ttxt,
+            gr.update(maximum=r_max, value=r), r,
             gr.update(choices=branch_choices, value=active),
             gr.update(choices=branch_choices, value=active),
         )
             branches_d[new_name].append(snapshot_of(state, new_name))
         else:
             idx = max(0, min(int(r), len(snaps) - 1))
             branches_d[new_name] = [Snapshot(**asdict(s)) for s in snaps[:idx + 1]]
             state = restore_into(state, branches_d[new_name][-1])
         active = new_name
         state.event_log.append(f"Forked branch -> {new_name}")
         if br not in branches_d:
             branches_d[br] = [snapshot_of(state, br)]
         active = br
         if branches_d[active]:
             state = restore_into(state, branches_d[active][-1])
         bel = init_beliefs(list(state.agents.keys()))
         r = len(branches_d[active]) - 1
+        out = refresh(state, branches_d, active, bel, r)
         return out + (state, branches_d, active, bel, r)
     def change_env(state, branches_d, active, bel, r, env_key):
         env_key = env_key or "chase"
         state.env_key = env_key
         state = reset_episode_keep_learning(state, seed=state.seed)
         bel = init_beliefs(list(state.agents.keys()))
     def import_fn(txt):
         state, branches_d, active, r, bel = import_run(txt)
         branches_d.setdefault(active, [])
         if not branches_d[active]:
             branches_d[active].append(snapshot_of(state, active))
         out = refresh(state, branches_d, active, bel, r)
         return out + (state, branches_d, active, bel, r)
+    # ---- wire events (no fn_kwargs) ----
+    common_outputs = [
+        pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
+        rewind, rewind_idx, branch_pick, branch_pick,
+        st, branches, active_branch, beliefs, rewind_idx
+    ]
     btn_L.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"L"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                outputs=common_outputs, queue=True)
     btn_F.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"F"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                outputs=common_outputs, queue=True)
     btn_R.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"R"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                outputs=common_outputs, queue=True)
     btn_I.click(lambda s,b,a,bel,r: do_manual(s,b,a,bel,r,"I"),
                 inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                outputs=common_outputs, queue=True)
     btn_tick.click(do_tick,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                   outputs=common_outputs, queue=True)
     btn_run.click(do_run,
                   inputs=[st, branches, active_branch, beliefs, rewind_idx, run_steps],
+                  outputs=common_outputs, queue=True)
     btn_toggle_control.click(toggle_control,
                              inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                             outputs=common_outputs, queue=True)
     btn_toggle_pov.click(toggle_pov,
                          inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                         outputs=common_outputs, queue=True)
     overlay.change(set_overlay,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, overlay],
+                   outputs=common_outputs, queue=True)
     env_pick.change(change_env,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx, env_pick],
+                    outputs=common_outputs, queue=True)
     truth.select(click_truth,
                  inputs=[tile_pick, st, branches, active_branch, beliefs, rewind_idx],
+                 outputs=common_outputs, queue=True)
     btn_jump.click(jump,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, rewind],
+                   outputs=common_outputs, queue=True)
     btn_fork.click(fork_branch,
                    inputs=[st, branches, active_branch, beliefs, rewind_idx, new_branch_name],
+                   outputs=common_outputs, queue=True)
     btn_set_branch.click(set_active_branch,
                          inputs=[st, branches, active_branch, beliefs, rewind_idx, branch_pick],
+                         outputs=common_outputs, queue=True)
     btn_reset.click(reset_ep,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx],
+                    outputs=common_outputs, queue=True)
     btn_reset_all.click(reset_all,
                         inputs=[st, branches, active_branch, beliefs, rewind_idx, env_pick],
+                        outputs=common_outputs, queue=True)
     btn_train.click(do_train,
                     inputs=[st, branches, active_branch, beliefs, rewind_idx,
                             use_q, alpha, gamma, eps, eps_decay, eps_min,
                             episodes, max_steps],
+                    outputs=common_outputs, queue=True)
     btn_export.click(export_fn, inputs=[st, branches, active_branch, rewind_idx], outputs=[export_box], queue=True)
     btn_import.click(import_fn,
                      inputs=[import_box],
+                     outputs=common_outputs, queue=True)
     demo.load(refresh,
               inputs=[st, branches, active_branch, beliefs, rewind_idx],
+              outputs=[
+                  pov_img, truth, belief_a, belief_b, dash, status, scoreboard, events, trace,
+                  rewind, rewind_idx, branch_pick, branch_pick
+              ],
               queue=True)
+# HF sometimes enables SSR by default; disable for maximum compatibility
+demo.queue().launch(ssr_mode=False)