import json
import math
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Optional

import numpy as np
from PIL import Image, ImageDraw

import gradio as gr

# ============================================================
# ChronoSandbox++ — Instrumented Training Arena
# - Deterministic gridworld + first-person raycast view
# - Click-to-edit environment (tiles)
# - Full step trace: obs -> action -> reward -> q-update rationale
# - Optional Q-learning (tabular) for Predator + Prey
# - Batch training: run episodes fast, track metrics
# - Export/import: environment, history, Q-tables, metrics
#
# Compatibility: avoids fn_kwargs + avoids gr.Timer
# ============================================================

# -----------------------------
# Config
# -----------------------------
GRID_W, GRID_H = 21, 15
TILE = 22

VIEW_W, VIEW_H = 640, 360
RAY_W = 320
FOV_DEG = 78
MAX_DEPTH = 20

DIRS = [(1, 0), (0, 1), (-1, 0), (0, -1)]
ORI_DEG = [0, 90, 180, 270]

EMPTY = 0
WALL = 1
FOOD = 2
NOISE = 3
DOOR = 4
TELE = 5

TILE_NAMES = {
    EMPTY: "Empty",
    WALL: "Wall",
    FOOD: "Food",
    NOISE: "Noise",
    DOOR: "Door",
    TELE: "Teleporter",
}

AGENT_COLORS = {
    "Predator": (255, 120, 90),
    "Prey": (120, 255, 160),
    "Scout": (120, 190, 255),
}

SKY = np.array([14, 16, 26], dtype=np.uint8)
FLOOR_NEAR = np.array([24, 26, 40], dtype=np.uint8)
FLOOR_FAR = np.array([10, 11, 18], dtype=np.uint8)
WALL_BASE = np.array([210, 210, 225], dtype=np.uint8)
WALL_SIDE = np.array([150, 150, 170], dtype=np.uint8)
DOOR_COL = np.array([180, 210, 255], dtype=np.uint8)

ACTIONS = ["L", "F", "R"]  # keep small for tabular learning stability

# -----------------------------
# Deterministic RNG streams
# -----------------------------
def rng_for(seed: int, step: int, stream: int = 0) -> np.random.Generator:
    mix = (seed * 1_000_003) ^ (step * 9_999_937) ^ (stream * 97_531)
    return np.random.default_rng(mix & 0xFFFFFFFFFFFFFFFF)

# -----------------------------
# Data structures
# -----------------------------
@dataclass
class Agent:
    name: str
    x: int
    y: int
    ori: int
    energy: int = 100

@dataclass
class TrainConfig:
    use_q_pred: bool = True
    use_q_prey: bool = True
    alpha: float = 0.15
    gamma: float = 0.95
    epsilon: float = 0.10
    epsilon_min: float = 0.02
    epsilon_decay: float = 0.995

    # reward shaping
    pred_step_penalty: float = -0.02
    pred_dist_coeff: float = 0.03
    pred_catch_reward: float = 3.0

    prey_step_penalty: float = -0.02
    prey_food_reward: float = 0.6
    prey_survive_reward: float = 0.02
    prey_caught_penalty: float = -3.0

@dataclass
class Metrics:
    episodes: int = 0
    catches: int = 0
    avg_steps_to_catch: float = 0.0
    avg_path_efficiency: float = 0.0  # optimal / actual (0..1)
    last_episode_steps: int = 0
    last_episode_eff: float = 0.0
    epsilon: float = 0.10

@dataclass
class WorldState:
    seed: int
    step: int
    grid: List[List[int]]
    agents: Dict[str, Agent]
    controlled: str
    pov: str
    overlay: bool

    caught: bool
    branches: Dict[str, int]

    # instrumentation
    event_log: List[str]
    trace_log: List[str]  # more detailed step trace (bounded)

    # training
    cfg: TrainConfig
    q_pred: Dict[str, List[float]]
    q_prey: Dict[str, List[float]]
    metrics: Metrics

@dataclass
class Snapshot:
    step: int
    agents: Dict[str, Dict]
    grid: List[List[int]]
    caught: bool
    event_log_tail: List[str]
    trace_tail: List[str]

# -----------------------------
# Environment
# -----------------------------
def default_grid() -> List[List[int]]:
    g = [[EMPTY for _ in range(GRID_W)] for _ in range(GRID_H)]
    for x in range(GRID_W):
        g[0][x] = WALL
        g[GRID_H - 1][x] = WALL
    for y in range(GRID_H):
        g[y][0] = WALL
        g[y][GRID_W - 1] = WALL

    for x in range(4, 17):
        g[7][x] = WALL
    g[7][10] = DOOR

    g[3][4] = FOOD
    g[11][15] = FOOD
    g[4][14] = NOISE
    g[12][5] = NOISE
    g[2][18] = TELE
    g[13][2] = TELE
    return g

def init_state(seed: int) -> WorldState:
    agents = {
        "Predator": Agent("Predator", 2, 2, 0, 100),
        "Prey":     Agent("Prey", 18, 12, 2, 100),
        "Scout":    Agent("Scout", 10, 3, 1, 100),
    }
    cfg = TrainConfig()
    return WorldState(
        seed=seed,
        step=0,
        grid=default_grid(),
        agents=agents,
        controlled="Predator",
        pov="Predator",
        overlay=False,
        caught=False,
        branches={"main": 0},
        event_log=["Initialized world."],
        trace_log=[],
        cfg=cfg,
        q_pred={},
        q_prey={},
        metrics=Metrics(epsilon=cfg.epsilon),
    )

# -----------------------------
# Belief maps
# -----------------------------
def init_belief() -> Dict[str, np.ndarray]:
    b = {}
    for nm in ["Predator", "Prey", "Scout"]:
        b[nm] = -1 * np.ones((GRID_H, GRID_W), dtype=np.int16)
    return b

# -----------------------------
# Helpers
# -----------------------------
def in_bounds(x: int, y: int) -> bool:
    return 0 <= x < GRID_W and 0 <= y < GRID_H

def is_blocking(tile: int) -> bool:
    return tile == WALL

def manhattan(a: Agent, b: Agent) -> int:
    return abs(a.x - b.x) + abs(a.y - b.y)

def bresenham_los(grid: List[List[int]], x0: int, y0: int, x1: int, y1: int) -> bool:
    dx = abs(x1 - x0)
    dy = abs(y1 - y0)
    sx = 1 if x0 < x1 else -1
    sy = 1 if y0 < y1 else -1
    err = dx - dy
    x, y = x0, y0
    while True:
        if (x, y) != (x0, y0) and (x, y) != (x1, y1):
            if grid[y][x] == WALL:
                return False
        if x == x1 and y == y1:
            return True
        e2 = 2 * err
        if e2 > -dy:
            err -= dy
            x += sx
        if e2 < dx:
            err += dx
            y += sy

def within_fov(observer: Agent, tx: int, ty: int, fov_deg: float = FOV_DEG) -> bool:
    dx = tx - observer.x
    dy = ty - observer.y
    if dx == 0 and dy == 0:
        return True
    angle = math.degrees(math.atan2(dy, dx)) % 360
    facing = ORI_DEG[observer.ori]
    diff = (angle - facing + 540) % 360 - 180
    return abs(diff) <= (fov_deg / 2)

def visible(observer: Agent, target: Agent, grid: List[List[int]]) -> bool:
    return within_fov(observer, target.x, target.y, FOV_DEG) and bresenham_los(grid, observer.x, observer.y, target.x, target.y)

# -----------------------------
# Movement
# -----------------------------
def turn_left(a: Agent) -> None:
    a.ori = (a.ori - 1) % 4

def turn_right(a: Agent) -> None:
    a.ori = (a.ori + 1) % 4

def move_forward(state: WorldState, a: Agent) -> str:
    dx, dy = DIRS[a.ori]
    nx, ny = a.x + dx, a.y + dy
    if not in_bounds(nx, ny):
        return "blocked: bounds"
    if is_blocking(state.grid[ny][nx]):
        return "blocked: wall"
    if state.grid[ny][nx] == DOOR:
        state.grid[ny][nx] = EMPTY
        state.event_log.append(f"t={state.step}: {a.name} opened a door.")
    a.x, a.y = nx, ny

    if state.grid[ny][nx] == TELE:
        teles = [(x, y) for y in range(GRID_H) for x in range(GRID_W) if state.grid[y][x] == TELE]
        if len(teles) >= 2:
            teles_sorted = sorted(teles)
            idx = teles_sorted.index((nx, ny))
            dest = teles_sorted[(idx + 1) % len(teles_sorted)]
            a.x, a.y = dest
            state.event_log.append(f"t={state.step}: {a.name} teleported.")
            return "moved: teleported"
    return "moved"

def apply_action(state: WorldState, agent_name: str, action: str) -> str:
    a = state.agents[agent_name]
    if action == "L":
        turn_left(a)
        return "turned left"
    if action == "R":
        turn_right(a)
        return "turned right"
    if action == "F":
        return move_forward(state, a)
    return "noop"

# -----------------------------
# Rendering
# -----------------------------
def raycast_view(state: WorldState, observer: Agent) -> np.ndarray:
    img = np.zeros((VIEW_H, VIEW_W, 3), dtype=np.uint8)
    img[:, :] = SKY

    for y in range(VIEW_H // 2, VIEW_H):
        t = (y - VIEW_H // 2) / (VIEW_H // 2 + 1e-6)
        col = (1 - t) * FLOOR_NEAR + t * FLOOR_FAR
        img[y, :] = col.astype(np.uint8)

    fov = math.radians(FOV_DEG)
    half_fov = fov / 2

    for rx in range(RAY_W):
        cam_x = (2 * rx / (RAY_W - 1)) - 1
        ray_ang = math.radians(ORI_DEG[observer.ori]) + cam_x * half_fov

        ox, oy = observer.x + 0.5, observer.y + 0.5
        sin_a = math.sin(ray_ang)
        cos_a = math.cos(ray_ang)

        depth = 0.0
        hit = None  # None, "wall", "door"
        side = 0

        while depth < MAX_DEPTH:
            depth += 0.05
            tx = int(ox + cos_a * depth)
            ty = int(oy + sin_a * depth)
            if not in_bounds(tx, ty):
                break
            tile = state.grid[ty][tx]
            if tile == WALL:
                hit = "wall"
                side = 1 if abs(cos_a) > abs(sin_a) else 0
                break
            if tile == DOOR:
                hit = "door"
                break

        if hit is None:
            continue

        depth *= math.cos(ray_ang - math.radians(ORI_DEG[observer.ori]))
        depth = max(depth, 0.001)

        proj_h = int((VIEW_H * 0.9) / depth)
        y0 = max(0, VIEW_H // 2 - proj_h // 2)
        y1 = min(VIEW_H - 1, VIEW_H // 2 + proj_h // 2)

        if hit == "door":
            col = DOOR_COL.copy()
        else:
            col = WALL_BASE.copy() if side == 0 else WALL_SIDE.copy()

        dim = max(0.25, 1.0 - (depth / MAX_DEPTH))
        col = (col * dim).astype(np.uint8)

        x0 = int(rx * (VIEW_W / RAY_W))
        x1 = int((rx + 1) * (VIEW_W / RAY_W))
        img[y0:y1, x0:x1] = col

    # billboards for visible agents
    for nm, other in state.agents.items():
        if nm == observer.name:
            continue
        if visible(observer, other, state.grid):
            dx = other.x - observer.x
            dy = other.y - observer.y
            ang = (math.degrees(math.atan2(dy, dx)) % 360)
            facing = ORI_DEG[observer.ori]
            diff = (ang - facing + 540) % 360 - 180
            sx = int((diff / (FOV_DEG / 2)) * (VIEW_W / 2) + (VIEW_W / 2))
            dist = math.sqrt(dx * dx + dy * dy)
            h = int((VIEW_H * 0.65) / max(dist, 0.75))
            w = max(10, h // 3)
            y_mid = VIEW_H // 2
            y0 = max(0, y_mid - h // 2)
            y1 = min(VIEW_H - 1, y_mid + h // 2)
            x0 = max(0, sx - w // 2)
            x1 = min(VIEW_W - 1, sx + w // 2)
            col = AGENT_COLORS.get(nm, (255, 200, 120))
            img[y0:y1, x0:x1] = np.array(col, dtype=np.uint8)

    if state.overlay:
        cx, cy = VIEW_W // 2, VIEW_H // 2
        img[cy - 1:cy + 2, cx - 10:cx + 10] = np.array([120, 190, 255], dtype=np.uint8)
        img[cy - 10:cy + 10, cx - 1:cx + 2] = np.array([120, 190, 255], dtype=np.uint8)

    return img

def render_topdown(grid: np.ndarray, agents: Dict[str, Agent], title: str, show_agents: bool = True) -> Image.Image:
    w = grid.shape[1] * TILE
    h = grid.shape[0] * TILE
    im = Image.new("RGB", (w, h + 28), (10, 12, 18))
    draw = ImageDraw.Draw(im)

    for y in range(grid.shape[0]):
        for x in range(grid.shape[1]):
            t = int(grid[y, x])
            if t == -1:
                col = (18, 20, 32)
            elif t == EMPTY:
                col = (26, 30, 44)
            elif t == WALL:
                col = (190, 190, 210)
            elif t == FOOD:
                col = (255, 210, 120)
            elif t == NOISE:
                col = (255, 120, 220)
            elif t == DOOR:
                col = (140, 210, 255)
            elif t == TELE:
                col = (120, 190, 255)
            else:
                col = (80, 80, 90)

            x0, y0 = x * TILE, y * TILE + 28
            draw.rectangle([x0, y0, x0 + TILE - 1, y0 + TILE - 1], fill=col)

    for x in range(grid.shape[1] + 1):
        xx = x * TILE
        draw.line([xx, 28, xx, h + 28], fill=(12, 14, 22))
    for y in range(grid.shape[0] + 1):
        yy = y * TILE + 28
        draw.line([0, yy, w, yy], fill=(12, 14, 22))

    if show_agents:
        for nm, a in agents.items():
            cx = a.x * TILE + TILE // 2
            cy = a.y * TILE + 28 + TILE // 2
            col = AGENT_COLORS.get(nm, (220, 220, 220))
            r = TILE // 3
            draw.ellipse([cx - r, cy - r, cx + r, cy + r], fill=col)
            dx, dy = DIRS[a.ori]
            draw.line([cx, cy, cx + dx * r, cy + dy * r], fill=(10, 10, 10), width=3)

    draw.rectangle([0, 0, w, 28], fill=(14, 16, 26))
    draw.text((8, 6), title, fill=(230, 230, 240))
    return im

# -----------------------------
# Belief updates
# -----------------------------
def update_belief_for_agent(state: WorldState, belief: np.ndarray, agent: Agent) -> None:
    belief[agent.y, agent.x] = state.grid[agent.y][agent.x]
    base = math.radians(ORI_DEG[agent.ori])
    half = math.radians(FOV_DEG / 2)
    rays = 33 if agent.name != "Scout" else 45

    for i in range(rays):
        t = i / (rays - 1)
        ang = base + (t * 2 - 1) * half
        sin_a, cos_a = math.sin(ang), math.cos(ang)
        ox, oy = agent.x + 0.5, agent.y + 0.5
        depth = 0.0
        while depth < MAX_DEPTH:
            depth += 0.2
            tx = int(ox + cos_a * depth)
            ty = int(oy + sin_a * depth)
            if not in_bounds(tx, ty):
                break
            belief[ty, tx] = state.grid[ty][tx]
            if state.grid[ty][tx] == WALL:
                break

# -----------------------------
# Optimal distance (BFS) for efficiency metric
# -----------------------------
def bfs_distance(grid: List[List[int]], sx: int, sy: int, gx: int, gy: int) -> Optional[int]:
    if (sx, sy) == (gx, gy):
        return 0
    q = [(sx, sy)]
    dist = { (sx, sy): 0 }
    head = 0
    while head < len(q):
        x, y = q[head]; head += 1
        for dx, dy in DIRS:
            nx, ny = x + dx, y + dy
            if not in_bounds(nx, ny):
                continue
            if grid[ny][nx] == WALL:
                continue
            if (nx, ny) in dist:
                continue
            dist[(nx, ny)] = dist[(x, y)] + 1
            if (nx, ny) == (gx, gy):
                return dist[(nx, ny)]
            q.append((nx, ny))
    return None

# -----------------------------
# Observation encoding (compact state key)
# -----------------------------
def obs_key(state: WorldState, who: str) -> str:
    pred = state.agents["Predator"]
    prey = state.agents["Prey"]
    a = state.agents[who]
    # relative position coarse-binned to keep table smaller
    dx = prey.x - pred.x
    dy = prey.y - pred.y
    dx_bin = int(np.clip(dx, -6, 6))
    dy_bin = int(np.clip(dy, -6, 6))
    vis = 1 if visible(pred, prey, state.grid) else 0
    # include own orientation and role
    if who == "Predator":
        return f"P|{pred.x},{pred.y},{pred.ori}|d{dx_bin},{dy_bin}|v{vis}"
    if who == "Prey":
        # prey cares if predator is visible to it
        vis2 = 1 if visible(prey, pred, state.grid) else 0
        ddx = pred.x - prey.x
        ddy = pred.y - prey.y
        ddx_bin = int(np.clip(ddx, -6, 6))
        ddy_bin = int(np.clip(ddy, -6, 6))
        return f"R|{prey.x},{prey.y},{prey.ori}|d{ddx_bin},{ddy_bin}|v{vis2}|e{int(prey.energy//25)}"
    # Scout: simple
    return f"S|{a.x},{a.y},{a.ori}"

def q_get(q: Dict[str, List[float]], key: str) -> List[float]:
    if key not in q:
        q[key] = [0.0, 0.0, 0.0]
    return q[key]

def epsilon_greedy(qvals: List[float], eps: float, r: np.random.Generator) -> int:
    if r.random() < eps:
        return int(r.integers(0, len(qvals)))
    return int(np.argmax(qvals))

def q_update(q: Dict[str, List[float]], key: str, a_idx: int, reward: float, next_key: str, alpha: float, gamma: float) -> Tuple[float, float, float]:
    qv = q_get(q, key)
    nq = q_get(q, next_key)
    old = qv[a_idx]
    target = reward + gamma * float(np.max(nq))
    new = old + alpha * (target - old)
    qv[a_idx] = new
    return old, target, new

# -----------------------------
# Baseline heuristic policies (for Scout + fallback)
# -----------------------------
def heuristic_pred_action(state: WorldState) -> str:
    pred = state.agents["Predator"]
    prey = state.agents["Prey"]
    if visible(pred, prey, state.grid):
        dx = prey.x - pred.x
        dy = prey.y - pred.y
        ang = (math.degrees(math.atan2(dy, dx)) % 360)
        facing = ORI_DEG[pred.ori]
        diff = (ang - facing + 540) % 360 - 180
        if diff < -10:
            return "L"
        if diff > 10:
            return "R"
        return "F"
    r = rng_for(state.seed, state.step, stream=11)
    return r.choice(ACTIONS)

def heuristic_prey_action(state: WorldState) -> str:
    prey = state.agents["Prey"]
    pred = state.agents["Predator"]
    if visible(prey, pred, state.grid):
        dx = pred.x - prey.x
        dy = pred.y - prey.y
        ang = (math.degrees(math.atan2(dy, dx)) % 360)
        facing = ORI_DEG[prey.ori]
        diff = (ang - facing + 540) % 360 - 180
        diff_away = ((diff + 180) + 540) % 360 - 180
        if diff_away < -10:
            return "L"
        if diff_away > 10:
            return "R"
        return "F"
    r = rng_for(state.seed, state.step, stream=12)
    return r.choice(ACTIONS)

def heuristic_scout_action(state: WorldState) -> str:
    r = rng_for(state.seed, state.step, stream=13)
    return r.choice(ACTIONS)

# -----------------------------
# Reward shaping
# -----------------------------
def pred_reward(state_prev: WorldState, state_now: WorldState) -> float:
    cfg = state_now.cfg
    pred0 = state_prev.agents["Predator"]
    prey0 = state_prev.agents["Prey"]
    pred1 = state_now.agents["Predator"]
    prey1 = state_now.agents["Prey"]
    d0 = abs(pred0.x - prey0.x) + abs(pred0.y - prey0.y)
    d1 = abs(pred1.x - prey1.x) + abs(pred1.y - prey1.y)
    r = cfg.pred_step_penalty + cfg.pred_dist_coeff * (d0 - d1)  # reward closing distance
    if state_now.caught:
        r += cfg.pred_catch_reward
    return float(r)

def prey_reward(state_prev: WorldState, state_now: WorldState, ate_food: bool) -> float:
    cfg = state_now.cfg
    r = cfg.prey_step_penalty + cfg.prey_survive_reward
    if ate_food:
        r += cfg.prey_food_reward
    if state_now.caught:
        r += cfg.prey_caught_penalty
    return float(r)

# -----------------------------
# Core simulation tick (with instrumentation + optional learning)
# -----------------------------
TRACE_MAX = 400

def clone_shallow(state: WorldState) -> WorldState:
    # clone for reward computation, minimal fields
    return WorldState(
        seed=state.seed,
        step=state.step,
        grid=[row[:] for row in state.grid],
        agents={k: Agent(**asdict(v)) for k, v in state.agents.items()},
        controlled=state.controlled,
        pov=state.pov,
        overlay=state.overlay,
        caught=state.caught,
        branches=dict(state.branches),
        event_log=list(state.event_log),
        trace_log=list(state.trace_log),
        cfg=state.cfg,
        q_pred=state.q_pred,
        q_prey=state.q_prey,
        metrics=state.metrics,
    )

def check_catch(state: WorldState) -> None:
    pred = state.agents["Predator"]
    prey = state.agents["Prey"]
    if pred.x == prey.x and pred.y == prey.y:
        state.caught = True
        state.event_log.append(f"t={state.step}: CAUGHT.")

def consume_food(state: WorldState) -> bool:
    prey = state.agents["Prey"]
    if state.grid[prey.y][prey.x] == FOOD:
        prey.energy = min(200, prey.energy + 35)
        state.grid[prey.y][prey.x] = EMPTY
        state.event_log.append(f"t={state.step}: Prey ate food (+energy).")
        return True
    return False

def choose_action(state: WorldState, who: str, stream: int) -> Tuple[str, str, Optional[Tuple[str,int]]]:
    """
    Returns (action, reason, q_info)
    q_info: (obs_key, action_index) if chosen by Q, else None
    """
    cfg = state.cfg
    r = rng_for(state.seed, state.step, stream=stream)

    if who == "Predator" and cfg.use_q_pred:
        k = obs_key(state, "Predator")
        qv = q_get(state.q_pred, k)
        a_idx = epsilon_greedy(qv, state.metrics.epsilon, r)
        return ACTIONS[a_idx], f"Q(pred) eps={state.metrics.epsilon:.3f} q={np.round(qv,3).tolist()}", (k, a_idx)

    if who == "Prey" and cfg.use_q_prey:
        k = obs_key(state, "Prey")
        qv = q_get(state.q_prey, k)
        a_idx = epsilon_greedy(qv, state.metrics.epsilon, r)
        return ACTIONS[a_idx], f"Q(prey) eps={state.metrics.epsilon:.3f} q={np.round(qv,3).tolist()}", (k, a_idx)

    # fallbacks
    if who == "Predator":
        a = heuristic_pred_action(state)
        return a, "heuristic(pred)", None
    if who == "Prey":
        a = heuristic_prey_action(state)
        return a, "heuristic(prey)", None
    a = heuristic_scout_action(state)
    return a, "heuristic(scout)", None

def tick(state: WorldState, manual_action: Optional[str] = None) -> None:
    if state.caught:
        return

    prev = clone_shallow(state)

    # record optimal distance for efficiency stats
    pred = state.agents["Predator"]
    prey = state.agents["Prey"]
    opt_dist = bfs_distance(state.grid, pred.x, pred.y, prey.x, prey.y)
    if opt_dist is None:
        opt_dist = 999

    # Action selection
    chosen = {}
    reasons = {}
    qinfo = {}

    # manual action applies to controlled agent
    if manual_action:
        chosen[state.controlled] = manual_action
        reasons[state.controlled] = "manual"
        qinfo[state.controlled] = None

    # others choose
    for who in ["Predator", "Prey", "Scout"]:
        if who in chosen:
            continue
        act, reason, q_i = choose_action(state, who, stream={"Predator":21,"Prey":22,"Scout":23}[who])
        chosen[who] = act
        reasons[who] = reason
        qinfo[who] = q_i

    # Apply actions (deterministic order)
    outcomes = {}
    for who in ["Predator", "Prey", "Scout"]:
        outcomes[who] = apply_action(state, who, chosen[who])

    ate = consume_food(state)
    check_catch(state)

    # Rewards + Q-updates
    pred_r = pred_reward(prev, state)
    prey_r = prey_reward(prev, state, ate_food=ate)

    q_lines = []
    if qinfo["Predator"] is not None:
        k, a_idx = qinfo["Predator"]
        nk = obs_key(state, "Predator")
        old, target, new = q_update(state.q_pred, k, a_idx, pred_r, nk, state.cfg.alpha, state.cfg.gamma)
        q_lines.append(f"Qpred: {k} a={ACTIONS[a_idx]} old={old:.3f} tgt={target:.3f} new={new:.3f}")

    if qinfo["Prey"] is not None:
        k, a_idx = qinfo["Prey"]
        nk = obs_key(state, "Prey")
        old, target, new = q_update(state.q_prey, k, a_idx, prey_r, nk, state.cfg.alpha, state.cfg.gamma)
        q_lines.append(f"Qprey: {k} a={ACTIONS[a_idx]} old={old:.3f} tgt={target:.3f} new={new:.3f}")

    # Trace line
    dist_now = manhattan(state.agents["Predator"], state.agents["Prey"])
    eff = (opt_dist / max(1, dist_now)) if dist_now > 0 else 1.0
    trace = (
        f"t={state.step} optDist~{opt_dist} distNow={dist_now} "
        f"| Pred:{chosen['Predator']} ({outcomes['Predator']}) [{reasons['Predator']}] r={pred_r:+.3f} "
        f"| Prey:{chosen['Prey']} ({outcomes['Prey']}) [{reasons['Prey']}] r={prey_r:+.3f} "
        f"| Scout:{chosen['Scout']} ({outcomes['Scout']}) [{reasons['Scout']}] "
        f"| ateFood={ate} caught={state.caught}"
    )
    if q_lines:
        trace += " | " + " ; ".join(q_lines)

    state.trace_log.append(trace)
    if len(state.trace_log) > TRACE_MAX:
        state.trace_log = state.trace_log[-TRACE_MAX:]

    state.step += 1

# -----------------------------
# Episode reset + training
# -----------------------------
def reset_episode(state: WorldState, seed: Optional[int] = None) -> None:
    # Keep Q-tables + cfg + metrics; reset world + logs
    if seed is None:
        seed = state.seed
    fresh = init_state(seed)
    fresh.cfg = state.cfg
    fresh.q_pred = state.q_pred
    fresh.q_prey = state.q_prey
    fresh.metrics = state.metrics
    fresh.metrics.epsilon = state.metrics.epsilon
    state.seed = fresh.seed
    state.step = 0
    state.grid = fresh.grid
    state.agents = fresh.agents
    state.controlled = fresh.controlled
    state.pov = fresh.pov
    state.overlay = fresh.overlay
    state.caught = False
    state.branches = fresh.branches
    state.event_log = ["Episode reset."]
    state.trace_log = []

def run_episode(state: WorldState, max_steps: int) -> Tuple[bool, int, float]:
    # returns (caught, steps, path_eff)
    start_pred = state.agents["Predator"]
    start_prey = state.agents["Prey"]
    opt = bfs_distance(state.grid, start_pred.x, start_pred.y, start_prey.x, start_prey.y)
    if opt is None:
        opt = 999
    steps = 0
    while steps < max_steps and not state.caught:
        tick(state, manual_action=None)
        steps += 1
    caught = state.caught
    eff = float(opt / max(1, steps)) if opt < 999 else 0.0
    return caught, steps, eff

def train(state: WorldState, episodes: int, max_steps: int) -> None:
    m = state.metrics
    cfg = state.cfg
    catches = 0
    total_steps_catch = 0
    total_eff = 0.0

    for ep in range(episodes):
        # deterministically vary episode seed so it doesn't memorize one map-layout only
        ep_seed = (state.seed * 1_000_003 + (m.episodes + ep) * 97_531) & 0xFFFFFFFF
        reset_episode(state, seed=int(ep_seed))

        caught, steps, eff = run_episode(state, max_steps=max_steps)
        total_eff += eff

        if caught:
            catches += 1
            total_steps_catch += steps

        # epsilon decay
        m.epsilon = max(cfg.epsilon_min, m.epsilon * cfg.epsilon_decay)

    # Update metrics
    m.episodes += episodes
    m.catches += catches
    m.last_episode_steps = steps
    m.last_episode_eff = eff
    if catches > 0:
        # moving average by episode count for stability
        avg_steps = total_steps_catch / catches
        m.avg_steps_to_catch = (
            0.85 * m.avg_steps_to_catch + 0.15 * avg_steps
            if m.avg_steps_to_catch > 0 else avg_steps
        )
    avg_eff = total_eff / max(1, episodes)
    m.avg_path_efficiency = (
        0.85 * m.avg_path_efficiency + 0.15 * avg_eff
        if m.avg_path_efficiency > 0 else avg_eff
    )

    state.event_log.append(
        f"Training: +{episodes} eps | catches={catches}/{episodes} | "
        f"avgStepsToCatch~{m.avg_steps_to_catch:.2f} | avgEff~{m.avg_path_efficiency:.2f} | eps={m.epsilon:.3f}"
    )

# -----------------------------
# History / snapshots
# -----------------------------
MAX_HISTORY = 1200

def snapshot_of(state: WorldState) -> Snapshot:
    return Snapshot(
        step=state.step,
        agents={k: asdict(v) for k, v in state.agents.items()},
        grid=[row[:] for row in state.grid],
        caught=state.caught,
        event_log_tail=state.event_log[-20:],
        trace_tail=state.trace_log[-40:],
    )

def restore_into(state: WorldState, snap: Snapshot) -> None:
    state.step = snap.step
    state.grid = [row[:] for row in snap.grid]
    for k, d in snap.agents.items():
        state.agents[k] = Agent(**d)
    state.caught = snap.caught
    state.event_log.append(f"Jumped to snapshot t={snap.step}.")

# -----------------------------
# Export / import
# -----------------------------
def export_run(state: WorldState, history: List[Snapshot]) -> str:
    payload = {
        "seed": state.seed,
        "controlled": state.controlled,
        "pov": state.pov,
        "overlay": state.overlay,
        "cfg": asdict(state.cfg),
        "metrics": asdict(state.metrics),
        "q_pred": state.q_pred,
        "q_prey": state.q_prey,
        "history": [asdict(s) for s in history],
        "grid": state.grid,
    }
    return json.dumps(payload, indent=2)

def import_run(txt: str) -> Tuple[WorldState, List[Snapshot], Dict[str, np.ndarray], int]:
    data = json.loads(txt)
    st = init_state(int(data.get("seed", 1337)))
    st.controlled = data.get("controlled", st.controlled)
    st.pov = data.get("pov", st.pov)
    st.overlay = bool(data.get("overlay", False))
    st.grid = data.get("grid", st.grid)

    st.cfg = TrainConfig(**data.get("cfg", asdict(st.cfg)))
    st.metrics = Metrics(**data.get("metrics", asdict(st.metrics)))

    st.q_pred = data.get("q_pred", {})
    st.q_prey = data.get("q_prey", {})

    hist = [Snapshot(**s) for s in data.get("history", [])]
    bel = init_belief()
    r_idx = max(0, len(hist) - 1)

    if hist:
        restore_into(st, hist[-1])
    st.event_log.append("Imported run.")
    return st, hist, bel, r_idx

# -----------------------------
# UI glue
# -----------------------------
def build_views(state: WorldState, beliefs: Dict[str, np.ndarray]) -> Tuple[np.ndarray, Image.Image, Image.Image, Image.Image, str, str, str]:
    for nm, a in state.agents.items():
        update_belief_for_agent(state, beliefs[nm], a)

    pov = raycast_view(state, state.agents[state.pov])
    truth_np = np.array(state.grid, dtype=np.int16)
    truth_img = render_topdown(truth_np, state.agents, f"Truth Map — t={state.step} seed={state.seed}", show_agents=True)

    ctrl = state.controlled
    other = "Prey" if ctrl == "Predator" else "Predator"
    b_ctrl = render_topdown(beliefs[ctrl], state.agents, f"{ctrl} Belief", show_agents=True)
    b_other = render_topdown(beliefs[other], state.agents, f"{other} Belief", show_agents=True)

    m = state.metrics
    pred = state.agents["Predator"]
    prey = state.agents["Prey"]
    scout = state.agents["Scout"]

    status = (
        f"Controlled={state.controlled} | POV={state.pov} | caught={state.caught} | eps={m.epsilon:.3f}\n"
        f"Episodes={m.episodes} | catches={m.catches} | avgStepsToCatch~{m.avg_steps_to_catch:.2f} | avgEff~{m.avg_path_efficiency:.2f}\n"
        f"Pred({pred.x},{pred.y}) o={pred.ori} | Prey({prey.x},{prey.y}) o={prey.ori} e={prey.energy} | Scout({scout.x},{scout.y}) o={scout.ori}"
    )
    events = "\n".join(state.event_log[-18:])
    trace = "\n".join(state.trace_log[-18:])
    return pov, truth_img, b_ctrl, b_other, status, events, trace

def grid_click_to_tile(evt: gr.SelectData, selected_tile: int, state: WorldState) -> WorldState:
    x_px, y_px = evt.index
    y_px -= 28
    if y_px < 0:
        return state
    gx = int(x_px // TILE)
    gy = int(y_px // TILE)
    if not in_bounds(gx, gy):
        return state
    if gx == 0 or gy == 0 or gx == GRID_W - 1 or gy == GRID_H - 1:
        return state
    state.grid[gy][gx] = selected_tile
    state.event_log.append(f"t={state.step}: Tile ({gx},{gy}) -> {TILE_NAMES.get(selected_tile)}")
    return state

# -----------------------------
# Gradio App
# -----------------------------
with gr.Blocks(title="Agent POV") as demo:
    gr.Markdown(
        "## Agent-POV by ZEN AI Co.\n"
        "Track every interaction, train policies, and audit why outcomes happened.\n"
        "No timers (compatibility). Use Tick/Run/Train for controlled experiments."
    )

    st = gr.State(init_state(1337))
    history = gr.State([snapshot_of(init_state(1337))])
    beliefs = gr.State(init_belief())
    rewind_idx = gr.State(0)

    with gr.Row():
        pov_img = gr.Image(label="POV (Pseudo-3D)", type="numpy", width=VIEW_W, height=VIEW_H)
        with gr.Column():
            status = gr.Textbox(label="Status + Metrics", lines=4)
            events = gr.Textbox(label="Event Log", lines=10)
            trace = gr.Textbox(label="Step Trace (why it happened)", lines=10)

    with gr.Row():
        truth = gr.Image(label="Truth Map (click to edit tiles)", type="pil")
        belief_a = gr.Image(label="Belief (Controlled)", type="pil")
        belief_b = gr.Image(label="Belief (Other)", type="pil")

    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### Manual Controls")
            with gr.Row():
                btn_L = gr.Button("L")
                btn_F = gr.Button("F")
                btn_R = gr.Button("R")
            with gr.Row():
                btn_tick = gr.Button("Tick")
                run_steps = gr.Number(value=25, label="Run N steps", precision=0)
                btn_run = gr.Button("Run")
            with gr.Row():
                btn_toggle_control = gr.Button("Toggle Controlled")
                btn_toggle_pov = gr.Button("Toggle POV")
                overlay = gr.Checkbox(False, label="Overlay reticle")

            tile_pick = gr.Radio(
                choices=[(TILE_NAMES[k], k) for k in [EMPTY, WALL, FOOD, NOISE, DOOR, TELE]],
                value=WALL,
                label="Paint tile type"
            )

        with gr.Column(scale=3):
            gr.Markdown("### Training Controls (Q-learning)")
            use_q_pred = gr.Checkbox(True, label="Use Q-learning: Predator")
            use_q_prey = gr.Checkbox(True, label="Use Q-learning: Prey")
            alpha = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label="alpha (learn rate)")
            gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="gamma (discount)")
            eps = gr.Slider(0.0, 0.5, value=0.10, step=0.01, label="epsilon (exploration)")
            eps_decay = gr.Slider(0.90, 0.999, value=0.995, step=0.001, label="epsilon decay")
            eps_min = gr.Slider(0.0, 0.2, value=0.02, step=0.01, label="epsilon min")

            episodes = gr.Number(value=50, label="Train episodes", precision=0)
            max_steps = gr.Number(value=250, label="Max steps per episode", precision=0)
            btn_train = gr.Button("Train")

            btn_reset = gr.Button("Reset Episode")
            btn_reset_all = gr.Button("Reset ALL (wipe Q + metrics)")

    with gr.Row():
        with gr.Column():
            rewind = gr.Slider(0, 0, value=0, step=1, label="Rewind (history index)")
            btn_jump = gr.Button("Jump")
        with gr.Column():
            export_box = gr.Textbox(label="Export JSON", lines=10)
            btn_export = gr.Button("Export")
        with gr.Column():
            import_box = gr.Textbox(label="Import JSON", lines=10)
            btn_import = gr.Button("Import")

    def refresh(state: WorldState, hist: List[Snapshot], bel: Dict[str, np.ndarray], r: int):
        r_max = max(0, len(hist) - 1)
        r = max(0, min(int(r), r_max))
        pov, tr, ba, bb, stxt, etxt, ttxt = build_views(state, bel)
        return (
            pov, tr, ba, bb,
            stxt, etxt, ttxt,
            gr.update(maximum=r_max, value=r),
            r
        )

    def push_hist(state: WorldState, hist: List[Snapshot]) -> List[Snapshot]:
        hist.append(snapshot_of(state))
        if len(hist) > MAX_HISTORY:
            hist.pop(0)
        return hist

    def set_cfg(state: WorldState, uq_pred: bool, uq_prey: bool, a: float, g: float, e: float, ed: float, emin: float):
        state.cfg.use_q_pred = bool(uq_pred)
        state.cfg.use_q_prey = bool(uq_prey)
        state.cfg.alpha = float(a)
        state.cfg.gamma = float(g)
        state.metrics.epsilon = float(e)
        state.cfg.epsilon_decay = float(ed)
        state.cfg.epsilon_min = float(emin)
        return state

    def do_manual(state, hist, bel, r, act):
        tick(state, manual_action=act)
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def do_tick(state, hist, bel, r):
        tick(state, manual_action=None)
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def do_run(state, hist, bel, r, n):
        n = max(1, int(n))
        for _ in range(n):
            if state.caught:
                break
            tick(state, manual_action=None)
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def toggle_control(state, hist, bel, r):
        order = ["Predator", "Prey", "Scout"]
        i = order.index(state.controlled)
        state.controlled = order[(i + 1) % len(order)]
        state.event_log.append(f"Controlled -> {state.controlled}")
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def toggle_pov(state, hist, bel, r):
        order = ["Predator", "Prey", "Scout"]
        i = order.index(state.pov)
        state.pov = order[(i + 1) % len(order)]
        state.event_log.append(f"POV -> {state.pov}")
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def set_overlay(state, hist, bel, r, ov):
        state.overlay = bool(ov)
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def click_truth(tile, state, hist, bel, r, evt: gr.SelectData):
        state = grid_click_to_tile(evt, int(tile), state)
        hist = push_hist(state, hist)
        r = len(hist) - 1
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def jump(state, hist, bel, r, idx):
        if not hist:
            out = refresh(state, hist, bel, r)
            return out + (state, hist, bel, r)
        idx = max(0, min(int(idx), len(hist) - 1))
        restore_into(state, hist[idx])
        r = idx
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def reset_ep(state, hist, bel, r):
        reset_episode(state, seed=state.seed)
        hist = [snapshot_of(state)]
        r = 0
        bel = init_belief()
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def reset_all(state, hist, bel, r):
        seed = state.seed
        state = init_state(seed)
        hist = [snapshot_of(state)]
        bel = init_belief()
        r = 0
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def do_train(state, hist, bel, r,
                 uq_pred, uq_prey, a, g, e, ed, emin,
                 eps_count, max_s):
        state = set_cfg(state, uq_pred, uq_prey, a, g, e, ed, emin)
        train(state, episodes=max(1, int(eps_count)), max_steps=max(10, int(max_s)))
        # After training, reset to a clean episode so user sees improved behavior
        reset_episode(state, seed=state.seed)
        hist = [snapshot_of(state)]
        bel = init_belief()
        r = 0
        out = refresh(state, hist, bel, r)
        return out + (state, hist, bel, r)

    def export_fn(state, hist):
        return export_run(state, hist)

    def import_fn(txt):
        state, hist, bel, r = import_run(txt)
        pov, tr, ba, bb, stxt, etxt, ttxt = build_views(state, bel)
        r_max = max(0, len(hist) - 1)
        return (
            pov, tr, ba, bb, stxt, etxt, ttxt,
            gr.update(maximum=r_max, value=r),
            state, hist, bel, r
        )

    # --- Wire buttons (no fn_kwargs; use lambdas) ---
    btn_L.click(lambda s,h,b,r: do_manual(s,h,b,r,"L"),
                inputs=[st, history, beliefs, rewind_idx],
                outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                queue=True)

    btn_F.click(lambda s,h,b,r: do_manual(s,h,b,r,"F"),
                inputs=[st, history, beliefs, rewind_idx],
                outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                queue=True)

    btn_R.click(lambda s,h,b,r: do_manual(s,h,b,r,"R"),
                inputs=[st, history, beliefs, rewind_idx],
                outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                queue=True)

    btn_tick.click(do_tick,
                   inputs=[st, history, beliefs, rewind_idx],
                   outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                   queue=True)

    btn_run.click(do_run,
                  inputs=[st, history, beliefs, rewind_idx, run_steps],
                  outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                  queue=True)

    btn_toggle_control.click(toggle_control,
                             inputs=[st, history, beliefs, rewind_idx],
                             outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                             queue=True)

    btn_toggle_pov.click(toggle_pov,
                         inputs=[st, history, beliefs, rewind_idx],
                         outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                         queue=True)

    overlay.change(set_overlay,
                   inputs=[st, history, beliefs, rewind_idx, overlay],
                   outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                   queue=True)

    truth.select(click_truth,
                 inputs=[tile_pick, st, history, beliefs, rewind_idx],
                 outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                 queue=True)

    btn_jump.click(jump,
                   inputs=[st, history, beliefs, rewind_idx, rewind],
                   outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                   queue=True)

    btn_reset.click(reset_ep,
                    inputs=[st, history, beliefs, rewind_idx],
                    outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                    queue=True)

    btn_reset_all.click(reset_all,
                        inputs=[st, history, beliefs, rewind_idx],
                        outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                        queue=True)

    btn_train.click(do_train,
                    inputs=[st, history, beliefs, rewind_idx,
                            use_q_pred, use_q_prey, alpha, gamma, eps, eps_decay, eps_min,
                            episodes, max_steps],
                    outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx, st, history, beliefs, rewind_idx],
                    queue=True)

    btn_export.click(export_fn, inputs=[st, history], outputs=[export_box], queue=True)

    btn_import.click(import_fn,
                     inputs=[import_box],
                     outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, st, history, beliefs, rewind_idx],
                     queue=True)

    demo.load(refresh,
              inputs=[st, history, beliefs, rewind_idx],
              outputs=[pov_img, truth, belief_a, belief_b, status, events, trace, rewind, rewind_idx],
              queue=True)

demo.queue().launch()