Spaces:

Dash10107
/

rl_maze_solver

Sleeping

Daksh C Jain Claude Sonnet 4.6 commited on May 13

Commit

34aeb9a

1 Parent(s): a6e59e3

Transform into interactive RL playground for all audiences

- 4-tab UI: Welcome, Playground, Algorithm Race, How it Works
- DFS maze generator (guaranteed solvable corridors) + open random style
- Animated GIF replay of agent solving the maze step by step
- Q-Learning, SARSA, Monte Carlo — all with plain English descriptions
- Algorithm Race tab: head-to-head convergence charts + winner announcement
- Q-value heatmap showing what the agent learned
- Difficulty presets: Tiny 5x5 to XL 17x17
- No jargon: human labels for all controls, story-driven onboarding
- Modular structure: maze/, agents/, viz/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (12) hide show

.gitignore +2 -0
agents/__init__.py +0 -0
agents/base.py +24 -0
agents/montecarlo.py +42 -0
agents/qlearning.py +31 -0
agents/sarsa.py +32 -0
app.py +631 -388
maze/__init__.py +0 -0
maze/env.py +62 -0
maze/generator.py +83 -0
viz/__init__.py +0 -0
viz/renderer.py +319 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ *.pyc

agents/__init__.py ADDED Viewed

File without changes

agents/base.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from __future__ import annotations
+import numpy as np
+class TabularAgent:
+    def __init__(self, n_states: int, n_actions: int,
+                 alpha: float = 0.1, gamma: float = 0.95, epsilon: float = 1.0):
+        self.n_states = n_states
+        self.n_actions = n_actions
+        self.alpha = alpha
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.Q = np.zeros((n_states, n_actions), dtype=np.float32)
+    def choose_action(self, state: int, rng: np.random.Generator) -> int:
+        if rng.random() < self.epsilon:
+            return int(rng.integers(self.n_actions))
+        return int(np.argmax(self.Q[state]))
+    def greedy_action(self, state: int) -> int:
+        return int(np.argmax(self.Q[state]))
+    def decay_epsilon(self, rate: float):
+        self.epsilon = max(0.01, self.epsilon * rate)

agents/montecarlo.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from __future__ import annotations
+import numpy as np
+from agents.base import TabularAgent
+from maze.env import MazeEnv
+def train_montecarlo(
+    env: MazeEnv, episodes: int, alpha: float, gamma: float,
+    decay: float, seed: int = 0,
+) -> tuple[TabularAgent, list[float]]:
+    agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
+    rng = np.random.default_rng(seed)
+    returns_sum = np.zeros_like(agent.Q)
+    returns_cnt = np.zeros_like(agent.Q)
+    rewards = []
+    for _ in range(episodes):
+        state, _ = env.reset()
+        episode: list[tuple[int, int, float]] = []
+        for _ in range(env.n_states * 4):
+            action = agent.choose_action(state, rng)
+            next_state, reward, done, _, _ = env.step(action)
+            episode.append((state, action, reward))
+            state = next_state
+            if done:
+                break
+        # First-visit MC update
+        G = 0.0
+        visited: set[tuple[int, int]] = set()
+        for s, a, r in reversed(episode):
+            G = gamma * G + r
+            if (s, a) not in visited:
+                visited.add((s, a))
+                returns_sum[s, a] += G
+                returns_cnt[s, a] += 1
+                agent.Q[s, a] = returns_sum[s, a] / returns_cnt[s, a]
+        agent.decay_epsilon(decay)
+        rewards.append(sum(r for _, _, r in episode))
+    return agent, rewards

agents/qlearning.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from __future__ import annotations
+import numpy as np
+from agents.base import TabularAgent
+from maze.env import MazeEnv
+def train_qlearning(
+    env: MazeEnv, episodes: int, alpha: float, gamma: float,
+    decay: float, seed: int = 0,
+) -> tuple[TabularAgent, list[float]]:
+    agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
+    rng = np.random.default_rng(seed)
+    rewards = []
+    for _ in range(episodes):
+        state, _ = env.reset()
+        total = 0.0
+        for _ in range(env.n_states * 4):
+            action = agent.choose_action(state, rng)
+            next_state, reward, done, _, _ = env.step(action)
+            # Q-Learning: off-policy TD update
+            td_target = reward + gamma * np.max(agent.Q[next_state]) * (1 - done)
+            agent.Q[state, action] += alpha * (td_target - agent.Q[state, action])
+            state = next_state
+            total += reward
+            if done:
+                break
+        agent.decay_epsilon(decay)
+        rewards.append(total)
+    return agent, rewards

agents/sarsa.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from __future__ import annotations
+import numpy as np
+from agents.base import TabularAgent
+from maze.env import MazeEnv
+def train_sarsa(
+    env: MazeEnv, episodes: int, alpha: float, gamma: float,
+    decay: float, seed: int = 0,
+) -> tuple[TabularAgent, list[float]]:
+    agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
+    rng = np.random.default_rng(seed)
+    rewards = []
+    for _ in range(episodes):
+        state, _ = env.reset()
+        action = agent.choose_action(state, rng)
+        total = 0.0
+        for _ in range(env.n_states * 4):
+            next_state, reward, done, _, _ = env.step(action)
+            next_action = agent.choose_action(next_state, rng)
+            # SARSA: on-policy TD update (uses next chosen action, not greedy)
+            td_target = reward + gamma * agent.Q[next_state, next_action] * (1 - done)
+            agent.Q[state, action] += alpha * (td_target - agent.Q[state, action])
+            state, action = next_state, next_action
+            total += reward
+            if done:
+                break
+        agent.decay_epsilon(decay)
+        rewards.append(total)
+    return agent, rewards

app.py CHANGED Viewed

@@ -1,394 +1,637 @@
 import numpy as np
-import gymnasium as gym
-from gymnasium import spaces
-import random
 import gradio as gr
-import matplotlib.pyplot as plt
-import matplotlib.patches as mpatches
-import io
-import base64
-import seaborn as sns
-from PIL import Image
-# ─────────────────────────────────────────────
-#  MAZE ENVIRONMENT
-# ─────────────────────────────────────────────
-class MazeEnv(gym.Env):
-    """
-    A grid-based maze environment.
-    States  : row * maze_size + col  (integer)
-    Actions : 0=Up, 1=Down, 2=Left, 3=Right
-    """
-    metadata = {"render_modes": []}
-    def __init__(self, maze_size: int = 5, num_walls: int = None, seed: int = None):
-        super().__init__()
-        self.maze_size  = maze_size
-        self.observation_space = spaces.Discrete(maze_size * maze_size)
-        self.action_space      = spaces.Discrete(4)
-        self.start_pos = (0, 0)
-        self.goal_pos  = (maze_size - 1, maze_size - 1)
-        self.seed      = seed
-        self.num_walls = num_walls if num_walls is not None else int(maze_size * maze_size * 0.15)
-        self.maze  = self._generate_maze()
-        self.state = self.start_pos
-    # ── maze generation ──────────────────────
-    def _generate_maze(self) -> np.ndarray:
-        maze = np.zeros((self.maze_size, self.maze_size), dtype=np.int8)
-        rng  = random.Random(self.seed)
-        placed = 0
-        max_attempts = self.num_walls * 100
-        attempts = 0
-        while placed < self.num_walls and attempts < max_attempts:
-            r = rng.randint(0, self.maze_size - 1)
-            c = rng.randint(0, self.maze_size - 1)
-            if (r, c) not in (self.start_pos, self.goal_pos) and maze[r, c] == 0:
-                maze[r, c] = 1
-                placed += 1
-            attempts += 1
-        return maze
-    # ── gym interface ─────────────────────────
-    def reset(self, seed=None, options=None):
-        super().reset(seed=seed)
-        if seed is not None and seed != self.seed:
-            self.seed = seed
-            self.maze = self._generate_maze()
-        self.state = self.start_pos
-        return self._get_obs(), {}
-    def _get_obs(self) -> int:
-        return self.state[0] * self.maze_size + self.state[1]
-    def step(self, action: int):
-        r, c = self.state
-        moves = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}
-        dr, dc     = moves[action]
-        nr, nc     = r + dr, c + dc
-        # Clamp to grid boundaries
-        nr = max(0, min(nr, self.maze_size - 1))
-        nc = max(0, min(nc, self.maze_size - 1))
-        next_pos   = (nr, nc)
-        # Wall collision → stay put, penalise
-        if self.maze[nr, nc] == 1:
-            reward = -10
-            # state unchanged
-        else:
-            self.state = next_pos
-            if self.state == self.goal_pos:
-                reward = 100
-                return self._get_obs(), reward, True, False, {}
-            else:
-                reward = -1
-        return self._get_obs(), reward, False, False, {}
-# ─────────────────────────────────────────────
-#  Q-LEARNING AGENT
-# ─────────────────────────────────────────────
-class QLearningAgent:
-    def __init__(self, num_states: int, num_actions: int,
-                 alpha: float = 0.1, gamma: float = 0.9, epsilon: float = 1.0):
-        self.num_states  = num_states
-        self.num_actions = num_actions
-        self.alpha       = alpha
-        self.gamma       = gamma
-        self.epsilon     = epsilon
-        self.q_table     = np.zeros((num_states, num_actions))
-    def choose_action(self, state: int) -> int:
-        if random.random() < self.epsilon:
-            return random.randint(0, self.num_actions - 1)
-        return int(np.argmax(self.q_table[state]))
-    def update(self, state: int, action: int, reward: float, next_state: int):
-        best_next = np.max(self.q_table[next_state])
-        td_target = reward + self.gamma * best_next
-        self.q_table[state, action] += self.alpha * (td_target - self.q_table[state, action])
-# ─────────────────────────────────────────────
-#  MONTE CARLO AGENT  (first-visit, every-episode update)
-# ─────────────────────────────────────────────
-class MonteCarloAgent:
-    def __init__(self, num_states: int, num_actions: int,
-                 alpha: float = 0.1, gamma: float = 0.9, epsilon: float = 1.0):
-        self.num_states    = num_states
-        self.num_actions   = num_actions
-        self.alpha         = alpha
-        self.gamma         = gamma
-        self.epsilon       = epsilon
-        self.q_table       = np.zeros((num_states, num_actions))
-        self.returns_sum   = np.zeros((num_states, num_actions))
-        self.returns_count = np.zeros((num_states, num_actions))
-    def choose_action(self, state: int) -> int:
-        if random.random() < self.epsilon:
-            return random.randint(0, self.num_actions - 1)
-        return int(np.argmax(self.q_table[state]))
-    def update(self, episode_history: list):
-        """First-visit Monte Carlo update."""
-        G = 0.0
-        visited = set()
-        for state, action, reward in reversed(episode_history):
-            G = self.gamma * G + reward
-            if (state, action) not in visited:
-                visited.add((state, action))
-                self.returns_sum[state, action]   += G
-                self.returns_count[state, action] += 1
-                self.q_table[state, action] = (
-                    self.returns_sum[state, action] / self.returns_count[state, action]
-                )
-# ─────────────────────────────────────────────
-#  TRAINING ROUTINES
-# ─────────────────────────────────────────────
-def train_q_agent(env, agent, num_episodes: int,
-                  min_epsilon: float = 0.01, decay_rate: float = 0.995,
-                  max_steps: int = 500) -> list:
-    rewards_history = []
-    for _ in range(num_episodes):
-        state, _ = env.reset()
-        total_reward = 0
-        for _ in range(max_steps):
-            action                           = agent.choose_action(state)
-            next_state, reward, done, _, _   = env.step(action)
-            agent.update(state, action, reward, next_state)
-            state        = next_state
-            total_reward += reward
-            if done:
-                break
-        agent.epsilon = max(min_epsilon, agent.epsilon * decay_rate)
-        rewards_history.append(total_reward)
-    return rewards_history
-def train_mc_agent(env, agent, num_episodes: int,
-                   min_epsilon: float = 0.01, decay_rate: float = 0.995,
-                   max_steps: int = 500) -> list:
-    rewards_history = []
-    for _ in range(num_episodes):
-        episode_history = []
-        state, _ = env.reset()
-        total_reward = 0
-        for _ in range(max_steps):
-            action                          = agent.choose_action(state)
-            next_state, reward, done, _, _  = env.step(action)
-            episode_history.append((state, action, reward))
-            state        = next_state
-            total_reward += reward
-            if done:
-                break
-        agent.update(episode_history)
-        agent.epsilon = max(min_epsilon, agent.epsilon * decay_rate)
-        rewards_history.append(total_reward)
-    return rewards_history
-# ─────────────────────────────────────────────
-#  VISUALISATION HELPER
-# ─────────────────────────────────────────────
-def render_maze(env, path: list, agent_type: str) -> Image.Image:
-    """Return a PIL Image of the maze with the solved path overlaid."""
-    ms   = env.maze_size
-    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
-    fig.patch.set_facecolor('#1a1a2e')
-    # ── left: maze + path ────────────────────
-    ax = axes[0]
-    ax.set_facecolor('#16213e')
-    # Draw cells
-    for r in range(ms):
-        for c in range(ms):
-            if env.maze[r, c] == 1:
-                color = '#e94560'   # wall
-            elif (r, c) == env.start_pos:
-                color = '#0f3460'
-            elif (r, c) == env.goal_pos:
-                color = '#533483'
-            else:
-                color = '#16213e'
-            rect = mpatches.FancyBboxPatch(
-                (c, ms - 1 - r), 1, 1,
-                boxstyle="round,pad=0.05",
-                linewidth=1.5, edgecolor='#0f3460',
-                facecolor=color
-            )
-            ax.add_patch(rect)
-    # Draw path
-    if len(path) > 1:
-        path_coords = [(s % ms + 0.5, ms - 1 - s // ms + 0.5) for s in path]
-        xs, ys = zip(*path_coords)
-        ax.plot(xs, ys, color='#00d2ff', linewidth=2.5,
-                alpha=0.85, zorder=5, marker='o',
-                markersize=5, markerfacecolor='#00d2ff')
-    # Emoji labels
-    sx, sy = env.start_pos[1] + 0.5, ms - 1 - env.start_pos[0] + 0.5
-    gx, gy = env.goal_pos[1]  + 0.5, ms - 1 - env.goal_pos[0]  + 0.5
-    ax.text(sx, sy, '🐀', ha='center', va='center', fontsize=18, zorder=10)
-    ax.text(gx, gy, '🧀', ha='center', va='center', fontsize=18, zorder=10)
-    ax.set_xlim(0, ms)
-    ax.set_ylim(0, ms)
-    ax.set_aspect('equal')
-    ax.set_xticks(range(ms))
-    ax.set_yticks(range(ms))
-    ax.tick_params(colors='#aaaacc')
-    ax.set_title(f'Maze — {agent_type}', color='white', fontsize=13, pad=10)
-    for spine in ax.spines.values():
-        spine.set_edgecolor('#0f3460')
-    # ── right: Q-value heatmap ────────────────
-    ax2 = axes[1]
-    q_max = np.max(env_ref.agent_q_table, axis=1).reshape(ms, ms) \
-        if hasattr(env_ref, 'agent_q_table') else np.zeros((ms, ms))
-    sns.heatmap(
-        q_max, ax=ax2, cmap='magma', linewidths=0.5, linecolor='#1a1a2e',
-        cbar_kws={'label': 'Max Q-value', 'shrink': 0.8},
-        annot=(ms <= 8), fmt='.1f', annot_kws={'color': 'white', 'size': 8}
-    )
-    ax2.set_title('Max Q-value per Cell', color='white', fontsize=13, pad=10)
-    ax2.tick_params(colors='#aaaacc')
-    ax2.set_facecolor('#16213e')
-    plt.setp(ax2.get_xticklabels(), color='#aaaacc')
-    plt.setp(ax2.get_yticklabels(), color='#aaaacc')
-    ax2.collections[0].colorbar.ax.yaxis.label.set_color('white')
-    ax2.collections[0].colorbar.ax.tick_params(colors='white')
-    plt.tight_layout(pad=2)
-    buf = io.BytesIO()
-    plt.savefig(buf, format='png', dpi=120, bbox_inches='tight',
-                facecolor=fig.get_facecolor())
-    plt.close(fig)
-    buf.seek(0)
-    return Image.open(buf).copy()
-# Tiny global to pass Q-table into render helper without refactoring signature
-class _EnvRef:
-    agent_q_table = None
-env_ref = _EnvRef()
-# ─────────────────────────────────────────────
-#  MAIN CALLABLE  (Gradio fn)
-# ─────────────────────────────────────────────
-def create_and_solve_maze(
-    maze_size: int,
-    num_walls: int,
-    agent_type: str,
-    num_episodes: int,
-    epsilon_decay: float,
-    learning_rate: float,
-    discount_factor: float,
-) -> tuple:
-    """Train an RL agent and return (maze image, stats text)."""
-    seed = random.randint(0, 10_000)
-    env  = MazeEnv(maze_size=maze_size, num_walls=num_walls, seed=seed)
-    n_s  = env.observation_space.n
-    n_a  = env.action_space.n
-    # ── build & train ─────────────────────────
-    if agent_type == 'Q-Learning':
-        agent   = QLearningAgent(n_s, n_a, alpha=learning_rate,
-                                 gamma=discount_factor, epsilon=1.0)
-        history = train_q_agent(env, agent, num_episodes,
-                                decay_rate=epsilon_decay)
-    else:  # Monte Carlo
-        agent   = MonteCarloAgent(n_s, n_a, alpha=learning_rate,
-                                  gamma=discount_factor, epsilon=1.0)
-        history = train_mc_agent(env, agent, num_episodes,
-                                 decay_rate=epsilon_decay)
-    # pass Q-table to renderer
-    env_ref.agent_q_table = agent.q_table
-    # ── greedy rollout ────────────────────────
-    state, _ = env.reset(seed=seed)
-    path      = [state]
-    max_steps = maze_size * maze_size * 3
-    done      = False
-    while not done and len(path) < max_steps:
-        action                        = int(np.argmax(agent.q_table[state]))
-        next_state, _, term, trunc, _ = env.step(action)
-        path.append(next_state)
-        state = next_state
-        done  = term or trunc
-    solved   = (env.state == env.goal_pos)
-    img      = render_maze(env, path, agent_type)
-    # ── stats string ──────────────────────────
-    avg_last = np.mean(history[-100:]) if len(history) >= 100 else np.mean(history)
-    status   = "✅ Goal reached!" if solved else "❌ Did not reach goal."
-    stats    = (
-        f"{status}\n"
-        f"Path length       : {len(path)} steps\n"
-        f"Avg reward (last 100 ep): {avg_last:.1f}\n"
-        f"Final epsilon     : {agent.epsilon:.4f}\n"
-        f"Episodes trained  : {num_episodes}"
-    )
-    return img, stats
-# ─────────────────────────────────────────────
-#  GRADIO INTERFACE
-# ─────────────────────────────────────────────
-custom_css = """
-body { background: #1a1a2e; }
-.gradio-container { background: #1a1a2e !important; color: #e0e0ff; font-family: 'Segoe UI', sans-serif; }
-.gr-button-primary { background: #e94560 !important; border: none !important; }
-.gr-button-primary:hover { background: #c73652 !important; }
-label { color: #aaaacc !important; }
 """
-with gr.Blocks(css=custom_css, title="RL Maze Solver") as iface:
-    gr.Markdown(
-        """
-        # 🐀 RL Rat & Cheese Maze Solver
-        Train a **Q-Learning** or **Monte Carlo** agent to navigate a randomly generated maze.
-        Adjust the parameters and hit **Solve Maze** to watch the agent learn!
-        """
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            maze_size      = gr.Slider(5, 15, step=1,   value=7,    label="Maze Size")
-            num_walls      = gr.Slider(0, 50,  step=1,   value=10,   label="Number of Walls")
-            agent_type     = gr.Radio(["Q-Learning", "Monte Carlo"],
-                                      value="Q-Learning", label="Agent Type")
-            num_episodes   = gr.Slider(100, 5000, step=100, value=1000,  label="Training Episodes")
-            epsilon_decay  = gr.Slider(0.90, 0.999, step=0.001, value=0.995, label="Epsilon Decay Rate")
-            learning_rate  = gr.Slider(0.01, 0.5,  step=0.01, value=0.1,   label="Learning Rate (α)")
-            discount_factor= gr.Slider(0.5,  0.99, step=0.01, value=0.9,   label="Discount Factor (γ)")
-            solve_btn      = gr.Button("🚀 Solve Maze", variant="primary")
-        with gr.Column(scale=2):
-            maze_img = gr.Image(type="pil", label="Solved Maze + Q-value Heatmap")
-            stats_box = gr.Textbox(label="Training Stats", lines=6, interactive=False)
-    solve_btn.click(
-        fn=create_and_solve_maze,
-        inputs=[maze_size, num_walls, agent_type, num_episodes,
-                epsilon_decay, learning_rate, discount_factor],
-        outputs=[maze_img, stats_box],
-    )
 if __name__ == "__main__":
-    iface.launch(share=True)

+"""
+🤖 Maze Runner — RL Playground
+An interactive, fun maze-solving playground powered by Reinforcement Learning.
+Anyone can build a maze, pick a brain, and watch the bot learn to escape.
+"""
+from __future__ import annotations
 import numpy as np
 import gradio as gr
+from maze.generator import generate_dfs_maze, generate_open_maze
+from maze.env import MazeEnv
+from agents.qlearning import train_qlearning
+from agents.sarsa import train_sarsa
+from agents.montecarlo import train_montecarlo
+from viz.renderer import (
+    make_solution_gif, make_training_chart,
+    make_qvalue_heatmap, make_race_chart, score_run,
+)
+# ── Helpers ───────────────────────────────────────────────────────────────────
+ALGO_MAP = {
+    "🧠 Q-Learning  (recommended)": "qlearning",
+    "🎯 SARSA  (cautious)":          "sarsa",
+    "🎲 Monte Carlo  (explorer)":    "montecarlo",
+}
+DIFFICULTY = {
+    "🐣 Tiny  (5×5)":    5,
+    "🐇 Small  (7×7)":   7,
+    "🐢 Medium  (9×9)":  9,
+    "🦊 Large  (13×13)": 13,
+    "🐉 XL  (17×17)":    17,
+}
+MAZE_STYLE = {
+    "🏰 Corridors  (DFS)": "dfs",
+    "🌿 Open Field  (random walls)": "open",
+}
+def _make_env(size: int, style: str, seed: int) -> MazeEnv:
+    if style == "dfs":
+        grid = generate_dfs_maze(size, seed=seed)
+    else:
+        grid = generate_open_maze(size, wall_frac=0.18, seed=seed)
+    return MazeEnv(grid)
+def _train(env: MazeEnv, algo: str, episodes: int, alpha: float,
+           gamma: float, decay: float, seed: int):
+    fn = {"qlearning": train_qlearning,
+          "sarsa": train_sarsa,
+          "montecarlo": train_montecarlo}[algo]
+    return fn(env, episodes, alpha, gamma, decay, seed)
+def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]:
+    state, _ = env.reset()
+    path: list[tuple[int, ...]] = [env.start]
+    for _ in range(env.n_states * 3):
+        action = agent.greedy_action(state)
+        state, _, done, _, _ = env.step(action)
+        path.append(env._from_state(state))
+        if done:
+            break
+    return path
+# ── Main Playground callback ──────────────────────────────────────────────────
+def cb_solve(
+    difficulty: str,
+    maze_style: str,
+    algo_label: str,
+    episodes: int,
+    alpha: float,
+    gamma: float,
+    decay: float,
+    seed: int,
+    progress: gr.Progress = gr.Progress(),
+):
+    progress(0.05, desc="Building maze…")
+    size = DIFFICULTY[difficulty]
+    style = MAZE_STYLE[maze_style]
+    algo = ALGO_MAP[algo_label]
+    env = _make_env(size, style, int(seed))
+    progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}…")
+    agent, rewards = _train(env, algo, int(episodes), float(alpha),
+                            float(gamma), float(decay), int(seed))
+    progress(0.75, desc="Rendering solution…")
+    env2 = _make_env(size, style, int(seed))
+    gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip())
+    progress(0.85, desc="Building charts…")
+    env3 = _make_env(size, style, int(seed))
+    path = _collect_path(env3, agent)
+    sc = score_run(path, env3.goal, rewards, env3.n_states)
+    train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards})
+    env4 = _make_env(size, style, int(seed))
+    heatmap_fig = make_qvalue_heatmap(env4, agent)
+    stats_md = f"""
+### {sc['grade']} — {sc['verdict']}
+| | |
+|---|---|
+| **Solved** | {"✅ Yes" if sc['solved'] else "❌ No"} |
+| **Steps taken** | `{sc['steps']}` |
+| **Efficiency score** | `{sc['efficiency']}%` |
+| **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` |
+| **Episodes trained** | `{int(episodes)}` |
+| **Maze size** | `{env.shape[0]} × {env.shape[1]}` cells |
+> **Efficiency** compares your bot's path length to the ideal shortest path.
+> 100% = perfect. 0% = didn't make it.
+"""
+    progress(1.0, desc="Done!")
+    return gif_path, train_fig, heatmap_fig, stats_md
+# ── Algorithm Race callback ───────────────────────────────────────────────────
+def cb_race(
+    difficulty: str,
+    maze_style: str,
+    episodes: int,
+    run_mc: bool,
+    progress: gr.Progress = gr.Progress(),
+):
+    size = DIFFICULTY[difficulty]
+    style = MAZE_STYLE[maze_style]
+    seed = 77
+    progress(0.1, desc="Training Q-Learning…")
+    env_q = _make_env(size, style, seed)
+    _, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed)
+    progress(0.4, desc="Training SARSA…")
+    env_s = _make_env(size, style, seed)
+    _, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed)
+    rc, name_c = None, ""
+    if run_mc:
+        progress(0.65, desc="Training Monte Carlo…")
+        env_m = _make_env(size, style, seed)
+        _, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed)
+        name_c = "Monte Carlo"
+    progress(0.9, desc="Building race chart…")
+    fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c)
+    # Winner
+    final_q = float(np.mean(rq[-max(1, len(rq)//5):]))
+    final_s = float(np.mean(rs[-max(1, len(rs)//5):]))
+    scores = {"Q-Learning": final_q, "SARSA": final_s}
+    if rc:
+        scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):]))
+    winner = max(scores, key=lambda k: scores[k])
+    result_md = f"""
+### 🏆 Race Result
+| Algorithm | Final Score |
+|---|---|
+{"".join(f"| {'🥇 ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())}
+**Winner: {winner}** with a final average reward of `{scores[winner]:.1f}`
+> All algorithms trained on the same maze with identical hyperparameters.
+> Final score = average reward over the last 20% of episodes.
 """
+    progress(1.0)
+    return fig, result_md
+# ── CSS ───────────────────────────────────────────────────────────────────────
+CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap');
+*, *::before, *::after { box-sizing: border-box; }
+body, .gradio-container {
+    background: #0d1117 !important;
+    color: #c9d1d9 !important;
+    font-family: 'Inter', sans-serif !important;
+}
+.gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
+/* Hero */
+.hero { text-align:center; padding:2rem 1rem 1rem; }
+.hero-title {
+    font-size: clamp(2rem, 5vw, 3rem); font-weight: 700;
+    background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657);
+    -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+    margin: 0 0 0.4rem;
+}
+.hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; }
+/* Tabs */
+.tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; }
+.tab-nav button {
+    font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important;
+    font-weight: 500 !important; color: #484f58 !important;
+    background: transparent !important; border: none !important;
+    padding: 0.7rem 1.1rem !important;
+}
+.tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; }
+.tab-nav button:hover { color: #8b949e !important; }
+/* Cards */
+.info-card {
+    background: #161b22; border: 1px solid #21262d; border-radius: 10px;
+    padding: 1.1rem;
+}
+.info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; }
+.info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; }
+.info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; }
+/* Algo cards */
+.algo-card {
+    background: #161b22; border: 1px solid #21262d; border-radius: 10px;
+    padding: 1rem; margin-bottom: 0.5rem;
+}
+.algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; }
+.algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; }
+.algo-tag {
+    display: inline-block; font-size: 0.68rem; padding: 2px 8px;
+    border-radius: 20px; margin-top: 0.4rem;
+}
+.tag-green  { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; }
+.tag-blue   { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; }
+.tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; }
+/* Grade badge */
+.grade-badge {
+    display:inline-block; font-size:2.5rem; font-weight:700;
+    font-family:'JetBrains Mono',monospace;
+}
+/* Buttons */
+button.primary {
+    font-family: 'Inter', sans-serif !important; font-weight: 600 !important;
+    background: linear-gradient(135deg, #238636, #2ea043) !important;
+    color: #ffffff !important; border: none !important;
+    border-radius: 6px !important; font-size: 0.9rem !important;
+    transition: opacity 0.2s !important;
+}
+button.primary:hover { opacity: 0.85 !important; }
+button.secondary {
+    background: #161b22 !important; color: #58a6ff !important;
+    border: 1px solid #30363d !important; border-radius: 6px !important;
+    font-family: 'Inter', sans-serif !important;
+}
+button.stop {
+    background: #1c0d0d !important; color: #f78166 !important;
+    border: 1px solid #6e2b2b !important; border-radius: 6px !important;
+}
+/* Labels */
+label span { font-family:'Inter',sans-serif !important;
+             font-size:0.82rem !important; color:#8b949e !important; }
+/* Slider */
+input[type=range] { -webkit-appearance:none; height:4px;
+                    background:#21262d; border-radius:2px; }
+input[type=range]::-webkit-slider-thumb {
+    -webkit-appearance:none; width:16px; height:16px;
+    border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117;
+}
+/* Textarea */
+textarea { font-family:'JetBrains Mono',monospace !important;
+           font-size:0.82rem !important; background:#0d1117 !important;
+           color:#3fb950 !important; border:1px solid #21262d !important;
+           border-radius:6px !important; }
+/* Markdown */
+.gradio-container h2 { color: #3fb950 !important; }
+.gradio-container h3 { color: #58a6ff !important; }
+.gradio-container p  { color: #8b949e !important; }
+table { width:100%; border-collapse:collapse; }
+th { background:#161b22; color:#3fb950; font-size:0.78rem;
+     text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; }
+td { padding:8px 12px; border-bottom:1px solid #0d1117;
+     color:#e6edf3; font-size:0.85rem; }
+blockquote { border-left:3px solid #3fb950; padding-left:1rem;
+             color:#484f58 !important; margin:0.5rem 0; }
+footer { display:none !important; }
+.gradio-container .block { background:transparent !important; border:none !important; }
+"""
+# ── Build UI ──────────────────────────────────────────────────────────────────
+with gr.Blocks(title="🤖 Maze Runner — RL Playground") as demo:
+    gr.HTML("""
+    <div class="hero">
+        <div class="hero-title">🤖 Maze Runner</div>
+        <div class="hero-sub">An AI that learns to escape mazes — watch it happen in real time</div>
+    </div>
+    """)
+    with gr.Tabs():
+        # ══════════════════════════════════════════════════════════════════
+        # Tab 1 — Welcome
+        # ══════════════════════════════════════════════════════════════════
+        with gr.Tab("🏠 Welcome"):
+            gr.HTML("""
+            <div style="text-align:center;padding:0.5rem 0 1.5rem;">
+                <p style="color:#8b949e;font-size:1rem;max-width:580px;margin:0 auto;">
+                    A tiny AI robot is dropped into a maze. It knows nothing.
+                    Through thousands of attempts — hitting walls, finding dead ends,
+                    occasionally stumbling upon the exit — it slowly builds a mental map
+                    and learns the perfect escape route.
+                </p>
+            </div>
+            """)
+            gr.HTML("""
+            <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-bottom:1.5rem;">
+                <div class="info-card">
+                    <div class="info-card-icon">🗺️</div>
+                    <div class="info-card-title">The Maze</div>
+                    <div class="info-card-body">
+                        A grid of corridors and walls. The bot starts at
+                        <strong style="color:#58a6ff">S</strong> and must reach
+                        <strong style="color:#f78166">G</strong>.
+                        It can only see its own position — no map, no cheating.
+                    </div>
+                </div>
+                <div class="info-card">
+                    <div class="info-card-icon">🤖</div>
+                    <div class="info-card-title">The Bot</div>
+                    <div class="info-card-body">
+                        At each step it chooses: go up, down, left, or right.
+                        Hit a wall? Penalty. Reach the goal? Big reward!
+                        It remembers what worked and what didn't.
+                    </div>
+                </div>
+                <div class="info-card">
+                    <div class="info-card-icon">🧠</div>
+                    <div class="info-card-title">The Learning</div>
+                    <div class="info-card-body">
+                        Each attempt updates a "score table" for every
+                        position and move. After enough tries, the bot
+                        always picks the move with the highest score — the optimal path.
+                    </div>
+                </div>
+            </div>
+            """)
+            gr.HTML("""
+            <div style="background:#161b22;border:1px solid #21262d;border-radius:10px;padding:1.2rem;margin-bottom:1rem;">
+                <div style="font-weight:600;color:#e6edf3;margin-bottom:1rem;">🧠 Choose your Bot's Brain</div>
+                <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;">
+                    <div class="algo-card">
+                        <div class="algo-name">Q-Learning</div>
+                        <div class="algo-desc">
+                            Updates its score table <em>immediately</em> after every step.
+                            Fast learner. Best for most mazes.
+                        </div>
+                        <span class="algo-tag tag-green">⚡ Recommended</span>
+                    </div>
+                    <div class="algo-card">
+                        <div class="algo-name">SARSA</div>
+                        <div class="algo-desc">
+                            Updates based on the move it <em>actually took</em> next,
+                            not just the best possible. More cautious, avoids risky paths.
+                        </div>
+                        <span class="algo-tag tag-blue">🎯 Cautious</span>
+                    </div>
+                    <div class="algo-card">
+                        <div class="algo-name">Monte Carlo</div>
+                        <div class="algo-desc">
+                            Plays out the <em>entire episode</em> first, then
+                            updates everything at once. Needs more episodes to converge.
+                        </div>
+                        <span class="algo-tag tag-orange">🎲 Explorer</span>
+                    </div>
+                </div>
+            </div>
+            """)
+            gr.HTML("""
+            <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;">
+                <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:10px;padding:1rem;">
+                    <div style="font-weight:600;color:#3fb950;margin-bottom:0.4rem;">🗺️ How to use this app</div>
+                    <ol style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
+                        <li>Go to <strong style="color:#e6edf3">🎮 Playground</strong> tab</li>
+                        <li>Pick a difficulty and maze style</li>
+                        <li>Choose a brain and hit <strong style="color:#3fb950">Train & Watch!</strong></li>
+                        <li>Watch the animated replay</li>
+                        <li>Try <strong style="color:#e6edf3">🏁 Algorithm Race</strong> to compare all three</li>
+                    </ol>
+                </div>
+                <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:10px;padding:1rem;">
+                    <div style="font-weight:600;color:#58a6ff;margin-bottom:0.4rem;">💡 Fun facts</div>
+                    <ul style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
+                        <li>This same idea trains robots, game AIs, and self-driving cars</li>
+                        <li>DeepMind's AlphaGo used a version of Q-Learning</li>
+                        <li>A 17×17 maze has 289 possible positions to learn</li>
+                        <li>The bot gets worse before it gets better — that's normal!</li>
+                    </ul>
+                </div>
+            </div>
+            """)
+        # ══════════════════════════════════════════════════════════════════
+        # Tab 2 — Playground
+        # ══════════════════════════════════════════════════════════════════
+        with gr.Tab("🎮 Playground"):
+            gr.HTML("""
+            <div style="padding:0.3rem 0 1rem;">
+                <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
+                    Build a maze, pick a brain, watch it learn
+                </div>
+                <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
+                    The animated replay shows the final learned path after training.
+                </div>
+            </div>
+            """)
+            with gr.Row():
+                # ── Controls ──────────────────────────────────────────────
+                with gr.Column(scale=1, min_width=300):
+                    gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin-bottom:0.5rem;">🗺️ MAZE SETUP</div>')
+                    difficulty = gr.Radio(
+                        list(DIFFICULTY.keys()),
+                        value="🐢 Medium  (9×9)",
+                        label="Difficulty",
+                    )
+                    maze_style = gr.Radio(
+                        list(MAZE_STYLE.keys()),
+                        value="🏰 Corridors  (DFS)",
+                        label="Maze style",
+                        info="Corridors = proper winding paths · Open = random walls"
+                    )
+                    gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">🧠 BOT BRAIN</div>')
+                    algo = gr.Radio(
+                        list(ALGO_MAP.keys()),
+                        value="🧠 Q-Learning  (recommended)",
+                        label="Algorithm",
+                    )
+                    gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">⚙️ TRAINING</div>')
+                    episodes = gr.Slider(100, 3000, value=800, step=100,
+                                         label="Training episodes",
+                                         info="More = smarter bot, but slower")
+                    with gr.Accordion("🔬 Advanced settings", open=False):
+                        alpha  = gr.Slider(0.01, 0.5,  value=0.1,   step=0.01, label="Learning speed (α)")
+                        gamma  = gr.Slider(0.5,  0.99, value=0.95,  step=0.01, label="Future planning (γ)")
+                        decay  = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay")
+                        seed   = gr.Slider(0,    100,  value=42,    step=1,    label="Random seed")
+                    btn_solve = gr.Button("🚀 Train & Watch!", variant="primary")
+                # ── Outputs ───────────────────────────────────────────────
+                with gr.Column(scale=2):
+                    play_stats = gr.Markdown("*Configure your maze and hit Train & Watch!*")
+                    with gr.Row():
+                        play_gif = gr.Image(
+                            label="🎬 Bot solving the maze (animated)",
+                            type="filepath", height=360,
+                        )
+            with gr.Row():
+                play_train_fig  = gr.Plot(label="📈 Training progress")
+                play_heatmap    = gr.Plot(label="🌡️ Q-value map (what the bot learned)")
+            # hidden state defaults for advanced
+            alpha_h = gr.State(0.1)
+            gamma_h = gr.State(0.95)
+            decay_h = gr.State(0.995)
+            seed_h  = gr.State(42)
+            btn_solve.click(
+                cb_solve,
+                inputs=[difficulty, maze_style, algo, episodes,
+                        alpha, gamma, decay, seed],
+                outputs=[play_gif, play_train_fig, play_heatmap, play_stats],
+            )
+        # ══════════════════════════════════════════════════════════════════
+        # Tab 3 — Algorithm Race
+        # ══════════════════════════════════════════════════════════════════
+        with gr.Tab("🏁 Algorithm Race"):
+            gr.HTML("""
+            <div style="padding:0.3rem 0 1rem;">
+                <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
+                    Head-to-head: which brain learns fastest?
+                </div>
+                <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
+                    All algorithms train on the same maze with identical settings —
+                    the only variable is the learning strategy.
+                </div>
+            </div>
+            """)
+            gr.HTML("""
+            <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;margin-bottom:1rem;">
+                <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.8rem;text-align:center;">
+                    <div style="color:#3fb950;font-size:1.2rem;font-weight:700;">Q-Learning</div>
+                    <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Off-policy · Fast update · Optimistic</div>
+                </div>
+                <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:8px;padding:0.8rem;text-align:center;">
+                    <div style="color:#58a6ff;font-size:1.2rem;font-weight:700;">SARSA</div>
+                    <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">On-policy · Careful update · Conservative</div>
+                </div>
+                <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.8rem;text-align:center;">
+                    <div style="color:#ffa657;font-size:1.2rem;font-weight:700;">Monte Carlo</div>
+                    <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Episodic · Full return · Unbiased</div>
+                </div>
+            </div>
+            """)
+            with gr.Row():
+                with gr.Column(scale=1, min_width=260):
+                    race_diff  = gr.Radio(list(DIFFICULTY.keys()),
+                                          value="🐢 Medium  (9×9)", label="Maze difficulty")
+                    race_style = gr.Radio(list(MAZE_STYLE.keys()),
+                                          value="🏰 Corridors  (DFS)", label="Maze style")
+                    race_eps   = gr.Slider(200, 2000, value=600, step=100,
+                                           label="Episodes per algorithm")
+                    race_mc    = gr.Checkbox(label="Include Monte Carlo (slower)", value=True)
+                    btn_race   = gr.Button("🏁 Start Race!", variant="primary")
+                with gr.Column(scale=2):
+                    race_result = gr.Markdown("*Click Start Race to run the comparison.*")
+            race_fig = gr.Plot(label="Race Results")
+            btn_race.click(
+                cb_race,
+                inputs=[race_diff, race_style, race_eps, race_mc],
+                outputs=[race_fig, race_result],
+            )
+        # ══════════════════════════════════════════════════════════════════
+        # Tab 4 — How it Works
+        # ══════════════════════════════════════════════════════════════════
+        with gr.Tab("🧠 How it Works"):
+            gr.HTML("""
+            <div style="max-width:700px;margin:0 auto;padding:1rem 0;">
+            <h2 style="color:#3fb950;font-size:1.3rem;margin-bottom:0.3rem;">The Big Idea</h2>
+            <p style="color:#8b949e;line-height:1.7;">
+                The bot doesn't know anything about the maze at the start. It just knows
+                4 possible moves and gets a number (reward) after each step.
+                <strong style="color:#e6edf3">Negative number = bad move. Positive = good move.</strong>
+                The goal: find the sequence of moves that gets the most reward.
+            </p>
+            <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">The Score Table (Q-Table)</h2>
+            <p style="color:#8b949e;line-height:1.7;">
+                The bot keeps a table with one row per maze cell and 4 columns (one per direction).
+                Each entry stores <em>how good it thinks that move is from that cell</em>.
+                At the start, everything is 0. After training, the table holds the bot's
+                entire learned strategy. The Q-value heatmap in the Playground shows this table visually.
+            </p>
+            <div style="background:#161b22;border:1px solid #21262d;border-radius:8px;padding:1rem;margin:1rem 0;font-family:'JetBrains Mono',monospace;font-size:0.82rem;color:#3fb950;">
+Q[current_cell][move] += learning_speed × (<br>
+&nbsp;&nbsp;reward_got + future_discount × best_Q[next_cell] − Q[current_cell][move]<br>
+)
+            </div>
+            <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Exploration vs Exploitation</h2>
+            <p style="color:#8b949e;line-height:1.7;">
+                Early in training, the bot tries <strong style="color:#e6edf3">random moves</strong> (exploration)
+                — it doesn't know enough to trust its table yet. Over time, it relies more on what
+                it's learned (exploitation). This is controlled by <strong style="color:#e6edf3">epsilon (ε)</strong>,
+                which starts near 1.0 (100% random) and decays toward 0 (always use best known move).
+            </p>
+            <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Why does reward go negative first?</h2>
+            <p style="color:#8b949e;line-height:1.7;">
+                Each step costs −1 (time penalty) and hitting a wall costs −5.
+                A random bot hits a <em>lot</em> of walls and takes forever to find the exit,
+                so early rewards are very negative. As it learns, fewer walls are hit and
+                the path shortens — reward climbs toward 0 and eventually turns positive when
+                it reliably reaches the goal.
+            </p>
+            <div style="display:grid;grid-template-columns:1fr 1fr;gap:0.8rem;margin-top:1.2rem;">
+                <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.9rem;">
+                    <div style="color:#3fb950;font-weight:600;margin-bottom:0.4rem;">Q-Learning vs SARSA</div>
+                    <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
+                        Q-Learning always updates toward the <em>best possible</em> next action —
+                        even if it wouldn't actually take that action. SARSA updates toward
+                        the action it <em>will actually take</em>. This makes SARSA more cautious near walls.
+                    </div>
+                </div>
+                <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.9rem;">
+                    <div style="color:#ffa657;font-weight:600;margin-bottom:0.4rem;">Why Monte Carlo is slow</div>
+                    <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
+                        MC waits until the episode <em>ends</em> before updating any scores.
+                        On large mazes where early episodes never reach the goal,
+                        it gets zero signal for a long time. But once it starts solving,
+                        its estimates are very accurate.
+                    </div>
+                </div>
+            </div>
+            </div>
+            """)
+    gr.HTML("""
+    <div style="text-align:center;color:#21262d;font-size:0.75rem;
+                padding:1.5rem 0 0.5rem;border-top:1px solid #161b22;margin-top:1rem;">
+        Built with Q-Learning · SARSA · Monte Carlo · Gymnasium · Gradio
+    </div>
+    """)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS)

maze/__init__.py ADDED Viewed

File without changes

maze/env.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Enhanced MazeEnv — works with both DFS corridor mazes and open random mazes.
+"""
+from __future__ import annotations
+import numpy as np
+import gymnasium as gym
+from gymnasium import spaces
+class MazeEnv(gym.Env):
+    metadata = {"render_modes": []}
+    def __init__(self, grid: np.ndarray):
+        super().__init__()
+        self.grid = grid.copy()
+        self.H, self.W = grid.shape
+        self.n_states = self.H * self.W
+        self.start = (0, 0)
+        self.goal = (self.H - 1, self.W - 1)
+        # Ensure start/goal are open
+        self.grid[self.start] = 0
+        self.grid[self.goal] = 0
+        self.observation_space = spaces.Discrete(self.n_states)
+        self.action_space = spaces.Discrete(4)  # up down left right
+        self._MOVES = [(-1, 0), (1, 0), (0, -1), (0, 1)]
+        self.agent_pos = self.start
+    def _to_state(self, r: int, c: int) -> int:
+        return r * self.W + c
+    def _from_state(self, s: int) -> tuple[int, int]:
+        return divmod(s, self.W)
+    def reset(self, *, seed=None, options=None):
+        super().reset(seed=seed)
+        self.agent_pos = self.start
+        return self._to_state(*self.agent_pos), {}
+    def step(self, action: int):
+        dr, dc = self._MOVES[action]
+        r, c = self.agent_pos
+        nr, nc = r + dr, c + dc
+        # Boundary / wall check
+        if 0 <= nr < self.H and 0 <= nc < self.W and self.grid[nr, nc] == 0:
+            self.agent_pos = (nr, nc)
+            reward = -1
+        else:
+            reward = -5  # bump penalty (less harsh than before so agent explores)
+        done = self.agent_pos == self.goal
+        if done:
+            reward = 100
+        return self._to_state(*self.agent_pos), reward, done, False, {}
+    @property
+    def shape(self) -> tuple[int, int]:
+        return self.H, self.W

maze/generator.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+Maze generation using recursive DFS backtracker.
+Guarantees a fully connected, solvable maze with proper corridors.
+"""
+from __future__ import annotations
+import numpy as np
+from enum import IntEnum
+class Cell(IntEnum):
+    OPEN = 0
+    WALL = 1
+    START = 2
+    GOAL = 3
+PRESETS = {
+    "🐣 Tiny (5×5)":    {"size": 5,  "wall_density": 0.0},
+    "🐇 Small (7×7)":   {"size": 7,  "wall_density": 0.0},
+    "🐢 Medium (9×9)":  {"size": 9,  "wall_density": 0.0},
+    "🦊 Large (13×13)": {"size": 13, "wall_density": 0.0},
+    "🐉 XL (17×17)":    {"size": 17, "wall_density": 0.0},
+}
+THEMES = {
+    "🏰 Classic":      {"wall": "#2d3561", "open": "#f5f0e8", "path": "#e94560"},
+    "🌲 Forest":       {"wall": "#1b4332", "open": "#d8f3dc", "path": "#f77f00"},
+    "🌌 Space":        {"wall": "#03045e", "open": "#0d1b2a", "path": "#00b4d8"},
+    "🔥 Lava":         {"wall": "#370617", "open": "#ffd166", "path": "#ef233c"},
+}
+def generate_dfs_maze(size: int, seed: int = 42) -> np.ndarray:
+    """
+    DFS recursive backtracker on a (size×size) grid.
+    Works on a logical grid where passages exist between cells.
+    Returns a (2*size-1) × (2*size-1) wall grid where 0=open, 1=wall.
+    """
+    rng = np.random.default_rng(seed)
+    # Full grid size (cells + walls between them)
+    H = W = 2 * size - 1
+    grid = np.ones((H, W), dtype=np.int8)  # start all walls
+    # Mark logical cells as open
+    for r in range(size):
+        for c in range(size):
+            grid[2*r, 2*c] = 0
+    visited = np.zeros((size, size), dtype=bool)
+    stack = [(0, 0)]
+    visited[0, 0] = True
+    while stack:
+        r, c = stack[-1]
+        neighbors = []
+        for dr, dc in [(-1,0),(1,0),(0,-1),(0,1)]:
+            nr, nc = r+dr, c+dc
+            if 0 <= nr < size and 0 <= nc < size and not visited[nr, nc]:
+                neighbors.append((nr, nc, dr, dc))
+        if neighbors:
+            nr, nc, dr, dc = neighbors[rng.integers(len(neighbors))]
+            # Carve passage
+            grid[2*r+dr, 2*c+dc] = 0
+            visited[nr, nc] = True
+            stack.append((nr, nc))
+        else:
+            stack.pop()
+    return grid
+def generate_open_maze(size: int, wall_frac: float = 0.20, seed: int = 42) -> np.ndarray:
+    """Simple random-wall maze — fast, less structured."""
+    rng = np.random.default_rng(seed)
+    grid = np.zeros((size, size), dtype=np.int8)
+    n_walls = int(size * size * wall_frac)
+    cells = [(r, c) for r in range(size) for c in range(size)
+             if not (r == 0 and c == 0) and not (r == size-1 and c == size-1)]
+    rng.shuffle(cells)
+    for r, c in cells[:n_walls]:
+        grid[r, c] = 1
+    return grid

viz/__init__.py ADDED Viewed

File without changes

viz/renderer.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+All visual output: animated GIF of the agent solving the maze,
+training curve chart, Q-value heatmap, algorithm race chart.
+"""
+from __future__ import annotations
+import io
+import tempfile
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+import matplotlib.cm as mpl_cm
+from matplotlib.figure import Figure as MplFigure
+from matplotlib.axes import Axes as MplAxes
+from matplotlib.colors import LinearSegmentedColormap
+import PIL.Image
+import PIL.ImageDraw
+from maze.env import MazeEnv
+from agents.base import TabularAgent
+# ── Palette ───────────────────────────────────────────────────────────────────
+BG       = "#0d1117"
+BG2      = "#161b22"
+GRID_C   = "#21262d"
+WALL_C   = "#1f6feb"
+OPEN_C   = "#0d1117"
+PATH_C   = "#3fb950"
+START_C  = "#58a6ff"
+GOAL_C   = "#f78166"
+AGENT_C  = "#ffa657"
+TEXT_C   = "#c9d1d9"
+DIM_C    = "#484f58"
+def _fig_to_pil(fig: MplFigure) -> PIL.Image.Image:
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=120, bbox_inches="tight",
+                facecolor=fig.get_facecolor())
+    buf.seek(0)
+    plt.close(fig)
+    return PIL.Image.open(buf).convert("RGB")
+def _draw_maze_frame(
+    ax: MplAxes,
+    grid: np.ndarray,
+    path: list[tuple[int, int]],
+    current_step: int,
+    H: int, W: int,
+    show_heatmap: np.ndarray | None = None,
+) -> None:
+    ax.set_facecolor(BG)
+    ax.set_xlim(-0.5, W - 0.5)
+    ax.set_ylim(H - 0.5, -0.5)
+    ax.set_aspect("equal")
+    ax.axis("off")
+    # Draw cells
+    for r in range(H):
+        for c in range(W):
+            is_wall = grid[r, c] == 1
+            if show_heatmap is not None and not is_wall:
+                val = float(show_heatmap[r, c])
+                intensity = np.clip(val, 0, 1)
+                color = mpl_cm.get_cmap("YlOrRd")(0.2 + 0.8 * intensity)
+            else:
+                color = "#2d333b" if is_wall else BG2
+            rect = patches.FancyBboxPatch(
+                (c - 0.45, r - 0.45), 0.90, 0.90,
+                boxstyle="round,pad=0.04",
+                facecolor=color,
+                edgecolor=GRID_C, linewidth=0.4,
+            )
+            ax.add_patch(rect)
+    # Start & goal
+    ax.add_patch(patches.Circle((0, 0), 0.35, color=START_C, zorder=3))
+    ax.add_patch(patches.Circle((W-1, H-1), 0.35, color=GOAL_C, zorder=3))
+    ax.text(0, 0, "S", ha="center", va="center", color="white",
+            fontsize=7, fontweight="bold", zorder=4)
+    ax.text(W-1, H-1, "G", ha="center", va="center", color="white",
+            fontsize=7, fontweight="bold", zorder=4)
+    # Walked path (up to current_step)
+    walked = path[:current_step]
+    if len(walked) > 1:
+        xs = [c for _, c in walked]
+        ys = [r for r, _ in walked]
+        ax.plot(xs, ys, color=PATH_C, linewidth=2.5, alpha=0.7,
+                zorder=2, solid_capstyle="round")
+    # Current agent position
+    if walked:
+        ar, ac = walked[-1]
+        ax.add_patch(patches.Circle((ac, ar), 0.32, color=AGENT_C, zorder=5))
+        ax.text(ac, ar, "●", ha="center", va="center",
+                color="white", fontsize=8, zorder=6)
+def make_solution_gif(
+    env: MazeEnv,
+    agent: TabularAgent,
+    fps: int = 8,
+    label: str = "",
+) -> str:
+    """Greedy rollout → animated GIF of agent walking the maze."""
+    # Collect path
+    state, _ = env.reset()
+    path: list[tuple[int, int]] = [env.start]
+    for _ in range(env.n_states * 3):
+        action = agent.greedy_action(state)
+        state, _, done, _, _ = env.step(action)
+        path.append(env._from_state(state))
+        if done:
+            break
+    H, W = env.shape
+    pil_frames: list[PIL.Image.Image] = []
+    for step in range(1, len(path) + 1):
+        fig, ax = plt.subplots(figsize=(max(4, W * 0.55), max(4, H * 0.55)))
+        fig.patch.set_facecolor(BG)
+        _draw_maze_frame(ax, env.grid, path, step, H, W)
+        status = f"Step {step}/{len(path)-1}"
+        if step == len(path) and path[-1] == env.goal:
+            status = f"🎉 Solved in {len(path)-1} steps!"
+        ax.set_title(f"{label}  {status}", color=TEXT_C, fontsize=9,
+                     pad=6, fontfamily="monospace")
+        pil_frames.append(_fig_to_pil(fig))
+    tmp = tempfile.NamedTemporaryFile(suffix=".gif", delete=False)
+    pil_frames[0].save(
+        tmp.name, save_all=True, append_images=pil_frames[1:],
+        duration=int(1000 / fps), loop=0, optimize=False,
+    )
+    return tmp.name
+def make_training_chart(
+    rewards_dict: dict[str, list[float]],
+    colors: dict[str, str] | None = None,
+) -> MplFigure:
+    """Reward-per-episode curves for one or more algorithms."""
+    default_colors = {"Q-Learning": "#3fb950", "SARSA": "#58a6ff", "Monte Carlo": "#ffa657"}
+    colors = colors or default_colors
+    fig, ax = plt.subplots(figsize=(10, 4))
+    fig.patch.set_facecolor(BG)
+    ax.set_facecolor(BG2)
+    for name, rewards in rewards_dict.items():
+        col = colors.get(name, "#ffffff")
+        eps = list(range(len(rewards)))
+        ax.plot(eps, rewards, color=col, linewidth=0.8, alpha=0.3)
+        if len(eps) > 20:
+            k = max(5, len(eps) // 40)
+            smooth = np.convolve(rewards, np.ones(k) / k, "valid")
+            ax.plot(range(k - 1, len(eps)), smooth, color=col, linewidth=2.2, label=name)
+        else:
+            ax.plot(eps, rewards, color=col, linewidth=2.0, label=name)
+    ax.set_xlabel("Episode", color=DIM_C, fontsize=9)
+    ax.set_ylabel("Total Reward", color=DIM_C, fontsize=9)
+    ax.set_title("Training Progress — Reward per Episode", color=TEXT_C,
+                 fontsize=11, pad=10, fontfamily="monospace")
+    ax.tick_params(colors=DIM_C, labelsize=8)
+    for spine in ax.spines.values():
+        spine.set_color(GRID_C)
+    ax.grid(color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
+    if len(rewards_dict) > 1:
+        ax.legend(facecolor=BG2, edgecolor=GRID_C, labelcolor=TEXT_C, fontsize=9)
+    fig.tight_layout()
+    return fig
+def make_qvalue_heatmap(env: MazeEnv, agent: TabularAgent, title: str = "") -> MplFigure:
+    """Q-value heatmap overlaid on the maze grid."""
+    H, W = env.shape
+    max_q = np.max(agent.Q, axis=1).reshape(H, W)
+    # Normalise to [0, 1] for colour mapping
+    qmin, qmax = max_q.min(), max_q.max()
+    norm_q = (max_q - qmin) / max(qmax - qmin, 1e-8)
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, max(4, H * 0.6)))
+    fig.patch.set_facecolor(BG)
+    # Left: maze with learned path
+    ax1.set_facecolor(BG)
+    state, _ = env.reset()
+    path: list[tuple[int, int]] = [env.start]
+    for _ in range(env.n_states * 3):
+        action = agent.greedy_action(state)
+        state, _, done, _, _ = env.step(action)
+        path.append(env._from_state(state))
+        if done:
+            break
+    _draw_maze_frame(ax1, env.grid, path, len(path), H, W)
+    solved = path[-1] == env.goal
+    ax1.set_title(
+        f"{'Solved' if solved else 'Not solved'} — {len(path)-1} steps",
+        color=TEXT_C, fontsize=10, pad=8, fontfamily="monospace",
+    )
+    # Right: Q-value heatmap
+    ax2.set_facecolor(BG)
+    ax2.set_xlim(-0.5, W - 0.5)
+    ax2.set_ylim(H - 0.5, -0.5)
+    ax2.set_aspect("equal")
+    ax2.axis("off")
+    cmap = LinearSegmentedColormap.from_list("qmap", ["#0d1117", "#1f6feb", "#3fb950"])
+    for r in range(H):
+        for c in range(W):
+            if env.grid[r, c] == 1:
+                color = "#2d333b"
+            else:
+                color = cmap(norm_q[r, c])
+            rect = patches.FancyBboxPatch(
+                (c - 0.45, r - 0.45), 0.90, 0.90,
+                boxstyle="round,pad=0.04",
+                facecolor=color, edgecolor=GRID_C, linewidth=0.4,
+            )
+            ax2.add_patch(rect)
+            if env.grid[r, c] == 0 and H <= 11:
+                ax2.text(c, r, f"{max_q[r,c]:.0f}",
+                         ha="center", va="center", fontsize=5.5,
+                         color="white" if norm_q[r, c] > 0.4 else DIM_C)
+    ax2.set_title("Q-Value Map (brighter = agent prefers this cell)",
+                  color=TEXT_C, fontsize=10, pad=8, fontfamily="monospace")
+    if title:
+        fig.suptitle(title, color=TEXT_C, fontsize=12, fontfamily="monospace", y=1.02)
+    fig.tight_layout()
+    return fig
+def make_race_chart(
+    rewards_a: list[float], name_a: str,
+    rewards_b: list[float], name_b: str,
+    rewards_c: list[float] | None = None, name_c: str = "",
+) -> MplFigure:
+    """Head-to-head convergence race chart."""
+    fig, axes = plt.subplots(1, 2, figsize=(13, 4))
+    fig.patch.set_facecolor(BG)
+    pairs = [(rewards_a, name_a, "#3fb950"), (rewards_b, name_b, "#58a6ff")]
+    if rewards_c:
+        pairs.append((rewards_c, name_c, "#ffa657"))
+    # Left: raw + smoothed reward curves
+    ax = axes[0]
+    ax.set_facecolor(BG2)
+    for rewards, name, col in pairs:
+        eps = list(range(len(rewards)))
+        ax.plot(eps, rewards, color=col, linewidth=0.6, alpha=0.25)
+        if len(eps) > 20:
+            k = max(5, len(eps) // 40)
+            smooth = np.convolve(rewards, np.ones(k) / k, "valid")
+            ax.plot(range(k-1, len(eps)), smooth, color=col, linewidth=2.2, label=name)
+    ax.set_title("Reward Convergence", color=TEXT_C, fontsize=10, fontfamily="monospace")
+    ax.set_xlabel("Episode", color=DIM_C, fontsize=8)
+    ax.set_ylabel("Reward", color=DIM_C, fontsize=8)
+    ax.tick_params(colors=DIM_C, labelsize=7)
+    for spine in ax.spines.values():
+        spine.set_color(GRID_C)
+    ax.grid(color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
+    ax.legend(facecolor=BG2, edgecolor=GRID_C, labelcolor=TEXT_C, fontsize=8)
+    # Right: bar chart of final performance
+    ax2 = axes[1]
+    ax2.set_facecolor(BG2)
+    names = [p[1] for p in pairs]
+    finals = [float(np.mean(p[0][-max(1, len(p[0])//5):])) for p in pairs]
+    cols = [p[2] for p in pairs]
+    bars = ax2.bar(names, finals, color=cols, edgecolor=BG, linewidth=0.8, width=0.5)
+    for bar, val in zip(bars, finals):
+        ax2.text(bar.get_x() + bar.get_width()/2,
+                 bar.get_height() + abs(min(finals)) * 0.02,
+                 f"{val:.1f}", ha="center", va="bottom",
+                 color=TEXT_C, fontsize=9, fontweight="bold")
+    ax2.set_title("Final Performance (avg last 20%)", color=TEXT_C,
+                  fontsize=10, fontfamily="monospace")
+    ax2.tick_params(colors=DIM_C, labelsize=8)
+    for spine in ax2.spines.values():
+        spine.set_color(GRID_C)
+    ax2.grid(axis="y", color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
+    ax2.set_facecolor(BG2)
+    fig.tight_layout(pad=2.0)
+    return fig
+def score_run(path: list[tuple[int, ...]], goal: tuple[int, int],
+              rewards: list[float], n_states: int) -> dict:
+    solved = bool(path and path[-1] == goal)
+    steps = len(path) - 1
+    optimal_approx = n_states ** 0.5  # rough floor
+    efficiency = min(100, int(optimal_approx / max(steps, 1) * 100)) if solved else 0
+    avg_r = float(np.mean(rewards[-max(1, len(rewards)//5):])) if rewards else 0
+    if not solved:
+        grade, verdict = "F", "Bot got lost — try more episodes!"
+    elif efficiency >= 80:
+        grade, verdict = "S", "Perfect pathfinding! Optimal route."
+    elif efficiency >= 60:
+        grade, verdict = "A", "Excellent! Near-optimal path."
+    elif efficiency >= 40:
+        grade, verdict = "B", "Good solve — some detours taken."
+    else:
+        grade, verdict = "C", "Solved but took the scenic route!"
+    return {"solved": solved, "steps": steps, "grade": grade,
+            "verdict": verdict, "efficiency": efficiency, "avg_reward": avg_r}