Daksh C Jain Claude Sonnet 4.6 commited on
Commit
34aeb9a
·
1 Parent(s): a6e59e3

Transform into interactive RL playground for all audiences

Browse files

- 4-tab UI: Welcome, Playground, Algorithm Race, How it Works
- DFS maze generator (guaranteed solvable corridors) + open random style
- Animated GIF replay of agent solving the maze step by step
- Q-Learning, SARSA, Monte Carlo — all with plain English descriptions
- Algorithm Race tab: head-to-head convergence charts + winner announcement
- Q-value heatmap showing what the agent learned
- Difficulty presets: Tiny 5x5 to XL 17x17
- No jargon: human labels for all controls, story-driven onboarding
- Modular structure: maze/, agents/, viz/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ *.pyc
agents/__init__.py ADDED
File without changes
agents/base.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+
4
+
5
+ class TabularAgent:
6
+ def __init__(self, n_states: int, n_actions: int,
7
+ alpha: float = 0.1, gamma: float = 0.95, epsilon: float = 1.0):
8
+ self.n_states = n_states
9
+ self.n_actions = n_actions
10
+ self.alpha = alpha
11
+ self.gamma = gamma
12
+ self.epsilon = epsilon
13
+ self.Q = np.zeros((n_states, n_actions), dtype=np.float32)
14
+
15
+ def choose_action(self, state: int, rng: np.random.Generator) -> int:
16
+ if rng.random() < self.epsilon:
17
+ return int(rng.integers(self.n_actions))
18
+ return int(np.argmax(self.Q[state]))
19
+
20
+ def greedy_action(self, state: int) -> int:
21
+ return int(np.argmax(self.Q[state]))
22
+
23
+ def decay_epsilon(self, rate: float):
24
+ self.epsilon = max(0.01, self.epsilon * rate)
agents/montecarlo.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ from agents.base import TabularAgent
4
+ from maze.env import MazeEnv
5
+
6
+
7
+ def train_montecarlo(
8
+ env: MazeEnv, episodes: int, alpha: float, gamma: float,
9
+ decay: float, seed: int = 0,
10
+ ) -> tuple[TabularAgent, list[float]]:
11
+ agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
12
+ rng = np.random.default_rng(seed)
13
+ returns_sum = np.zeros_like(agent.Q)
14
+ returns_cnt = np.zeros_like(agent.Q)
15
+ rewards = []
16
+
17
+ for _ in range(episodes):
18
+ state, _ = env.reset()
19
+ episode: list[tuple[int, int, float]] = []
20
+ for _ in range(env.n_states * 4):
21
+ action = agent.choose_action(state, rng)
22
+ next_state, reward, done, _, _ = env.step(action)
23
+ episode.append((state, action, reward))
24
+ state = next_state
25
+ if done:
26
+ break
27
+
28
+ # First-visit MC update
29
+ G = 0.0
30
+ visited: set[tuple[int, int]] = set()
31
+ for s, a, r in reversed(episode):
32
+ G = gamma * G + r
33
+ if (s, a) not in visited:
34
+ visited.add((s, a))
35
+ returns_sum[s, a] += G
36
+ returns_cnt[s, a] += 1
37
+ agent.Q[s, a] = returns_sum[s, a] / returns_cnt[s, a]
38
+
39
+ agent.decay_epsilon(decay)
40
+ rewards.append(sum(r for _, _, r in episode))
41
+
42
+ return agent, rewards
agents/qlearning.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ from agents.base import TabularAgent
4
+ from maze.env import MazeEnv
5
+
6
+
7
+ def train_qlearning(
8
+ env: MazeEnv, episodes: int, alpha: float, gamma: float,
9
+ decay: float, seed: int = 0,
10
+ ) -> tuple[TabularAgent, list[float]]:
11
+ agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
12
+ rng = np.random.default_rng(seed)
13
+ rewards = []
14
+
15
+ for _ in range(episodes):
16
+ state, _ = env.reset()
17
+ total = 0.0
18
+ for _ in range(env.n_states * 4):
19
+ action = agent.choose_action(state, rng)
20
+ next_state, reward, done, _, _ = env.step(action)
21
+ # Q-Learning: off-policy TD update
22
+ td_target = reward + gamma * np.max(agent.Q[next_state]) * (1 - done)
23
+ agent.Q[state, action] += alpha * (td_target - agent.Q[state, action])
24
+ state = next_state
25
+ total += reward
26
+ if done:
27
+ break
28
+ agent.decay_epsilon(decay)
29
+ rewards.append(total)
30
+
31
+ return agent, rewards
agents/sarsa.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import numpy as np
3
+ from agents.base import TabularAgent
4
+ from maze.env import MazeEnv
5
+
6
+
7
+ def train_sarsa(
8
+ env: MazeEnv, episodes: int, alpha: float, gamma: float,
9
+ decay: float, seed: int = 0,
10
+ ) -> tuple[TabularAgent, list[float]]:
11
+ agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma)
12
+ rng = np.random.default_rng(seed)
13
+ rewards = []
14
+
15
+ for _ in range(episodes):
16
+ state, _ = env.reset()
17
+ action = agent.choose_action(state, rng)
18
+ total = 0.0
19
+ for _ in range(env.n_states * 4):
20
+ next_state, reward, done, _, _ = env.step(action)
21
+ next_action = agent.choose_action(next_state, rng)
22
+ # SARSA: on-policy TD update (uses next chosen action, not greedy)
23
+ td_target = reward + gamma * agent.Q[next_state, next_action] * (1 - done)
24
+ agent.Q[state, action] += alpha * (td_target - agent.Q[state, action])
25
+ state, action = next_state, next_action
26
+ total += reward
27
+ if done:
28
+ break
29
+ agent.decay_epsilon(decay)
30
+ rewards.append(total)
31
+
32
+ return agent, rewards
app.py CHANGED
@@ -1,394 +1,637 @@
 
 
 
 
 
 
 
1
  import numpy as np
2
- import gymnasium as gym
3
- from gymnasium import spaces
4
- import random
5
  import gradio as gr
6
- import matplotlib.pyplot as plt
7
- import matplotlib.patches as mpatches
8
- import io
9
- import base64
10
- import seaborn as sns
11
- from PIL import Image
12
-
13
- # ─────────────────────────────────────────────
14
- # MAZE ENVIRONMENT
15
- # ─────────────────────────────────────────────
16
- class MazeEnv(gym.Env):
17
- """
18
- A grid-based maze environment.
19
- States : row * maze_size + col (integer)
20
- Actions : 0=Up, 1=Down, 2=Left, 3=Right
21
- """
22
- metadata = {"render_modes": []}
23
-
24
- def __init__(self, maze_size: int = 5, num_walls: int = None, seed: int = None):
25
- super().__init__()
26
- self.maze_size = maze_size
27
- self.observation_space = spaces.Discrete(maze_size * maze_size)
28
- self.action_space = spaces.Discrete(4)
29
-
30
- self.start_pos = (0, 0)
31
- self.goal_pos = (maze_size - 1, maze_size - 1)
32
-
33
- self.seed = seed
34
- self.num_walls = num_walls if num_walls is not None else int(maze_size * maze_size * 0.15)
35
-
36
- self.maze = self._generate_maze()
37
- self.state = self.start_pos
38
-
39
- # ── maze generation ──────────────────────
40
- def _generate_maze(self) -> np.ndarray:
41
- maze = np.zeros((self.maze_size, self.maze_size), dtype=np.int8)
42
- rng = random.Random(self.seed)
43
- placed = 0
44
- max_attempts = self.num_walls * 100
45
- attempts = 0
46
- while placed < self.num_walls and attempts < max_attempts:
47
- r = rng.randint(0, self.maze_size - 1)
48
- c = rng.randint(0, self.maze_size - 1)
49
- if (r, c) not in (self.start_pos, self.goal_pos) and maze[r, c] == 0:
50
- maze[r, c] = 1
51
- placed += 1
52
- attempts += 1
53
- return maze
54
-
55
- # ── gym interface ─────────────────────────
56
- def reset(self, seed=None, options=None):
57
- super().reset(seed=seed)
58
- if seed is not None and seed != self.seed:
59
- self.seed = seed
60
- self.maze = self._generate_maze()
61
- self.state = self.start_pos
62
- return self._get_obs(), {}
63
-
64
- def _get_obs(self) -> int:
65
- return self.state[0] * self.maze_size + self.state[1]
66
-
67
- def step(self, action: int):
68
- r, c = self.state
69
- moves = {0: (-1, 0), 1: (1, 0), 2: (0, -1), 3: (0, 1)}
70
- dr, dc = moves[action]
71
- nr, nc = r + dr, c + dc
72
-
73
- # Clamp to grid boundaries
74
- nr = max(0, min(nr, self.maze_size - 1))
75
- nc = max(0, min(nc, self.maze_size - 1))
76
- next_pos = (nr, nc)
77
-
78
- # Wall collision → stay put, penalise
79
- if self.maze[nr, nc] == 1:
80
- reward = -10
81
- # state unchanged
82
- else:
83
- self.state = next_pos
84
- if self.state == self.goal_pos:
85
- reward = 100
86
- return self._get_obs(), reward, True, False, {}
87
- else:
88
- reward = -1
89
-
90
- return self._get_obs(), reward, False, False, {}
91
-
92
-
93
- # ─────────────────────────────────────────────
94
- # Q-LEARNING AGENT
95
- # ─────────────────────────────────────────────
96
- class QLearningAgent:
97
- def __init__(self, num_states: int, num_actions: int,
98
- alpha: float = 0.1, gamma: float = 0.9, epsilon: float = 1.0):
99
- self.num_states = num_states
100
- self.num_actions = num_actions
101
- self.alpha = alpha
102
- self.gamma = gamma
103
- self.epsilon = epsilon
104
- self.q_table = np.zeros((num_states, num_actions))
105
-
106
- def choose_action(self, state: int) -> int:
107
- if random.random() < self.epsilon:
108
- return random.randint(0, self.num_actions - 1)
109
- return int(np.argmax(self.q_table[state]))
110
-
111
- def update(self, state: int, action: int, reward: float, next_state: int):
112
- best_next = np.max(self.q_table[next_state])
113
- td_target = reward + self.gamma * best_next
114
- self.q_table[state, action] += self.alpha * (td_target - self.q_table[state, action])
115
-
116
-
117
- # ─────────────────────────────────────────────
118
- # MONTE CARLO AGENT (first-visit, every-episode update)
119
- # ─────────────────────────────────────────────
120
- class MonteCarloAgent:
121
- def __init__(self, num_states: int, num_actions: int,
122
- alpha: float = 0.1, gamma: float = 0.9, epsilon: float = 1.0):
123
- self.num_states = num_states
124
- self.num_actions = num_actions
125
- self.alpha = alpha
126
- self.gamma = gamma
127
- self.epsilon = epsilon
128
- self.q_table = np.zeros((num_states, num_actions))
129
- self.returns_sum = np.zeros((num_states, num_actions))
130
- self.returns_count = np.zeros((num_states, num_actions))
131
-
132
- def choose_action(self, state: int) -> int:
133
- if random.random() < self.epsilon:
134
- return random.randint(0, self.num_actions - 1)
135
- return int(np.argmax(self.q_table[state]))
136
-
137
- def update(self, episode_history: list):
138
- """First-visit Monte Carlo update."""
139
- G = 0.0
140
- visited = set()
141
- for state, action, reward in reversed(episode_history):
142
- G = self.gamma * G + reward
143
- if (state, action) not in visited:
144
- visited.add((state, action))
145
- self.returns_sum[state, action] += G
146
- self.returns_count[state, action] += 1
147
- self.q_table[state, action] = (
148
- self.returns_sum[state, action] / self.returns_count[state, action]
149
- )
150
-
151
-
152
- # ─────────────────────────────────────────────
153
- # TRAINING ROUTINES
154
- # ─────────────────────────────────────────────
155
- def train_q_agent(env, agent, num_episodes: int,
156
- min_epsilon: float = 0.01, decay_rate: float = 0.995,
157
- max_steps: int = 500) -> list:
158
- rewards_history = []
159
- for _ in range(num_episodes):
160
- state, _ = env.reset()
161
- total_reward = 0
162
- for _ in range(max_steps):
163
- action = agent.choose_action(state)
164
- next_state, reward, done, _, _ = env.step(action)
165
- agent.update(state, action, reward, next_state)
166
- state = next_state
167
- total_reward += reward
168
- if done:
169
- break
170
- agent.epsilon = max(min_epsilon, agent.epsilon * decay_rate)
171
- rewards_history.append(total_reward)
172
- return rewards_history
173
-
174
-
175
- def train_mc_agent(env, agent, num_episodes: int,
176
- min_epsilon: float = 0.01, decay_rate: float = 0.995,
177
- max_steps: int = 500) -> list:
178
- rewards_history = []
179
- for _ in range(num_episodes):
180
- episode_history = []
181
- state, _ = env.reset()
182
- total_reward = 0
183
- for _ in range(max_steps):
184
- action = agent.choose_action(state)
185
- next_state, reward, done, _, _ = env.step(action)
186
- episode_history.append((state, action, reward))
187
- state = next_state
188
- total_reward += reward
189
- if done:
190
- break
191
- agent.update(episode_history)
192
- agent.epsilon = max(min_epsilon, agent.epsilon * decay_rate)
193
- rewards_history.append(total_reward)
194
- return rewards_history
195
-
196
-
197
- # ─────────────────────────────────────────────
198
- # VISUALISATION HELPER
199
- # ─────────────────────────────────────────────
200
- def render_maze(env, path: list, agent_type: str) -> Image.Image:
201
- """Return a PIL Image of the maze with the solved path overlaid."""
202
- ms = env.maze_size
203
- fig, axes = plt.subplots(1, 2, figsize=(14, 6))
204
- fig.patch.set_facecolor('#1a1a2e')
205
-
206
- # ── left: maze + path ────────────────────
207
- ax = axes[0]
208
- ax.set_facecolor('#16213e')
209
-
210
- # Draw cells
211
- for r in range(ms):
212
- for c in range(ms):
213
- if env.maze[r, c] == 1:
214
- color = '#e94560' # wall
215
- elif (r, c) == env.start_pos:
216
- color = '#0f3460'
217
- elif (r, c) == env.goal_pos:
218
- color = '#533483'
219
- else:
220
- color = '#16213e'
221
- rect = mpatches.FancyBboxPatch(
222
- (c, ms - 1 - r), 1, 1,
223
- boxstyle="round,pad=0.05",
224
- linewidth=1.5, edgecolor='#0f3460',
225
- facecolor=color
226
- )
227
- ax.add_patch(rect)
228
-
229
- # Draw path
230
- if len(path) > 1:
231
- path_coords = [(s % ms + 0.5, ms - 1 - s // ms + 0.5) for s in path]
232
- xs, ys = zip(*path_coords)
233
- ax.plot(xs, ys, color='#00d2ff', linewidth=2.5,
234
- alpha=0.85, zorder=5, marker='o',
235
- markersize=5, markerfacecolor='#00d2ff')
236
-
237
- # Emoji labels
238
- sx, sy = env.start_pos[1] + 0.5, ms - 1 - env.start_pos[0] + 0.5
239
- gx, gy = env.goal_pos[1] + 0.5, ms - 1 - env.goal_pos[0] + 0.5
240
- ax.text(sx, sy, '🐀', ha='center', va='center', fontsize=18, zorder=10)
241
- ax.text(gx, gy, '🧀', ha='center', va='center', fontsize=18, zorder=10)
242
-
243
- ax.set_xlim(0, ms)
244
- ax.set_ylim(0, ms)
245
- ax.set_aspect('equal')
246
- ax.set_xticks(range(ms))
247
- ax.set_yticks(range(ms))
248
- ax.tick_params(colors='#aaaacc')
249
- ax.set_title(f'Maze — {agent_type}', color='white', fontsize=13, pad=10)
250
- for spine in ax.spines.values():
251
- spine.set_edgecolor('#0f3460')
252
-
253
- # ── right: Q-value heatmap ────────────────
254
- ax2 = axes[1]
255
- q_max = np.max(env_ref.agent_q_table, axis=1).reshape(ms, ms) \
256
- if hasattr(env_ref, 'agent_q_table') else np.zeros((ms, ms))
257
-
258
- sns.heatmap(
259
- q_max, ax=ax2, cmap='magma', linewidths=0.5, linecolor='#1a1a2e',
260
- cbar_kws={'label': 'Max Q-value', 'shrink': 0.8},
261
- annot=(ms <= 8), fmt='.1f', annot_kws={'color': 'white', 'size': 8}
262
- )
263
- ax2.set_title('Max Q-value per Cell', color='white', fontsize=13, pad=10)
264
- ax2.tick_params(colors='#aaaacc')
265
- ax2.set_facecolor('#16213e')
266
- plt.setp(ax2.get_xticklabels(), color='#aaaacc')
267
- plt.setp(ax2.get_yticklabels(), color='#aaaacc')
268
- ax2.collections[0].colorbar.ax.yaxis.label.set_color('white')
269
- ax2.collections[0].colorbar.ax.tick_params(colors='white')
270
-
271
- plt.tight_layout(pad=2)
272
- buf = io.BytesIO()
273
- plt.savefig(buf, format='png', dpi=120, bbox_inches='tight',
274
- facecolor=fig.get_facecolor())
275
- plt.close(fig)
276
- buf.seek(0)
277
- return Image.open(buf).copy()
278
-
279
-
280
- # Tiny global to pass Q-table into render helper without refactoring signature
281
- class _EnvRef:
282
- agent_q_table = None
283
-
284
- env_ref = _EnvRef()
285
-
286
-
287
- # ─────────────────────────────────────────────
288
- # MAIN CALLABLE (Gradio fn)
289
- # ─────────────────────────────────────────────
290
- def create_and_solve_maze(
291
- maze_size: int,
292
- num_walls: int,
293
- agent_type: str,
294
- num_episodes: int,
295
- epsilon_decay: float,
296
- learning_rate: float,
297
- discount_factor: float,
298
- ) -> tuple:
299
- """Train an RL agent and return (maze image, stats text)."""
300
-
301
- seed = random.randint(0, 10_000)
302
- env = MazeEnv(maze_size=maze_size, num_walls=num_walls, seed=seed)
303
- n_s = env.observation_space.n
304
- n_a = env.action_space.n
305
-
306
- # ── build & train ─────────────────────────
307
- if agent_type == 'Q-Learning':
308
- agent = QLearningAgent(n_s, n_a, alpha=learning_rate,
309
- gamma=discount_factor, epsilon=1.0)
310
- history = train_q_agent(env, agent, num_episodes,
311
- decay_rate=epsilon_decay)
312
- else: # Monte Carlo
313
- agent = MonteCarloAgent(n_s, n_a, alpha=learning_rate,
314
- gamma=discount_factor, epsilon=1.0)
315
- history = train_mc_agent(env, agent, num_episodes,
316
- decay_rate=epsilon_decay)
317
-
318
- # pass Q-table to renderer
319
- env_ref.agent_q_table = agent.q_table
320
-
321
- # ── greedy rollout ────────────────────────
322
- state, _ = env.reset(seed=seed)
323
- path = [state]
324
- max_steps = maze_size * maze_size * 3
325
- done = False
326
-
327
- while not done and len(path) < max_steps:
328
- action = int(np.argmax(agent.q_table[state]))
329
- next_state, _, term, trunc, _ = env.step(action)
330
- path.append(next_state)
331
- state = next_state
332
- done = term or trunc
333
-
334
- solved = (env.state == env.goal_pos)
335
- img = render_maze(env, path, agent_type)
336
-
337
- # ── stats string ──────────────────────────
338
- avg_last = np.mean(history[-100:]) if len(history) >= 100 else np.mean(history)
339
- status = "✅ Goal reached!" if solved else "❌ Did not reach goal."
340
- stats = (
341
- f"{status}\n"
342
- f"Path length : {len(path)} steps\n"
343
- f"Avg reward (last 100 ep): {avg_last:.1f}\n"
344
- f"Final epsilon : {agent.epsilon:.4f}\n"
345
- f"Episodes trained : {num_episodes}"
346
- )
347
- return img, stats
348
-
349
-
350
- # ─────────────────────────────────────────────
351
- # GRADIO INTERFACE
352
- # ─────────────────────────────────────────────
353
- custom_css = """
354
- body { background: #1a1a2e; }
355
- .gradio-container { background: #1a1a2e !important; color: #e0e0ff; font-family: 'Segoe UI', sans-serif; }
356
- .gr-button-primary { background: #e94560 !important; border: none !important; }
357
- .gr-button-primary:hover { background: #c73652 !important; }
358
- label { color: #aaaacc !important; }
359
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
- with gr.Blocks(css=custom_css, title="RL Maze Solver") as iface:
362
- gr.Markdown(
363
- """
364
- # 🐀 RL Rat & Cheese Maze Solver
365
- Train a **Q-Learning** or **Monte Carlo** agent to navigate a randomly generated maze.
366
- Adjust the parameters and hit **Solve Maze** to watch the agent learn!
367
- """
368
- )
369
-
370
- with gr.Row():
371
- with gr.Column(scale=1):
372
- maze_size = gr.Slider(5, 15, step=1, value=7, label="Maze Size")
373
- num_walls = gr.Slider(0, 50, step=1, value=10, label="Number of Walls")
374
- agent_type = gr.Radio(["Q-Learning", "Monte Carlo"],
375
- value="Q-Learning", label="Agent Type")
376
- num_episodes = gr.Slider(100, 5000, step=100, value=1000, label="Training Episodes")
377
- epsilon_decay = gr.Slider(0.90, 0.999, step=0.001, value=0.995, label="Epsilon Decay Rate")
378
- learning_rate = gr.Slider(0.01, 0.5, step=0.01, value=0.1, label="Learning Rate (α)")
379
- discount_factor= gr.Slider(0.5, 0.99, step=0.01, value=0.9, label="Discount Factor (γ)")
380
- solve_btn = gr.Button("🚀 Solve Maze", variant="primary")
381
-
382
- with gr.Column(scale=2):
383
- maze_img = gr.Image(type="pil", label="Solved Maze + Q-value Heatmap")
384
- stats_box = gr.Textbox(label="Training Stats", lines=6, interactive=False)
385
-
386
- solve_btn.click(
387
- fn=create_and_solve_maze,
388
- inputs=[maze_size, num_walls, agent_type, num_episodes,
389
- epsilon_decay, learning_rate, discount_factor],
390
- outputs=[maze_img, stats_box],
391
- )
392
 
393
  if __name__ == "__main__":
394
- iface.launch(share=True)
 
1
+ """
2
+ 🤖 Maze Runner — RL Playground
3
+ An interactive, fun maze-solving playground powered by Reinforcement Learning.
4
+ Anyone can build a maze, pick a brain, and watch the bot learn to escape.
5
+ """
6
+
7
+ from __future__ import annotations
8
  import numpy as np
 
 
 
9
  import gradio as gr
10
+
11
+ from maze.generator import generate_dfs_maze, generate_open_maze
12
+ from maze.env import MazeEnv
13
+ from agents.qlearning import train_qlearning
14
+ from agents.sarsa import train_sarsa
15
+ from agents.montecarlo import train_montecarlo
16
+ from viz.renderer import (
17
+ make_solution_gif, make_training_chart,
18
+ make_qvalue_heatmap, make_race_chart, score_run,
19
+ )
20
+
21
+ # ── Helpers ───────────────────────────────────────────────────────────────────
22
+
23
+ ALGO_MAP = {
24
+ "🧠 Q-Learning (recommended)": "qlearning",
25
+ "🎯 SARSA (cautious)": "sarsa",
26
+ "🎲 Monte Carlo (explorer)": "montecarlo",
27
+ }
28
+
29
+ DIFFICULTY = {
30
+ "🐣 Tiny (5×5)": 5,
31
+ "🐇 Small (7×7)": 7,
32
+ "🐢 Medium (9×9)": 9,
33
+ "🦊 Large (13×13)": 13,
34
+ "🐉 XL (17×17)": 17,
35
+ }
36
+
37
+ MAZE_STYLE = {
38
+ "🏰 Corridors (DFS)": "dfs",
39
+ "🌿 Open Field (random walls)": "open",
40
+ }
41
+
42
+
43
+ def _make_env(size: int, style: str, seed: int) -> MazeEnv:
44
+ if style == "dfs":
45
+ grid = generate_dfs_maze(size, seed=seed)
46
+ else:
47
+ grid = generate_open_maze(size, wall_frac=0.18, seed=seed)
48
+ return MazeEnv(grid)
49
+
50
+
51
+ def _train(env: MazeEnv, algo: str, episodes: int, alpha: float,
52
+ gamma: float, decay: float, seed: int):
53
+ fn = {"qlearning": train_qlearning,
54
+ "sarsa": train_sarsa,
55
+ "montecarlo": train_montecarlo}[algo]
56
+ return fn(env, episodes, alpha, gamma, decay, seed)
57
+
58
+
59
+ def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]:
60
+ state, _ = env.reset()
61
+ path: list[tuple[int, ...]] = [env.start]
62
+ for _ in range(env.n_states * 3):
63
+ action = agent.greedy_action(state)
64
+ state, _, done, _, _ = env.step(action)
65
+ path.append(env._from_state(state))
66
+ if done:
67
+ break
68
+ return path
69
+
70
+
71
+ # ── Main Playground callback ──────────────────────────────────────────────────
72
+
73
+ def cb_solve(
74
+ difficulty: str,
75
+ maze_style: str,
76
+ algo_label: str,
77
+ episodes: int,
78
+ alpha: float,
79
+ gamma: float,
80
+ decay: float,
81
+ seed: int,
82
+ progress: gr.Progress = gr.Progress(),
83
+ ):
84
+ progress(0.05, desc="Building maze…")
85
+ size = DIFFICULTY[difficulty]
86
+ style = MAZE_STYLE[maze_style]
87
+ algo = ALGO_MAP[algo_label]
88
+
89
+ env = _make_env(size, style, int(seed))
90
+
91
+ progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}…")
92
+ agent, rewards = _train(env, algo, int(episodes), float(alpha),
93
+ float(gamma), float(decay), int(seed))
94
+
95
+ progress(0.75, desc="Rendering solution…")
96
+ env2 = _make_env(size, style, int(seed))
97
+ gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip())
98
+
99
+ progress(0.85, desc="Building charts…")
100
+ env3 = _make_env(size, style, int(seed))
101
+ path = _collect_path(env3, agent)
102
+ sc = score_run(path, env3.goal, rewards, env3.n_states)
103
+
104
+ train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards})
105
+
106
+ env4 = _make_env(size, style, int(seed))
107
+ heatmap_fig = make_qvalue_heatmap(env4, agent)
108
+
109
+ stats_md = f"""
110
+ ### {sc['grade']} {sc['verdict']}
111
+
112
+ | | |
113
+ |---|---|
114
+ | **Solved** | {"✅ Yes" if sc['solved'] else "❌ No"} |
115
+ | **Steps taken** | `{sc['steps']}` |
116
+ | **Efficiency score** | `{sc['efficiency']}%` |
117
+ | **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` |
118
+ | **Episodes trained** | `{int(episodes)}` |
119
+ | **Maze size** | `{env.shape[0]} × {env.shape[1]}` cells |
120
+
121
+ > **Efficiency** compares your bot's path length to the ideal shortest path.
122
+ > 100% = perfect. 0% = didn't make it.
123
+ """
124
+ progress(1.0, desc="Done!")
125
+ return gif_path, train_fig, heatmap_fig, stats_md
126
+
127
+
128
+ # ── Algorithm Race callback ───────────────────────────────────────────────────
129
+
130
+ def cb_race(
131
+ difficulty: str,
132
+ maze_style: str,
133
+ episodes: int,
134
+ run_mc: bool,
135
+ progress: gr.Progress = gr.Progress(),
136
+ ):
137
+ size = DIFFICULTY[difficulty]
138
+ style = MAZE_STYLE[maze_style]
139
+ seed = 77
140
+
141
+ progress(0.1, desc="Training Q-Learning…")
142
+ env_q = _make_env(size, style, seed)
143
+ _, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed)
144
+
145
+ progress(0.4, desc="Training SARSA…")
146
+ env_s = _make_env(size, style, seed)
147
+ _, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed)
148
+
149
+ rc, name_c = None, ""
150
+ if run_mc:
151
+ progress(0.65, desc="Training Monte Carlo…")
152
+ env_m = _make_env(size, style, seed)
153
+ _, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed)
154
+ name_c = "Monte Carlo"
155
+
156
+ progress(0.9, desc="Building race chart…")
157
+ fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c)
158
+
159
+ # Winner
160
+ final_q = float(np.mean(rq[-max(1, len(rq)//5):]))
161
+ final_s = float(np.mean(rs[-max(1, len(rs)//5):]))
162
+ scores = {"Q-Learning": final_q, "SARSA": final_s}
163
+ if rc:
164
+ scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):]))
165
+ winner = max(scores, key=lambda k: scores[k])
166
+
167
+ result_md = f"""
168
+ ### 🏆 Race Result
169
+
170
+ | Algorithm | Final Score |
171
+ |---|---|
172
+ {"".join(f"| {'🥇 ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())}
173
+
174
+ **Winner: {winner}** with a final average reward of `{scores[winner]:.1f}`
175
+
176
+ > All algorithms trained on the same maze with identical hyperparameters.
177
+ > Final score = average reward over the last 20% of episodes.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  """
179
+ progress(1.0)
180
+ return fig, result_md
181
+
182
+
183
+ # ── CSS ───────────────────────────────────────────────────────────────────────
184
+
185
+ CSS = """
186
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap');
187
+
188
+ *, *::before, *::after { box-sizing: border-box; }
189
+
190
+ body, .gradio-container {
191
+ background: #0d1117 !important;
192
+ color: #c9d1d9 !important;
193
+ font-family: 'Inter', sans-serif !important;
194
+ }
195
+ .gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
196
+
197
+ /* Hero */
198
+ .hero { text-align:center; padding:2rem 1rem 1rem; }
199
+ .hero-title {
200
+ font-size: clamp(2rem, 5vw, 3rem); font-weight: 700;
201
+ background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657);
202
+ -webkit-background-clip: text; -webkit-text-fill-color: transparent;
203
+ margin: 0 0 0.4rem;
204
+ }
205
+ .hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; }
206
+
207
+ /* Tabs */
208
+ .tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; }
209
+ .tab-nav button {
210
+ font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important;
211
+ font-weight: 500 !important; color: #484f58 !important;
212
+ background: transparent !important; border: none !important;
213
+ padding: 0.7rem 1.1rem !important;
214
+ }
215
+ .tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; }
216
+ .tab-nav button:hover { color: #8b949e !important; }
217
+
218
+ /* Cards */
219
+ .info-card {
220
+ background: #161b22; border: 1px solid #21262d; border-radius: 10px;
221
+ padding: 1.1rem;
222
+ }
223
+ .info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; }
224
+ .info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; }
225
+ .info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; }
226
+
227
+ /* Algo cards */
228
+ .algo-card {
229
+ background: #161b22; border: 1px solid #21262d; border-radius: 10px;
230
+ padding: 1rem; margin-bottom: 0.5rem;
231
+ }
232
+ .algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; }
233
+ .algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; }
234
+ .algo-tag {
235
+ display: inline-block; font-size: 0.68rem; padding: 2px 8px;
236
+ border-radius: 20px; margin-top: 0.4rem;
237
+ }
238
+ .tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; }
239
+ .tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; }
240
+ .tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; }
241
+
242
+ /* Grade badge */
243
+ .grade-badge {
244
+ display:inline-block; font-size:2.5rem; font-weight:700;
245
+ font-family:'JetBrains Mono',monospace;
246
+ }
247
+
248
+ /* Buttons */
249
+ button.primary {
250
+ font-family: 'Inter', sans-serif !important; font-weight: 600 !important;
251
+ background: linear-gradient(135deg, #238636, #2ea043) !important;
252
+ color: #ffffff !important; border: none !important;
253
+ border-radius: 6px !important; font-size: 0.9rem !important;
254
+ transition: opacity 0.2s !important;
255
+ }
256
+ button.primary:hover { opacity: 0.85 !important; }
257
+ button.secondary {
258
+ background: #161b22 !important; color: #58a6ff !important;
259
+ border: 1px solid #30363d !important; border-radius: 6px !important;
260
+ font-family: 'Inter', sans-serif !important;
261
+ }
262
+ button.stop {
263
+ background: #1c0d0d !important; color: #f78166 !important;
264
+ border: 1px solid #6e2b2b !important; border-radius: 6px !important;
265
+ }
266
+
267
+ /* Labels */
268
+ label span { font-family:'Inter',sans-serif !important;
269
+ font-size:0.82rem !important; color:#8b949e !important; }
270
+
271
+ /* Slider */
272
+ input[type=range] { -webkit-appearance:none; height:4px;
273
+ background:#21262d; border-radius:2px; }
274
+ input[type=range]::-webkit-slider-thumb {
275
+ -webkit-appearance:none; width:16px; height:16px;
276
+ border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117;
277
+ }
278
+
279
+ /* Textarea */
280
+ textarea { font-family:'JetBrains Mono',monospace !important;
281
+ font-size:0.82rem !important; background:#0d1117 !important;
282
+ color:#3fb950 !important; border:1px solid #21262d !important;
283
+ border-radius:6px !important; }
284
+
285
+ /* Markdown */
286
+ .gradio-container h2 { color: #3fb950 !important; }
287
+ .gradio-container h3 { color: #58a6ff !important; }
288
+ .gradio-container p { color: #8b949e !important; }
289
+ table { width:100%; border-collapse:collapse; }
290
+ th { background:#161b22; color:#3fb950; font-size:0.78rem;
291
+ text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; }
292
+ td { padding:8px 12px; border-bottom:1px solid #0d1117;
293
+ color:#e6edf3; font-size:0.85rem; }
294
+ blockquote { border-left:3px solid #3fb950; padding-left:1rem;
295
+ color:#484f58 !important; margin:0.5rem 0; }
296
+
297
+ footer { display:none !important; }
298
+ .gradio-container .block { background:transparent !important; border:none !important; }
299
+ """
300
+
301
+ # ── Build UI ──────────────────────────────────────────────────────────────────
302
+
303
+ with gr.Blocks(title="🤖 Maze Runner — RL Playground") as demo:
304
+
305
+ gr.HTML("""
306
+ <div class="hero">
307
+ <div class="hero-title">🤖 Maze Runner</div>
308
+ <div class="hero-sub">An AI that learns to escape mazes — watch it happen in real time</div>
309
+ </div>
310
+ """)
311
+
312
+ with gr.Tabs():
313
+
314
+ # ══════════════════════════════════════════════════════════════════
315
+ # Tab 1 — Welcome
316
+ # ══════════════════════════════════════════════════════════════════
317
+ with gr.Tab("🏠 Welcome"):
318
+
319
+ gr.HTML("""
320
+ <div style="text-align:center;padding:0.5rem 0 1.5rem;">
321
+ <p style="color:#8b949e;font-size:1rem;max-width:580px;margin:0 auto;">
322
+ A tiny AI robot is dropped into a maze. It knows nothing.
323
+ Through thousands of attempts — hitting walls, finding dead ends,
324
+ occasionally stumbling upon the exit — it slowly builds a mental map
325
+ and learns the perfect escape route.
326
+ </p>
327
+ </div>
328
+ """)
329
+
330
+ gr.HTML("""
331
+ <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-bottom:1.5rem;">
332
+ <div class="info-card">
333
+ <div class="info-card-icon">🗺️</div>
334
+ <div class="info-card-title">The Maze</div>
335
+ <div class="info-card-body">
336
+ A grid of corridors and walls. The bot starts at
337
+ <strong style="color:#58a6ff">S</strong> and must reach
338
+ <strong style="color:#f78166">G</strong>.
339
+ It can only see its own position — no map, no cheating.
340
+ </div>
341
+ </div>
342
+ <div class="info-card">
343
+ <div class="info-card-icon">🤖</div>
344
+ <div class="info-card-title">The Bot</div>
345
+ <div class="info-card-body">
346
+ At each step it chooses: go up, down, left, or right.
347
+ Hit a wall? Penalty. Reach the goal? Big reward!
348
+ It remembers what worked and what didn't.
349
+ </div>
350
+ </div>
351
+ <div class="info-card">
352
+ <div class="info-card-icon">🧠</div>
353
+ <div class="info-card-title">The Learning</div>
354
+ <div class="info-card-body">
355
+ Each attempt updates a "score table" for every
356
+ position and move. After enough tries, the bot
357
+ always picks the move with the highest score — the optimal path.
358
+ </div>
359
+ </div>
360
+ </div>
361
+ """)
362
+
363
+ gr.HTML("""
364
+ <div style="background:#161b22;border:1px solid #21262d;border-radius:10px;padding:1.2rem;margin-bottom:1rem;">
365
+ <div style="font-weight:600;color:#e6edf3;margin-bottom:1rem;">🧠 Choose your Bot's Brain</div>
366
+ <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;">
367
+ <div class="algo-card">
368
+ <div class="algo-name">Q-Learning</div>
369
+ <div class="algo-desc">
370
+ Updates its score table <em>immediately</em> after every step.
371
+ Fast learner. Best for most mazes.
372
+ </div>
373
+ <span class="algo-tag tag-green">⚡ Recommended</span>
374
+ </div>
375
+ <div class="algo-card">
376
+ <div class="algo-name">SARSA</div>
377
+ <div class="algo-desc">
378
+ Updates based on the move it <em>actually took</em> next,
379
+ not just the best possible. More cautious, avoids risky paths.
380
+ </div>
381
+ <span class="algo-tag tag-blue">🎯 Cautious</span>
382
+ </div>
383
+ <div class="algo-card">
384
+ <div class="algo-name">Monte Carlo</div>
385
+ <div class="algo-desc">
386
+ Plays out the <em>entire episode</em> first, then
387
+ updates everything at once. Needs more episodes to converge.
388
+ </div>
389
+ <span class="algo-tag tag-orange">🎲 Explorer</span>
390
+ </div>
391
+ </div>
392
+ </div>
393
+ """)
394
+
395
+ gr.HTML("""
396
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;">
397
+ <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:10px;padding:1rem;">
398
+ <div style="font-weight:600;color:#3fb950;margin-bottom:0.4rem;">🗺️ How to use this app</div>
399
+ <ol style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
400
+ <li>Go to <strong style="color:#e6edf3">🎮 Playground</strong> tab</li>
401
+ <li>Pick a difficulty and maze style</li>
402
+ <li>Choose a brain and hit <strong style="color:#3fb950">Train & Watch!</strong></li>
403
+ <li>Watch the animated replay</li>
404
+ <li>Try <strong style="color:#e6edf3">🏁 Algorithm Race</strong> to compare all three</li>
405
+ </ol>
406
+ </div>
407
+ <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:10px;padding:1rem;">
408
+ <div style="font-weight:600;color:#58a6ff;margin-bottom:0.4rem;">💡 Fun facts</div>
409
+ <ul style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
410
+ <li>This same idea trains robots, game AIs, and self-driving cars</li>
411
+ <li>DeepMind's AlphaGo used a version of Q-Learning</li>
412
+ <li>A 17×17 maze has 289 possible positions to learn</li>
413
+ <li>The bot gets worse before it gets better — that's normal!</li>
414
+ </ul>
415
+ </div>
416
+ </div>
417
+ """)
418
+
419
+ # ══════════════════════════════════════════════════════════════════
420
+ # Tab 2 — Playground
421
+ # ══════════════════════════════════════════════════════════════════
422
+ with gr.Tab("🎮 Playground"):
423
+
424
+ gr.HTML("""
425
+ <div style="padding:0.3rem 0 1rem;">
426
+ <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
427
+ Build a maze, pick a brain, watch it learn
428
+ </div>
429
+ <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
430
+ The animated replay shows the final learned path after training.
431
+ </div>
432
+ </div>
433
+ """)
434
+
435
+ with gr.Row():
436
+ # ── Controls ──────────────────────────────────────────────
437
+ with gr.Column(scale=1, min_width=300):
438
+
439
+ gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin-bottom:0.5rem;">🗺️ MAZE SETUP</div>')
440
+
441
+ difficulty = gr.Radio(
442
+ list(DIFFICULTY.keys()),
443
+ value="🐢 Medium (9×9)",
444
+ label="Difficulty",
445
+ )
446
+ maze_style = gr.Radio(
447
+ list(MAZE_STYLE.keys()),
448
+ value="🏰 Corridors (DFS)",
449
+ label="Maze style",
450
+ info="Corridors = proper winding paths · Open = random walls"
451
+ )
452
+
453
+ gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">🧠 BOT BRAIN</div>')
454
+
455
+ algo = gr.Radio(
456
+ list(ALGO_MAP.keys()),
457
+ value="🧠 Q-Learning (recommended)",
458
+ label="Algorithm",
459
+ )
460
+
461
+ gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">⚙️ TRAINING</div>')
462
+
463
+ episodes = gr.Slider(100, 3000, value=800, step=100,
464
+ label="Training episodes",
465
+ info="More = smarter bot, but slower")
466
+
467
+ with gr.Accordion("🔬 Advanced settings", open=False):
468
+ alpha = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Learning speed (α)")
469
+ gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="Future planning (γ)")
470
+ decay = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay")
471
+ seed = gr.Slider(0, 100, value=42, step=1, label="Random seed")
472
+
473
+ btn_solve = gr.Button("🚀 Train & Watch!", variant="primary")
474
+
475
+ # ── Outputs ───────────────────────────────────────────────
476
+ with gr.Column(scale=2):
477
+ play_stats = gr.Markdown("*Configure your maze and hit Train & Watch!*")
478
+
479
+ with gr.Row():
480
+ play_gif = gr.Image(
481
+ label="🎬 Bot solving the maze (animated)",
482
+ type="filepath", height=360,
483
+ )
484
+
485
+ with gr.Row():
486
+ play_train_fig = gr.Plot(label="📈 Training progress")
487
+ play_heatmap = gr.Plot(label="🌡️ Q-value map (what the bot learned)")
488
+
489
+ # hidden state defaults for advanced
490
+ alpha_h = gr.State(0.1)
491
+ gamma_h = gr.State(0.95)
492
+ decay_h = gr.State(0.995)
493
+ seed_h = gr.State(42)
494
+
495
+ btn_solve.click(
496
+ cb_solve,
497
+ inputs=[difficulty, maze_style, algo, episodes,
498
+ alpha, gamma, decay, seed],
499
+ outputs=[play_gif, play_train_fig, play_heatmap, play_stats],
500
+ )
501
+
502
+ # ══════════════════════════════════════════════════════════════════
503
+ # Tab 3 — Algorithm Race
504
+ # ══════════════════════════════════════════════════════════════════
505
+ with gr.Tab("🏁 Algorithm Race"):
506
+
507
+ gr.HTML("""
508
+ <div style="padding:0.3rem 0 1rem;">
509
+ <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
510
+ Head-to-head: which brain learns fastest?
511
+ </div>
512
+ <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
513
+ All algorithms train on the same maze with identical settings —
514
+ the only variable is the learning strategy.
515
+ </div>
516
+ </div>
517
+ """)
518
+
519
+ gr.HTML("""
520
+ <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;margin-bottom:1rem;">
521
+ <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.8rem;text-align:center;">
522
+ <div style="color:#3fb950;font-size:1.2rem;font-weight:700;">Q-Learning</div>
523
+ <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Off-policy · Fast update · Optimistic</div>
524
+ </div>
525
+ <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:8px;padding:0.8rem;text-align:center;">
526
+ <div style="color:#58a6ff;font-size:1.2rem;font-weight:700;">SARSA</div>
527
+ <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">On-policy · Careful update · Conservative</div>
528
+ </div>
529
+ <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.8rem;text-align:center;">
530
+ <div style="color:#ffa657;font-size:1.2rem;font-weight:700;">Monte Carlo</div>
531
+ <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Episodic · Full return · Unbiased</div>
532
+ </div>
533
+ </div>
534
+ """)
535
+
536
+ with gr.Row():
537
+ with gr.Column(scale=1, min_width=260):
538
+ race_diff = gr.Radio(list(DIFFICULTY.keys()),
539
+ value="🐢 Medium (9×9)", label="Maze difficulty")
540
+ race_style = gr.Radio(list(MAZE_STYLE.keys()),
541
+ value="🏰 Corridors (DFS)", label="Maze style")
542
+ race_eps = gr.Slider(200, 2000, value=600, step=100,
543
+ label="Episodes per algorithm")
544
+ race_mc = gr.Checkbox(label="Include Monte Carlo (slower)", value=True)
545
+ btn_race = gr.Button("🏁 Start Race!", variant="primary")
546
+
547
+ with gr.Column(scale=2):
548
+ race_result = gr.Markdown("*Click Start Race to run the comparison.*")
549
+
550
+ race_fig = gr.Plot(label="Race Results")
551
+
552
+ btn_race.click(
553
+ cb_race,
554
+ inputs=[race_diff, race_style, race_eps, race_mc],
555
+ outputs=[race_fig, race_result],
556
+ )
557
+
558
+ # ══════════════════════════════════════════════════════════════════
559
+ # Tab 4 — How it Works
560
+ # ══════════════════════════════════════════════════════════════════
561
+ with gr.Tab("🧠 How it Works"):
562
+
563
+ gr.HTML("""
564
+ <div style="max-width:700px;margin:0 auto;padding:1rem 0;">
565
+
566
+ <h2 style="color:#3fb950;font-size:1.3rem;margin-bottom:0.3rem;">The Big Idea</h2>
567
+ <p style="color:#8b949e;line-height:1.7;">
568
+ The bot doesn't know anything about the maze at the start. It just knows
569
+ 4 possible moves and gets a number (reward) after each step.
570
+ <strong style="color:#e6edf3">Negative number = bad move. Positive = good move.</strong>
571
+ The goal: find the sequence of moves that gets the most reward.
572
+ </p>
573
+
574
+ <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">The Score Table (Q-Table)</h2>
575
+ <p style="color:#8b949e;line-height:1.7;">
576
+ The bot keeps a table with one row per maze cell and 4 columns (one per direction).
577
+ Each entry stores <em>how good it thinks that move is from that cell</em>.
578
+ At the start, everything is 0. After training, the table holds the bot's
579
+ entire learned strategy. The Q-value heatmap in the Playground shows this table visually.
580
+ </p>
581
+
582
+ <div style="background:#161b22;border:1px solid #21262d;border-radius:8px;padding:1rem;margin:1rem 0;font-family:'JetBrains Mono',monospace;font-size:0.82rem;color:#3fb950;">
583
+ Q[current_cell][move] += learning_speed × (<br>
584
+ &nbsp;&nbsp;reward_got + future_discount × best_Q[next_cell] − Q[current_cell][move]<br>
585
+ )
586
+ </div>
587
+
588
+ <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Exploration vs Exploitation</h2>
589
+ <p style="color:#8b949e;line-height:1.7;">
590
+ Early in training, the bot tries <strong style="color:#e6edf3">random moves</strong> (exploration)
591
+ — it doesn't know enough to trust its table yet. Over time, it relies more on what
592
+ it's learned (exploitation). This is controlled by <strong style="color:#e6edf3">epsilon (ε)</strong>,
593
+ which starts near 1.0 (100% random) and decays toward 0 (always use best known move).
594
+ </p>
595
+
596
+ <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Why does reward go negative first?</h2>
597
+ <p style="color:#8b949e;line-height:1.7;">
598
+ Each step costs −1 (time penalty) and hitting a wall costs −5.
599
+ A random bot hits a <em>lot</em> of walls and takes forever to find the exit,
600
+ so early rewards are very negative. As it learns, fewer walls are hit and
601
+ the path shortens — reward climbs toward 0 and eventually turns positive when
602
+ it reliably reaches the goal.
603
+ </p>
604
+
605
+ <div style="display:grid;grid-template-columns:1fr 1fr;gap:0.8rem;margin-top:1.2rem;">
606
+ <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.9rem;">
607
+ <div style="color:#3fb950;font-weight:600;margin-bottom:0.4rem;">Q-Learning vs SARSA</div>
608
+ <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
609
+ Q-Learning always updates toward the <em>best possible</em> next action —
610
+ even if it wouldn't actually take that action. SARSA updates toward
611
+ the action it <em>will actually take</em>. This makes SARSA more cautious near walls.
612
+ </div>
613
+ </div>
614
+ <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.9rem;">
615
+ <div style="color:#ffa657;font-weight:600;margin-bottom:0.4rem;">Why Monte Carlo is slow</div>
616
+ <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
617
+ MC waits until the episode <em>ends</em> before updating any scores.
618
+ On large mazes where early episodes never reach the goal,
619
+ it gets zero signal for a long time. But once it starts solving,
620
+ its estimates are very accurate.
621
+ </div>
622
+ </div>
623
+ </div>
624
+
625
+ </div>
626
+ """)
627
+
628
+ gr.HTML("""
629
+ <div style="text-align:center;color:#21262d;font-size:0.75rem;
630
+ padding:1.5rem 0 0.5rem;border-top:1px solid #161b22;margin-top:1rem;">
631
+ Built with Q-Learning · SARSA · Monte Carlo · Gymnasium · Gradio
632
+ </div>
633
+ """)
634
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
 
636
  if __name__ == "__main__":
637
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS)
maze/__init__.py ADDED
File without changes
maze/env.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced MazeEnv — works with both DFS corridor mazes and open random mazes.
3
+ """
4
+
5
+ from __future__ import annotations
6
+ import numpy as np
7
+ import gymnasium as gym
8
+ from gymnasium import spaces
9
+
10
+
11
+ class MazeEnv(gym.Env):
12
+ metadata = {"render_modes": []}
13
+
14
+ def __init__(self, grid: np.ndarray):
15
+ super().__init__()
16
+ self.grid = grid.copy()
17
+ self.H, self.W = grid.shape
18
+ self.n_states = self.H * self.W
19
+ self.start = (0, 0)
20
+ self.goal = (self.H - 1, self.W - 1)
21
+
22
+ # Ensure start/goal are open
23
+ self.grid[self.start] = 0
24
+ self.grid[self.goal] = 0
25
+
26
+ self.observation_space = spaces.Discrete(self.n_states)
27
+ self.action_space = spaces.Discrete(4) # up down left right
28
+ self._MOVES = [(-1, 0), (1, 0), (0, -1), (0, 1)]
29
+ self.agent_pos = self.start
30
+
31
+ def _to_state(self, r: int, c: int) -> int:
32
+ return r * self.W + c
33
+
34
+ def _from_state(self, s: int) -> tuple[int, int]:
35
+ return divmod(s, self.W)
36
+
37
+ def reset(self, *, seed=None, options=None):
38
+ super().reset(seed=seed)
39
+ self.agent_pos = self.start
40
+ return self._to_state(*self.agent_pos), {}
41
+
42
+ def step(self, action: int):
43
+ dr, dc = self._MOVES[action]
44
+ r, c = self.agent_pos
45
+ nr, nc = r + dr, c + dc
46
+
47
+ # Boundary / wall check
48
+ if 0 <= nr < self.H and 0 <= nc < self.W and self.grid[nr, nc] == 0:
49
+ self.agent_pos = (nr, nc)
50
+ reward = -1
51
+ else:
52
+ reward = -5 # bump penalty (less harsh than before so agent explores)
53
+
54
+ done = self.agent_pos == self.goal
55
+ if done:
56
+ reward = 100
57
+
58
+ return self._to_state(*self.agent_pos), reward, done, False, {}
59
+
60
+ @property
61
+ def shape(self) -> tuple[int, int]:
62
+ return self.H, self.W
maze/generator.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Maze generation using recursive DFS backtracker.
3
+ Guarantees a fully connected, solvable maze with proper corridors.
4
+ """
5
+
6
+ from __future__ import annotations
7
+ import numpy as np
8
+ from enum import IntEnum
9
+
10
+
11
+ class Cell(IntEnum):
12
+ OPEN = 0
13
+ WALL = 1
14
+ START = 2
15
+ GOAL = 3
16
+
17
+
18
+ PRESETS = {
19
+ "🐣 Tiny (5×5)": {"size": 5, "wall_density": 0.0},
20
+ "🐇 Small (7×7)": {"size": 7, "wall_density": 0.0},
21
+ "🐢 Medium (9×9)": {"size": 9, "wall_density": 0.0},
22
+ "🦊 Large (13×13)": {"size": 13, "wall_density": 0.0},
23
+ "🐉 XL (17×17)": {"size": 17, "wall_density": 0.0},
24
+ }
25
+
26
+ THEMES = {
27
+ "🏰 Classic": {"wall": "#2d3561", "open": "#f5f0e8", "path": "#e94560"},
28
+ "🌲 Forest": {"wall": "#1b4332", "open": "#d8f3dc", "path": "#f77f00"},
29
+ "🌌 Space": {"wall": "#03045e", "open": "#0d1b2a", "path": "#00b4d8"},
30
+ "🔥 Lava": {"wall": "#370617", "open": "#ffd166", "path": "#ef233c"},
31
+ }
32
+
33
+
34
+ def generate_dfs_maze(size: int, seed: int = 42) -> np.ndarray:
35
+ """
36
+ DFS recursive backtracker on a (size×size) grid.
37
+ Works on a logical grid where passages exist between cells.
38
+ Returns a (2*size-1) × (2*size-1) wall grid where 0=open, 1=wall.
39
+ """
40
+ rng = np.random.default_rng(seed)
41
+ # Full grid size (cells + walls between them)
42
+ H = W = 2 * size - 1
43
+ grid = np.ones((H, W), dtype=np.int8) # start all walls
44
+
45
+ # Mark logical cells as open
46
+ for r in range(size):
47
+ for c in range(size):
48
+ grid[2*r, 2*c] = 0
49
+
50
+ visited = np.zeros((size, size), dtype=bool)
51
+ stack = [(0, 0)]
52
+ visited[0, 0] = True
53
+
54
+ while stack:
55
+ r, c = stack[-1]
56
+ neighbors = []
57
+ for dr, dc in [(-1,0),(1,0),(0,-1),(0,1)]:
58
+ nr, nc = r+dr, c+dc
59
+ if 0 <= nr < size and 0 <= nc < size and not visited[nr, nc]:
60
+ neighbors.append((nr, nc, dr, dc))
61
+ if neighbors:
62
+ nr, nc, dr, dc = neighbors[rng.integers(len(neighbors))]
63
+ # Carve passage
64
+ grid[2*r+dr, 2*c+dc] = 0
65
+ visited[nr, nc] = True
66
+ stack.append((nr, nc))
67
+ else:
68
+ stack.pop()
69
+
70
+ return grid
71
+
72
+
73
+ def generate_open_maze(size: int, wall_frac: float = 0.20, seed: int = 42) -> np.ndarray:
74
+ """Simple random-wall maze — fast, less structured."""
75
+ rng = np.random.default_rng(seed)
76
+ grid = np.zeros((size, size), dtype=np.int8)
77
+ n_walls = int(size * size * wall_frac)
78
+ cells = [(r, c) for r in range(size) for c in range(size)
79
+ if not (r == 0 and c == 0) and not (r == size-1 and c == size-1)]
80
+ rng.shuffle(cells)
81
+ for r, c in cells[:n_walls]:
82
+ grid[r, c] = 1
83
+ return grid
viz/__init__.py ADDED
File without changes
viz/renderer.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ All visual output: animated GIF of the agent solving the maze,
3
+ training curve chart, Q-value heatmap, algorithm race chart.
4
+ """
5
+
6
+ from __future__ import annotations
7
+ import io
8
+ import tempfile
9
+ import numpy as np
10
+ import matplotlib
11
+ matplotlib.use("Agg")
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.patches as patches
14
+ import matplotlib.cm as mpl_cm
15
+ from matplotlib.figure import Figure as MplFigure
16
+ from matplotlib.axes import Axes as MplAxes
17
+ from matplotlib.colors import LinearSegmentedColormap
18
+ import PIL.Image
19
+ import PIL.ImageDraw
20
+
21
+ from maze.env import MazeEnv
22
+ from agents.base import TabularAgent
23
+
24
+ # ── Palette ───────────────────────────────────────────────────────────────────
25
+ BG = "#0d1117"
26
+ BG2 = "#161b22"
27
+ GRID_C = "#21262d"
28
+ WALL_C = "#1f6feb"
29
+ OPEN_C = "#0d1117"
30
+ PATH_C = "#3fb950"
31
+ START_C = "#58a6ff"
32
+ GOAL_C = "#f78166"
33
+ AGENT_C = "#ffa657"
34
+ TEXT_C = "#c9d1d9"
35
+ DIM_C = "#484f58"
36
+
37
+
38
+ def _fig_to_pil(fig: MplFigure) -> PIL.Image.Image:
39
+ buf = io.BytesIO()
40
+ fig.savefig(buf, format="png", dpi=120, bbox_inches="tight",
41
+ facecolor=fig.get_facecolor())
42
+ buf.seek(0)
43
+ plt.close(fig)
44
+ return PIL.Image.open(buf).convert("RGB")
45
+
46
+
47
+ def _draw_maze_frame(
48
+ ax: MplAxes,
49
+ grid: np.ndarray,
50
+ path: list[tuple[int, int]],
51
+ current_step: int,
52
+ H: int, W: int,
53
+ show_heatmap: np.ndarray | None = None,
54
+ ) -> None:
55
+ ax.set_facecolor(BG)
56
+ ax.set_xlim(-0.5, W - 0.5)
57
+ ax.set_ylim(H - 0.5, -0.5)
58
+ ax.set_aspect("equal")
59
+ ax.axis("off")
60
+
61
+ # Draw cells
62
+ for r in range(H):
63
+ for c in range(W):
64
+ is_wall = grid[r, c] == 1
65
+ if show_heatmap is not None and not is_wall:
66
+ val = float(show_heatmap[r, c])
67
+ intensity = np.clip(val, 0, 1)
68
+ color = mpl_cm.get_cmap("YlOrRd")(0.2 + 0.8 * intensity)
69
+ else:
70
+ color = "#2d333b" if is_wall else BG2
71
+ rect = patches.FancyBboxPatch(
72
+ (c - 0.45, r - 0.45), 0.90, 0.90,
73
+ boxstyle="round,pad=0.04",
74
+ facecolor=color,
75
+ edgecolor=GRID_C, linewidth=0.4,
76
+ )
77
+ ax.add_patch(rect)
78
+
79
+ # Start & goal
80
+ ax.add_patch(patches.Circle((0, 0), 0.35, color=START_C, zorder=3))
81
+ ax.add_patch(patches.Circle((W-1, H-1), 0.35, color=GOAL_C, zorder=3))
82
+ ax.text(0, 0, "S", ha="center", va="center", color="white",
83
+ fontsize=7, fontweight="bold", zorder=4)
84
+ ax.text(W-1, H-1, "G", ha="center", va="center", color="white",
85
+ fontsize=7, fontweight="bold", zorder=4)
86
+
87
+ # Walked path (up to current_step)
88
+ walked = path[:current_step]
89
+ if len(walked) > 1:
90
+ xs = [c for _, c in walked]
91
+ ys = [r for r, _ in walked]
92
+ ax.plot(xs, ys, color=PATH_C, linewidth=2.5, alpha=0.7,
93
+ zorder=2, solid_capstyle="round")
94
+
95
+ # Current agent position
96
+ if walked:
97
+ ar, ac = walked[-1]
98
+ ax.add_patch(patches.Circle((ac, ar), 0.32, color=AGENT_C, zorder=5))
99
+ ax.text(ac, ar, "●", ha="center", va="center",
100
+ color="white", fontsize=8, zorder=6)
101
+
102
+
103
+ def make_solution_gif(
104
+ env: MazeEnv,
105
+ agent: TabularAgent,
106
+ fps: int = 8,
107
+ label: str = "",
108
+ ) -> str:
109
+ """Greedy rollout → animated GIF of agent walking the maze."""
110
+ # Collect path
111
+ state, _ = env.reset()
112
+ path: list[tuple[int, int]] = [env.start]
113
+ for _ in range(env.n_states * 3):
114
+ action = agent.greedy_action(state)
115
+ state, _, done, _, _ = env.step(action)
116
+ path.append(env._from_state(state))
117
+ if done:
118
+ break
119
+
120
+ H, W = env.shape
121
+ pil_frames: list[PIL.Image.Image] = []
122
+
123
+ for step in range(1, len(path) + 1):
124
+ fig, ax = plt.subplots(figsize=(max(4, W * 0.55), max(4, H * 0.55)))
125
+ fig.patch.set_facecolor(BG)
126
+ _draw_maze_frame(ax, env.grid, path, step, H, W)
127
+
128
+ status = f"Step {step}/{len(path)-1}"
129
+ if step == len(path) and path[-1] == env.goal:
130
+ status = f"🎉 Solved in {len(path)-1} steps!"
131
+ ax.set_title(f"{label} {status}", color=TEXT_C, fontsize=9,
132
+ pad=6, fontfamily="monospace")
133
+ pil_frames.append(_fig_to_pil(fig))
134
+
135
+ tmp = tempfile.NamedTemporaryFile(suffix=".gif", delete=False)
136
+ pil_frames[0].save(
137
+ tmp.name, save_all=True, append_images=pil_frames[1:],
138
+ duration=int(1000 / fps), loop=0, optimize=False,
139
+ )
140
+ return tmp.name
141
+
142
+
143
+ def make_training_chart(
144
+ rewards_dict: dict[str, list[float]],
145
+ colors: dict[str, str] | None = None,
146
+ ) -> MplFigure:
147
+ """Reward-per-episode curves for one or more algorithms."""
148
+ default_colors = {"Q-Learning": "#3fb950", "SARSA": "#58a6ff", "Monte Carlo": "#ffa657"}
149
+ colors = colors or default_colors
150
+
151
+ fig, ax = plt.subplots(figsize=(10, 4))
152
+ fig.patch.set_facecolor(BG)
153
+ ax.set_facecolor(BG2)
154
+
155
+ for name, rewards in rewards_dict.items():
156
+ col = colors.get(name, "#ffffff")
157
+ eps = list(range(len(rewards)))
158
+ ax.plot(eps, rewards, color=col, linewidth=0.8, alpha=0.3)
159
+ if len(eps) > 20:
160
+ k = max(5, len(eps) // 40)
161
+ smooth = np.convolve(rewards, np.ones(k) / k, "valid")
162
+ ax.plot(range(k - 1, len(eps)), smooth, color=col, linewidth=2.2, label=name)
163
+ else:
164
+ ax.plot(eps, rewards, color=col, linewidth=2.0, label=name)
165
+
166
+ ax.set_xlabel("Episode", color=DIM_C, fontsize=9)
167
+ ax.set_ylabel("Total Reward", color=DIM_C, fontsize=9)
168
+ ax.set_title("Training Progress — Reward per Episode", color=TEXT_C,
169
+ fontsize=11, pad=10, fontfamily="monospace")
170
+ ax.tick_params(colors=DIM_C, labelsize=8)
171
+ for spine in ax.spines.values():
172
+ spine.set_color(GRID_C)
173
+ ax.grid(color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
174
+ if len(rewards_dict) > 1:
175
+ ax.legend(facecolor=BG2, edgecolor=GRID_C, labelcolor=TEXT_C, fontsize=9)
176
+ fig.tight_layout()
177
+ return fig
178
+
179
+
180
+ def make_qvalue_heatmap(env: MazeEnv, agent: TabularAgent, title: str = "") -> MplFigure:
181
+ """Q-value heatmap overlaid on the maze grid."""
182
+ H, W = env.shape
183
+ max_q = np.max(agent.Q, axis=1).reshape(H, W)
184
+
185
+ # Normalise to [0, 1] for colour mapping
186
+ qmin, qmax = max_q.min(), max_q.max()
187
+ norm_q = (max_q - qmin) / max(qmax - qmin, 1e-8)
188
+
189
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, max(4, H * 0.6)))
190
+ fig.patch.set_facecolor(BG)
191
+
192
+ # Left: maze with learned path
193
+ ax1.set_facecolor(BG)
194
+ state, _ = env.reset()
195
+ path: list[tuple[int, int]] = [env.start]
196
+ for _ in range(env.n_states * 3):
197
+ action = agent.greedy_action(state)
198
+ state, _, done, _, _ = env.step(action)
199
+ path.append(env._from_state(state))
200
+ if done:
201
+ break
202
+ _draw_maze_frame(ax1, env.grid, path, len(path), H, W)
203
+ solved = path[-1] == env.goal
204
+ ax1.set_title(
205
+ f"{'Solved' if solved else 'Not solved'} — {len(path)-1} steps",
206
+ color=TEXT_C, fontsize=10, pad=8, fontfamily="monospace",
207
+ )
208
+
209
+ # Right: Q-value heatmap
210
+ ax2.set_facecolor(BG)
211
+ ax2.set_xlim(-0.5, W - 0.5)
212
+ ax2.set_ylim(H - 0.5, -0.5)
213
+ ax2.set_aspect("equal")
214
+ ax2.axis("off")
215
+
216
+ cmap = LinearSegmentedColormap.from_list("qmap", ["#0d1117", "#1f6feb", "#3fb950"])
217
+ for r in range(H):
218
+ for c in range(W):
219
+ if env.grid[r, c] == 1:
220
+ color = "#2d333b"
221
+ else:
222
+ color = cmap(norm_q[r, c])
223
+ rect = patches.FancyBboxPatch(
224
+ (c - 0.45, r - 0.45), 0.90, 0.90,
225
+ boxstyle="round,pad=0.04",
226
+ facecolor=color, edgecolor=GRID_C, linewidth=0.4,
227
+ )
228
+ ax2.add_patch(rect)
229
+ if env.grid[r, c] == 0 and H <= 11:
230
+ ax2.text(c, r, f"{max_q[r,c]:.0f}",
231
+ ha="center", va="center", fontsize=5.5,
232
+ color="white" if norm_q[r, c] > 0.4 else DIM_C)
233
+
234
+ ax2.set_title("Q-Value Map (brighter = agent prefers this cell)",
235
+ color=TEXT_C, fontsize=10, pad=8, fontfamily="monospace")
236
+
237
+ if title:
238
+ fig.suptitle(title, color=TEXT_C, fontsize=12, fontfamily="monospace", y=1.02)
239
+ fig.tight_layout()
240
+ return fig
241
+
242
+
243
+ def make_race_chart(
244
+ rewards_a: list[float], name_a: str,
245
+ rewards_b: list[float], name_b: str,
246
+ rewards_c: list[float] | None = None, name_c: str = "",
247
+ ) -> MplFigure:
248
+ """Head-to-head convergence race chart."""
249
+ fig, axes = plt.subplots(1, 2, figsize=(13, 4))
250
+ fig.patch.set_facecolor(BG)
251
+
252
+ pairs = [(rewards_a, name_a, "#3fb950"), (rewards_b, name_b, "#58a6ff")]
253
+ if rewards_c:
254
+ pairs.append((rewards_c, name_c, "#ffa657"))
255
+
256
+ # Left: raw + smoothed reward curves
257
+ ax = axes[0]
258
+ ax.set_facecolor(BG2)
259
+ for rewards, name, col in pairs:
260
+ eps = list(range(len(rewards)))
261
+ ax.plot(eps, rewards, color=col, linewidth=0.6, alpha=0.25)
262
+ if len(eps) > 20:
263
+ k = max(5, len(eps) // 40)
264
+ smooth = np.convolve(rewards, np.ones(k) / k, "valid")
265
+ ax.plot(range(k-1, len(eps)), smooth, color=col, linewidth=2.2, label=name)
266
+ ax.set_title("Reward Convergence", color=TEXT_C, fontsize=10, fontfamily="monospace")
267
+ ax.set_xlabel("Episode", color=DIM_C, fontsize=8)
268
+ ax.set_ylabel("Reward", color=DIM_C, fontsize=8)
269
+ ax.tick_params(colors=DIM_C, labelsize=7)
270
+ for spine in ax.spines.values():
271
+ spine.set_color(GRID_C)
272
+ ax.grid(color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
273
+ ax.legend(facecolor=BG2, edgecolor=GRID_C, labelcolor=TEXT_C, fontsize=8)
274
+
275
+ # Right: bar chart of final performance
276
+ ax2 = axes[1]
277
+ ax2.set_facecolor(BG2)
278
+ names = [p[1] for p in pairs]
279
+ finals = [float(np.mean(p[0][-max(1, len(p[0])//5):])) for p in pairs]
280
+ cols = [p[2] for p in pairs]
281
+ bars = ax2.bar(names, finals, color=cols, edgecolor=BG, linewidth=0.8, width=0.5)
282
+ for bar, val in zip(bars, finals):
283
+ ax2.text(bar.get_x() + bar.get_width()/2,
284
+ bar.get_height() + abs(min(finals)) * 0.02,
285
+ f"{val:.1f}", ha="center", va="bottom",
286
+ color=TEXT_C, fontsize=9, fontweight="bold")
287
+ ax2.set_title("Final Performance (avg last 20%)", color=TEXT_C,
288
+ fontsize=10, fontfamily="monospace")
289
+ ax2.tick_params(colors=DIM_C, labelsize=8)
290
+ for spine in ax2.spines.values():
291
+ spine.set_color(GRID_C)
292
+ ax2.grid(axis="y", color=GRID_C, linewidth=0.5, linestyle="--", alpha=0.5)
293
+ ax2.set_facecolor(BG2)
294
+
295
+ fig.tight_layout(pad=2.0)
296
+ return fig
297
+
298
+
299
+ def score_run(path: list[tuple[int, ...]], goal: tuple[int, int],
300
+ rewards: list[float], n_states: int) -> dict:
301
+ solved = bool(path and path[-1] == goal)
302
+ steps = len(path) - 1
303
+ optimal_approx = n_states ** 0.5 # rough floor
304
+ efficiency = min(100, int(optimal_approx / max(steps, 1) * 100)) if solved else 0
305
+ avg_r = float(np.mean(rewards[-max(1, len(rewards)//5):])) if rewards else 0
306
+
307
+ if not solved:
308
+ grade, verdict = "F", "Bot got lost — try more episodes!"
309
+ elif efficiency >= 80:
310
+ grade, verdict = "S", "Perfect pathfinding! Optimal route."
311
+ elif efficiency >= 60:
312
+ grade, verdict = "A", "Excellent! Near-optimal path."
313
+ elif efficiency >= 40:
314
+ grade, verdict = "B", "Good solve — some detours taken."
315
+ else:
316
+ grade, verdict = "C", "Solved but took the scenic route!"
317
+
318
+ return {"solved": solved, "steps": steps, "grade": grade,
319
+ "verdict": verdict, "efficiency": efficiency, "avg_reward": avg_r}