""" ๐ค Maze Runner โ RL Playground An interactive, fun maze-solving playground powered by Reinforcement Learning. Anyone can build a maze, pick a brain, and watch the bot learn to escape. """ from __future__ import annotations import numpy as np import gradio as gr from maze.generator import generate_dfs_maze, generate_open_maze from maze.env import MazeEnv from agents.qlearning import train_qlearning from agents.sarsa import train_sarsa from agents.montecarlo import train_montecarlo from viz.renderer import ( make_solution_gif, make_training_chart, make_qvalue_heatmap, make_race_chart, score_run, ) # โโ Helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ ALGO_MAP = { "๐ง Q-Learning (recommended)": "qlearning", "๐ฏ SARSA (cautious)": "sarsa", "๐ฒ Monte Carlo (explorer)": "montecarlo", } DIFFICULTY = { "๐ฃ Tiny (5ร5)": 5, "๐ Small (7ร7)": 7, "๐ข Medium (9ร9)": 9, "๐ฆ Large (13ร13)": 13, "๐ XL (17ร17)": 17, } MAZE_STYLE = { "๐ฐ Corridors (DFS)": "dfs", "๐ฟ Open Field (random walls)": "open", } def _make_env(size: int, style: str, seed: int) -> MazeEnv: if style == "dfs": grid = generate_dfs_maze(size, seed=seed) else: grid = generate_open_maze(size, wall_frac=0.18, seed=seed) return MazeEnv(grid) def _train(env: MazeEnv, algo: str, episodes: int, alpha: float, gamma: float, decay: float, seed: int): fn = {"qlearning": train_qlearning, "sarsa": train_sarsa, "montecarlo": train_montecarlo}[algo] return fn(env, episodes, alpha, gamma, decay, seed) def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]: state, _ = env.reset() path: list[tuple[int, ...]] = [env.start] for _ in range(env.n_states * 3): action = agent.greedy_action(state) state, _, done, _, _ = env.step(action) path.append(env._from_state(state)) if done: break return path # โโ Main Playground callback โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def cb_solve( difficulty: str, maze_style: str, algo_label: str, episodes: int, alpha: float, gamma: float, decay: float, seed: int, progress: gr.Progress = gr.Progress(), ): progress(0.05, desc="Building mazeโฆ") size = DIFFICULTY[difficulty] style = MAZE_STYLE[maze_style] algo = ALGO_MAP[algo_label] env = _make_env(size, style, int(seed)) progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}โฆ") agent, rewards = _train(env, algo, int(episodes), float(alpha), float(gamma), float(decay), int(seed)) progress(0.75, desc="Rendering solutionโฆ") env2 = _make_env(size, style, int(seed)) gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip()) progress(0.85, desc="Building chartsโฆ") env3 = _make_env(size, style, int(seed)) path = _collect_path(env3, agent) sc = score_run(path, env3.goal, rewards, env3.n_states) train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards}) env4 = _make_env(size, style, int(seed)) heatmap_fig = make_qvalue_heatmap(env4, agent) stats_md = f""" ### {sc['grade']} โ {sc['verdict']} | | | |---|---| | **Solved** | {"โ Yes" if sc['solved'] else "โ No"} | | **Steps taken** | `{sc['steps']}` | | **Efficiency score** | `{sc['efficiency']}%` | | **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` | | **Episodes trained** | `{int(episodes)}` | | **Maze size** | `{env.shape[0]} ร {env.shape[1]}` cells | > **Efficiency** compares your bot's path length to the ideal shortest path. > 100% = perfect. 0% = didn't make it. """ progress(1.0, desc="Done!") return gif_path, train_fig, heatmap_fig, stats_md # โโ Algorithm Race callback โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def cb_race( difficulty: str, maze_style: str, episodes: int, run_mc: bool, progress: gr.Progress = gr.Progress(), ): size = DIFFICULTY[difficulty] style = MAZE_STYLE[maze_style] seed = 77 progress(0.1, desc="Training Q-Learningโฆ") env_q = _make_env(size, style, seed) _, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed) progress(0.4, desc="Training SARSAโฆ") env_s = _make_env(size, style, seed) _, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed) rc, name_c = None, "" if run_mc: progress(0.65, desc="Training Monte Carloโฆ") env_m = _make_env(size, style, seed) _, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed) name_c = "Monte Carlo" progress(0.9, desc="Building race chartโฆ") fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c) # Winner final_q = float(np.mean(rq[-max(1, len(rq)//5):])) final_s = float(np.mean(rs[-max(1, len(rs)//5):])) scores = {"Q-Learning": final_q, "SARSA": final_s} if rc: scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):])) winner = max(scores, key=lambda k: scores[k]) result_md = f""" ### ๐ Race Result | Algorithm | Final Score | |---|---| {"".join(f"| {'๐ฅ ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())} **Winner: {winner}** with a final average reward of `{scores[winner]:.1f}` > All algorithms trained on the same maze with identical hyperparameters. > Final score = average reward over the last 20% of episodes. """ progress(1.0) return fig, result_md # โโ CSS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap'); *, *::before, *::after { box-sizing: border-box; } body, .gradio-container { background: #0d1117 !important; color: #c9d1d9 !important; font-family: 'Inter', sans-serif !important; } .gradio-container { max-width: 1100px !important; margin: 0 auto !important; } /* Hero */ .hero { text-align:center; padding:2rem 1rem 1rem; } .hero-title { font-size: clamp(2rem, 5vw, 3rem); font-weight: 700; background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.4rem; } .hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; } /* Tabs */ .tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; } .tab-nav button { font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important; font-weight: 500 !important; color: #484f58 !important; background: transparent !important; border: none !important; padding: 0.7rem 1.1rem !important; } .tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; } .tab-nav button:hover { color: #8b949e !important; } /* Cards */ .info-card { background: #161b22; border: 1px solid #21262d; border-radius: 10px; padding: 1.1rem; } .info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; } .info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; } .info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; } /* Algo cards */ .algo-card { background: #161b22; border: 1px solid #21262d; border-radius: 10px; padding: 1rem; margin-bottom: 0.5rem; } .algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; } .algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; } .algo-tag { display: inline-block; font-size: 0.68rem; padding: 2px 8px; border-radius: 20px; margin-top: 0.4rem; } .tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; } .tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; } .tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; } /* Grade badge */ .grade-badge { display:inline-block; font-size:2.5rem; font-weight:700; font-family:'JetBrains Mono',monospace; } /* Buttons */ button.primary { font-family: 'Inter', sans-serif !important; font-weight: 600 !important; background: linear-gradient(135deg, #238636, #2ea043) !important; color: #ffffff !important; border: none !important; border-radius: 6px !important; font-size: 0.9rem !important; transition: opacity 0.2s !important; } button.primary:hover { opacity: 0.85 !important; } button.secondary { background: #161b22 !important; color: #58a6ff !important; border: 1px solid #30363d !important; border-radius: 6px !important; font-family: 'Inter', sans-serif !important; } button.stop { background: #1c0d0d !important; color: #f78166 !important; border: 1px solid #6e2b2b !important; border-radius: 6px !important; } /* Labels */ label span { font-family:'Inter',sans-serif !important; font-size:0.82rem !important; color:#8b949e !important; } /* Slider */ input[type=range] { -webkit-appearance:none; height:4px; background:#21262d; border-radius:2px; } input[type=range]::-webkit-slider-thumb { -webkit-appearance:none; width:16px; height:16px; border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117; } /* Textarea */ textarea { font-family:'JetBrains Mono',monospace !important; font-size:0.82rem !important; background:#0d1117 !important; color:#3fb950 !important; border:1px solid #21262d !important; border-radius:6px !important; } /* Markdown */ .gradio-container h2 { color: #3fb950 !important; } .gradio-container h3 { color: #58a6ff !important; } .gradio-container p { color: #8b949e !important; } table { width:100%; border-collapse:collapse; } th { background:#161b22; color:#3fb950; font-size:0.78rem; text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; } td { padding:8px 12px; border-bottom:1px solid #0d1117; color:#e6edf3; font-size:0.85rem; } blockquote { border-left:3px solid #3fb950; padding-left:1rem; color:#484f58 !important; margin:0.5rem 0; } footer { display:none !important; } .gradio-container .block { background:transparent !important; border:none !important; } """ # โโ Build UI โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ with gr.Blocks(title="๐ค Maze Runner โ RL Playground") as demo: gr.HTML("""
A tiny AI robot is dropped into a maze. It knows nothing. Through thousands of attempts โ hitting walls, finding dead ends, occasionally stumbling upon the exit โ it slowly builds a mental map and learns the perfect escape route.
The bot doesn't know anything about the maze at the start. It just knows 4 possible moves and gets a number (reward) after each step. Negative number = bad move. Positive = good move. The goal: find the sequence of moves that gets the most reward.
The bot keeps a table with one row per maze cell and 4 columns (one per direction). Each entry stores how good it thinks that move is from that cell. At the start, everything is 0. After training, the table holds the bot's entire learned strategy. The Q-value heatmap in the Playground shows this table visually.
Early in training, the bot tries random moves (exploration) โ it doesn't know enough to trust its table yet. Over time, it relies more on what it's learned (exploitation). This is controlled by epsilon (ฮต), which starts near 1.0 (100% random) and decays toward 0 (always use best known move).
Each step costs โ1 (time penalty) and hitting a wall costs โ5. A random bot hits a lot of walls and takes forever to find the exit, so early rewards are very negative. As it learns, fewer walls are hit and the path shortens โ reward climbs toward 0 and eventually turns positive when it reliably reaches the goal.