""" ๐Ÿค– Maze Runner โ€” RL Playground An interactive, fun maze-solving playground powered by Reinforcement Learning. Anyone can build a maze, pick a brain, and watch the bot learn to escape. """ from __future__ import annotations import numpy as np import gradio as gr from maze.generator import generate_dfs_maze, generate_open_maze from maze.env import MazeEnv from agents.qlearning import train_qlearning from agents.sarsa import train_sarsa from agents.montecarlo import train_montecarlo from viz.renderer import ( make_solution_gif, make_training_chart, make_qvalue_heatmap, make_race_chart, score_run, ) # โ”€โ”€ Helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ALGO_MAP = { "๐Ÿง  Q-Learning (recommended)": "qlearning", "๐ŸŽฏ SARSA (cautious)": "sarsa", "๐ŸŽฒ Monte Carlo (explorer)": "montecarlo", } DIFFICULTY = { "๐Ÿฃ Tiny (5ร—5)": 5, "๐Ÿ‡ Small (7ร—7)": 7, "๐Ÿข Medium (9ร—9)": 9, "๐ŸฆŠ Large (13ร—13)": 13, "๐Ÿ‰ XL (17ร—17)": 17, } MAZE_STYLE = { "๐Ÿฐ Corridors (DFS)": "dfs", "๐ŸŒฟ Open Field (random walls)": "open", } def _make_env(size: int, style: str, seed: int) -> MazeEnv: if style == "dfs": grid = generate_dfs_maze(size, seed=seed) else: grid = generate_open_maze(size, wall_frac=0.18, seed=seed) return MazeEnv(grid) def _train(env: MazeEnv, algo: str, episodes: int, alpha: float, gamma: float, decay: float, seed: int): fn = {"qlearning": train_qlearning, "sarsa": train_sarsa, "montecarlo": train_montecarlo}[algo] return fn(env, episodes, alpha, gamma, decay, seed) def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]: state, _ = env.reset() path: list[tuple[int, ...]] = [env.start] for _ in range(env.n_states * 3): action = agent.greedy_action(state) state, _, done, _, _ = env.step(action) path.append(env._from_state(state)) if done: break return path # โ”€โ”€ Main Playground callback โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def cb_solve( difficulty: str, maze_style: str, algo_label: str, episodes: int, alpha: float, gamma: float, decay: float, seed: int, progress: gr.Progress = gr.Progress(), ): progress(0.05, desc="Building mazeโ€ฆ") size = DIFFICULTY[difficulty] style = MAZE_STYLE[maze_style] algo = ALGO_MAP[algo_label] env = _make_env(size, style, int(seed)) progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}โ€ฆ") agent, rewards = _train(env, algo, int(episodes), float(alpha), float(gamma), float(decay), int(seed)) progress(0.75, desc="Rendering solutionโ€ฆ") env2 = _make_env(size, style, int(seed)) gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip()) progress(0.85, desc="Building chartsโ€ฆ") env3 = _make_env(size, style, int(seed)) path = _collect_path(env3, agent) sc = score_run(path, env3.goal, rewards, env3.n_states) train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards}) env4 = _make_env(size, style, int(seed)) heatmap_fig = make_qvalue_heatmap(env4, agent) stats_md = f""" ### {sc['grade']} โ€” {sc['verdict']} | | | |---|---| | **Solved** | {"โœ… Yes" if sc['solved'] else "โŒ No"} | | **Steps taken** | `{sc['steps']}` | | **Efficiency score** | `{sc['efficiency']}%` | | **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` | | **Episodes trained** | `{int(episodes)}` | | **Maze size** | `{env.shape[0]} ร— {env.shape[1]}` cells | > **Efficiency** compares your bot's path length to the ideal shortest path. > 100% = perfect. 0% = didn't make it. """ progress(1.0, desc="Done!") return gif_path, train_fig, heatmap_fig, stats_md # โ”€โ”€ Algorithm Race callback โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ def cb_race( difficulty: str, maze_style: str, episodes: int, run_mc: bool, progress: gr.Progress = gr.Progress(), ): size = DIFFICULTY[difficulty] style = MAZE_STYLE[maze_style] seed = 77 progress(0.1, desc="Training Q-Learningโ€ฆ") env_q = _make_env(size, style, seed) _, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed) progress(0.4, desc="Training SARSAโ€ฆ") env_s = _make_env(size, style, seed) _, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed) rc, name_c = None, "" if run_mc: progress(0.65, desc="Training Monte Carloโ€ฆ") env_m = _make_env(size, style, seed) _, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed) name_c = "Monte Carlo" progress(0.9, desc="Building race chartโ€ฆ") fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c) # Winner final_q = float(np.mean(rq[-max(1, len(rq)//5):])) final_s = float(np.mean(rs[-max(1, len(rs)//5):])) scores = {"Q-Learning": final_q, "SARSA": final_s} if rc: scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):])) winner = max(scores, key=lambda k: scores[k]) result_md = f""" ### ๐Ÿ† Race Result | Algorithm | Final Score | |---|---| {"".join(f"| {'๐Ÿฅ‡ ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())} **Winner: {winner}** with a final average reward of `{scores[winner]:.1f}` > All algorithms trained on the same maze with identical hyperparameters. > Final score = average reward over the last 20% of episodes. """ progress(1.0) return fig, result_md # โ”€โ”€ CSS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap'); *, *::before, *::after { box-sizing: border-box; } body, .gradio-container { background: #0d1117 !important; color: #c9d1d9 !important; font-family: 'Inter', sans-serif !important; } .gradio-container { max-width: 1100px !important; margin: 0 auto !important; } /* Hero */ .hero { text-align:center; padding:2rem 1rem 1rem; } .hero-title { font-size: clamp(2rem, 5vw, 3rem); font-weight: 700; background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657); -webkit-background-clip: text; -webkit-text-fill-color: transparent; margin: 0 0 0.4rem; } .hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; } /* Tabs */ .tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; } .tab-nav button { font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important; font-weight: 500 !important; color: #484f58 !important; background: transparent !important; border: none !important; padding: 0.7rem 1.1rem !important; } .tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; } .tab-nav button:hover { color: #8b949e !important; } /* Cards */ .info-card { background: #161b22; border: 1px solid #21262d; border-radius: 10px; padding: 1.1rem; } .info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; } .info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; } .info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; } /* Algo cards */ .algo-card { background: #161b22; border: 1px solid #21262d; border-radius: 10px; padding: 1rem; margin-bottom: 0.5rem; } .algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; } .algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; } .algo-tag { display: inline-block; font-size: 0.68rem; padding: 2px 8px; border-radius: 20px; margin-top: 0.4rem; } .tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; } .tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; } .tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; } /* Grade badge */ .grade-badge { display:inline-block; font-size:2.5rem; font-weight:700; font-family:'JetBrains Mono',monospace; } /* Buttons */ button.primary { font-family: 'Inter', sans-serif !important; font-weight: 600 !important; background: linear-gradient(135deg, #238636, #2ea043) !important; color: #ffffff !important; border: none !important; border-radius: 6px !important; font-size: 0.9rem !important; transition: opacity 0.2s !important; } button.primary:hover { opacity: 0.85 !important; } button.secondary { background: #161b22 !important; color: #58a6ff !important; border: 1px solid #30363d !important; border-radius: 6px !important; font-family: 'Inter', sans-serif !important; } button.stop { background: #1c0d0d !important; color: #f78166 !important; border: 1px solid #6e2b2b !important; border-radius: 6px !important; } /* Labels */ label span { font-family:'Inter',sans-serif !important; font-size:0.82rem !important; color:#8b949e !important; } /* Slider */ input[type=range] { -webkit-appearance:none; height:4px; background:#21262d; border-radius:2px; } input[type=range]::-webkit-slider-thumb { -webkit-appearance:none; width:16px; height:16px; border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117; } /* Textarea */ textarea { font-family:'JetBrains Mono',monospace !important; font-size:0.82rem !important; background:#0d1117 !important; color:#3fb950 !important; border:1px solid #21262d !important; border-radius:6px !important; } /* Markdown */ .gradio-container h2 { color: #3fb950 !important; } .gradio-container h3 { color: #58a6ff !important; } .gradio-container p { color: #8b949e !important; } table { width:100%; border-collapse:collapse; } th { background:#161b22; color:#3fb950; font-size:0.78rem; text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; } td { padding:8px 12px; border-bottom:1px solid #0d1117; color:#e6edf3; font-size:0.85rem; } blockquote { border-left:3px solid #3fb950; padding-left:1rem; color:#484f58 !important; margin:0.5rem 0; } footer { display:none !important; } .gradio-container .block { background:transparent !important; border:none !important; } """ # โ”€โ”€ Build UI โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with gr.Blocks(title="๐Ÿค– Maze Runner โ€” RL Playground") as demo: gr.HTML("""
๐Ÿค– Maze Runner
An AI that learns to escape mazes โ€” watch it happen in real time
""") with gr.Tabs(): # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• # Tab 1 โ€” Welcome # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• with gr.Tab("๐Ÿ  Welcome"): gr.HTML("""

A tiny AI robot is dropped into a maze. It knows nothing. Through thousands of attempts โ€” hitting walls, finding dead ends, occasionally stumbling upon the exit โ€” it slowly builds a mental map and learns the perfect escape route.

""") gr.HTML("""
๐Ÿ—บ๏ธ
The Maze
A grid of corridors and walls. The bot starts at S and must reach G. It can only see its own position โ€” no map, no cheating.
๐Ÿค–
The Bot
At each step it chooses: go up, down, left, or right. Hit a wall? Penalty. Reach the goal? Big reward! It remembers what worked and what didn't.
๐Ÿง 
The Learning
Each attempt updates a "score table" for every position and move. After enough tries, the bot always picks the move with the highest score โ€” the optimal path.
""") gr.HTML("""
๐Ÿง  Choose your Bot's Brain
Q-Learning
Updates its score table immediately after every step. Fast learner. Best for most mazes.
โšก Recommended
SARSA
Updates based on the move it actually took next, not just the best possible. More cautious, avoids risky paths.
๐ŸŽฏ Cautious
Monte Carlo
Plays out the entire episode first, then updates everything at once. Needs more episodes to converge.
๐ŸŽฒ Explorer
""") gr.HTML("""
๐Ÿ—บ๏ธ How to use this app
  1. Go to ๐ŸŽฎ Playground tab
  2. Pick a difficulty and maze style
  3. Choose a brain and hit Train & Watch!
  4. Watch the animated replay
  5. Try ๐Ÿ Algorithm Race to compare all three
๐Ÿ’ก Fun facts
""") # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• # Tab 2 โ€” Playground # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• with gr.Tab("๐ŸŽฎ Playground"): gr.HTML("""
Build a maze, pick a brain, watch it learn
The animated replay shows the final learned path after training.
""") with gr.Row(): # โ”€โ”€ Controls โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with gr.Column(scale=1, min_width=300): gr.HTML('
๐Ÿ—บ๏ธ MAZE SETUP
') difficulty = gr.Radio( list(DIFFICULTY.keys()), value="๐Ÿข Medium (9ร—9)", label="Difficulty", ) maze_style = gr.Radio( list(MAZE_STYLE.keys()), value="๐Ÿฐ Corridors (DFS)", label="Maze style", info="Corridors = proper winding paths ยท Open = random walls" ) gr.HTML('
๐Ÿง  BOT BRAIN
') algo = gr.Radio( list(ALGO_MAP.keys()), value="๐Ÿง  Q-Learning (recommended)", label="Algorithm", ) gr.HTML('
โš™๏ธ TRAINING
') episodes = gr.Slider(100, 3000, value=800, step=100, label="Training episodes", info="More = smarter bot, but slower") with gr.Accordion("๐Ÿ”ฌ Advanced settings", open=False): alpha = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Learning speed (ฮฑ)") gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="Future planning (ฮณ)") decay = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay") seed = gr.Slider(0, 100, value=42, step=1, label="Random seed") btn_solve = gr.Button("๐Ÿš€ Train & Watch!", variant="primary") # โ”€โ”€ Outputs โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ with gr.Column(scale=2): play_stats = gr.Markdown("*Configure your maze and hit Train & Watch!*") with gr.Row(): play_gif = gr.Image( label="๐ŸŽฌ Bot solving the maze (animated)", type="filepath", height=360, ) with gr.Row(): play_train_fig = gr.Plot(label="๐Ÿ“ˆ Training progress") play_heatmap = gr.Plot(label="๐ŸŒก๏ธ Q-value map (what the bot learned)") # hidden state defaults for advanced alpha_h = gr.State(0.1) gamma_h = gr.State(0.95) decay_h = gr.State(0.995) seed_h = gr.State(42) btn_solve.click( cb_solve, inputs=[difficulty, maze_style, algo, episodes, alpha, gamma, decay, seed], outputs=[play_gif, play_train_fig, play_heatmap, play_stats], ) # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• # Tab 3 โ€” Algorithm Race # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• with gr.Tab("๐Ÿ Algorithm Race"): gr.HTML("""
Head-to-head: which brain learns fastest?
All algorithms train on the same maze with identical settings โ€” the only variable is the learning strategy.
""") gr.HTML("""
Q-Learning
Off-policy ยท Fast update ยท Optimistic
SARSA
On-policy ยท Careful update ยท Conservative
Monte Carlo
Episodic ยท Full return ยท Unbiased
""") with gr.Row(): with gr.Column(scale=1, min_width=260): race_diff = gr.Radio(list(DIFFICULTY.keys()), value="๐Ÿข Medium (9ร—9)", label="Maze difficulty") race_style = gr.Radio(list(MAZE_STYLE.keys()), value="๐Ÿฐ Corridors (DFS)", label="Maze style") race_eps = gr.Slider(200, 2000, value=600, step=100, label="Episodes per algorithm") race_mc = gr.Checkbox(label="Include Monte Carlo (slower)", value=True) btn_race = gr.Button("๐Ÿ Start Race!", variant="primary") with gr.Column(scale=2): race_result = gr.Markdown("*Click Start Race to run the comparison.*") race_fig = gr.Plot(label="Race Results") btn_race.click( cb_race, inputs=[race_diff, race_style, race_eps, race_mc], outputs=[race_fig, race_result], ) # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• # Tab 4 โ€” How it Works # โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• with gr.Tab("๐Ÿง  How it Works"): gr.HTML("""

The Big Idea

The bot doesn't know anything about the maze at the start. It just knows 4 possible moves and gets a number (reward) after each step. Negative number = bad move. Positive = good move. The goal: find the sequence of moves that gets the most reward.

The Score Table (Q-Table)

The bot keeps a table with one row per maze cell and 4 columns (one per direction). Each entry stores how good it thinks that move is from that cell. At the start, everything is 0. After training, the table holds the bot's entire learned strategy. The Q-value heatmap in the Playground shows this table visually.

Q[current_cell][move] += learning_speed ร— (
  reward_got + future_discount ร— best_Q[next_cell] โˆ’ Q[current_cell][move]
)

Exploration vs Exploitation

Early in training, the bot tries random moves (exploration) โ€” it doesn't know enough to trust its table yet. Over time, it relies more on what it's learned (exploitation). This is controlled by epsilon (ฮต), which starts near 1.0 (100% random) and decays toward 0 (always use best known move).

Why does reward go negative first?

Each step costs โˆ’1 (time penalty) and hitting a wall costs โˆ’5. A random bot hits a lot of walls and takes forever to find the exit, so early rewards are very negative. As it learns, fewer walls are hit and the path shortens โ€” reward climbs toward 0 and eventually turns positive when it reliably reaches the goal.

Q-Learning vs SARSA
Q-Learning always updates toward the best possible next action โ€” even if it wouldn't actually take that action. SARSA updates toward the action it will actually take. This makes SARSA more cautious near walls.
Why Monte Carlo is slow
MC waits until the episode ends before updating any scores. On large mazes where early episodes never reach the goal, it gets zero signal for a long time. But once it starts solving, its estimates are very accurate.
""") gr.HTML("""
Built with Q-Learning ยท SARSA ยท Monte Carlo ยท Gymnasium ยท Gradio
""") if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS)