Spaces:
Running
Running
| """ | |
| 🤖 Maze Runner — RL Playground | |
| An interactive, fun maze-solving playground powered by Reinforcement Learning. | |
| Anyone can build a maze, pick a brain, and watch the bot learn to escape. | |
| """ | |
| from __future__ import annotations | |
| import numpy as np | |
| import gradio as gr | |
| from maze.generator import generate_dfs_maze, generate_open_maze | |
| from maze.env import MazeEnv | |
| from agents.qlearning import train_qlearning | |
| from agents.sarsa import train_sarsa | |
| from agents.montecarlo import train_montecarlo | |
| from viz.renderer import ( | |
| make_solution_gif, make_training_chart, | |
| make_qvalue_heatmap, make_race_chart, score_run, | |
| ) | |
| # ── Helpers ─────────────────────────────────────────────────────────────────── | |
| ALGO_MAP = { | |
| "🧠 Q-Learning (recommended)": "qlearning", | |
| "🎯 SARSA (cautious)": "sarsa", | |
| "🎲 Monte Carlo (explorer)": "montecarlo", | |
| } | |
| DIFFICULTY = { | |
| "🐣 Tiny (5×5)": 5, | |
| "🐇 Small (7×7)": 7, | |
| "🐢 Medium (9×9)": 9, | |
| "🦊 Large (13×13)": 13, | |
| "🐉 XL (17×17)": 17, | |
| } | |
| MAZE_STYLE = { | |
| "🏰 Corridors (DFS)": "dfs", | |
| "🌿 Open Field (random walls)": "open", | |
| } | |
| def _make_env(size: int, style: str, seed: int) -> MazeEnv: | |
| if style == "dfs": | |
| grid = generate_dfs_maze(size, seed=seed) | |
| else: | |
| grid = generate_open_maze(size, wall_frac=0.18, seed=seed) | |
| return MazeEnv(grid) | |
| def _train(env: MazeEnv, algo: str, episodes: int, alpha: float, | |
| gamma: float, decay: float, seed: int): | |
| fn = {"qlearning": train_qlearning, | |
| "sarsa": train_sarsa, | |
| "montecarlo": train_montecarlo}[algo] | |
| return fn(env, episodes, alpha, gamma, decay, seed) | |
| def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]: | |
| state, _ = env.reset() | |
| path: list[tuple[int, ...]] = [env.start] | |
| for _ in range(env.n_states * 3): | |
| action = agent.greedy_action(state) | |
| state, _, done, _, _ = env.step(action) | |
| path.append(env._from_state(state)) | |
| if done: | |
| break | |
| return path | |
| # ── Main Playground callback ────────────────────────────────────────────────── | |
| def cb_solve( | |
| difficulty: str, | |
| maze_style: str, | |
| algo_label: str, | |
| episodes: int, | |
| alpha: float, | |
| gamma: float, | |
| decay: float, | |
| seed: int, | |
| progress: gr.Progress = gr.Progress(), | |
| ): | |
| progress(0.05, desc="Building maze…") | |
| size = DIFFICULTY[difficulty] | |
| style = MAZE_STYLE[maze_style] | |
| algo = ALGO_MAP[algo_label] | |
| env = _make_env(size, style, int(seed)) | |
| progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}…") | |
| agent, rewards = _train(env, algo, int(episodes), float(alpha), | |
| float(gamma), float(decay), int(seed)) | |
| progress(0.75, desc="Rendering solution…") | |
| env2 = _make_env(size, style, int(seed)) | |
| gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip()) | |
| progress(0.85, desc="Building charts…") | |
| env3 = _make_env(size, style, int(seed)) | |
| path = _collect_path(env3, agent) | |
| sc = score_run(path, env3.goal, rewards, env3.n_states) | |
| train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards}) | |
| env4 = _make_env(size, style, int(seed)) | |
| heatmap_fig = make_qvalue_heatmap(env4, agent) | |
| stats_md = f""" | |
| ### {sc['grade']} — {sc['verdict']} | |
| | | | | |
| |---|---| | |
| | **Solved** | {"✅ Yes" if sc['solved'] else "❌ No"} | | |
| | **Steps taken** | `{sc['steps']}` | | |
| | **Efficiency score** | `{sc['efficiency']}%` | | |
| | **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` | | |
| | **Episodes trained** | `{int(episodes)}` | | |
| | **Maze size** | `{env.shape[0]} × {env.shape[1]}` cells | | |
| > **Efficiency** compares your bot's path length to the ideal shortest path. | |
| > 100% = perfect. 0% = didn't make it. | |
| """ | |
| progress(1.0, desc="Done!") | |
| return gif_path, train_fig, heatmap_fig, stats_md | |
| # ── Algorithm Race callback ─────────────────────────────────────────────────── | |
| def cb_race( | |
| difficulty: str, | |
| maze_style: str, | |
| episodes: int, | |
| run_mc: bool, | |
| progress: gr.Progress = gr.Progress(), | |
| ): | |
| size = DIFFICULTY[difficulty] | |
| style = MAZE_STYLE[maze_style] | |
| seed = 77 | |
| progress(0.1, desc="Training Q-Learning…") | |
| env_q = _make_env(size, style, seed) | |
| _, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed) | |
| progress(0.4, desc="Training SARSA…") | |
| env_s = _make_env(size, style, seed) | |
| _, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed) | |
| rc, name_c = None, "" | |
| if run_mc: | |
| progress(0.65, desc="Training Monte Carlo…") | |
| env_m = _make_env(size, style, seed) | |
| _, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed) | |
| name_c = "Monte Carlo" | |
| progress(0.9, desc="Building race chart…") | |
| fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c) | |
| # Winner | |
| final_q = float(np.mean(rq[-max(1, len(rq)//5):])) | |
| final_s = float(np.mean(rs[-max(1, len(rs)//5):])) | |
| scores = {"Q-Learning": final_q, "SARSA": final_s} | |
| if rc: | |
| scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):])) | |
| winner = max(scores, key=lambda k: scores[k]) | |
| result_md = f""" | |
| ### 🏆 Race Result | |
| | Algorithm | Final Score | | |
| |---|---| | |
| {"".join(f"| {'🥇 ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())} | |
| **Winner: {winner}** with a final average reward of `{scores[winner]:.1f}` | |
| > All algorithms trained on the same maze with identical hyperparameters. | |
| > Final score = average reward over the last 20% of episodes. | |
| """ | |
| progress(1.0) | |
| return fig, result_md | |
| # ── CSS ─────────────────────────────────────────────────────────────────────── | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap'); | |
| *, *::before, *::after { box-sizing: border-box; } | |
| body, .gradio-container { | |
| background: #0d1117 !important; | |
| color: #c9d1d9 !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .gradio-container { max-width: 1100px !important; margin: 0 auto !important; } | |
| /* Hero */ | |
| .hero { text-align:center; padding:2rem 1rem 1rem; } | |
| .hero-title { | |
| font-size: clamp(2rem, 5vw, 3rem); font-weight: 700; | |
| background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657); | |
| -webkit-background-clip: text; -webkit-text-fill-color: transparent; | |
| margin: 0 0 0.4rem; | |
| } | |
| .hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; } | |
| /* Tabs */ | |
| .tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; } | |
| .tab-nav button { | |
| font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important; | |
| font-weight: 500 !important; color: #484f58 !important; | |
| background: transparent !important; border: none !important; | |
| padding: 0.7rem 1.1rem !important; | |
| } | |
| .tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; } | |
| .tab-nav button:hover { color: #8b949e !important; } | |
| /* Cards */ | |
| .info-card { | |
| background: #161b22; border: 1px solid #21262d; border-radius: 10px; | |
| padding: 1.1rem; | |
| } | |
| .info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; } | |
| .info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; } | |
| .info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; } | |
| /* Algo cards */ | |
| .algo-card { | |
| background: #161b22; border: 1px solid #21262d; border-radius: 10px; | |
| padding: 1rem; margin-bottom: 0.5rem; | |
| } | |
| .algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; } | |
| .algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; } | |
| .algo-tag { | |
| display: inline-block; font-size: 0.68rem; padding: 2px 8px; | |
| border-radius: 20px; margin-top: 0.4rem; | |
| } | |
| .tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; } | |
| .tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; } | |
| .tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; } | |
| /* Grade badge */ | |
| .grade-badge { | |
| display:inline-block; font-size:2.5rem; font-weight:700; | |
| font-family:'JetBrains Mono',monospace; | |
| } | |
| /* Buttons */ | |
| button.primary { | |
| font-family: 'Inter', sans-serif !important; font-weight: 600 !important; | |
| background: linear-gradient(135deg, #238636, #2ea043) !important; | |
| color: #ffffff !important; border: none !important; | |
| border-radius: 6px !important; font-size: 0.9rem !important; | |
| transition: opacity 0.2s !important; | |
| } | |
| button.primary:hover { opacity: 0.85 !important; } | |
| button.secondary { | |
| background: #161b22 !important; color: #58a6ff !important; | |
| border: 1px solid #30363d !important; border-radius: 6px !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| button.stop { | |
| background: #1c0d0d !important; color: #f78166 !important; | |
| border: 1px solid #6e2b2b !important; border-radius: 6px !important; | |
| } | |
| /* Labels */ | |
| label span { font-family:'Inter',sans-serif !important; | |
| font-size:0.82rem !important; color:#8b949e !important; } | |
| /* Slider */ | |
| input[type=range] { -webkit-appearance:none; height:4px; | |
| background:#21262d; border-radius:2px; } | |
| input[type=range]::-webkit-slider-thumb { | |
| -webkit-appearance:none; width:16px; height:16px; | |
| border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117; | |
| } | |
| /* Textarea */ | |
| textarea { font-family:'JetBrains Mono',monospace !important; | |
| font-size:0.82rem !important; background:#0d1117 !important; | |
| color:#3fb950 !important; border:1px solid #21262d !important; | |
| border-radius:6px !important; } | |
| /* Markdown */ | |
| .gradio-container h2 { color: #3fb950 !important; } | |
| .gradio-container h3 { color: #58a6ff !important; } | |
| .gradio-container p { color: #8b949e !important; } | |
| table { width:100%; border-collapse:collapse; } | |
| th { background:#161b22; color:#3fb950; font-size:0.78rem; | |
| text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; } | |
| td { padding:8px 12px; border-bottom:1px solid #0d1117; | |
| color:#e6edf3; font-size:0.85rem; } | |
| blockquote { border-left:3px solid #3fb950; padding-left:1rem; | |
| color:#484f58 !important; margin:0.5rem 0; } | |
| footer { display:none !important; } | |
| .gradio-container .block { background:transparent !important; border:none !important; } | |
| """ | |
| # ── Build UI ────────────────────────────────────────────────────────────────── | |
| with gr.Blocks(title="🤖 Maze Runner — RL Playground") as demo: | |
| gr.HTML(""" | |
| <div class="hero"> | |
| <div class="hero-title">🤖 Maze Runner</div> | |
| <div class="hero-sub">An AI that learns to escape mazes — watch it happen in real time</div> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| # ══════════════════════════════════════════════════════════════════ | |
| # Tab 1 — Welcome | |
| # ══════════════════════════════════════════════════════════════════ | |
| with gr.Tab("🏠 Welcome"): | |
| gr.HTML(""" | |
| <div style="text-align:center;padding:0.5rem 0 1.5rem;"> | |
| <p style="color:#8b949e;font-size:1rem;max-width:580px;margin:0 auto;"> | |
| A tiny AI robot is dropped into a maze. It knows nothing. | |
| Through thousands of attempts — hitting walls, finding dead ends, | |
| occasionally stumbling upon the exit — it slowly builds a mental map | |
| and learns the perfect escape route. | |
| </p> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-bottom:1.5rem;"> | |
| <div class="info-card"> | |
| <div class="info-card-icon">🗺️</div> | |
| <div class="info-card-title">The Maze</div> | |
| <div class="info-card-body"> | |
| A grid of corridors and walls. The bot starts at | |
| <strong style="color:#58a6ff">S</strong> and must reach | |
| <strong style="color:#f78166">G</strong>. | |
| It can only see its own position — no map, no cheating. | |
| </div> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-icon">🤖</div> | |
| <div class="info-card-title">The Bot</div> | |
| <div class="info-card-body"> | |
| At each step it chooses: go up, down, left, or right. | |
| Hit a wall? Penalty. Reach the goal? Big reward! | |
| It remembers what worked and what didn't. | |
| </div> | |
| </div> | |
| <div class="info-card"> | |
| <div class="info-card-icon">🧠</div> | |
| <div class="info-card-title">The Learning</div> | |
| <div class="info-card-body"> | |
| Each attempt updates a "score table" for every | |
| position and move. After enough tries, the bot | |
| always picks the move with the highest score — the optimal path. | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="background:#161b22;border:1px solid #21262d;border-radius:10px;padding:1.2rem;margin-bottom:1rem;"> | |
| <div style="font-weight:600;color:#e6edf3;margin-bottom:1rem;">🧠 Choose your Bot's Brain</div> | |
| <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;"> | |
| <div class="algo-card"> | |
| <div class="algo-name">Q-Learning</div> | |
| <div class="algo-desc"> | |
| Updates its score table <em>immediately</em> after every step. | |
| Fast learner. Best for most mazes. | |
| </div> | |
| <span class="algo-tag tag-green">⚡ Recommended</span> | |
| </div> | |
| <div class="algo-card"> | |
| <div class="algo-name">SARSA</div> | |
| <div class="algo-desc"> | |
| Updates based on the move it <em>actually took</em> next, | |
| not just the best possible. More cautious, avoids risky paths. | |
| </div> | |
| <span class="algo-tag tag-blue">🎯 Cautious</span> | |
| </div> | |
| <div class="algo-card"> | |
| <div class="algo-name">Monte Carlo</div> | |
| <div class="algo-desc"> | |
| Plays out the <em>entire episode</em> first, then | |
| updates everything at once. Needs more episodes to converge. | |
| </div> | |
| <span class="algo-tag tag-orange">🎲 Explorer</span> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;"> | |
| <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:10px;padding:1rem;"> | |
| <div style="font-weight:600;color:#3fb950;margin-bottom:0.4rem;">🗺️ How to use this app</div> | |
| <ol style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;"> | |
| <li>Go to <strong style="color:#e6edf3">🎮 Playground</strong> tab</li> | |
| <li>Pick a difficulty and maze style</li> | |
| <li>Choose a brain and hit <strong style="color:#3fb950">Train & Watch!</strong></li> | |
| <li>Watch the animated replay</li> | |
| <li>Try <strong style="color:#e6edf3">🏁 Algorithm Race</strong> to compare all three</li> | |
| </ol> | |
| </div> | |
| <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:10px;padding:1rem;"> | |
| <div style="font-weight:600;color:#58a6ff;margin-bottom:0.4rem;">💡 Fun facts</div> | |
| <ul style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;"> | |
| <li>This same idea trains robots, game AIs, and self-driving cars</li> | |
| <li>DeepMind's AlphaGo used a version of Q-Learning</li> | |
| <li>A 17×17 maze has 289 possible positions to learn</li> | |
| <li>The bot gets worse before it gets better — that's normal!</li> | |
| </ul> | |
| </div> | |
| </div> | |
| """) | |
| # ══════════════════════════════════════════════════════════════════ | |
| # Tab 2 — Playground | |
| # ══════════════════════════════════════════════════════════════════ | |
| with gr.Tab("🎮 Playground"): | |
| gr.HTML(""" | |
| <div style="padding:0.3rem 0 1rem;"> | |
| <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;"> | |
| Build a maze, pick a brain, watch it learn | |
| </div> | |
| <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;"> | |
| The animated replay shows the final learned path after training. | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # ── Controls ────────────────────────────────────────────── | |
| with gr.Column(scale=1, min_width=300): | |
| gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin-bottom:0.5rem;">🗺️ MAZE SETUP</div>') | |
| difficulty = gr.Radio( | |
| list(DIFFICULTY.keys()), | |
| value="🐢 Medium (9×9)", | |
| label="Difficulty", | |
| ) | |
| maze_style = gr.Radio( | |
| list(MAZE_STYLE.keys()), | |
| value="🏰 Corridors (DFS)", | |
| label="Maze style", | |
| info="Corridors = proper winding paths · Open = random walls" | |
| ) | |
| gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">🧠 BOT BRAIN</div>') | |
| algo = gr.Radio( | |
| list(ALGO_MAP.keys()), | |
| value="🧠 Q-Learning (recommended)", | |
| label="Algorithm", | |
| ) | |
| gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">⚙️ TRAINING</div>') | |
| episodes = gr.Slider(100, 3000, value=800, step=100, | |
| label="Training episodes", | |
| info="More = smarter bot, but slower") | |
| with gr.Accordion("🔬 Advanced settings", open=False): | |
| alpha = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Learning speed (α)") | |
| gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="Future planning (γ)") | |
| decay = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay") | |
| seed = gr.Slider(0, 100, value=42, step=1, label="Random seed") | |
| btn_solve = gr.Button("🚀 Train & Watch!", variant="primary") | |
| # ── Outputs ─────────────────────────────────────────────── | |
| with gr.Column(scale=2): | |
| play_stats = gr.Markdown("*Configure your maze and hit Train & Watch!*") | |
| with gr.Row(): | |
| play_gif = gr.Image( | |
| label="🎬 Bot solving the maze (animated)", | |
| type="filepath", height=360, | |
| ) | |
| with gr.Row(): | |
| play_train_fig = gr.Plot(label="📈 Training progress") | |
| play_heatmap = gr.Plot(label="🌡️ Q-value map (what the bot learned)") | |
| # hidden state defaults for advanced | |
| alpha_h = gr.State(0.1) | |
| gamma_h = gr.State(0.95) | |
| decay_h = gr.State(0.995) | |
| seed_h = gr.State(42) | |
| btn_solve.click( | |
| cb_solve, | |
| inputs=[difficulty, maze_style, algo, episodes, | |
| alpha, gamma, decay, seed], | |
| outputs=[play_gif, play_train_fig, play_heatmap, play_stats], | |
| ) | |
| # ══════════════════════════════════════════════════════════════════ | |
| # Tab 3 — Algorithm Race | |
| # ══════════════════════════════════════════════════════════════════ | |
| with gr.Tab("🏁 Algorithm Race"): | |
| gr.HTML(""" | |
| <div style="padding:0.3rem 0 1rem;"> | |
| <div style="font-size:1.05rem;font-weight:600;color:#e6edf3;"> | |
| Head-to-head: which brain learns fastest? | |
| </div> | |
| <div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;"> | |
| All algorithms train on the same maze with identical settings — | |
| the only variable is the learning strategy. | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;margin-bottom:1rem;"> | |
| <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.8rem;text-align:center;"> | |
| <div style="color:#3fb950;font-size:1.2rem;font-weight:700;">Q-Learning</div> | |
| <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Off-policy · Fast update · Optimistic</div> | |
| </div> | |
| <div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:8px;padding:0.8rem;text-align:center;"> | |
| <div style="color:#58a6ff;font-size:1.2rem;font-weight:700;">SARSA</div> | |
| <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">On-policy · Careful update · Conservative</div> | |
| </div> | |
| <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.8rem;text-align:center;"> | |
| <div style="color:#ffa657;font-size:1.2rem;font-weight:700;">Monte Carlo</div> | |
| <div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Episodic · Full return · Unbiased</div> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=260): | |
| race_diff = gr.Radio(list(DIFFICULTY.keys()), | |
| value="🐢 Medium (9×9)", label="Maze difficulty") | |
| race_style = gr.Radio(list(MAZE_STYLE.keys()), | |
| value="🏰 Corridors (DFS)", label="Maze style") | |
| race_eps = gr.Slider(200, 2000, value=600, step=100, | |
| label="Episodes per algorithm") | |
| race_mc = gr.Checkbox(label="Include Monte Carlo (slower)", value=True) | |
| btn_race = gr.Button("🏁 Start Race!", variant="primary") | |
| with gr.Column(scale=2): | |
| race_result = gr.Markdown("*Click Start Race to run the comparison.*") | |
| race_fig = gr.Plot(label="Race Results") | |
| btn_race.click( | |
| cb_race, | |
| inputs=[race_diff, race_style, race_eps, race_mc], | |
| outputs=[race_fig, race_result], | |
| ) | |
| # ══════════════════════════════════════════════════════════════════ | |
| # Tab 4 — How it Works | |
| # ══════════════════════════════════════════════════════════════════ | |
| with gr.Tab("🧠 How it Works"): | |
| gr.HTML(""" | |
| <div style="max-width:700px;margin:0 auto;padding:1rem 0;"> | |
| <h2 style="color:#3fb950;font-size:1.3rem;margin-bottom:0.3rem;">The Big Idea</h2> | |
| <p style="color:#8b949e;line-height:1.7;"> | |
| The bot doesn't know anything about the maze at the start. It just knows | |
| 4 possible moves and gets a number (reward) after each step. | |
| <strong style="color:#e6edf3">Negative number = bad move. Positive = good move.</strong> | |
| The goal: find the sequence of moves that gets the most reward. | |
| </p> | |
| <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">The Score Table (Q-Table)</h2> | |
| <p style="color:#8b949e;line-height:1.7;"> | |
| The bot keeps a table with one row per maze cell and 4 columns (one per direction). | |
| Each entry stores <em>how good it thinks that move is from that cell</em>. | |
| At the start, everything is 0. After training, the table holds the bot's | |
| entire learned strategy. The Q-value heatmap in the Playground shows this table visually. | |
| </p> | |
| <div style="background:#161b22;border:1px solid #21262d;border-radius:8px;padding:1rem;margin:1rem 0;font-family:'JetBrains Mono',monospace;font-size:0.82rem;color:#3fb950;"> | |
| Q[current_cell][move] += learning_speed × (<br> | |
| reward_got + future_discount × best_Q[next_cell] − Q[current_cell][move]<br> | |
| ) | |
| </div> | |
| <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Exploration vs Exploitation</h2> | |
| <p style="color:#8b949e;line-height:1.7;"> | |
| Early in training, the bot tries <strong style="color:#e6edf3">random moves</strong> (exploration) | |
| — it doesn't know enough to trust its table yet. Over time, it relies more on what | |
| it's learned (exploitation). This is controlled by <strong style="color:#e6edf3">epsilon (ε)</strong>, | |
| which starts near 1.0 (100% random) and decays toward 0 (always use best known move). | |
| </p> | |
| <h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Why does reward go negative first?</h2> | |
| <p style="color:#8b949e;line-height:1.7;"> | |
| Each step costs −1 (time penalty) and hitting a wall costs −5. | |
| A random bot hits a <em>lot</em> of walls and takes forever to find the exit, | |
| so early rewards are very negative. As it learns, fewer walls are hit and | |
| the path shortens — reward climbs toward 0 and eventually turns positive when | |
| it reliably reaches the goal. | |
| </p> | |
| <div style="display:grid;grid-template-columns:1fr 1fr;gap:0.8rem;margin-top:1.2rem;"> | |
| <div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.9rem;"> | |
| <div style="color:#3fb950;font-weight:600;margin-bottom:0.4rem;">Q-Learning vs SARSA</div> | |
| <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;"> | |
| Q-Learning always updates toward the <em>best possible</em> next action — | |
| even if it wouldn't actually take that action. SARSA updates toward | |
| the action it <em>will actually take</em>. This makes SARSA more cautious near walls. | |
| </div> | |
| </div> | |
| <div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.9rem;"> | |
| <div style="color:#ffa657;font-weight:600;margin-bottom:0.4rem;">Why Monte Carlo is slow</div> | |
| <div style="color:#8b949e;font-size:0.82rem;line-height:1.6;"> | |
| MC waits until the episode <em>ends</em> before updating any scores. | |
| On large mazes where early episodes never reach the goal, | |
| it gets zero signal for a long time. But once it starts solving, | |
| its estimates are very accurate. | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| gr.HTML(""" | |
| <div style="text-align:center;color:#21262d;font-size:0.75rem; | |
| padding:1.5rem 0 0.5rem;border-top:1px solid #161b22;margin-top:1rem;"> | |
| Built with Q-Learning · SARSA · Monte Carlo · Gymnasium · Gradio | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS) | |