rl_maze_solver / app.py
Daksh C Jain
Transform into interactive RL playground for all audiences
34aeb9a
"""
🤖 Maze Runner — RL Playground
An interactive, fun maze-solving playground powered by Reinforcement Learning.
Anyone can build a maze, pick a brain, and watch the bot learn to escape.
"""
from __future__ import annotations
import numpy as np
import gradio as gr
from maze.generator import generate_dfs_maze, generate_open_maze
from maze.env import MazeEnv
from agents.qlearning import train_qlearning
from agents.sarsa import train_sarsa
from agents.montecarlo import train_montecarlo
from viz.renderer import (
make_solution_gif, make_training_chart,
make_qvalue_heatmap, make_race_chart, score_run,
)
# ── Helpers ───────────────────────────────────────────────────────────────────
ALGO_MAP = {
"🧠 Q-Learning (recommended)": "qlearning",
"🎯 SARSA (cautious)": "sarsa",
"🎲 Monte Carlo (explorer)": "montecarlo",
}
DIFFICULTY = {
"🐣 Tiny (5×5)": 5,
"🐇 Small (7×7)": 7,
"🐢 Medium (9×9)": 9,
"🦊 Large (13×13)": 13,
"🐉 XL (17×17)": 17,
}
MAZE_STYLE = {
"🏰 Corridors (DFS)": "dfs",
"🌿 Open Field (random walls)": "open",
}
def _make_env(size: int, style: str, seed: int) -> MazeEnv:
if style == "dfs":
grid = generate_dfs_maze(size, seed=seed)
else:
grid = generate_open_maze(size, wall_frac=0.18, seed=seed)
return MazeEnv(grid)
def _train(env: MazeEnv, algo: str, episodes: int, alpha: float,
gamma: float, decay: float, seed: int):
fn = {"qlearning": train_qlearning,
"sarsa": train_sarsa,
"montecarlo": train_montecarlo}[algo]
return fn(env, episodes, alpha, gamma, decay, seed)
def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]:
state, _ = env.reset()
path: list[tuple[int, ...]] = [env.start]
for _ in range(env.n_states * 3):
action = agent.greedy_action(state)
state, _, done, _, _ = env.step(action)
path.append(env._from_state(state))
if done:
break
return path
# ── Main Playground callback ──────────────────────────────────────────────────
def cb_solve(
difficulty: str,
maze_style: str,
algo_label: str,
episodes: int,
alpha: float,
gamma: float,
decay: float,
seed: int,
progress: gr.Progress = gr.Progress(),
):
progress(0.05, desc="Building maze…")
size = DIFFICULTY[difficulty]
style = MAZE_STYLE[maze_style]
algo = ALGO_MAP[algo_label]
env = _make_env(size, style, int(seed))
progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}…")
agent, rewards = _train(env, algo, int(episodes), float(alpha),
float(gamma), float(decay), int(seed))
progress(0.75, desc="Rendering solution…")
env2 = _make_env(size, style, int(seed))
gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip())
progress(0.85, desc="Building charts…")
env3 = _make_env(size, style, int(seed))
path = _collect_path(env3, agent)
sc = score_run(path, env3.goal, rewards, env3.n_states)
train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards})
env4 = _make_env(size, style, int(seed))
heatmap_fig = make_qvalue_heatmap(env4, agent)
stats_md = f"""
### {sc['grade']}{sc['verdict']}
| | |
|---|---|
| **Solved** | {"✅ Yes" if sc['solved'] else "❌ No"} |
| **Steps taken** | `{sc['steps']}` |
| **Efficiency score** | `{sc['efficiency']}%` |
| **Avg reward (final 20%)** | `{sc['avg_reward']:.1f}` |
| **Episodes trained** | `{int(episodes)}` |
| **Maze size** | `{env.shape[0]} × {env.shape[1]}` cells |
> **Efficiency** compares your bot's path length to the ideal shortest path.
> 100% = perfect. 0% = didn't make it.
"""
progress(1.0, desc="Done!")
return gif_path, train_fig, heatmap_fig, stats_md
# ── Algorithm Race callback ───────────────────────────────────────────────────
def cb_race(
difficulty: str,
maze_style: str,
episodes: int,
run_mc: bool,
progress: gr.Progress = gr.Progress(),
):
size = DIFFICULTY[difficulty]
style = MAZE_STYLE[maze_style]
seed = 77
progress(0.1, desc="Training Q-Learning…")
env_q = _make_env(size, style, seed)
_, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed)
progress(0.4, desc="Training SARSA…")
env_s = _make_env(size, style, seed)
_, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed)
rc, name_c = None, ""
if run_mc:
progress(0.65, desc="Training Monte Carlo…")
env_m = _make_env(size, style, seed)
_, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed)
name_c = "Monte Carlo"
progress(0.9, desc="Building race chart…")
fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c)
# Winner
final_q = float(np.mean(rq[-max(1, len(rq)//5):]))
final_s = float(np.mean(rs[-max(1, len(rs)//5):]))
scores = {"Q-Learning": final_q, "SARSA": final_s}
if rc:
scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):]))
winner = max(scores, key=lambda k: scores[k])
result_md = f"""
### 🏆 Race Result
| Algorithm | Final Score |
|---|---|
{"".join(f"| {'🥇 ' if k==winner else ''}{k} | `{v:.1f}` |" + chr(10) for k,v in scores.items())}
**Winner: {winner}** with a final average reward of `{scores[winner]:.1f}`
> All algorithms trained on the same maze with identical hyperparameters.
> Final score = average reward over the last 20% of episodes.
"""
progress(1.0)
return fig, result_md
# ── CSS ───────────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap');
*, *::before, *::after { box-sizing: border-box; }
body, .gradio-container {
background: #0d1117 !important;
color: #c9d1d9 !important;
font-family: 'Inter', sans-serif !important;
}
.gradio-container { max-width: 1100px !important; margin: 0 auto !important; }
/* Hero */
.hero { text-align:center; padding:2rem 1rem 1rem; }
.hero-title {
font-size: clamp(2rem, 5vw, 3rem); font-weight: 700;
background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657);
-webkit-background-clip: text; -webkit-text-fill-color: transparent;
margin: 0 0 0.4rem;
}
.hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; }
/* Tabs */
.tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; }
.tab-nav button {
font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important;
font-weight: 500 !important; color: #484f58 !important;
background: transparent !important; border: none !important;
padding: 0.7rem 1.1rem !important;
}
.tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; }
.tab-nav button:hover { color: #8b949e !important; }
/* Cards */
.info-card {
background: #161b22; border: 1px solid #21262d; border-radius: 10px;
padding: 1.1rem;
}
.info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; }
.info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; }
.info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; }
/* Algo cards */
.algo-card {
background: #161b22; border: 1px solid #21262d; border-radius: 10px;
padding: 1rem; margin-bottom: 0.5rem;
}
.algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; }
.algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; }
.algo-tag {
display: inline-block; font-size: 0.68rem; padding: 2px 8px;
border-radius: 20px; margin-top: 0.4rem;
}
.tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; }
.tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; }
.tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; }
/* Grade badge */
.grade-badge {
display:inline-block; font-size:2.5rem; font-weight:700;
font-family:'JetBrains Mono',monospace;
}
/* Buttons */
button.primary {
font-family: 'Inter', sans-serif !important; font-weight: 600 !important;
background: linear-gradient(135deg, #238636, #2ea043) !important;
color: #ffffff !important; border: none !important;
border-radius: 6px !important; font-size: 0.9rem !important;
transition: opacity 0.2s !important;
}
button.primary:hover { opacity: 0.85 !important; }
button.secondary {
background: #161b22 !important; color: #58a6ff !important;
border: 1px solid #30363d !important; border-radius: 6px !important;
font-family: 'Inter', sans-serif !important;
}
button.stop {
background: #1c0d0d !important; color: #f78166 !important;
border: 1px solid #6e2b2b !important; border-radius: 6px !important;
}
/* Labels */
label span { font-family:'Inter',sans-serif !important;
font-size:0.82rem !important; color:#8b949e !important; }
/* Slider */
input[type=range] { -webkit-appearance:none; height:4px;
background:#21262d; border-radius:2px; }
input[type=range]::-webkit-slider-thumb {
-webkit-appearance:none; width:16px; height:16px;
border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117;
}
/* Textarea */
textarea { font-family:'JetBrains Mono',monospace !important;
font-size:0.82rem !important; background:#0d1117 !important;
color:#3fb950 !important; border:1px solid #21262d !important;
border-radius:6px !important; }
/* Markdown */
.gradio-container h2 { color: #3fb950 !important; }
.gradio-container h3 { color: #58a6ff !important; }
.gradio-container p { color: #8b949e !important; }
table { width:100%; border-collapse:collapse; }
th { background:#161b22; color:#3fb950; font-size:0.78rem;
text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; }
td { padding:8px 12px; border-bottom:1px solid #0d1117;
color:#e6edf3; font-size:0.85rem; }
blockquote { border-left:3px solid #3fb950; padding-left:1rem;
color:#484f58 !important; margin:0.5rem 0; }
footer { display:none !important; }
.gradio-container .block { background:transparent !important; border:none !important; }
"""
# ── Build UI ──────────────────────────────────────────────────────────────────
with gr.Blocks(title="🤖 Maze Runner — RL Playground") as demo:
gr.HTML("""
<div class="hero">
<div class="hero-title">🤖 Maze Runner</div>
<div class="hero-sub">An AI that learns to escape mazes — watch it happen in real time</div>
</div>
""")
with gr.Tabs():
# ══════════════════════════════════════════════════════════════════
# Tab 1 — Welcome
# ══════════════════════════════════════════════════════════════════
with gr.Tab("🏠 Welcome"):
gr.HTML("""
<div style="text-align:center;padding:0.5rem 0 1.5rem;">
<p style="color:#8b949e;font-size:1rem;max-width:580px;margin:0 auto;">
A tiny AI robot is dropped into a maze. It knows nothing.
Through thousands of attempts — hitting walls, finding dead ends,
occasionally stumbling upon the exit — it slowly builds a mental map
and learns the perfect escape route.
</p>
</div>
""")
gr.HTML("""
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-bottom:1.5rem;">
<div class="info-card">
<div class="info-card-icon">🗺️</div>
<div class="info-card-title">The Maze</div>
<div class="info-card-body">
A grid of corridors and walls. The bot starts at
<strong style="color:#58a6ff">S</strong> and must reach
<strong style="color:#f78166">G</strong>.
It can only see its own position — no map, no cheating.
</div>
</div>
<div class="info-card">
<div class="info-card-icon">🤖</div>
<div class="info-card-title">The Bot</div>
<div class="info-card-body">
At each step it chooses: go up, down, left, or right.
Hit a wall? Penalty. Reach the goal? Big reward!
It remembers what worked and what didn't.
</div>
</div>
<div class="info-card">
<div class="info-card-icon">🧠</div>
<div class="info-card-title">The Learning</div>
<div class="info-card-body">
Each attempt updates a "score table" for every
position and move. After enough tries, the bot
always picks the move with the highest score — the optimal path.
</div>
</div>
</div>
""")
gr.HTML("""
<div style="background:#161b22;border:1px solid #21262d;border-radius:10px;padding:1.2rem;margin-bottom:1rem;">
<div style="font-weight:600;color:#e6edf3;margin-bottom:1rem;">🧠 Choose your Bot's Brain</div>
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;">
<div class="algo-card">
<div class="algo-name">Q-Learning</div>
<div class="algo-desc">
Updates its score table <em>immediately</em> after every step.
Fast learner. Best for most mazes.
</div>
<span class="algo-tag tag-green">⚡ Recommended</span>
</div>
<div class="algo-card">
<div class="algo-name">SARSA</div>
<div class="algo-desc">
Updates based on the move it <em>actually took</em> next,
not just the best possible. More cautious, avoids risky paths.
</div>
<span class="algo-tag tag-blue">🎯 Cautious</span>
</div>
<div class="algo-card">
<div class="algo-name">Monte Carlo</div>
<div class="algo-desc">
Plays out the <em>entire episode</em> first, then
updates everything at once. Needs more episodes to converge.
</div>
<span class="algo-tag tag-orange">🎲 Explorer</span>
</div>
</div>
</div>
""")
gr.HTML("""
<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;">
<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:10px;padding:1rem;">
<div style="font-weight:600;color:#3fb950;margin-bottom:0.4rem;">🗺️ How to use this app</div>
<ol style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
<li>Go to <strong style="color:#e6edf3">🎮 Playground</strong> tab</li>
<li>Pick a difficulty and maze style</li>
<li>Choose a brain and hit <strong style="color:#3fb950">Train & Watch!</strong></li>
<li>Watch the animated replay</li>
<li>Try <strong style="color:#e6edf3">🏁 Algorithm Race</strong> to compare all three</li>
</ol>
</div>
<div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:10px;padding:1rem;">
<div style="font-weight:600;color:#58a6ff;margin-bottom:0.4rem;">💡 Fun facts</div>
<ul style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
<li>This same idea trains robots, game AIs, and self-driving cars</li>
<li>DeepMind's AlphaGo used a version of Q-Learning</li>
<li>A 17×17 maze has 289 possible positions to learn</li>
<li>The bot gets worse before it gets better — that's normal!</li>
</ul>
</div>
</div>
""")
# ══════════════════════════════════════════════════════════════════
# Tab 2 — Playground
# ══════════════════════════════════════════════════════════════════
with gr.Tab("🎮 Playground"):
gr.HTML("""
<div style="padding:0.3rem 0 1rem;">
<div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
Build a maze, pick a brain, watch it learn
</div>
<div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
The animated replay shows the final learned path after training.
</div>
</div>
""")
with gr.Row():
# ── Controls ──────────────────────────────────────────────
with gr.Column(scale=1, min_width=300):
gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin-bottom:0.5rem;">🗺️ MAZE SETUP</div>')
difficulty = gr.Radio(
list(DIFFICULTY.keys()),
value="🐢 Medium (9×9)",
label="Difficulty",
)
maze_style = gr.Radio(
list(MAZE_STYLE.keys()),
value="🏰 Corridors (DFS)",
label="Maze style",
info="Corridors = proper winding paths · Open = random walls"
)
gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">🧠 BOT BRAIN</div>')
algo = gr.Radio(
list(ALGO_MAP.keys()),
value="🧠 Q-Learning (recommended)",
label="Algorithm",
)
gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">⚙️ TRAINING</div>')
episodes = gr.Slider(100, 3000, value=800, step=100,
label="Training episodes",
info="More = smarter bot, but slower")
with gr.Accordion("🔬 Advanced settings", open=False):
alpha = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Learning speed (α)")
gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="Future planning (γ)")
decay = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay")
seed = gr.Slider(0, 100, value=42, step=1, label="Random seed")
btn_solve = gr.Button("🚀 Train & Watch!", variant="primary")
# ── Outputs ───────────────────────────────────────────────
with gr.Column(scale=2):
play_stats = gr.Markdown("*Configure your maze and hit Train & Watch!*")
with gr.Row():
play_gif = gr.Image(
label="🎬 Bot solving the maze (animated)",
type="filepath", height=360,
)
with gr.Row():
play_train_fig = gr.Plot(label="📈 Training progress")
play_heatmap = gr.Plot(label="🌡️ Q-value map (what the bot learned)")
# hidden state defaults for advanced
alpha_h = gr.State(0.1)
gamma_h = gr.State(0.95)
decay_h = gr.State(0.995)
seed_h = gr.State(42)
btn_solve.click(
cb_solve,
inputs=[difficulty, maze_style, algo, episodes,
alpha, gamma, decay, seed],
outputs=[play_gif, play_train_fig, play_heatmap, play_stats],
)
# ══════════════════════════════════════════════════════════════════
# Tab 3 — Algorithm Race
# ══════════════════════════════════════════════════════════════════
with gr.Tab("🏁 Algorithm Race"):
gr.HTML("""
<div style="padding:0.3rem 0 1rem;">
<div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
Head-to-head: which brain learns fastest?
</div>
<div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
All algorithms train on the same maze with identical settings —
the only variable is the learning strategy.
</div>
</div>
""")
gr.HTML("""
<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;margin-bottom:1rem;">
<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.8rem;text-align:center;">
<div style="color:#3fb950;font-size:1.2rem;font-weight:700;">Q-Learning</div>
<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Off-policy · Fast update · Optimistic</div>
</div>
<div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:8px;padding:0.8rem;text-align:center;">
<div style="color:#58a6ff;font-size:1.2rem;font-weight:700;">SARSA</div>
<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">On-policy · Careful update · Conservative</div>
</div>
<div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.8rem;text-align:center;">
<div style="color:#ffa657;font-size:1.2rem;font-weight:700;">Monte Carlo</div>
<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Episodic · Full return · Unbiased</div>
</div>
</div>
""")
with gr.Row():
with gr.Column(scale=1, min_width=260):
race_diff = gr.Radio(list(DIFFICULTY.keys()),
value="🐢 Medium (9×9)", label="Maze difficulty")
race_style = gr.Radio(list(MAZE_STYLE.keys()),
value="🏰 Corridors (DFS)", label="Maze style")
race_eps = gr.Slider(200, 2000, value=600, step=100,
label="Episodes per algorithm")
race_mc = gr.Checkbox(label="Include Monte Carlo (slower)", value=True)
btn_race = gr.Button("🏁 Start Race!", variant="primary")
with gr.Column(scale=2):
race_result = gr.Markdown("*Click Start Race to run the comparison.*")
race_fig = gr.Plot(label="Race Results")
btn_race.click(
cb_race,
inputs=[race_diff, race_style, race_eps, race_mc],
outputs=[race_fig, race_result],
)
# ══════════════════════════════════════════════════════════════════
# Tab 4 — How it Works
# ══════════════════════════════════════════════════════════════════
with gr.Tab("🧠 How it Works"):
gr.HTML("""
<div style="max-width:700px;margin:0 auto;padding:1rem 0;">
<h2 style="color:#3fb950;font-size:1.3rem;margin-bottom:0.3rem;">The Big Idea</h2>
<p style="color:#8b949e;line-height:1.7;">
The bot doesn't know anything about the maze at the start. It just knows
4 possible moves and gets a number (reward) after each step.
<strong style="color:#e6edf3">Negative number = bad move. Positive = good move.</strong>
The goal: find the sequence of moves that gets the most reward.
</p>
<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">The Score Table (Q-Table)</h2>
<p style="color:#8b949e;line-height:1.7;">
The bot keeps a table with one row per maze cell and 4 columns (one per direction).
Each entry stores <em>how good it thinks that move is from that cell</em>.
At the start, everything is 0. After training, the table holds the bot's
entire learned strategy. The Q-value heatmap in the Playground shows this table visually.
</p>
<div style="background:#161b22;border:1px solid #21262d;border-radius:8px;padding:1rem;margin:1rem 0;font-family:'JetBrains Mono',monospace;font-size:0.82rem;color:#3fb950;">
Q[current_cell][move] += learning_speed × (<br>
&nbsp;&nbsp;reward_got + future_discount × best_Q[next_cell] − Q[current_cell][move]<br>
)
</div>
<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Exploration vs Exploitation</h2>
<p style="color:#8b949e;line-height:1.7;">
Early in training, the bot tries <strong style="color:#e6edf3">random moves</strong> (exploration)
— it doesn't know enough to trust its table yet. Over time, it relies more on what
it's learned (exploitation). This is controlled by <strong style="color:#e6edf3">epsilon (ε)</strong>,
which starts near 1.0 (100% random) and decays toward 0 (always use best known move).
</p>
<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Why does reward go negative first?</h2>
<p style="color:#8b949e;line-height:1.7;">
Each step costs −1 (time penalty) and hitting a wall costs −5.
A random bot hits a <em>lot</em> of walls and takes forever to find the exit,
so early rewards are very negative. As it learns, fewer walls are hit and
the path shortens — reward climbs toward 0 and eventually turns positive when
it reliably reaches the goal.
</p>
<div style="display:grid;grid-template-columns:1fr 1fr;gap:0.8rem;margin-top:1.2rem;">
<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.9rem;">
<div style="color:#3fb950;font-weight:600;margin-bottom:0.4rem;">Q-Learning vs SARSA</div>
<div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
Q-Learning always updates toward the <em>best possible</em> next action —
even if it wouldn't actually take that action. SARSA updates toward
the action it <em>will actually take</em>. This makes SARSA more cautious near walls.
</div>
</div>
<div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.9rem;">
<div style="color:#ffa657;font-weight:600;margin-bottom:0.4rem;">Why Monte Carlo is slow</div>
<div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
MC waits until the episode <em>ends</em> before updating any scores.
On large mazes where early episodes never reach the goal,
it gets zero signal for a long time. But once it starts solving,
its estimates are very accurate.
</div>
</div>
</div>
</div>
""")
gr.HTML("""
<div style="text-align:center;color:#21262d;font-size:0.75rem;
padding:1.5rem 0 0.5rem;border-top:1px solid #161b22;margin-top:1rem;">
Built with Q-Learning · SARSA · Monte Carlo · Gymnasium · Gradio
</div>
""")
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS)