Spaces:

Dash10107
/

rl_maze_solver

Running

Daksh C Jain

Transform into interactive RL playground for all audiences

34aeb9a 15 days ago

30.7 kB

	"""
	🤖 Maze Runner — RL Playground
	An interactive, fun maze-solving playground powered by Reinforcement Learning.
	Anyone can build a maze, pick a brain, and watch the bot learn to escape.
	"""

	from __future__ import annotations
	import numpy as np
	import gradio as gr

	from maze.generator import generate_dfs_maze, generate_open_maze
	from maze.env import MazeEnv
	from agents.qlearning import train_qlearning
	from agents.sarsa import train_sarsa
	from agents.montecarlo import train_montecarlo
	from viz.renderer import (
	make_solution_gif, make_training_chart,
	make_qvalue_heatmap, make_race_chart, score_run,
	)

	# ── Helpers ───────────────────────────────────────────────────────────────────

	ALGO_MAP = {
	"🧠 Q-Learning (recommended)": "qlearning",
	"🎯 SARSA (cautious)": "sarsa",
	"🎲 Monte Carlo (explorer)": "montecarlo",
	}

	DIFFICULTY = {
	"🐣 Tiny (5×5)": 5,
	"🐇 Small (7×7)": 7,
	"🐢 Medium (9×9)": 9,
	"🦊 Large (13×13)": 13,
	"🐉 XL (17×17)": 17,
	}

	MAZE_STYLE = {
	"🏰 Corridors (DFS)": "dfs",
	"🌿 Open Field (random walls)": "open",
	}


	def _make_env(size: int, style: str, seed: int) -> MazeEnv:
	if style == "dfs":
	grid = generate_dfs_maze(size, seed=seed)
	else:
	grid = generate_open_maze(size, wall_frac=0.18, seed=seed)
	return MazeEnv(grid)


	def _train(env: MazeEnv, algo: str, episodes: int, alpha: float,
	gamma: float, decay: float, seed: int):
	fn = {"qlearning": train_qlearning,
	"sarsa": train_sarsa,
	"montecarlo": train_montecarlo}[algo]
	return fn(env, episodes, alpha, gamma, decay, seed)


	def _collect_path(env: MazeEnv, agent) -> list[tuple[int, ...]]:
	state, _ = env.reset()
	path: list[tuple[int, ...]] = [env.start]
	for _ in range(env.n_states * 3):
	action = agent.greedy_action(state)
	state, _, done, _, _ = env.step(action)
	path.append(env._from_state(state))
	if done:
	break
	return path


	# ── Main Playground callback ──────────────────────────────────────────────────

	def cb_solve(
	difficulty: str,
	maze_style: str,
	algo_label: str,
	episodes: int,
	alpha: float,
	gamma: float,
	decay: float,
	seed: int,
	progress: gr.Progress = gr.Progress(),
	):
	progress(0.05, desc="Building maze…")
	size = DIFFICULTY[difficulty]
	style = MAZE_STYLE[maze_style]
	algo = ALGO_MAP[algo_label]

	env = _make_env(size, style, int(seed))

	progress(0.15, desc=f"Training {algo_label.split('(')[0].strip()}…")
	agent, rewards = _train(env, algo, int(episodes), float(alpha),
	float(gamma), float(decay), int(seed))

	progress(0.75, desc="Rendering solution…")
	env2 = _make_env(size, style, int(seed))
	gif_path = make_solution_gif(env2, agent, fps=7, label=algo_label.split("(")[0].strip())

	progress(0.85, desc="Building charts…")
	env3 = _make_env(size, style, int(seed))
	path = _collect_path(env3, agent)
	sc = score_run(path, env3.goal, rewards, env3.n_states)

	train_fig = make_training_chart({algo_label.split("(")[0].strip(): rewards})

	env4 = _make_env(size, style, int(seed))
	heatmap_fig = make_qvalue_heatmap(env4, agent)

	stats_md = f"""
	### {sc['grade']} — {sc['verdict']}

	\| \| \|
	\|---\|---\|
	\| Solved \| {"✅ Yes" if sc['solved'] else "❌ No"} \|
	\| Steps taken \| `{sc['steps']}` \|
	\| Efficiency score \| `{sc['efficiency']}%` \|
	\| Avg reward (final 20%) \| `{sc['avg_reward']:.1f}` \|
	\| Episodes trained \| `{int(episodes)}` \|
	\| Maze size \| `{env.shape[0]} × {env.shape[1]}` cells \|

	> Efficiency compares your bot's path length to the ideal shortest path.
	> 100% = perfect. 0% = didn't make it.
	"""
	progress(1.0, desc="Done!")
	return gif_path, train_fig, heatmap_fig, stats_md


	# ── Algorithm Race callback ───────────────────────────────────────────────────

	def cb_race(
	difficulty: str,
	maze_style: str,
	episodes: int,
	run_mc: bool,
	progress: gr.Progress = gr.Progress(),
	):
	size = DIFFICULTY[difficulty]
	style = MAZE_STYLE[maze_style]
	seed = 77

	progress(0.1, desc="Training Q-Learning…")
	env_q = _make_env(size, style, seed)
	_, rq = _train(env_q, "qlearning", int(episodes), 0.1, 0.95, 0.995, seed)

	progress(0.4, desc="Training SARSA…")
	env_s = _make_env(size, style, seed)
	_, rs = _train(env_s, "sarsa", int(episodes), 0.1, 0.95, 0.995, seed)

	rc, name_c = None, ""
	if run_mc:
	progress(0.65, desc="Training Monte Carlo…")
	env_m = _make_env(size, style, seed)
	_, rc = _train(env_m, "montecarlo", int(episodes), 0.1, 0.95, 0.995, seed)
	name_c = "Monte Carlo"

	progress(0.9, desc="Building race chart…")
	fig = make_race_chart(rq, "Q-Learning", rs, "SARSA", rc, name_c)

	# Winner
	final_q = float(np.mean(rq[-max(1, len(rq)//5):]))
	final_s = float(np.mean(rs[-max(1, len(rs)//5):]))
	scores = {"Q-Learning": final_q, "SARSA": final_s}
	if rc:
	scores["Monte Carlo"] = float(np.mean(rc[-max(1, len(rc)//5):]))
	winner = max(scores, key=lambda k: scores[k])

	result_md = f"""
	### 🏆 Race Result

	\| Algorithm \| Final Score \|
	\|---\|---\|
	{"".join(f"\| {'🥇 ' if k==winner else ''}{k} \| `{v:.1f}` \|" + chr(10) for k,v in scores.items())}

	Winner: {winner} with a final average reward of `{scores[winner]:.1f}`

	> All algorithms trained on the same maze with identical hyperparameters.
	> Final score = average reward over the last 20% of episodes.
	"""
	progress(1.0)
	return fig, result_md


	# ── CSS ───────────────────────────────────────────────────────────────────────

	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;600;700&family=JetBrains+Mono:wght@400;600&display=swap');

	, ::before, *::after { box-sizing: border-box; }

	body, .gradio-container {
	background: #0d1117 !important;
	color: #c9d1d9 !important;
	font-family: 'Inter', sans-serif !important;
	}
	.gradio-container { max-width: 1100px !important; margin: 0 auto !important; }

	/* Hero */
	.hero { text-align:center; padding:2rem 1rem 1rem; }
	.hero-title {
	font-size: clamp(2rem, 5vw, 3rem); font-weight: 700;
	background: linear-gradient(135deg, #3fb950, #58a6ff, #ffa657);
	-webkit-background-clip: text; -webkit-text-fill-color: transparent;
	margin: 0 0 0.4rem;
	}
	.hero-sub { color: #484f58; font-size: 0.95rem; letter-spacing:0.02em; }

	/* Tabs */
	.tab-nav { border-bottom: 1px solid #21262d !important; background: transparent !important; }
	.tab-nav button {
	font-family: 'Inter', sans-serif !important; font-size: 0.85rem !important;
	font-weight: 500 !important; color: #484f58 !important;
	background: transparent !important; border: none !important;
	padding: 0.7rem 1.1rem !important;
	}
	.tab-nav button.selected { color: #3fb950 !important; border-bottom: 2px solid #3fb950 !important; }
	.tab-nav button:hover { color: #8b949e !important; }

	/* Cards */
	.info-card {
	background: #161b22; border: 1px solid #21262d; border-radius: 10px;
	padding: 1.1rem;
	}
	.info-card-icon { font-size: 1.8rem; margin-bottom:0.4rem; }
	.info-card-title { font-weight: 600; color: #e6edf3; font-size: 0.95rem; margin-bottom:0.3rem; }
	.info-card-body { color: #8b949e; font-size: 0.83rem; line-height: 1.6; }

	/* Algo cards */
	.algo-card {
	background: #161b22; border: 1px solid #21262d; border-radius: 10px;
	padding: 1rem; margin-bottom: 0.5rem;
	}
	.algo-name { font-weight: 600; color: #e6edf3; font-size: 0.9rem; }
	.algo-desc { color: #8b949e; font-size: 0.8rem; line-height:1.5; margin-top:0.2rem; }
	.algo-tag {
	display: inline-block; font-size: 0.68rem; padding: 2px 8px;
	border-radius: 20px; margin-top: 0.4rem;
	}
	.tag-green { background:#0d2d17; color:#3fb950; border:1px solid #1a5c2e; }
	.tag-blue { background:#0d1f38; color:#58a6ff; border:1px solid #1a4a7a; }
	.tag-orange { background:#2d1c06; color:#ffa657; border:1px solid #5c3a12; }

	/* Grade badge */
	.grade-badge {
	display:inline-block; font-size:2.5rem; font-weight:700;
	font-family:'JetBrains Mono',monospace;
	}

	/* Buttons */
	button.primary {
	font-family: 'Inter', sans-serif !important; font-weight: 600 !important;
	background: linear-gradient(135deg, #238636, #2ea043) !important;
	color: #ffffff !important; border: none !important;
	border-radius: 6px !important; font-size: 0.9rem !important;
	transition: opacity 0.2s !important;
	}
	button.primary:hover { opacity: 0.85 !important; }
	button.secondary {
	background: #161b22 !important; color: #58a6ff !important;
	border: 1px solid #30363d !important; border-radius: 6px !important;
	font-family: 'Inter', sans-serif !important;
	}
	button.stop {
	background: #1c0d0d !important; color: #f78166 !important;
	border: 1px solid #6e2b2b !important; border-radius: 6px !important;
	}

	/* Labels */
	label span { font-family:'Inter',sans-serif !important;
	font-size:0.82rem !important; color:#8b949e !important; }

	/* Slider */
	input[type=range] { -webkit-appearance:none; height:4px;
	background:#21262d; border-radius:2px; }
	input[type=range]::-webkit-slider-thumb {
	-webkit-appearance:none; width:16px; height:16px;
	border-radius:50%; background:#3fb950; cursor:pointer; border:2px solid #0d1117;
	}

	/* Textarea */
	textarea { font-family:'JetBrains Mono',monospace !important;
	font-size:0.82rem !important; background:#0d1117 !important;
	color:#3fb950 !important; border:1px solid #21262d !important;
	border-radius:6px !important; }

	/* Markdown */
	.gradio-container h2 { color: #3fb950 !important; }
	.gradio-container h3 { color: #58a6ff !important; }
	.gradio-container p { color: #8b949e !important; }
	table { width:100%; border-collapse:collapse; }
	th { background:#161b22; color:#3fb950; font-size:0.78rem;
	text-align:left; padding:8px 12px; border-bottom:1px solid #21262d; }
	td { padding:8px 12px; border-bottom:1px solid #0d1117;
	color:#e6edf3; font-size:0.85rem; }
	blockquote { border-left:3px solid #3fb950; padding-left:1rem;
	color:#484f58 !important; margin:0.5rem 0; }

	footer { display:none !important; }
	.gradio-container .block { background:transparent !important; border:none !important; }
	"""

	# ── Build UI ──────────────────────────────────────────────────────────────────

	with gr.Blocks(title="🤖 Maze Runner — RL Playground") as demo:

	gr.HTML("""
	<div class="hero">
	<div class="hero-title">🤖 Maze Runner</div>
	<div class="hero-sub">An AI that learns to escape mazes — watch it happen in real time</div>
	</div>
	""")

	with gr.Tabs():

	# ══════════════════════════════════════════════════════════════════
	# Tab 1 — Welcome
	# ══════════════════════════════════════════════════════════════════
	with gr.Tab("🏠 Welcome"):

	gr.HTML("""
	<div style="text-align:center;padding:0.5rem 0 1.5rem;">
	<p style="color:#8b949e;font-size:1rem;max-width:580px;margin:0 auto;">
	A tiny AI robot is dropped into a maze. It knows nothing.
	Through thousands of attempts — hitting walls, finding dead ends,
	occasionally stumbling upon the exit — it slowly builds a mental map
	and learns the perfect escape route.
	</p>
	</div>
	""")

	gr.HTML("""
	<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:1rem;margin-bottom:1.5rem;">
	<div class="info-card">
	<div class="info-card-icon">🗺️</div>
	<div class="info-card-title">The Maze</div>
	<div class="info-card-body">
	A grid of corridors and walls. The bot starts at
	<strong style="color:#58a6ff">S</strong> and must reach
	<strong style="color:#f78166">G</strong>.
	It can only see its own position — no map, no cheating.
	</div>
	</div>
	<div class="info-card">
	<div class="info-card-icon">🤖</div>
	<div class="info-card-title">The Bot</div>
	<div class="info-card-body">
	At each step it chooses: go up, down, left, or right.
	Hit a wall? Penalty. Reach the goal? Big reward!
	It remembers what worked and what didn't.
	</div>
	</div>
	<div class="info-card">
	<div class="info-card-icon">🧠</div>
	<div class="info-card-title">The Learning</div>
	<div class="info-card-body">
	Each attempt updates a "score table" for every
	position and move. After enough tries, the bot
	always picks the move with the highest score — the optimal path.
	</div>
	</div>
	</div>
	""")

	gr.HTML("""
	<div style="background:#161b22;border:1px solid #21262d;border-radius:10px;padding:1.2rem;margin-bottom:1rem;">
	<div style="font-weight:600;color:#e6edf3;margin-bottom:1rem;">🧠 Choose your Bot's Brain</div>
	<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;">
	<div class="algo-card">
	<div class="algo-name">Q-Learning</div>
	<div class="algo-desc">
	Updates its score table <em>immediately</em> after every step.
	Fast learner. Best for most mazes.
	</div>
	<span class="algo-tag tag-green">⚡ Recommended</span>
	</div>
	<div class="algo-card">
	<div class="algo-name">SARSA</div>
	<div class="algo-desc">
	Updates based on the move it <em>actually took</em> next,
	not just the best possible. More cautious, avoids risky paths.
	</div>
	<span class="algo-tag tag-blue">🎯 Cautious</span>
	</div>
	<div class="algo-card">
	<div class="algo-name">Monte Carlo</div>
	<div class="algo-desc">
	Plays out the <em>entire episode</em> first, then
	updates everything at once. Needs more episodes to converge.
	</div>
	<span class="algo-tag tag-orange">🎲 Explorer</span>
	</div>
	</div>
	</div>
	""")

	gr.HTML("""
	<div style="display:grid;grid-template-columns:1fr 1fr;gap:1rem;">
	<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:10px;padding:1rem;">
	<div style="font-weight:600;color:#3fb950;margin-bottom:0.4rem;">🗺️ How to use this app</div>
	<ol style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
	<li>Go to <strong style="color:#e6edf3">🎮 Playground</strong> tab</li>
	<li>Pick a difficulty and maze style</li>
	<li>Choose a brain and hit <strong style="color:#3fb950">Train & Watch!</strong></li>
	<li>Watch the animated replay</li>
	<li>Try <strong style="color:#e6edf3">🏁 Algorithm Race</strong> to compare all three</li>
	</ol>
	</div>
	<div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:10px;padding:1rem;">
	<div style="font-weight:600;color:#58a6ff;margin-bottom:0.4rem;">💡 Fun facts</div>
	<ul style="color:#8b949e;font-size:0.85rem;line-height:1.8;margin:0;padding-left:1.2rem;">
	<li>This same idea trains robots, game AIs, and self-driving cars</li>
	<li>DeepMind's AlphaGo used a version of Q-Learning</li>
	<li>A 17×17 maze has 289 possible positions to learn</li>
	<li>The bot gets worse before it gets better — that's normal!</li>
	</ul>
	</div>
	</div>
	""")

	# ══════════════════════════════════════════════════════════════════
	# Tab 2 — Playground
	# ══════════════════════════════════════════════════════════════════
	with gr.Tab("🎮 Playground"):

	gr.HTML("""
	<div style="padding:0.3rem 0 1rem;">
	<div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
	Build a maze, pick a brain, watch it learn
	</div>
	<div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
	The animated replay shows the final learned path after training.
	</div>
	</div>
	""")

	with gr.Row():
	# ── Controls ──────────────────────────────────────────────
	with gr.Column(scale=1, min_width=300):

	gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin-bottom:0.5rem;">🗺️ MAZE SETUP</div>')

	difficulty = gr.Radio(
	list(DIFFICULTY.keys()),
	value="🐢 Medium (9×9)",
	label="Difficulty",
	)
	maze_style = gr.Radio(
	list(MAZE_STYLE.keys()),
	value="🏰 Corridors (DFS)",
	label="Maze style",
	info="Corridors = proper winding paths · Open = random walls"
	)

	gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">🧠 BOT BRAIN</div>')

	algo = gr.Radio(
	list(ALGO_MAP.keys()),
	value="🧠 Q-Learning (recommended)",
	label="Algorithm",
	)

	gr.HTML('<div style="font-size:0.75rem;font-weight:600;color:#484f58;text-transform:uppercase;letter-spacing:0.1em;margin:0.8rem 0 0.5rem;">⚙️ TRAINING</div>')

	episodes = gr.Slider(100, 3000, value=800, step=100,
	label="Training episodes",
	info="More = smarter bot, but slower")

	with gr.Accordion("🔬 Advanced settings", open=False):
	alpha = gr.Slider(0.01, 0.5, value=0.1, step=0.01, label="Learning speed (α)")
	gamma = gr.Slider(0.5, 0.99, value=0.95, step=0.01, label="Future planning (γ)")
	decay = gr.Slider(0.90, 0.999,value=0.995, step=0.001,label="Exploration decay")
	seed = gr.Slider(0, 100, value=42, step=1, label="Random seed")

	btn_solve = gr.Button("🚀 Train & Watch!", variant="primary")

	# ── Outputs ───────────────────────────────────────────────
	with gr.Column(scale=2):
	play_stats = gr.Markdown("Configure your maze and hit Train & Watch!")

	with gr.Row():
	play_gif = gr.Image(
	label="🎬 Bot solving the maze (animated)",
	type="filepath", height=360,
	)

	with gr.Row():
	play_train_fig = gr.Plot(label="📈 Training progress")
	play_heatmap = gr.Plot(label="🌡️ Q-value map (what the bot learned)")

	# hidden state defaults for advanced
	alpha_h = gr.State(0.1)
	gamma_h = gr.State(0.95)
	decay_h = gr.State(0.995)
	seed_h = gr.State(42)

	btn_solve.click(
	cb_solve,
	inputs=[difficulty, maze_style, algo, episodes,
	alpha, gamma, decay, seed],
	outputs=[play_gif, play_train_fig, play_heatmap, play_stats],
	)

	# ══════════════════════════════════════════════════════════════════
	# Tab 3 — Algorithm Race
	# ══════════════════════════════════════════════════════════════════
	with gr.Tab("🏁 Algorithm Race"):

	gr.HTML("""
	<div style="padding:0.3rem 0 1rem;">
	<div style="font-size:1.05rem;font-weight:600;color:#e6edf3;">
	Head-to-head: which brain learns fastest?
	</div>
	<div style="color:#484f58;font-size:0.85rem;margin-top:0.2rem;">
	All algorithms train on the same maze with identical settings —
	the only variable is the learning strategy.
	</div>
	</div>
	""")

	gr.HTML("""
	<div style="display:grid;grid-template-columns:repeat(3,1fr);gap:0.8rem;margin-bottom:1rem;">
	<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.8rem;text-align:center;">
	<div style="color:#3fb950;font-size:1.2rem;font-weight:700;">Q-Learning</div>
	<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Off-policy · Fast update · Optimistic</div>
	</div>
	<div style="background:#0d1f38;border:1px solid #1a4a7a;border-radius:8px;padding:0.8rem;text-align:center;">
	<div style="color:#58a6ff;font-size:1.2rem;font-weight:700;">SARSA</div>
	<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">On-policy · Careful update · Conservative</div>
	</div>
	<div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.8rem;text-align:center;">
	<div style="color:#ffa657;font-size:1.2rem;font-weight:700;">Monte Carlo</div>
	<div style="color:#484f58;font-size:0.78rem;margin-top:0.2rem;">Episodic · Full return · Unbiased</div>
	</div>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1, min_width=260):
	race_diff = gr.Radio(list(DIFFICULTY.keys()),
	value="🐢 Medium (9×9)", label="Maze difficulty")
	race_style = gr.Radio(list(MAZE_STYLE.keys()),
	value="🏰 Corridors (DFS)", label="Maze style")
	race_eps = gr.Slider(200, 2000, value=600, step=100,
	label="Episodes per algorithm")
	race_mc = gr.Checkbox(label="Include Monte Carlo (slower)", value=True)
	btn_race = gr.Button("🏁 Start Race!", variant="primary")

	with gr.Column(scale=2):
	race_result = gr.Markdown("Click Start Race to run the comparison.")

	race_fig = gr.Plot(label="Race Results")

	btn_race.click(
	cb_race,
	inputs=[race_diff, race_style, race_eps, race_mc],
	outputs=[race_fig, race_result],
	)

	# ══════════════════════════════════════════════════════════════════
	# Tab 4 — How it Works
	# ══════════════════════════════════════════════════════════════════
	with gr.Tab("🧠 How it Works"):

	gr.HTML("""
	<div style="max-width:700px;margin:0 auto;padding:1rem 0;">

	<h2 style="color:#3fb950;font-size:1.3rem;margin-bottom:0.3rem;">The Big Idea</h2>
	<p style="color:#8b949e;line-height:1.7;">
	The bot doesn't know anything about the maze at the start. It just knows
	4 possible moves and gets a number (reward) after each step.
	<strong style="color:#e6edf3">Negative number = bad move. Positive = good move.</strong>
	The goal: find the sequence of moves that gets the most reward.
	</p>

	<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">The Score Table (Q-Table)</h2>
	<p style="color:#8b949e;line-height:1.7;">
	The bot keeps a table with one row per maze cell and 4 columns (one per direction).
	Each entry stores <em>how good it thinks that move is from that cell</em>.
	At the start, everything is 0. After training, the table holds the bot's
	entire learned strategy. The Q-value heatmap in the Playground shows this table visually.
	</p>

	<div style="background:#161b22;border:1px solid #21262d;border-radius:8px;padding:1rem;margin:1rem 0;font-family:'JetBrains Mono',monospace;font-size:0.82rem;color:#3fb950;">
	Q[current_cell][move] += learning_speed × (<br>
	reward_got + future_discount × best_Q[next_cell] − Q[current_cell][move]<br>
	)
	</div>

	<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Exploration vs Exploitation</h2>
	<p style="color:#8b949e;line-height:1.7;">
	Early in training, the bot tries <strong style="color:#e6edf3">random moves</strong> (exploration)
	— it doesn't know enough to trust its table yet. Over time, it relies more on what
	it's learned (exploitation). This is controlled by <strong style="color:#e6edf3">epsilon (ε)</strong>,
	which starts near 1.0 (100% random) and decays toward 0 (always use best known move).
	</p>

	<h2 style="color:#3fb950;font-size:1.3rem;margin:1.2rem 0 0.3rem;">Why does reward go negative first?</h2>
	<p style="color:#8b949e;line-height:1.7;">
	Each step costs −1 (time penalty) and hitting a wall costs −5.
	A random bot hits a <em>lot</em> of walls and takes forever to find the exit,
	so early rewards are very negative. As it learns, fewer walls are hit and
	the path shortens — reward climbs toward 0 and eventually turns positive when
	it reliably reaches the goal.
	</p>

	<div style="display:grid;grid-template-columns:1fr 1fr;gap:0.8rem;margin-top:1.2rem;">
	<div style="background:#0d2d17;border:1px solid #1a5c2e;border-radius:8px;padding:0.9rem;">
	<div style="color:#3fb950;font-weight:600;margin-bottom:0.4rem;">Q-Learning vs SARSA</div>
	<div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
	Q-Learning always updates toward the <em>best possible</em> next action —
	even if it wouldn't actually take that action. SARSA updates toward
	the action it <em>will actually take</em>. This makes SARSA more cautious near walls.
	</div>
	</div>
	<div style="background:#2d1c06;border:1px solid #5c3a12;border-radius:8px;padding:0.9rem;">
	<div style="color:#ffa657;font-weight:600;margin-bottom:0.4rem;">Why Monte Carlo is slow</div>
	<div style="color:#8b949e;font-size:0.82rem;line-height:1.6;">
	MC waits until the episode <em>ends</em> before updating any scores.
	On large mazes where early episodes never reach the goal,
	it gets zero signal for a long time. But once it starts solving,
	its estimates are very accurate.
	</div>
	</div>
	</div>

	</div>
	""")

	gr.HTML("""
	<div style="text-align:center;color:#21262d;font-size:0.75rem;
	padding:1.5rem 0 0.5rem;border-top:1px solid #161b22;margin-top:1rem;">
	Built with Q-Learning · SARSA · Monte Carlo · Gymnasium · Gradio
	</div>
	""")


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860, share=False, css=CSS)