Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
# app.py
|
| 2 |
# Gridworld RL (Q-learning) with:
|
| 3 |
-
# ✅ Original visualization + layout
|
| 4 |
# ✅ Non-flickering learning curve (always visible)
|
| 5 |
# ✅ Obstacle density slider (auto-generate more/less blocks)
|
| 6 |
# ✅ Train uses epsilon decay (converges); Play shows deterministic route (epsilon=0)
|
| 7 |
# ✅ Same obstacle layout is reused for Play (stored in state)
|
| 8 |
-
# ✅
|
|
|
|
| 9 |
|
| 10 |
import time
|
| 11 |
import numpy as np
|
|
@@ -28,10 +29,14 @@ ACTION_DELTAS = {
|
|
| 28 |
}
|
| 29 |
|
| 30 |
def _neighbors(r, c, n):
|
| 31 |
-
if r > 0:
|
| 32 |
-
|
| 33 |
-
if
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def _has_path(size, start, goal, blocked):
|
| 37 |
"""BFS to ensure there's at least one safe path from start to goal."""
|
|
@@ -100,14 +105,17 @@ class GridWorld:
|
|
| 100 |
r, c = self.pos
|
| 101 |
nr, nc = r + dr, c + dc
|
| 102 |
|
|
|
|
| 103 |
if nr < 0 or nr >= self.size or nc < 0 or nc >= self.size:
|
| 104 |
nr, nc = r, c
|
| 105 |
|
|
|
|
| 106 |
if (nr, nc) in self.walls:
|
| 107 |
nr, nc = r, c
|
| 108 |
|
| 109 |
self.pos = (nr, nc)
|
| 110 |
|
|
|
|
| 111 |
if self.pos == self.goal:
|
| 112 |
return self.pos, +10.0, True
|
| 113 |
if self.pos in self.lava:
|
|
@@ -166,7 +174,7 @@ def draw_grid(env: GridWorld, agent: QAgent = None, show_q=False, episode=None,
|
|
| 166 |
|
| 167 |
for r in range(n):
|
| 168 |
for c in range(n):
|
| 169 |
-
x, y = c, n - 1 - r
|
| 170 |
|
| 171 |
tile_color = "#121a33"
|
| 172 |
if (r, c) == env.goal:
|
|
@@ -183,17 +191,33 @@ def draw_grid(env: GridWorld, agent: QAgent = None, show_q=False, episode=None,
|
|
| 183 |
linewidth=1.0,
|
| 184 |
edgecolor="#2a355f",
|
| 185 |
facecolor=tile_color,
|
| 186 |
-
alpha=0.95
|
| 187 |
)
|
| 188 |
)
|
| 189 |
|
| 190 |
if show_q and agent is not None and (r, c) not in env.walls:
|
| 191 |
best_a = int(np.argmax(agent.Q[r, c]))
|
| 192 |
qv = float(np.max(agent.Q[r, c]))
|
| 193 |
-
ax.text(
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
def put_icon(rc, icon, color="#ffffff"):
|
| 199 |
r, c = rc
|
|
@@ -253,8 +277,13 @@ def draw_learning_curve(returns, successes, window=25):
|
|
| 253 |
ma = moving_average(returns, window=window)
|
| 254 |
if len(ma) > 0:
|
| 255 |
xs_ma = np.arange(len(returns) - len(ma) + 1, len(returns) + 1)
|
| 256 |
-
ax.plot(
|
| 257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
ax2 = ax.twinx()
|
| 260 |
ax2.tick_params(colors="#c9d6ff")
|
|
@@ -318,7 +347,7 @@ def train_stream(
|
|
| 318 |
|
| 319 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode=0, step_i=0, total_reward=0.0)
|
| 320 |
last_curve = draw_learning_curve(returns, successes, window=int(curve_window))
|
| 321 |
-
status = f"Klaar om te trainen. Obstacle density={float(obstacle_density):.2f}.
|
| 322 |
yield frame, last_curve, agent.Q, env_state, status
|
| 323 |
|
| 324 |
CURVE_UPDATE_EVERY_STEPS = 8
|
|
@@ -362,7 +391,7 @@ def train_stream(
|
|
| 362 |
|
| 363 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode=episodes, step_i=None, total_reward=None)
|
| 364 |
last_curve = draw_learning_curve(returns, successes, window=int(curve_window))
|
| 365 |
-
status = "Training klaar ✅ Klik nu op ‘Play learned policy
|
| 366 |
yield frame, last_curve, agent.Q, env_state, status
|
| 367 |
|
| 368 |
def play_stream(q_table, env_state, speed, show_q_overlay, max_steps):
|
|
@@ -389,7 +418,7 @@ def play_stream(q_table, env_state, speed, show_q_overlay, max_steps):
|
|
| 389 |
|
| 390 |
curve = draw_learning_curve([], [], window=25)
|
| 391 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode="PLAY", step_i=0, total_reward=total_r)
|
| 392 |
-
yield frame, curve, "Play • epsilon=0.0 (deterministisch)
|
| 393 |
|
| 394 |
for t in range(1, int(max_steps) + 1):
|
| 395 |
a = agent.act_greedy(s)
|
|
@@ -406,7 +435,7 @@ def play_stream(q_table, env_state, speed, show_q_overlay, max_steps):
|
|
| 406 |
break
|
| 407 |
|
| 408 |
if env.pos == env.goal:
|
| 409 |
-
end = f"🏁 Goal bereikt! return {total_r:+.2f}
|
| 410 |
elif env.pos in env.lava:
|
| 411 |
end = "🔥 In lava beland. Tip: train langer of zet density lager."
|
| 412 |
else:
|
|
@@ -416,53 +445,39 @@ def play_stream(q_table, env_state, speed, show_q_overlay, max_steps):
|
|
| 416 |
yield frame, curve, end
|
| 417 |
|
| 418 |
# -----------------------------
|
| 419 |
-
# Gradio UI
|
| 420 |
# -----------------------------
|
| 421 |
with gr.Blocks(theme=gr.themes.Soft(), title="RL Gridworld (Q-learning)") as demo:
|
| 422 |
-
#
|
| 423 |
-
gr.
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
)
|
| 428 |
-
|
| 429 |
-
# NEW: intro text (kept short and friendly)
|
| 430 |
-
gr.Markdown(
|
| 431 |
-
"""
|
| 432 |
### 🤖 Een robot in het magazijn
|
| 433 |
|
| 434 |
Stel je voor: je werkt in een groot magazijn.
|
| 435 |
-
Tussen de stellingen rijdt een robot rond die bestellingen moet ophalen en naar het inpakstation brengen.
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
In deze demo zie je hoe zo’n robot **zelf leert** wat slim gedrag is.
|
| 440 |
|
| 441 |
In het begin rijdt hij willekeurig rond en maakt hij fouten.
|
| 442 |
-
Maar naarmate hij meer ervaring opdoet, ontdekt hij vanzelf:
|
| 443 |
-
**hoe hij veilig, efficiënt en zo snel mogelijk door het magazijn kan bewegen.**
|
| 444 |
|
| 445 |
Boven zie je de robot rijden tussen stellingen en gevaarlijke zones.
|
| 446 |
Onder zie je hoe zijn prestaties verbeteren naarmate hij leert.
|
| 447 |
|
| 448 |
👉 Probeer het zelf: maak het magazijn makkelijker of moeilijker, train de robot,
|
| 449 |
en laat daarna zien wat hij geleerd heeft.
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
- **Obstacle density**: hoeveel 🧱/🔥 er in het grid staan (meer = moeilijker)
|
| 459 |
-
- **Train**: agent leert (epsilon decays: eerst ontdekken, later benutten)
|
| 460 |
-
- **Play learned policy**: toont wat hij geleerd heeft (**epsilon=0**)
|
| 461 |
-
|
| 462 |
-
Rechts zie je een **learning curve** (return + moving average + success rate) die **niet knippert**.
|
| 463 |
-
"""
|
| 464 |
-
)
|
| 465 |
|
|
|
|
| 466 |
q_state = gr.State(None)
|
| 467 |
env_state = gr.State(None)
|
| 468 |
|
|
|
|
| 1 |
# app.py
|
| 2 |
# Gridworld RL (Q-learning) with:
|
| 3 |
+
# ✅ Original visualization + layout for the demo section (unchanged)
|
| 4 |
# ✅ Non-flickering learning curve (always visible)
|
| 5 |
# ✅ Obstacle density slider (auto-generate more/less blocks)
|
| 6 |
# ✅ Train uses epsilon decay (converges); Play shows deterministic route (epsilon=0)
|
| 7 |
# ✅ Same obstacle layout is reused for Play (stored in state)
|
| 8 |
+
# ✅ Header updated: text on the LEFT, photo on the RIGHT
|
| 9 |
+
# ✅ Removed the extra "Reinforcement Learning in een Gridworld..." block as requested
|
| 10 |
|
| 11 |
import time
|
| 12 |
import numpy as np
|
|
|
|
| 29 |
}
|
| 30 |
|
| 31 |
def _neighbors(r, c, n):
|
| 32 |
+
if r > 0:
|
| 33 |
+
yield (r - 1, c)
|
| 34 |
+
if r < n - 1:
|
| 35 |
+
yield (r + 1, c)
|
| 36 |
+
if c > 0:
|
| 37 |
+
yield (r, c - 1)
|
| 38 |
+
if c < n - 1:
|
| 39 |
+
yield (r, c + 1)
|
| 40 |
|
| 41 |
def _has_path(size, start, goal, blocked):
|
| 42 |
"""BFS to ensure there's at least one safe path from start to goal."""
|
|
|
|
| 105 |
r, c = self.pos
|
| 106 |
nr, nc = r + dr, c + dc
|
| 107 |
|
| 108 |
+
# bounds check
|
| 109 |
if nr < 0 or nr >= self.size or nc < 0 or nc >= self.size:
|
| 110 |
nr, nc = r, c
|
| 111 |
|
| 112 |
+
# wall check
|
| 113 |
if (nr, nc) in self.walls:
|
| 114 |
nr, nc = r, c
|
| 115 |
|
| 116 |
self.pos = (nr, nc)
|
| 117 |
|
| 118 |
+
# rewards
|
| 119 |
if self.pos == self.goal:
|
| 120 |
return self.pos, +10.0, True
|
| 121 |
if self.pos in self.lava:
|
|
|
|
| 174 |
|
| 175 |
for r in range(n):
|
| 176 |
for c in range(n):
|
| 177 |
+
x, y = c, n - 1 - r # invert y so (0,0) is top-left visually
|
| 178 |
|
| 179 |
tile_color = "#121a33"
|
| 180 |
if (r, c) == env.goal:
|
|
|
|
| 191 |
linewidth=1.0,
|
| 192 |
edgecolor="#2a355f",
|
| 193 |
facecolor=tile_color,
|
| 194 |
+
alpha=0.95,
|
| 195 |
)
|
| 196 |
)
|
| 197 |
|
| 198 |
if show_q and agent is not None and (r, c) not in env.walls:
|
| 199 |
best_a = int(np.argmax(agent.Q[r, c]))
|
| 200 |
qv = float(np.max(agent.Q[r, c]))
|
| 201 |
+
ax.text(
|
| 202 |
+
x + 0.5,
|
| 203 |
+
y + 0.55,
|
| 204 |
+
ACTIONS[best_a],
|
| 205 |
+
ha="center",
|
| 206 |
+
va="center",
|
| 207 |
+
fontsize=14,
|
| 208 |
+
color="#d7e3ff",
|
| 209 |
+
alpha=0.65,
|
| 210 |
+
)
|
| 211 |
+
ax.text(
|
| 212 |
+
x + 0.5,
|
| 213 |
+
y + 0.30,
|
| 214 |
+
f"{qv:+.2f}",
|
| 215 |
+
ha="center",
|
| 216 |
+
va="center",
|
| 217 |
+
fontsize=9,
|
| 218 |
+
color="#a9b7e6",
|
| 219 |
+
alpha=0.55,
|
| 220 |
+
)
|
| 221 |
|
| 222 |
def put_icon(rc, icon, color="#ffffff"):
|
| 223 |
r, c = rc
|
|
|
|
| 277 |
ma = moving_average(returns, window=window)
|
| 278 |
if len(ma) > 0:
|
| 279 |
xs_ma = np.arange(len(returns) - len(ma) + 1, len(returns) + 1)
|
| 280 |
+
ax.plot(
|
| 281 |
+
xs_ma,
|
| 282 |
+
ma,
|
| 283 |
+
linewidth=2.5,
|
| 284 |
+
alpha=0.95,
|
| 285 |
+
label=f"Moving avg ({min(int(window), len(returns))})",
|
| 286 |
+
)
|
| 287 |
|
| 288 |
ax2 = ax.twinx()
|
| 289 |
ax2.tick_params(colors="#c9d6ff")
|
|
|
|
| 347 |
|
| 348 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode=0, step_i=0, total_reward=0.0)
|
| 349 |
last_curve = draw_learning_curve(returns, successes, window=int(curve_window))
|
| 350 |
+
status = f"Klaar om te trainen. Obstacle density={float(obstacle_density):.2f}."
|
| 351 |
yield frame, last_curve, agent.Q, env_state, status
|
| 352 |
|
| 353 |
CURVE_UPDATE_EVERY_STEPS = 8
|
|
|
|
| 391 |
|
| 392 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode=episodes, step_i=None, total_reward=None)
|
| 393 |
last_curve = draw_learning_curve(returns, successes, window=int(curve_window))
|
| 394 |
+
status = "Training klaar ✅ Klik nu op ‘Play learned policy���."
|
| 395 |
yield frame, last_curve, agent.Q, env_state, status
|
| 396 |
|
| 397 |
def play_stream(q_table, env_state, speed, show_q_overlay, max_steps):
|
|
|
|
| 418 |
|
| 419 |
curve = draw_learning_curve([], [], window=25)
|
| 420 |
frame = draw_grid(env, agent, show_q=show_q_overlay, episode="PLAY", step_i=0, total_reward=total_r)
|
| 421 |
+
yield frame, curve, "Play • epsilon=0.0 (deterministisch)"
|
| 422 |
|
| 423 |
for t in range(1, int(max_steps) + 1):
|
| 424 |
a = agent.act_greedy(s)
|
|
|
|
| 435 |
break
|
| 436 |
|
| 437 |
if env.pos == env.goal:
|
| 438 |
+
end = f"🏁 Goal bereikt! return {total_r:+.2f}"
|
| 439 |
elif env.pos in env.lava:
|
| 440 |
end = "🔥 In lava beland. Tip: train langer of zet density lager."
|
| 441 |
else:
|
|
|
|
| 445 |
yield frame, curve, end
|
| 446 |
|
| 447 |
# -----------------------------
|
| 448 |
+
# Gradio UI
|
| 449 |
# -----------------------------
|
| 450 |
with gr.Blocks(theme=gr.themes.Soft(), title="RL Gridworld (Q-learning)") as demo:
|
| 451 |
+
# Header block: text LEFT, image RIGHT
|
| 452 |
+
with gr.Row():
|
| 453 |
+
with gr.Column(scale=3):
|
| 454 |
+
gr.Markdown(
|
| 455 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
### 🤖 Een robot in het magazijn
|
| 457 |
|
| 458 |
Stel je voor: je werkt in een groot magazijn.
|
| 459 |
+
Tussen de stellingen rijdt een robot rond die bestellingen moet ophalen en naar het inpakstation brengen.
|
| 460 |
+
Die robot krijgt geen kaart, geen regels en geen instructies over wat de snelste route is.
|
| 461 |
+
In deze demo zie je hoe zo’n robot zelf leert wat slim gedrag is.
|
|
|
|
|
|
|
| 462 |
|
| 463 |
In het begin rijdt hij willekeurig rond en maakt hij fouten.
|
| 464 |
+
Maar naarmate hij meer ervaring opdoet, ontdekt hij vanzelf: hoe hij veilig, efficiënt en zo snel mogelijk door het magazijn kan bewegen.
|
|
|
|
| 465 |
|
| 466 |
Boven zie je de robot rijden tussen stellingen en gevaarlijke zones.
|
| 467 |
Onder zie je hoe zijn prestaties verbeteren naarmate hij leert.
|
| 468 |
|
| 469 |
👉 Probeer het zelf: maak het magazijn makkelijker of moeilijker, train de robot,
|
| 470 |
en laat daarna zien wat hij geleerd heeft.
|
| 471 |
+
"""
|
| 472 |
+
)
|
| 473 |
+
with gr.Column(scale=2):
|
| 474 |
+
gr.Image(
|
| 475 |
+
value="humanoid-robot-apptronic-1024x684.jpg.webp",
|
| 476 |
+
show_label=False,
|
| 477 |
+
height=340,
|
| 478 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
+
# ---- Everything below here is your demo layout (kept the same) ----
|
| 481 |
q_state = gr.State(None)
|
| 482 |
env_state = gr.State(None)
|
| 483 |
|