Spaces:
Running
Running
| from __future__ import annotations | |
| import numpy as np | |
| from agents.base import TabularAgent | |
| from maze.env import MazeEnv | |
| def train_qlearning( | |
| env: MazeEnv, episodes: int, alpha: float, gamma: float, | |
| decay: float, seed: int = 0, | |
| ) -> tuple[TabularAgent, list[float]]: | |
| agent = TabularAgent(env.n_states, env.action_space.n, alpha, gamma) | |
| rng = np.random.default_rng(seed) | |
| rewards = [] | |
| for _ in range(episodes): | |
| state, _ = env.reset() | |
| total = 0.0 | |
| for _ in range(env.n_states * 4): | |
| action = agent.choose_action(state, rng) | |
| next_state, reward, done, _, _ = env.step(action) | |
| # Q-Learning: off-policy TD update | |
| td_target = reward + gamma * np.max(agent.Q[next_state]) * (1 - done) | |
| agent.Q[state, action] += alpha * (td_target - agent.Q[state, action]) | |
| state = next_state | |
| total += reward | |
| if done: | |
| break | |
| agent.decay_epsilon(decay) | |
| rewards.append(total) | |
| return agent, rewards | |