Spaces:
Sleeping
Sleeping
File size: 6,536 Bytes
b00d5d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | """
Tabular Q-Learning Agent.
Implements Q(s,a) ← Q(s,a) + α [r + γ·max_a' Q(s',a') − Q(s,a)]
Because Q-learning requires a finite state space, the continuous
observation is discretised into equal-width bins per dimension.
Key results from PROJECT_EXPLANATION.md:
• Mean reward: −916.97 (best among all methods)
• 5-feature state + 10 bins per dimension performs well
• Epsilon-greedy exploration with decay 0.995/episode
"""
import numpy as np
from .base_agent import BaseAgent
class QLearningAgent(BaseAgent):
"""
Tabular Q-Learning with adaptive state discretisation.
The Q-table is stored as a sparse dictionary
{(discrete_state_tuple, action): q_value} for memory efficiency.
"""
def __init__(self, state_size: int, action_size: int, config: dict):
super().__init__(state_size, action_size, config)
# Hyperparameters
self.learning_rate = config.get("learning_rate", 0.1)
self.gamma = config.get("gamma", 0.99)
self.epsilon = config.get("epsilon_start", 1.0)
self.epsilon_end = config.get("epsilon_end", 0.01)
self.epsilon_decay = config.get("epsilon_decay", 0.995)
self.num_bins = config.get("num_bins", 10)
# Adaptive bounds for normalisation
self.state_mins = np.zeros(state_size, dtype=np.float32)
self.state_maxs = np.ones(state_size, dtype=np.float32)
# Sparse Q-table
self.q_table: dict = {}
# Stats
self.steps = 0
self.episodes = 0
print(f"[Q-Learning] Initialised state={state_size} "
f"actions={action_size} bins={self.num_bins} "
f"lr={self.learning_rate} gamma={self.gamma}")
# ------------------------------------------------------------------
# Helpers
# ------------------------------------------------------------------
def _discretise(self, state: np.ndarray) -> tuple:
"""Convert continuous state → discrete tuple (hashable dict key)."""
if not isinstance(state, np.ndarray):
state = np.array(state, dtype=np.float32)
if state.dtype != np.float32:
state = state.astype(np.float32)
# Update running bounds
self.state_mins = np.minimum(self.state_mins, state)
self.state_maxs = np.maximum(self.state_maxs, state)
ranges = np.maximum(self.state_maxs - self.state_mins, 1e-8)
normalised = np.clip((state - self.state_mins) / ranges, 0.0, 1.0)
indices = (normalised * (self.num_bins - 1)).astype(np.int32)
return tuple(indices)
def _get_q(self, discrete_state: tuple, action: int) -> float:
return self.q_table.get((discrete_state, action), 0.0)
def _set_q(self, discrete_state: tuple, action: int, value: float):
self.q_table[(discrete_state, action)] = float(value)
# ------------------------------------------------------------------
# BaseAgent interface
# ------------------------------------------------------------------
def select_action(self, state, training: bool = True) -> int:
"""Epsilon-greedy action selection."""
ds = self._discretise(state)
if training and np.random.random() < self.epsilon:
return int(np.random.randint(0, self.action_size))
q_values = [self._get_q(ds, a) for a in range(self.action_size)]
max_q = max(q_values)
best = [a for a, q in enumerate(q_values) if q == max_q]
return int(np.random.choice(best))
def train_step(self, state, action, reward, next_state, done):
"""
One Bellman update.
Returns:
td_error (float): Temporal-difference error for this update.
"""
ds = self._discretise(state)
dns = self._discretise(next_state)
action = int(action)
reward = float(reward)
done = bool(done)
current_q = self._get_q(ds, action)
if done:
target_q = reward
else:
next_qs = [self._get_q(dns, a) for a in range(self.action_size)]
target_q = reward + self.gamma * max(next_qs)
td_error = target_q - current_q
self._set_q(ds, action, current_q + self.learning_rate * td_error)
if done:
self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)
self.episodes += 1
self.steps += 1
return float(td_error)
def save(self, filepath: str):
"""Serialise Q-table to a .npy file."""
payload = {
"q_table": dict(self.q_table),
"state_mins": self.state_mins.tolist(),
"state_maxs": self.state_maxs.tolist(),
"epsilon": self.epsilon,
"steps": self.steps,
"episodes": self.episodes,
"num_bins": self.num_bins,
}
np.save(filepath, payload, allow_pickle=True)
print(f"[Q-Learning] Saved Q-table ({len(self.q_table)} entries) -> {filepath}")
def load(self, filepath: str):
"""Deserialise Q-table from a .npy file."""
payload = np.load(filepath, allow_pickle=True).item()
self.q_table = payload["q_table"]
self.state_mins = np.array(payload["state_mins"], dtype=np.float32)
self.state_maxs = np.array(payload["state_maxs"], dtype=np.float32)
self.epsilon = payload["epsilon"]
self.steps = payload["steps"]
self.episodes = payload["episodes"]
self.num_bins = payload["num_bins"]
print(f"[Q-Learning] Loaded Q-table ({len(self.q_table)} entries) <- {filepath}")
# ------------------------------------------------------------------
# Diagnostics
# ------------------------------------------------------------------
def stats(self) -> dict:
if not self.q_table:
return {"entries": 0, "unique_states": 0}
states = {s for s, _ in self.q_table}
vals = list(self.q_table.values())
return {
"entries": len(self.q_table),
"unique_states": len(states),
"mean_q": float(np.mean(vals)),
"max_q": float(np.max(vals)),
"min_q": float(np.min(vals)),
"epsilon": round(self.epsilon, 4),
"episodes": self.episodes,
}
def __repr__(self):
s = self.stats()
return (
f"QLearningAgent(state={self.state_size}, actions={self.action_size}, "
f"bins={self.num_bins}, entries={s['entries']}, ε={s['epsilon']})"
)
|