Spaces:
Running on T4
Running on T4
Upload folder using huggingface_hub
Browse files
server/meta_optimizer_environment.py
CHANGED
|
@@ -29,6 +29,8 @@ from .tasks import TRAIN_TASK_IDS, get_task, task_spec_from_dict, TaskSpec
|
|
| 29 |
LOSS_THRESHOLD = 0.1
|
| 30 |
MAX_STEPS = 100
|
| 31 |
BATCH_SIZE = 32
|
|
|
|
|
|
|
| 32 |
|
| 33 |
|
| 34 |
def _build_model(spec: TaskSpec) -> nn.Module:
|
|
@@ -200,7 +202,9 @@ def run_meta_optimizer_trajectory(
|
|
| 200 |
class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObservation, State]):
|
| 201 |
"""
|
| 202 |
Meta-learning optimizer environment: agent chooses LR scale, momentum, grad clip, weight decay per step.
|
| 203 |
-
Reward =
|
|
|
|
|
|
|
| 204 |
"""
|
| 205 |
|
| 206 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
@@ -222,6 +226,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
|
|
| 222 |
self._velocities: Optional[List[torch.Tensor]] = None
|
| 223 |
self._step_count: int = 0
|
| 224 |
self._current_loss: float = 0.0
|
|
|
|
| 225 |
self._steps_to_threshold: Optional[int] = None
|
| 226 |
self._action_log: List[Dict[str, Any]] = []
|
| 227 |
self._episode_id: Optional[str] = None
|
|
@@ -254,6 +259,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
|
|
| 254 |
with torch.no_grad():
|
| 255 |
out = self._model(X)
|
| 256 |
self._current_loss = nn.functional.mse_loss(out, y).item()
|
|
|
|
| 257 |
|
| 258 |
return self._observation(reward=None, grad_norm=None)
|
| 259 |
|
|
@@ -264,6 +270,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
|
|
| 264 |
**kwargs: Any,
|
| 265 |
) -> MetaOptimizerObservation:
|
| 266 |
assert self._model is not None and self._task_spec is not None
|
|
|
|
| 267 |
lr = action.lr_scale
|
| 268 |
momentum = action.momentum_coef
|
| 269 |
clip = action.grad_clip_threshold
|
|
@@ -310,11 +317,17 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
|
|
| 310 |
if self._steps_to_threshold is None and self._current_loss < self.loss_threshold:
|
| 311 |
self._steps_to_threshold = self._step_count
|
| 312 |
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
if done:
|
| 315 |
-
|
|
|
|
| 316 |
else:
|
| 317 |
-
reward =
|
| 318 |
|
| 319 |
return self._observation(reward=reward, grad_norm=grad_norm, done=done)
|
| 320 |
|
|
|
|
| 29 |
LOSS_THRESHOLD = 0.1
|
| 30 |
MAX_STEPS = 100
|
| 31 |
BATCH_SIZE = 32
|
| 32 |
+
# Dense reward scale: reward += DENSE_REWARD_SCALE * (prev_loss - current_loss) each step (potential-based, helps credit assignment)
|
| 33 |
+
DENSE_REWARD_SCALE = 0.2
|
| 34 |
|
| 35 |
|
| 36 |
def _build_model(spec: TaskSpec) -> nn.Module:
|
|
|
|
| 202 |
class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObservation, State]):
|
| 203 |
"""
|
| 204 |
Meta-learning optimizer environment: agent chooses LR scale, momentum, grad clip, weight decay per step.
|
| 205 |
+
Reward: dense term = scale * (prev_loss - current_loss) each step (loss decrease); terminal = -steps_to_threshold
|
| 206 |
+
when episode ends. Episode ends at max_steps or as soon as loss < threshold (early termination). Supports 50 train
|
| 207 |
+
tasks and held-out eval.
|
| 208 |
"""
|
| 209 |
|
| 210 |
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
|
|
|
| 226 |
self._velocities: Optional[List[torch.Tensor]] = None
|
| 227 |
self._step_count: int = 0
|
| 228 |
self._current_loss: float = 0.0
|
| 229 |
+
self._prev_loss: float = 0.0 # for dense reward (loss decrease)
|
| 230 |
self._steps_to_threshold: Optional[int] = None
|
| 231 |
self._action_log: List[Dict[str, Any]] = []
|
| 232 |
self._episode_id: Optional[str] = None
|
|
|
|
| 259 |
with torch.no_grad():
|
| 260 |
out = self._model(X)
|
| 261 |
self._current_loss = nn.functional.mse_loss(out, y).item()
|
| 262 |
+
self._prev_loss = self._current_loss
|
| 263 |
|
| 264 |
return self._observation(reward=None, grad_norm=None)
|
| 265 |
|
|
|
|
| 270 |
**kwargs: Any,
|
| 271 |
) -> MetaOptimizerObservation:
|
| 272 |
assert self._model is not None and self._task_spec is not None
|
| 273 |
+
prev_loss = self._prev_loss
|
| 274 |
lr = action.lr_scale
|
| 275 |
momentum = action.momentum_coef
|
| 276 |
clip = action.grad_clip_threshold
|
|
|
|
| 317 |
if self._steps_to_threshold is None and self._current_loss < self.loss_threshold:
|
| 318 |
self._steps_to_threshold = self._step_count
|
| 319 |
|
| 320 |
+
# Dense reward: reward loss decrease (potential-based shaping, does not change optimal policy)
|
| 321 |
+
dense_reward = DENSE_REWARD_SCALE * (prev_loss - self._current_loss)
|
| 322 |
+
self._prev_loss = self._current_loss
|
| 323 |
+
|
| 324 |
+
# End episode when we hit max_steps or when loss first crosses threshold (early termination)
|
| 325 |
+
done = self._step_count >= self.max_steps or self._steps_to_threshold is not None
|
| 326 |
if done:
|
| 327 |
+
terminal = -(self._steps_to_threshold if self._steps_to_threshold is not None else self.max_steps)
|
| 328 |
+
reward = dense_reward + terminal
|
| 329 |
else:
|
| 330 |
+
reward = dense_reward
|
| 331 |
|
| 332 |
return self._observation(reward=reward, grad_norm=grad_norm, done=done)
|
| 333 |
|