SavirD commited on
Commit
ab2a5b9
·
verified ·
1 Parent(s): 578a236

Upload folder using huggingface_hub

Browse files
server/meta_optimizer_environment.py CHANGED
@@ -29,6 +29,8 @@ from .tasks import TRAIN_TASK_IDS, get_task, task_spec_from_dict, TaskSpec
29
  LOSS_THRESHOLD = 0.1
30
  MAX_STEPS = 100
31
  BATCH_SIZE = 32
 
 
32
 
33
 
34
  def _build_model(spec: TaskSpec) -> nn.Module:
@@ -200,7 +202,9 @@ def run_meta_optimizer_trajectory(
200
  class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObservation, State]):
201
  """
202
  Meta-learning optimizer environment: agent chooses LR scale, momentum, grad clip, weight decay per step.
203
- Reward = -steps_to_reach_threshold (convergence speed). Supports 50 train tasks and held-out eval.
 
 
204
  """
205
 
206
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
@@ -222,6 +226,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
222
  self._velocities: Optional[List[torch.Tensor]] = None
223
  self._step_count: int = 0
224
  self._current_loss: float = 0.0
 
225
  self._steps_to_threshold: Optional[int] = None
226
  self._action_log: List[Dict[str, Any]] = []
227
  self._episode_id: Optional[str] = None
@@ -254,6 +259,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
254
  with torch.no_grad():
255
  out = self._model(X)
256
  self._current_loss = nn.functional.mse_loss(out, y).item()
 
257
 
258
  return self._observation(reward=None, grad_norm=None)
259
 
@@ -264,6 +270,7 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
264
  **kwargs: Any,
265
  ) -> MetaOptimizerObservation:
266
  assert self._model is not None and self._task_spec is not None
 
267
  lr = action.lr_scale
268
  momentum = action.momentum_coef
269
  clip = action.grad_clip_threshold
@@ -310,11 +317,17 @@ class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObs
310
  if self._steps_to_threshold is None and self._current_loss < self.loss_threshold:
311
  self._steps_to_threshold = self._step_count
312
 
313
- done = self._step_count >= self.max_steps
 
 
 
 
 
314
  if done:
315
- reward = -(self._steps_to_threshold if self._steps_to_threshold is not None else self.max_steps)
 
316
  else:
317
- reward = 0.0
318
 
319
  return self._observation(reward=reward, grad_norm=grad_norm, done=done)
320
 
 
29
  LOSS_THRESHOLD = 0.1
30
  MAX_STEPS = 100
31
  BATCH_SIZE = 32
32
+ # Dense reward scale: reward += DENSE_REWARD_SCALE * (prev_loss - current_loss) each step (potential-based, helps credit assignment)
33
+ DENSE_REWARD_SCALE = 0.2
34
 
35
 
36
  def _build_model(spec: TaskSpec) -> nn.Module:
 
202
  class MetaOptimizerEnvironment(Environment[MetaOptimizerAction, MetaOptimizerObservation, State]):
203
  """
204
  Meta-learning optimizer environment: agent chooses LR scale, momentum, grad clip, weight decay per step.
205
+ Reward: dense term = scale * (prev_loss - current_loss) each step (loss decrease); terminal = -steps_to_threshold
206
+ when episode ends. Episode ends at max_steps or as soon as loss < threshold (early termination). Supports 50 train
207
+ tasks and held-out eval.
208
  """
209
 
210
  SUPPORTS_CONCURRENT_SESSIONS: bool = True
 
226
  self._velocities: Optional[List[torch.Tensor]] = None
227
  self._step_count: int = 0
228
  self._current_loss: float = 0.0
229
+ self._prev_loss: float = 0.0 # for dense reward (loss decrease)
230
  self._steps_to_threshold: Optional[int] = None
231
  self._action_log: List[Dict[str, Any]] = []
232
  self._episode_id: Optional[str] = None
 
259
  with torch.no_grad():
260
  out = self._model(X)
261
  self._current_loss = nn.functional.mse_loss(out, y).item()
262
+ self._prev_loss = self._current_loss
263
 
264
  return self._observation(reward=None, grad_norm=None)
265
 
 
270
  **kwargs: Any,
271
  ) -> MetaOptimizerObservation:
272
  assert self._model is not None and self._task_spec is not None
273
+ prev_loss = self._prev_loss
274
  lr = action.lr_scale
275
  momentum = action.momentum_coef
276
  clip = action.grad_clip_threshold
 
317
  if self._steps_to_threshold is None and self._current_loss < self.loss_threshold:
318
  self._steps_to_threshold = self._step_count
319
 
320
+ # Dense reward: reward loss decrease (potential-based shaping, does not change optimal policy)
321
+ dense_reward = DENSE_REWARD_SCALE * (prev_loss - self._current_loss)
322
+ self._prev_loss = self._current_loss
323
+
324
+ # End episode when we hit max_steps or when loss first crosses threshold (early termination)
325
+ done = self._step_count >= self.max_steps or self._steps_to_threshold is not None
326
  if done:
327
+ terminal = -(self._steps_to_threshold if self._steps_to_threshold is not None else self.max_steps)
328
+ reward = dense_reward + terminal
329
  else:
330
+ reward = dense_reward
331
 
332
  return self._observation(reward=reward, grad_norm=grad_norm, done=done)
333