Fix LR warmup ordering and align adam_eps with Meta LLaMA

Move scheduler.set_lr() before optimizer.step() so the first training
step uses the correct warmup LR instead of the full peak LR, which
could perturb pretrained weights during CPT. Change adam_eps from 1e-8
to 1e-5 to match Meta LLaMA's value for better bf16 numerical stability.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

llm_lab/config/train_config.py +4 -3
llm_lab/training/trainer.py +5 -3

llm_lab/config/train_config.py CHANGED Viewed

@@ -37,7 +37,9 @@ class TrainConfig:
     """Adam momentum coefficients. β2=0.95 is more stable than β2=0.999 for LLM training.
     With large batches and long training, a β2 that is too large slows adaptation."""
-    adam_eps: float = 1e-8
     grad_clip: float = 1.0
     """Gradient Clipping: rescales gradients when their norm exceeds 1.0.
     Prevents gradient spikes that occur during early training or with noisy data."""
@@ -112,8 +114,7 @@ class TrainConfig:
     @property
     def tokens_per_step(self) -> int:
-        """Number of tokens processed per optimizer step."""
-        # max_seq_len is injected externally (see ModelConfig)
         return self.effective_batch_size * 2048
     @property

     """Adam momentum coefficients. β2=0.95 is more stable than β2=0.999 for LLM training.
     With large batches and long training, a β2 that is too large slows adaptation."""
+    adam_eps: float = 1e-5
+    """Adam epsilon. LLaMA uses 1e-5 (not PyTorch default 1e-8) for
+    numerical stability with bf16, which has fewer mantissa bits than fp32."""
     grad_clip: float = 1.0
     """Gradient Clipping: rescales gradients when their norm exceeds 1.0.
     Prevents gradient spikes that occur during early training or with noisy data."""
     @property
     def tokens_per_step(self) -> int:
+        """Number of tokens processed per optimizer step (assumes max_seq_len=2048)."""
         return self.effective_batch_size * 2048
     @property

llm_lab/training/trainer.py CHANGED Viewed

@@ -172,12 +172,14 @@ class Trainer:
             max_norm=self.config.grad_clip,
         ).item()
         # ── Optimizer Step ──
         self.optimizer.step()
-        # ── LR Update ──
-        self.scheduler.set_lr(self.optimizer, self.global_step)
         avg_loss = total_loss / self.config.gradient_accumulation_steps
         return avg_loss, grad_norm

             max_norm=self.config.grad_clip,
         ).item()
+        # ── LR Update (before optimizer step) ──
+        # Must set LR before step() so the very first step uses warmup LR (not peak LR).
+        # Otherwise step 0 would use the full peak LR, perturbing pretrained weights.
+        self.scheduler.set_lr(self.optimizer, self.global_step)
         # ── Optimizer Step ──
         self.optimizer.step()
         avg_loss = total_loss / self.config.gradient_accumulation_steps
         return avg_loss, grad_norm