add micro_batch_size config key + per-micro inner loop in train step (fixes OOM for fp32+seq2048)

Browse files

Files changed (6) hide show

configs/base.toml +1 -0
configs/grow40_simple.toml +1 -0
configs/grow40_winning.toml +1 -0
configs/replicate_zero4.toml +1 -0
configs/zero_14_17.toml +1 -0
distill.py +28 -16

configs/base.toml CHANGED Viewed

@@ -25,6 +25,7 @@ grad_clip            = 1.0
 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 4
 max_steps            = 5
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 4
+micro_batch_size     = 4
 max_steps            = 5
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

configs/grow40_simple.toml CHANGED Viewed

@@ -26,6 +26,7 @@ grad_clip            = 1.0
 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 8
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 8
+micro_batch_size     = 8
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

configs/grow40_winning.toml CHANGED Viewed

@@ -26,6 +26,7 @@ grad_clip            = 1.0
 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
+micro_batch_size     = 1
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

configs/replicate_zero4.toml CHANGED Viewed

@@ -25,6 +25,7 @@ grad_clip            = 1.0
 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

 betas                = [0.9, 0.999]
 eps                  = 1.0e-3
 samples_per_step     = 4
+micro_batch_size     = 1
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

configs/zero_14_17.toml CHANGED Viewed

@@ -26,6 +26,7 @@ grad_clip            = 1.0
 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 8
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

 betas                = [0.9, 0.95]
 eps                  = 1.0e-8
 samples_per_step     = 8
+micro_batch_size     = 8
 max_steps            = 2000
 grad_checkpointing   = true
 attn_implementation  = "flash_attention_2"

distill.py CHANGED Viewed

@@ -71,6 +71,7 @@ REQUIRED_KEYS = {
         "teacher_dtype",
         "mixed_precision",
         "kl_chunk_size",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
@@ -580,6 +581,7 @@ def main():
     # ---- Train loop
     samples_per_step = cfg["train"]["samples_per_step"]
     grad_clip = cfg["train"]["grad_clip"]
     kl_start_pos = cfg["data"]["kl_start_pos"]
     kl_chunk_size = cfg["train"]["kl_chunk_size"]
@@ -590,7 +592,8 @@ def main():
     if accelerator.is_main_process:
         log.info(
             f"=== Training: max_steps={max_steps}, samples_per_step={samples_per_step} "
-            f"(per rank), effective batch={samples_per_step * accelerator.num_processes}"
         )
     student.train()
@@ -604,20 +607,29 @@ def main():
             log.warning(f"rank {accelerator.process_index}: data exhausted")
             break
-        ids, mask = collate_pad(batch, pad_id)
-        ids = ids.to(accelerator.device)
-        mask = mask.to(accelerator.device)
-        with torch.no_grad():
-            t_logits = teacher_forward(teacher, ids, mask)
-        s_logits = student(input_ids=ids, attention_mask=mask).logits
-        loss = kl_loss_masked(
-            s_logits, t_logits, mask,
-            start_pos=kl_start_pos, chunk_size=kl_chunk_size,
-        )
         optimizer.zero_grad()
-        accelerator.backward(loss)
         if grad_clip > 0:
             accelerator.clip_grad_norm_(student.parameters(), grad_clip)
         optimizer.step()
@@ -625,9 +637,9 @@ def main():
         global_step += 1
         elapsed = time.time() - t0
-        kl_local = loss.detach()
         kl_avg = accelerator.gather(kl_local.unsqueeze(0)).mean().item()
-        del t_logits, s_logits, loss, kl_local
         if accelerator.is_main_process and global_step % log_every == 0:
             lr_now = scheduler.get_last_lr()[0]

         "teacher_dtype",
         "mixed_precision",
         "kl_chunk_size",
+        "micro_batch_size",
     ),
     "eval": ("every_steps", "samples", "seed"),
     "log": ("wandb", "wandb_project", "wandb_run", "log_every", "output_dir"),
     # ---- Train loop
     samples_per_step = cfg["train"]["samples_per_step"]
+    micro_batch_size = cfg["train"]["micro_batch_size"]
     grad_clip = cfg["train"]["grad_clip"]
     kl_start_pos = cfg["data"]["kl_start_pos"]
     kl_chunk_size = cfg["train"]["kl_chunk_size"]
     if accelerator.is_main_process:
         log.info(
             f"=== Training: max_steps={max_steps}, samples_per_step={samples_per_step} "
+            f"(per rank, micro={micro_batch_size}), "
+            f"effective batch={samples_per_step * accelerator.num_processes}"
         )
     student.train()
             log.warning(f"rank {accelerator.process_index}: data exhausted")
             break
         optimizer.zero_grad()
+        batch_n = len(batch)
+        kl_sum = 0.0
+        for mb_start in range(0, batch_n, micro_batch_size):
+            micro = batch[mb_start : mb_start + micro_batch_size]
+            mb_n = len(micro)
+            ids, mask = collate_pad(micro, pad_id)
+            ids = ids.to(accelerator.device)
+            mask = mask.to(accelerator.device)
+            with torch.no_grad():
+                t_logits = teacher_forward(teacher, ids, mask)
+            s_logits = student(input_ids=ids, attention_mask=mask).logits
+            loss = kl_loss_masked(
+                s_logits, t_logits, mask,
+                start_pos=kl_start_pos, chunk_size=kl_chunk_size,
+            )
+            # Weight by micro size so summing micros gives the batch mean
+            scaled = loss * (mb_n / batch_n)
+            accelerator.backward(scaled)
+            kl_sum += loss.item() * mb_n
+            del t_logits, s_logits, loss, scaled
         if grad_clip > 0:
             accelerator.clip_grad_norm_(student.parameters(), grad_clip)
         optimizer.step()
         global_step += 1
         elapsed = time.time() - t0
+        kl_local = torch.tensor(kl_sum / batch_n, device=accelerator.device)
         kl_avg = accelerator.gather(kl_local.unsqueeze(0)).mean().item()
+        del kl_local
         if accelerator.is_main_process and global_step % log_every == 0:
             lr_now = scheduler.get_last_lr()[0]