ikaganacar
/

ismail

Model card Files Files and versions

xet

Community

ikaganacar commited on Nov 12, 2025

Commit

4e97b2c

1 Parent(s): 986639c

Fixes

Browse files

Files changed (1) hide show

Model_Architecture/train.py +21 -14

Model_Architecture/train.py CHANGED Viewed

@@ -368,8 +368,13 @@ def main():
         step = ckpt["step"]
         print(f"✅ Resumed from step {step}\n")
-    # Gradient scaler
-    scaler = torch.amp.GradScaler(device='cuda', enabled=(config["training"]["dtype"] == "bf16"))
     # Expert rotation
     current_expert = 0
@@ -377,11 +382,10 @@ def main():
     model.set_active_expert(current_expert)
     print(f"🎯 Training expert {current_expert}/{model_args.n_routed_experts - 1}")
-    # ✅ DEFINE VARIABLES HERE - outside the loop
     accum_steps = config["training"]["gradient_accumulation_steps"]
     total_steps = config["training"]["total_steps"]
     grad_clip = config["training"]["grad_clip"]
-    dtype_bf16 = config["training"]["dtype"] == "bf16"
     print("\n" + "="*70)
     print("TRAINING STARTED")
@@ -389,7 +393,7 @@ def main():
     model.train()
-    # ✅ MAIN TRAINING LOOP
     while step < total_steps:
         step_start = time.time()
@@ -407,7 +411,7 @@ def main():
             train_iter = iter(train_loader)
             batch = next(train_iter)
-        # ✅ SPLIT BATCH OUTSIDE ACCUMULATION LOOP
         input_ids, target_ids = batch
         batch_size = input_ids.size(0)
         micro_batch_size = batch_size // accum_steps
@@ -416,12 +420,12 @@ def main():
         lm_loss_accum = 0.0
         lb_loss_accum = 0.0
-        # ✅ GRADIENT ACCUMULATION LOOP
         for accum_step in range(accum_steps):
             # Calculate slice indices
             start_idx = micro_batch_size * accum_step
-            # Handle last micro-batch (includes remainder)
             if accum_step == accum_steps - 1:
                 end_idx = batch_size
             else:
@@ -436,22 +440,25 @@ def main():
                 model, input_mb, target_mb, device, config, scaler
             )
-            # Accumulate losses (normalized by accum_steps)
             lm_loss_accum += lm_loss / accum_steps
             lb_loss_accum += lb_loss / accum_steps
-        # Gradient clipping
         if grad_clip > 0:
-            if dtype_bf16:
                 scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
-        # Optimizer step
         if dtype_bf16:
             scaler.step(optimizer)
             scaler.update()
-        else:
-            optimizer.step()
         optimizer.zero_grad(set_to_none=True)

         step = ckpt["step"]
         print(f"✅ Resumed from step {step}\n")
+    # ✅ FIX: Only create scaler for FP16, not BF16
+    dtype_bf16 = config["training"]["dtype"] == "bf16"
+    if dtype_bf16:
+        scaler = None
+        print("⚠️  BF16 mode: Disabling GradScaler (not needed/supported)\n")
+    else:
+        scaler = torch.amp.GradScaler(device='cuda', enabled=True)
     # Expert rotation
     current_expert = 0
     model.set_active_expert(current_expert)
     print(f"🎯 Training expert {current_expert}/{model_args.n_routed_experts - 1}")
+    # Define variables
     accum_steps = config["training"]["gradient_accumulation_steps"]
     total_steps = config["training"]["total_steps"]
     grad_clip = config["training"]["grad_clip"]
     print("\n" + "="*70)
     print("TRAINING STARTED")
     model.train()
+    # MAIN TRAINING LOOP
     while step < total_steps:
         step_start = time.time()
             train_iter = iter(train_loader)
             batch = next(train_iter)
+        # Split batch
         input_ids, target_ids = batch
         batch_size = input_ids.size(0)
         micro_batch_size = batch_size // accum_steps
         lm_loss_accum = 0.0
         lb_loss_accum = 0.0
+        # Gradient accumulation loop
         for accum_step in range(accum_steps):
             # Calculate slice indices
             start_idx = micro_batch_size * accum_step
+            # Handle last micro-batch
             if accum_step == accum_steps - 1:
                 end_idx = batch_size
             else:
                 model, input_mb, target_mb, device, config, scaler
             )
+            # Accumulate losses
             lm_loss_accum += lm_loss / accum_steps
             lb_loss_accum += lb_loss / accum_steps
+        # Gradient clipping (if enabled)
         if grad_clip > 0:
+            # Skip unscale for BF16
+            if not dtype_bf16:
                 scaler.unscale_(optimizer)
             torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+        # ✅ FIX: Conditional optimizer step
         if dtype_bf16:
+            # BF16: Direct step
+            optimizer.step()
+        else:
+            # FP16: Scaled step
             scaler.step(optimizer)
             scaler.update()
         optimizer.zero_grad(set_to_none=True)