natmin322
/

Continual

Model card Files Files and versions

xet

Community

natmin322 commited on Mar 16

Commit

64fec92

1 Parent(s): cc6f149

fixbug

Browse files

Files changed (2) hide show

improve_gainlora/requirements.txt +2 -2
improve_gainlora/src/cl_trainer_specroute_llama.py +35 -19

improve_gainlora/requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:147d29ab8e8a0f0da4b02ff8d050374648c53963177e439b6af0f9e41c3d13ea
-size 407

 version https://git-lfs.github.com/spec/v1
+oid sha256:e99f4dac41391676def68a9b2e6e0d577cbb57931fedcdac8f70bc1904fc71a9
+size 422

improve_gainlora/src/cl_trainer_specroute_llama.py CHANGED Viewed

@@ -127,27 +127,43 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
         # One-time gradient check after first backward
         if not self._grad_check_done:
             self._grad_check_done = True
-            n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
-            sample_name, sample_norm = None, None
-            for name, p in self.model.named_parameters():
-                if p.requires_grad:
-                    if p.grad is not None:
-                        gn = p.grad.norm().item()
-                        if gn > 0:
-                            n_with_grad += 1
-                            if sample_name is None:
-                                sample_name, sample_norm = name, gn
-                        else:
-                            n_zero_grad += 1
-                    else:
-                        n_none_grad += 1
             print("=" * 60)
-            print(f"[GRAD CHECK] params with grad>0: {n_with_grad}, "
-                  f"grad==0: {n_zero_grad}, grad=None: {n_none_grad}")
-            if sample_name:
-                print(f"[GRAD CHECK] sample: {sample_name} grad_norm={sample_norm:.6e}")
             else:
-                print("[GRAD CHECK] WARNING: NO trainable param has non-zero gradient!")
             print("=" * 60)
         return loss.detach()

         # One-time gradient check after first backward
         if not self._grad_check_done:
             self._grad_check_done = True
             print("=" * 60)
+            print(f"[GRAD CHECK] loss={loss.item():.6f}")
+            if self.is_deepspeed_enabled:
+                # With DeepSpeed ZeRO Stage 2 + bf16, gradients are stored in
+                # DeepSpeed's internal fp32 optimizer buffers, NOT in p.grad.
+                # p.grad IS None here — this is NORMAL. Check via optimizer.
+                n_trainable = sum(1 for p in self.model.parameters() if p.requires_grad)
+                print(f"[GRAD CHECK] DeepSpeed mode: {n_trainable} trainable params (lora_B)")
+                print(f"[GRAD CHECK] p.grad=None is NORMAL for DeepSpeed ZeRO Stage 2 + bf16")
+                print(f"[GRAD CHECK] Training health: loss={loss.item():.6f} (should decrease over steps)")
+                if loss.item() != loss.item():  # NaN check
+                    print("[GRAD CHECK] WARNING: loss is NaN — check bf16 overflow or model init!")
+                elif loss.item() > 100:
+                    print("[GRAD CHECK] WARNING: loss very high — check tokenization / data loading")
+                else:
+                    print("[GRAD CHECK] OK: loss is finite, training proceeding normally")
             else:
+                n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
+                sample_name, sample_norm = None, None
+                for name, p in self.model.named_parameters():
+                    if p.requires_grad:
+                        if p.grad is not None:
+                            gn = p.grad.norm().item()
+                            if gn > 0:
+                                n_with_grad += 1
+                                if sample_name is None:
+                                    sample_name, sample_norm = name, gn
+                            else:
+                                n_zero_grad += 1
+                        else:
+                            n_none_grad += 1
+                print(f"[GRAD CHECK] params with grad>0: {n_with_grad}, "
+                      f"grad==0: {n_zero_grad}, grad=None: {n_none_grad}")
+                if sample_name:
+                    print(f"[GRAD CHECK] sample: {sample_name} grad_norm={sample_norm:.6e}")
+                else:
+                    print("[GRAD CHECK] WARNING: NO trainable param has non-zero gradient!")
             print("=" * 60)
         return loss.detach()