fixbug
Browse files
improve_gainlora/requirements.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e99f4dac41391676def68a9b2e6e0d577cbb57931fedcdac8f70bc1904fc71a9
|
| 3 |
+
size 422
|
improve_gainlora/src/cl_trainer_specroute_llama.py
CHANGED
|
@@ -127,27 +127,43 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
|
|
| 127 |
# One-time gradient check after first backward
|
| 128 |
if not self._grad_check_done:
|
| 129 |
self._grad_check_done = True
|
| 130 |
-
n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
|
| 131 |
-
sample_name, sample_norm = None, None
|
| 132 |
-
for name, p in self.model.named_parameters():
|
| 133 |
-
if p.requires_grad:
|
| 134 |
-
if p.grad is not None:
|
| 135 |
-
gn = p.grad.norm().item()
|
| 136 |
-
if gn > 0:
|
| 137 |
-
n_with_grad += 1
|
| 138 |
-
if sample_name is None:
|
| 139 |
-
sample_name, sample_norm = name, gn
|
| 140 |
-
else:
|
| 141 |
-
n_zero_grad += 1
|
| 142 |
-
else:
|
| 143 |
-
n_none_grad += 1
|
| 144 |
print("=" * 60)
|
| 145 |
-
print(f"[GRAD CHECK]
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
else:
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
print("=" * 60)
|
| 152 |
|
| 153 |
return loss.detach()
|
|
|
|
| 127 |
# One-time gradient check after first backward
|
| 128 |
if not self._grad_check_done:
|
| 129 |
self._grad_check_done = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
print("=" * 60)
|
| 131 |
+
print(f"[GRAD CHECK] loss={loss.item():.6f}")
|
| 132 |
+
if self.is_deepspeed_enabled:
|
| 133 |
+
# With DeepSpeed ZeRO Stage 2 + bf16, gradients are stored in
|
| 134 |
+
# DeepSpeed's internal fp32 optimizer buffers, NOT in p.grad.
|
| 135 |
+
# p.grad IS None here — this is NORMAL. Check via optimizer.
|
| 136 |
+
n_trainable = sum(1 for p in self.model.parameters() if p.requires_grad)
|
| 137 |
+
print(f"[GRAD CHECK] DeepSpeed mode: {n_trainable} trainable params (lora_B)")
|
| 138 |
+
print(f"[GRAD CHECK] p.grad=None is NORMAL for DeepSpeed ZeRO Stage 2 + bf16")
|
| 139 |
+
print(f"[GRAD CHECK] Training health: loss={loss.item():.6f} (should decrease over steps)")
|
| 140 |
+
if loss.item() != loss.item(): # NaN check
|
| 141 |
+
print("[GRAD CHECK] WARNING: loss is NaN — check bf16 overflow or model init!")
|
| 142 |
+
elif loss.item() > 100:
|
| 143 |
+
print("[GRAD CHECK] WARNING: loss very high — check tokenization / data loading")
|
| 144 |
+
else:
|
| 145 |
+
print("[GRAD CHECK] OK: loss is finite, training proceeding normally")
|
| 146 |
else:
|
| 147 |
+
n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
|
| 148 |
+
sample_name, sample_norm = None, None
|
| 149 |
+
for name, p in self.model.named_parameters():
|
| 150 |
+
if p.requires_grad:
|
| 151 |
+
if p.grad is not None:
|
| 152 |
+
gn = p.grad.norm().item()
|
| 153 |
+
if gn > 0:
|
| 154 |
+
n_with_grad += 1
|
| 155 |
+
if sample_name is None:
|
| 156 |
+
sample_name, sample_norm = name, gn
|
| 157 |
+
else:
|
| 158 |
+
n_zero_grad += 1
|
| 159 |
+
else:
|
| 160 |
+
n_none_grad += 1
|
| 161 |
+
print(f"[GRAD CHECK] params with grad>0: {n_with_grad}, "
|
| 162 |
+
f"grad==0: {n_zero_grad}, grad=None: {n_none_grad}")
|
| 163 |
+
if sample_name:
|
| 164 |
+
print(f"[GRAD CHECK] sample: {sample_name} grad_norm={sample_norm:.6e}")
|
| 165 |
+
else:
|
| 166 |
+
print("[GRAD CHECK] WARNING: NO trainable param has non-zero gradient!")
|
| 167 |
print("=" * 60)
|
| 168 |
|
| 169 |
return loss.detach()
|