natmin322 commited on
Commit
64fec92
·
1 Parent(s): cc6f149
improve_gainlora/requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:147d29ab8e8a0f0da4b02ff8d050374648c53963177e439b6af0f9e41c3d13ea
3
- size 407
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99f4dac41391676def68a9b2e6e0d577cbb57931fedcdac8f70bc1904fc71a9
3
+ size 422
improve_gainlora/src/cl_trainer_specroute_llama.py CHANGED
@@ -127,27 +127,43 @@ class SpecRoute_Trainer(Seq2SeqTrainer):
127
  # One-time gradient check after first backward
128
  if not self._grad_check_done:
129
  self._grad_check_done = True
130
- n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
131
- sample_name, sample_norm = None, None
132
- for name, p in self.model.named_parameters():
133
- if p.requires_grad:
134
- if p.grad is not None:
135
- gn = p.grad.norm().item()
136
- if gn > 0:
137
- n_with_grad += 1
138
- if sample_name is None:
139
- sample_name, sample_norm = name, gn
140
- else:
141
- n_zero_grad += 1
142
- else:
143
- n_none_grad += 1
144
  print("=" * 60)
145
- print(f"[GRAD CHECK] params with grad>0: {n_with_grad}, "
146
- f"grad==0: {n_zero_grad}, grad=None: {n_none_grad}")
147
- if sample_name:
148
- print(f"[GRAD CHECK] sample: {sample_name} grad_norm={sample_norm:.6e}")
 
 
 
 
 
 
 
 
 
 
 
149
  else:
150
- print("[GRAD CHECK] WARNING: NO trainable param has non-zero gradient!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  print("=" * 60)
152
 
153
  return loss.detach()
 
127
  # One-time gradient check after first backward
128
  if not self._grad_check_done:
129
  self._grad_check_done = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  print("=" * 60)
131
+ print(f"[GRAD CHECK] loss={loss.item():.6f}")
132
+ if self.is_deepspeed_enabled:
133
+ # With DeepSpeed ZeRO Stage 2 + bf16, gradients are stored in
134
+ # DeepSpeed's internal fp32 optimizer buffers, NOT in p.grad.
135
+ # p.grad IS None here — this is NORMAL. Check via optimizer.
136
+ n_trainable = sum(1 for p in self.model.parameters() if p.requires_grad)
137
+ print(f"[GRAD CHECK] DeepSpeed mode: {n_trainable} trainable params (lora_B)")
138
+ print(f"[GRAD CHECK] p.grad=None is NORMAL for DeepSpeed ZeRO Stage 2 + bf16")
139
+ print(f"[GRAD CHECK] Training health: loss={loss.item():.6f} (should decrease over steps)")
140
+ if loss.item() != loss.item(): # NaN check
141
+ print("[GRAD CHECK] WARNING: loss is NaN — check bf16 overflow or model init!")
142
+ elif loss.item() > 100:
143
+ print("[GRAD CHECK] WARNING: loss very high — check tokenization / data loading")
144
+ else:
145
+ print("[GRAD CHECK] OK: loss is finite, training proceeding normally")
146
  else:
147
+ n_with_grad, n_zero_grad, n_none_grad = 0, 0, 0
148
+ sample_name, sample_norm = None, None
149
+ for name, p in self.model.named_parameters():
150
+ if p.requires_grad:
151
+ if p.grad is not None:
152
+ gn = p.grad.norm().item()
153
+ if gn > 0:
154
+ n_with_grad += 1
155
+ if sample_name is None:
156
+ sample_name, sample_norm = name, gn
157
+ else:
158
+ n_zero_grad += 1
159
+ else:
160
+ n_none_grad += 1
161
+ print(f"[GRAD CHECK] params with grad>0: {n_with_grad}, "
162
+ f"grad==0: {n_zero_grad}, grad=None: {n_none_grad}")
163
+ if sample_name:
164
+ print(f"[GRAD CHECK] sample: {sample_name} grad_norm={sample_norm:.6e}")
165
+ else:
166
+ print("[GRAD CHECK] WARNING: NO trainable param has non-zero gradient!")
167
  print("=" * 60)
168
 
169
  return loss.detach()