Vjeong Claude Opus 4.6 commited on
Commit
6c7b430
·
1 Parent(s): 38fd260

Improve LOSS_BOUNCE detection with pre-computed bounce metrics

Browse files

Pre-compute min loss index and bounce amount before classification
chain, and tighten bounce conditions (min before 85% of training,
bounce > 0.2) to reduce false positives.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. llm_lab/training/debugger.py +13 -5
llm_lab/training/debugger.py CHANGED
@@ -175,6 +175,16 @@ class LossDebugger:
175
  else:
176
  val_trend = "flat"
177
 
 
 
 
 
 
 
 
 
 
 
178
  # ── Classify ──
179
  status = STATUS_NORMAL
180
  severity = "green"
@@ -226,15 +236,13 @@ class LossDebugger:
226
  recommended_levels = [3, 2]
227
 
228
  # Check 5: Loss bounce (decreased then increased again)
229
- elif loss_change > 0.1 and second_half_avg > first_half_avg:
230
- min_loss = min(train_losses)
231
- bounce_amount = last_loss - min_loss
232
  status = STATUS_LOSS_BOUNCE
233
  severity = "yellow"
234
  details = (
235
  f"Loss decreased then bounced back up: "
236
- f"{first_loss:.4f} -> min {min_loss:.4f} -> {last_loss:.4f} "
237
- f"(bounce={bounce_amount:.4f}). "
238
  f"Possible LR too high, data issue, or overfitting."
239
  )
240
  recommended_levels = [3, 4]
 
175
  else:
176
  val_trend = "flat"
177
 
178
+ # Pre-compute bounce detection
179
+ _min_loss = min(train_losses)
180
+ _min_idx = train_losses.index(_min_loss)
181
+ _bounce_amount = last_loss - _min_loss
182
+ _has_bounce = (
183
+ loss_change > 0.1
184
+ and _min_idx < len(train_losses) * 0.85
185
+ and _bounce_amount > 0.2
186
+ )
187
+
188
  # ── Classify ──
189
  status = STATUS_NORMAL
190
  severity = "green"
 
236
  recommended_levels = [3, 2]
237
 
238
  # Check 5: Loss bounce (decreased then increased again)
239
+ elif _has_bounce:
 
 
240
  status = STATUS_LOSS_BOUNCE
241
  severity = "yellow"
242
  details = (
243
  f"Loss decreased then bounced back up: "
244
+ f"{first_loss:.4f} -> min {_min_loss:.4f} -> {last_loss:.4f} "
245
+ f"(bounce={_bounce_amount:.4f}). "
246
  f"Possible LR too high, data issue, or overfitting."
247
  )
248
  recommended_levels = [3, 4]