Spaces:

FaiziRBLX
/

NousAPI

Sleeping

App Files Files Community

FaiziRBLX commited on 24 days ago

Commit

f040bfd

verified ·

1 Parent(s): 110b8ce

Update best.py

Browse files

Files changed (1) hide show

best.py +17 -23

best.py CHANGED Viewed

@@ -261,9 +261,13 @@ class GroupedQueryAttention(nn.Module):
             query_states = (query_states * cos_full) + (rotate_half(query_states) * sin_full)
             key_states   = (key_states   * cos_full) + (rotate_half(key_states)   * sin_full)
         present_kv = (key_states, value_states) if use_cache else None
-        # FIX: only expand KV if groups > 1 (skip no-op repeat when groups==1)
         if self.num_key_value_groups > 1:
             key_states   = key_states  .repeat_interleave(self.num_key_value_groups, dim=1)
             value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
@@ -353,10 +357,10 @@ class DecoderLayer(nn.Module):
 class LabelSmoothingCrossEntropy(nn.Module):
     """
-    Cross-entropy with label smoothing that correctly ignores positions
-    where label == ignore_index. PyTorch's built-in label_smoothing
-    distributes probability mass to ALL vocab entries including padding —
-    this implementation does not.
     """
     def __init__(self, vocab_size: int, smoothing: float = 0.1, ignore_index: int = -100):
@@ -364,26 +368,17 @@ class LabelSmoothingCrossEntropy(nn.Module):
         self.vocab_size   = vocab_size
         self.smoothing    = smoothing
         self.ignore_index = ignore_index
-        self.confidence   = 1.0 - smoothing
     def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
         # logits: [N, V]  targets: [N]
-        mask = targets != self.ignore_index
-        if mask.sum() == 0:
-            return logits.sum() * 0.0  # differentiable zero
-        logits  = logits[mask]
-        targets = targets[mask]
-        log_probs = F.log_softmax(logits, dim=-1)           # [N_valid, V]
-        # Smooth target distribution
-        smooth_val = self.smoothing / (self.vocab_size - 1)
-        smooth_dist = torch.full_like(log_probs, smooth_val)
-        smooth_dist.scatter_(1, targets.unsqueeze(1), self.confidence)
-        loss = -(smooth_dist * log_probs).sum(dim=-1).mean()
-        return loss
 # ============================================================================
@@ -900,7 +895,6 @@ def train_model(
         betas=(config.adam_beta1, config.adam_beta2),
         eps=config.adam_epsilon,
         weight_decay=config.weight_decay,
-        fused=True if (device.type == 'cuda' and hasattr(torch.optim.AdamW, '__init__')) else False,
     )
     total_steps = sum(

             query_states = (query_states * cos_full) + (rotate_half(query_states) * sin_full)
             key_states   = (key_states   * cos_full) + (rotate_half(key_states)   * sin_full)
+        # Store pre-expand KV in cache (shape [B, num_kv_heads, T, D]).
+        # Must happen BEFORE repeat_interleave — otherwise cached keys have
+        # num_heads channels instead of num_kv_heads, and every decode step
+        # re-expands them again, corrupting attention.
         present_kv = (key_states, value_states) if use_cache else None
+        # Expand KV heads for full attention computation
         if self.num_key_value_groups > 1:
             key_states   = key_states  .repeat_interleave(self.num_key_value_groups, dim=1)
             value_states = value_states.repeat_interleave(self.num_key_value_groups, dim=1)
 class LabelSmoothingCrossEntropy(nn.Module):
     """
+    Cross-entropy with label smoothing.
+    Filters ignore_index=-100 first, then uses F.cross_entropy with smoothing.
+    This keeps the exact same loss scale as the original nn.CrossEntropyLoss
+    so the LR schedule pacing is unchanged.
     """
     def __init__(self, vocab_size: int, smoothing: float = 0.1, ignore_index: int = -100):
         self.vocab_size   = vocab_size
         self.smoothing    = smoothing
         self.ignore_index = ignore_index
     def forward(self, logits: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
         # logits: [N, V]  targets: [N]
+        # F.cross_entropy with label_smoothing and ignore_index is correct in
+        # PyTorch >= 1.10 — it does NOT distribute to ignored positions.
+        return F.cross_entropy(
+            logits,
+            targets,
+            ignore_index=self.ignore_index,
+            label_smoothing=self.smoothing,
+        )
 # ============================================================================
         betas=(config.adam_beta1, config.adam_beta2),
         eps=config.adam_epsilon,
         weight_decay=config.weight_decay,
     )
     total_steps = sum(