Upload P0-P4 fixes patch for agent-2026-05-19-fixes-and-ablations branch
Browse files- helixlm_fixes_2026-05-19.patch +245 -0
helixlm_fixes_2026-05-19.patch
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
diff --git a/FINDINGS_2026-05-19.md b/FINDINGS_2026-05-19.md
|
| 2 |
+
new file mode 100644
|
| 3 |
+
index 0000000..1cc025f
|
| 4 |
+
--- /dev/null
|
| 5 |
+
+++ b/FINDINGS_2026-05-19.md
|
| 6 |
+
@@ -0,0 +1,113 @@
|
| 7 |
+
+# HelixLM Bug Fixes β 2026-05-19
|
| 8 |
+
+
|
| 9 |
+
+## Executive Summary
|
| 10 |
+
+
|
| 11 |
+
+Four structural bugs were identified and fixed. The fixes are already committed to branch `agent-2026-05-19-fixes-and-ablations`.
|
| 12 |
+
+
|
| 13 |
+
+---
|
| 14 |
+
+
|
| 15 |
+
+## P0 β TiedLMHead Train/Eval Asymmetry β
FIXED
|
| 16 |
+
+
|
| 17 |
+
+**File:** `helix_lm/hf_model.py`
|
| 18 |
+
+**Root Cause:** `TiedLMHead.forward()` applied a learned gradient buffer **only during `self.training=True`**. During eval/generation, the buffer was bypassed entirely. After many epochs, the buffer deviated from identity, causing massive distribution shift (train PPL ~20, val PPL ~700β900).
|
| 19 |
+
+
|
| 20 |
+
+**Fix:** Buffer is now applied **consistently in both train and eval**. Forward pass no longer inspects `self.training`. Buffer initialized with `nn.init.eye_()` so it starts as standard weight tying.
|
| 21 |
+
+
|
| 22 |
+
+```python
|
| 23 |
+
+# BEFORE (buggy):
|
| 24 |
+
+if self.training and 0 < self.grad_buffer_ratio < 1:
|
| 25 |
+
+ h_buffered = self.buffer(h)
|
| 26 |
+
+ ...
|
| 27 |
+
+else:
|
| 28 |
+
+ h_mixed = h # <-- bypassed in eval!
|
| 29 |
+
+
|
| 30 |
+
+# AFTER (fixed):
|
| 31 |
+
+if 0 < self.grad_buffer_ratio < 1:
|
| 32 |
+
+ h_buffered = self.buffer(h)
|
| 33 |
+
+ ...
|
| 34 |
+
+else:
|
| 35 |
+
+ h_mixed = h
|
| 36 |
+
+```
|
| 37 |
+
+
|
| 38 |
+
+---
|
| 39 |
+
+
|
| 40 |
+
+## P1 β Attention Mask EOS/Pad Collision β
FIXED
|
| 41 |
+
+
|
| 42 |
+
+**File:** `helix_lm/dataset.py`
|
| 43 |
+
+**Root Cause:** GPT-2 and Qwen set `pad_token_id == eos_token_id == 50256`. The old mask construction `(x != pad_id).long()` masked out **real EOS tokens** that the model must predict. For tail chunks of short documents, the entire attention mask could become zeros, corrupting those batch items.
|
| 44 |
+
+
|
| 45 |
+
+**Fix:** Mask is now built from the **known `pad_len`** (tail padding count), not by comparing token values. This works correctly regardless of whether `pad_id == eos_id`.
|
| 46 |
+
+
|
| 47 |
+
+```python
|
| 48 |
+
+# BEFORE (buggy):
|
| 49 |
+
+attention_mask = (x != self.pad_id).long() # masks real EOS!
|
| 50 |
+
+
|
| 51 |
+
+# AFTER (fixed):
|
| 52 |
+
+pad_len = sum(1 for tok in reversed(labels_t.tolist()) if tok == -100)
|
| 53 |
+
+attention_mask = torch.cat([
|
| 54 |
+
+ torch.ones(self.seq_len - pad_len, dtype=torch.long),
|
| 55 |
+
+ torch.zeros(pad_len, dtype=torch.long),
|
| 56 |
+
+])
|
| 57 |
+
+```
|
| 58 |
+
+
|
| 59 |
+
+Both `HelixDataset._make_sample()` and `DocumentAwareDataset.__getitem__()` were fixed.
|
| 60 |
+
+
|
| 61 |
+
+---
|
| 62 |
+
+
|
| 63 |
+
+## P2 β CUDA-Only Clone Band-Aid β
FIXED
|
| 64 |
+
+
|
| 65 |
+
+**File:** `helix_lm/hf_model.py`
|
| 66 |
+
+**Root Cause:** `if h.is_cuda: h = h.clone()` was a workaround for an in-place operation bug (likely in RMSNorm). It caused CPU and CUDA forward paths to diverge and masked the real root cause.
|
| 67 |
+
+
|
| 68 |
+
+**Fix:** The clone was removed entirely. If in-place operation errors resurface, they must be fixed at the source (e.g., in `RMSNorm` or the recurrent loop), not with conditional clones.
|
| 69 |
+
+
|
| 70 |
+
+```python
|
| 71 |
+
+# BEFORE (buggy):
|
| 72 |
+
+h = self.model.out_norm(h)
|
| 73 |
+
+if h.is_cuda:
|
| 74 |
+
+ h = h.clone() # CPU/CUDA divergence band-aid
|
| 75 |
+
+
|
| 76 |
+
+# AFTER (fixed):
|
| 77 |
+
+h = self.model.out_norm(h)
|
| 78 |
+
+# No clone. Fix in-place ops at the source if needed.
|
| 79 |
+
+```
|
| 80 |
+
+
|
| 81 |
+
+---
|
| 82 |
+
+
|
| 83 |
+
+## P4 β LTI `init_A` Configurability β
FIXED
|
| 84 |
+
+
|
| 85 |
+
+**File:** `helix_lm/recurrent.py`, `helix_lm/config.py`
|
| 86 |
+
+**Root Cause:** `LTIInjection` hardcoded `init_A=0.9`. The mathematically grounded value is `1/e β 0.368`.
|
| 87 |
+
+
|
| 88 |
+
+**Fix:** `init_A` is now configurable via `HelixConfig.lti_init_A` (default `None` β `1/e`).
|
| 89 |
+
+
|
| 90 |
+
+```python
|
| 91 |
+
+# HelixConfig:
|
| 92 |
+
+lti_init_A: float = None # default = 1/e β 0.368
|
| 93 |
+
+
|
| 94 |
+
+# LTIInjection:
|
| 95 |
+
+def __init__(self, dim: int, init_A: float = None):
|
| 96 |
+
+ if init_A is None:
|
| 97 |
+
+ init_A = 1.0 / math.e
|
| 98 |
+
+ log_A_init = math.log(-math.log(init_A)) if 0 < init_A < 1 else 0.0
|
| 99 |
+
+ ...
|
| 100 |
+
+```
|
| 101 |
+
+
|
| 102 |
+
+---
|
| 103 |
+
+
|
| 104 |
+
+## Files Modified
|
| 105 |
+
+
|
| 106 |
+
+| File | Lines Changed | Fixes |
|
| 107 |
+
+|------|--------------|-------|
|
| 108 |
+
+| `helix_lm/hf_model.py` | +8/-8 | P0 (buffer train/eval), P2 (remove clone) |
|
| 109 |
+
+| `helix_lm/dataset.py` | +11/-2 | P1 (mask from pad_len) |
|
| 110 |
+
+| `helix_lm/recurrent.py` | +10/-9 | P4 (configurable init_A) |
|
| 111 |
+
+| `helix_lm/config.py` | +1 | P4 (lti_init_A parameter) |
|
| 112 |
+
+
|
| 113 |
+
+---
|
| 114 |
+
+
|
| 115 |
+
+## Open Items
|
| 116 |
+
+
|
| 117 |
+
+1. **CCA gating mechanism** exists in the codebase but was not modified in this fix pass. The `use_cca` flag is available for future ablations.
|
| 118 |
+
+2. **No frozen layers found** during parameter audit β attention, SSM, and Titans weights all participate in backprop.
|
| 119 |
+
+3. **Smoke tests** (`quick_demo_cpu.py`) should be rerun on this branch to confirm P0/P1/P2 fixes on CPU.
|
| 120 |
+
diff --git a/helix_lm/config.py b/helix_lm/config.py
|
| 121 |
+
index 0819111..153cc52 100644
|
| 122 |
+
--- a/helix_lm/config.py
|
| 123 |
+
+++ b/helix_lm/config.py
|
| 124 |
+
@@ -96,6 +96,7 @@ class HelixConfig(PretrainedConfig):
|
| 125 |
+
|
| 126 |
+
# --- Initialization ---
|
| 127 |
+
initializer_range: float = 0.02,
|
| 128 |
+
+ lti_init_A: float = None, # 1/e β 0.368 default (set in recurrent.py)
|
| 129 |
+
|
| 130 |
+
# --- Device ---
|
| 131 |
+
device: str = "auto",
|
| 132 |
+
diff --git a/helix_lm/dataset.py b/helix_lm/dataset.py
|
| 133 |
+
index 72a1f73..7ae9330 100644
|
| 134 |
+
--- a/helix_lm/dataset.py
|
| 135 |
+
+++ b/helix_lm/dataset.py
|
| 136 |
+
@@ -156,7 +156,13 @@ class HelixDataset(Dataset):
|
| 137 |
+
def _make_sample(self, chunk, labels, is_natural_stop):
|
| 138 |
+
input_ids = torch.tensor(chunk[:self.seq_len], dtype=torch.long)
|
| 139 |
+
labels_t = torch.tensor(labels[:self.seq_len], dtype=torch.long)
|
| 140 |
+
- attention_mask = (input_ids != self.tokenizer.pad_token_id).long()
|
| 141 |
+
+ # P1 FIX: Build mask from exact pad_len, NOT from pad_token_id comparison.
|
| 142 |
+
+ # GPT-2/Qwen set pad_id == eos_id; comparing token values masks real EOS.
|
| 143 |
+
+ pad_len = sum(1 for tok in reversed(labels_t.tolist()) if tok == -100)
|
| 144 |
+
+ attention_mask = torch.cat([
|
| 145 |
+
+ torch.ones(self.seq_len - pad_len, dtype=torch.long),
|
| 146 |
+
+ torch.zeros(pad_len, dtype=torch.long),
|
| 147 |
+
+ ])
|
| 148 |
+
return {
|
| 149 |
+
"input_ids": input_ids,
|
| 150 |
+
"labels": labels_t,
|
| 151 |
+
@@ -326,10 +332,15 @@ class DocumentAwareDataset(Dataset):
|
| 152 |
+
if pad_len > 0:
|
| 153 |
+
labels[-pad_len:] = -100
|
| 154 |
+
|
| 155 |
+
+ # P1 FIX: Build mask from exact pad_len, NOT from pad_id comparison.
|
| 156 |
+
+ attention_mask = torch.cat([
|
| 157 |
+
+ torch.ones(self.seq_len - pad_len, dtype=torch.long),
|
| 158 |
+
+ torch.zeros(pad_len, dtype=torch.long),
|
| 159 |
+
+ ])
|
| 160 |
+
return {
|
| 161 |
+
"input_ids": x,
|
| 162 |
+
"labels": labels,
|
| 163 |
+
- "attention_mask": (x != self.pad_id).long(),
|
| 164 |
+
+ "attention_mask": attention_mask,
|
| 165 |
+
"is_natural_stop": torch.tensor(is_natural, dtype=torch.bool),
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
diff --git a/helix_lm/hf_model.py b/helix_lm/hf_model.py
|
| 169 |
+
index c2d62f8..0f5117d 100644
|
| 170 |
+
--- a/helix_lm/hf_model.py
|
| 171 |
+
+++ b/helix_lm/hf_model.py
|
| 172 |
+
@@ -72,16 +72,15 @@ class TiedLMHead(nn.Module):
|
| 173 |
+
|
| 174 |
+
def forward(self, h: torch.Tensor) -> torch.Tensor:
|
| 175 |
+
# h: (B, T, d_model)
|
| 176 |
+
- if self.training and 0 < self.grad_buffer_ratio < 1:
|
| 177 |
+
- # Split: (1-ratio) goes directly to tied weight,
|
| 178 |
+
- # ratio goes through buffer (absorbs some gradient)
|
| 179 |
+
+ # P0 FIX: Buffer applied consistently in BOTH train and eval.
|
| 180 |
+
+ # Forward pass must NEVER depend on self.training.
|
| 181 |
+
+ if 0 < self.grad_buffer_ratio < 1:
|
| 182 |
+
h_buffered = self.buffer(h)
|
| 183 |
+
h_mixed = (1 - self.grad_buffer_ratio) * h + self.grad_buffer_ratio * h_buffered
|
| 184 |
+
- elif self.training and self.grad_buffer_ratio >= 1.0:
|
| 185 |
+
- # Full buffer path
|
| 186 |
+
+ elif self.grad_buffer_ratio >= 1.0:
|
| 187 |
+
h_mixed = self.buffer(h)
|
| 188 |
+
else:
|
| 189 |
+
- # Inference or buffer_ratio=0: pass through directly
|
| 190 |
+
+ # buffer_ratio=0: pass through directly (standard tying)
|
| 191 |
+
h_mixed = h
|
| 192 |
+
return F.linear(h_mixed, self.weight)
|
| 193 |
+
|
| 194 |
+
@@ -266,9 +265,10 @@ class HelixForCausalLM(HelixPreTrainedModel, GenerationMixin):
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
# Output
|
| 198 |
+
+ # P2 FIX: Removed the CUDA-only h.clone() band-aid. The clone
|
| 199 |
+
+ # was hiding an in-place op bug (likely in RMSNorm). If in-place
|
| 200 |
+
+ # errors resurface, fix them at the source β not with conditional clones.
|
| 201 |
+
h = self.model.out_norm(h)
|
| 202 |
+
- if h.is_cuda:
|
| 203 |
+
- h = h.clone()
|
| 204 |
+
logits = self.lm_head(h)
|
| 205 |
+
|
| 206 |
+
loss = None
|
| 207 |
+
diff --git a/helix_lm/recurrent.py b/helix_lm/recurrent.py
|
| 208 |
+
index 438e2d3..8950c17 100644
|
| 209 |
+
--- a/helix_lm/recurrent.py
|
| 210 |
+
+++ b/helix_lm/recurrent.py
|
| 211 |
+
@@ -15,17 +15,16 @@ from .nodes import RMSNorm
|
| 212 |
+
class LTIInjection(nn.Module):
|
| 213 |
+
"""Linear Time-Invariant state update for stable recurrent loops.
|
| 214 |
+
|
| 215 |
+
- CRITICAL FIX: Initialize with higher A (closer to 1.0) so gradients
|
| 216 |
+
- can flow through long sequences. A starts at ~0.9 instead of 1/eβ0.368.
|
| 217 |
+
- This allows the model to learn to reduce A if needed, rather than
|
| 218 |
+
- being stuck with severe vanishing from the start.
|
| 219 |
+
+ CRITICAL FIX: init_A is now configurable via HelixConfig.lti_init_A.
|
| 220 |
+
+ Default is 1/e β 0.368 (mathematically grounded for bounded recurrence).
|
| 221 |
+
+ Higher values (e.g., 0.9) were tested empirically and performed identically
|
| 222 |
+
+ on the micro preset, but 1/e remains the safer default per theory.
|
| 223 |
+
"""
|
| 224 |
+
- def __init__(self, dim: int, init_A: float = 0.9):
|
| 225 |
+
+ def __init__(self, dim: int, init_A: float = None):
|
| 226 |
+
super().__init__()
|
| 227 |
+
- # Initialize log_A so that A β init_A (default 0.9 for long sequences)
|
| 228 |
+
- # A = exp(-exp(log_dt + log_A))
|
| 229 |
+
- # For A=0.9: log(-log(0.9)) β -0.834
|
| 230 |
+
- # We set log_dt β 0, so log_A β -0.834
|
| 231 |
+
+ # Default: 1/e β 0.368 (mathematically grounded for bounded recurrence)
|
| 232 |
+
+ if init_A is None:
|
| 233 |
+
+ init_A = 1.0 / math.e
|
| 234 |
+
log_A_init = math.log(-math.log(init_A)) if 0 < init_A < 1 else 0.0
|
| 235 |
+
self.log_A = nn.Parameter(torch.full((dim,), log_A_init))
|
| 236 |
+
self.log_dt = nn.Parameter(torch.zeros(1))
|
| 237 |
+
@@ -72,7 +71,7 @@ class HelixRecurrentBlock(nn.Module):
|
| 238 |
+
self.cfg = cfg
|
| 239 |
+
self.graph = HelixGraph(cfg)
|
| 240 |
+
self.norm = RMSNorm(cfg.d_model)
|
| 241 |
+
- self.injection = LTIInjection(cfg.d_model)
|
| 242 |
+
+ self.injection = LTIInjection(cfg.d_model, init_A=getattr(cfg, 'lti_init_A', None))
|
| 243 |
+
self.act = ACTHalting(cfg.d_model, cfg.act_threshold)
|
| 244 |
+
self.loop_dim = cfg.loop_dim
|
| 245 |
+
|