smithblack-0
/

SHRAM-dev

@@ -24,7 +24,7 @@
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
-  "transformers_version": "5.9.0",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
+  "transformers_version": "5.10.1",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

huggingface.py CHANGED Viewed

@@ -1284,6 +1284,13 @@ class ShramCache(Cache):
     layer have materially different update semantics; callers must update sub-caches directly
     via cache.layers[layer_idx].sliding_window_cache or cache.layers[layer_idx].mosrah_cache.
     Args:
         config: ShramConfig instance. All layer counts, buffer sizes, and sub-cache
             dimensions are derived from config so that a single source of truth governs
@@ -1310,11 +1317,19 @@ class ShramCache(Cache):
         ]
         super().__init__(layers=layers)
     # ---------------------------------------------------------------------------
     # Cache — composite-meaningful methods
     # ---------------------------------------------------------------------------
     #
-    # reset(): Inherited. Iterates all layer caches and calls reset() on each.
     #
     # reorder_cache(beam_idx): Inherited. Iterates all layer caches and reorders each.
     #
@@ -1322,6 +1337,40 @@ class ShramCache(Cache):
     #   Since ShramLayerCache.is_initialized is True from construction, this is True
     #   immediately after ShramCache.__init__ returns.
     def get_seq_length(self, layer_idx: int = 0) -> int:  # type: ignore[override]
         """Return the cumulative sequence length for the specified layer.
@@ -2191,6 +2240,7 @@ class BottleneckedEnsembleAttention(nn.Module):
              key_length=key_states.shape[2],
              device=packed_embeddings.device,
          )
         attended_states = flex_attention(
              rotated_query_states,
              key_states,
@@ -2836,7 +2886,7 @@ class MoSRAHRouter(nn.Module):
                outputs.
             capacity_scalar: Static upper bound on n; used to derive topk k as
                min(tensor.shape[dim], capacity_scalar). Must be a Python int
-               for compile compatibility.
         Returns:
             Boolean mask of the same shape as tensor.
@@ -4055,19 +4105,49 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         return attention_mask.to(dtype=torch.bool)
     def _resolve_current_position_ids(
-        self,
-        input_ids: torch.Tensor,
-        position_ids: torch.Tensor | None,
-        full_attention_mask: torch.BoolTensor,
     ) -> torch.LongTensor:
-        """Resolve concrete current-step position IDs for the backbone."""
         if position_ids is not None:
             return position_ids.to(dtype=torch.long)
-        full_position_ids = full_attention_mask.to(dtype=torch.long).cumsum(dim=-1) - 1
-        full_position_ids = full_position_ids.masked_fill(~full_attention_mask, 0)
         current_length = input_ids.shape[1]
-        return full_position_ids[:, -current_length:]
     def forward(
         self,
@@ -4172,12 +4252,13 @@ class ShramForCausalLM(PreTrainedModel, GenerationMixin):
         )
         current_length: int = input_ids.shape[1]
         current_active_mask: torch.BoolTensor = full_attention_mask[:, -current_length:]
         current_position_ids: torch.LongTensor = self._resolve_current_position_ids(
             input_ids=input_ids,
             position_ids=position_ids,
-            full_attention_mask=full_attention_mask,
         )
-        shram_cache: ShramCache | None = past_key_values if use_cache else None
         if shram_cache is None:
             positions_start_sane = torch.all(current_position_ids[:, 0] == 0)

     layer have materially different update semantics; callers must update sub-caches directly
     via cache.layers[layer_idx].sliding_window_cache or cache.layers[layer_idx].mosrah_cache.
+    ShramCache also tracks per-batch cumulative active token counts via
+    ``_active_token_counts``. ``total_active_tokens(active_mask)`` returns the accumulated
+    count before the current step and updates the buffer in-place; the caller uses this as a
+    per-batch position bias for contiguous arange-based position ID resolution. All counter
+    updates are in-place to satisfy CUDAGraph fixed-memory requirements. ``reset()``
+    zeroes the buffer along with all layer caches.
     Args:
         config: ShramConfig instance. All layer counts, buffer sizes, and sub-cache
             dimensions are derived from config so that a single source of truth governs
         ]
         super().__init__(layers=layers)
+        # Active token counter for position ID resolution (Unit 23.B). Pre-allocated
+        # at construction so all updates remain in-place across forward passes,
+        # satisfying CUDAGraph fixed-memory requirements.
+        self._active_token_counts: torch.Tensor = torch.zeros(
+            batch_size, dtype=torch.long, device=device
+        )
     # ---------------------------------------------------------------------------
     # Cache — composite-meaningful methods
     # ---------------------------------------------------------------------------
     #
+    # reset(): Overridden. Zeroes _active_token_counts in-place, then delegates to
+    #   the inherited implementation to reset all layer caches.
     #
     # reorder_cache(beam_idx): Inherited. Iterates all layer caches and reorders each.
     #
     #   Since ShramLayerCache.is_initialized is True from construction, this is True
     #   immediately after ShramCache.__init__ returns.
+    def total_active_tokens(self, active_mask: torch.BoolTensor) -> torch.Tensor:
+        """Return the per-batch accumulated active token count before this step, then update.
+        Reads the current per-batch accumulated count as a position bias for the caller,
+        then increments the internal counter in-place by the number of active tokens in
+        ``active_mask`` for each batch item. The pre-update count is returned so the
+        caller can offset an arange-based position tensor to the correct starting position
+        for this forward pass.
+        All updates are in-place to satisfy CUDAGraph fixed-memory requirements. The
+        counter persists across forward passes until ``reset()`` is called.
+        Args:
+            active_mask: Boolean mask of shape ``(B, N)`` for the current forward step,
+                where True marks an active (non-padding) token position.
+        Returns:
+            Integer tensor of shape ``(B,)`` — the accumulated count before this update.
+        """
+        prior_counts = self._active_token_counts.clone()
+        self._active_token_counts.add_(active_mask.sum(dim=-1))
+        return prior_counts
+    def reset(self) -> None:
+        """Clear all layer caches and reset the active token counter.
+        Zeroes ``_active_token_counts`` in-place, then delegates to the inherited
+        implementation to reset all ShramLayerCache instances. In-place mutation of
+        the counter is required for CUDAGraph compatibility — the buffer must remain
+        at the same memory address across steps.
+        """
+        self._active_token_counts.zero_()
+        super().reset()
     def get_seq_length(self, layer_idx: int = 0) -> int:  # type: ignore[override]
         """Return the cumulative sequence length for the specified layer.
              key_length=key_states.shape[2],
              device=packed_embeddings.device,
          )
         attended_states = flex_attention(
              rotated_query_states,
              key_states,
                outputs.
             capacity_scalar: Static upper bound on n; used to derive topk k as
                min(tensor.shape[dim], capacity_scalar). Must be a Python int
+           for compile compatibility.
         Returns:
             Boolean mask of the same shape as tensor.
         return attention_mask.to(dtype=torch.bool)
     def _resolve_current_position_ids(
+            self,
+            input_ids: torch.Tensor,
+            position_ids: torch.Tensor | None,
+            current_active_mask: torch.BoolTensor,
+            cache: ShramCache | None,
     ) -> torch.LongTensor:
+        """Resolve concrete current-step position IDs for the backbone.
+        Builds a fresh contiguous allocation via arange + per-batch bias. No cumsum
+        or stride-based views are produced; the returned tensor is always a new
+        allocation safe for Inductor tracing at the FlexAttention boundary.
+        When a cache is present, ``total_active_tokens()`` provides the per-batch
+        accumulated active token count as a position bias. Uncached calls use a zero
+        bias. In both cases positions are ``bias + arange(current_length)``, with
+        inactive positions masked to 0.
+        Args:
+            input_ids: Current token IDs of shape ``(B, N)``.
+            position_ids: Explicit positions if supplied by the caller; returned
+                unchanged (cast to long). Bias computation is skipped entirely.
+            current_active_mask: Boolean mask of shape ``(B, N)`` for the current step.
+            cache: Active ``ShramCache``, or ``None`` for uncached forward passes.
+        Returns:
+            Long tensor of shape ``(B, N)`` — position index per token, 0 for inactive.
+        """
         if position_ids is not None:
             return position_ids.to(dtype=torch.long)
         current_length = input_ids.shape[1]
+        if cache is not None:
+            position_bias = cache.total_active_tokens(current_active_mask)
+        else:
+            position_bias = torch.zeros(
+                input_ids.shape[0], dtype=torch.long, device=input_ids.device
+            )
+        positions = position_bias.unsqueeze(1) + torch.arange(
+            current_length, device=input_ids.device, dtype=torch.long
+        )
+        return positions.masked_fill(~current_active_mask, 0)
     def forward(
         self,
         )
         current_length: int = input_ids.shape[1]
         current_active_mask: torch.BoolTensor = full_attention_mask[:, -current_length:]
+        shram_cache: ShramCache | None = past_key_values if use_cache else None
         current_position_ids: torch.LongTensor = self._resolve_current_position_ids(
             input_ids=input_ids,
             position_ids=position_ids,
+            current_active_mask=current_active_mask,
+            cache=shram_cache,
         )
         if shram_cache is None:
             positions_start_sane = torch.all(current_position_ids[:, 0] == 0)