Update architecture and tokenizer

Browse files

Files changed (3) hide show

__attention__sliding_window_attention.py +16 -15
__cache__sliding_window_cache.py +41 -7
config.json +1 -1

__attention__sliding_window_attention.py CHANGED Viewed

@@ -120,12 +120,15 @@ class SlidingWindowAttention(nn.Module):
         # The cache returns the current-step visible local frame, not merely the
         # retained next-step cache buffer.
         if cache is not None:
-            k_full, v_full, full_active_mask = cache.update(k, v, active_mask)
         else:
-            k_full, v_full, full_active_mask = k, v, active_mask
         block_mask = self._make_block_mask(
             active_mask=full_active_mask,
             batch_size=batch_size,
             num_heads=self.num_heads,
             query_len=query_len,
@@ -182,6 +185,7 @@ class SlidingWindowAttention(nn.Module):
     def _make_block_mask(
         self,
         active_mask: torch.Tensor,
         batch_size: int,
         num_heads: int,
         query_len: int,
@@ -191,17 +195,14 @@ class SlidingWindowAttention(nn.Module):
     ) -> Any:
         """Create the FlexAttention block mask for masked local continuation.
-        The returned local frame is chronological in raw buffer order, but dead
-        positions may remain inside it. Effective local order is therefore
-        recovered from the active mask itself by taking a cumulative count over
-        active positions.
-        Queries still occupy the tail of the returned frame, so raw buffer order
-        is used to locate query rows. Semantic active-token positions are then
-        used to decide causality and sliding-window distance.
         """
         query_offset = kv_len - query_len
-        semantic_positions = active_mask.long().cumsum(dim=-1) - 1
         def sliding_window_mask(
             batch_idx: torch.Tensor,
@@ -215,11 +216,11 @@ class SlidingWindowAttention(nn.Module):
             query_is_active = active_mask[batch_idx, q_abs]
             key_is_active = active_mask[batch_idx, kv_idx]
-            q_sem = semantic_positions[batch_idx, q_abs]
-            k_sem = semantic_positions[batch_idx, kv_idx]
-            is_causal = k_sem <= q_sem
-            in_window = (q_sem - k_sem) < window_size
             return query_is_active & key_is_active & is_causal & in_window

         # The cache returns the current-step visible local frame, not merely the
         # retained next-step cache buffer.
         if cache is not None:
+            k_full, v_full, full_active_mask, full_positions = cache.update(
+                k, v, active_mask, position_ids
+            )
         else:
+            k_full, v_full, full_active_mask, full_positions = k, v, active_mask, position_ids
         block_mask = self._make_block_mask(
             active_mask=full_active_mask,
+            positions=full_positions,
             batch_size=batch_size,
             num_heads=self.num_heads,
             query_len=query_len,
     def _make_block_mask(
         self,
         active_mask: torch.Tensor,
+        positions: torch.Tensor,
         batch_size: int,
         num_heads: int,
         query_len: int,
     ) -> Any:
         """Create the FlexAttention block mask for masked local continuation.
+        The returned local frame is chronological in raw buffer order; dead
+        positions may remain inside it. Liveness is carried by `active_mask`.
+        Causality and window distance are determined from `positions`, which
+        holds the absolute sequence position of every slot in the composite
+        frame. Using absolute positions rather than a cumsum over the active
+        mask eliminates the data-dependent computation that blocks torch.compile.
         """
         query_offset = kv_len - query_len
         def sliding_window_mask(
             batch_idx: torch.Tensor,
             query_is_active = active_mask[batch_idx, q_abs]
             key_is_active = active_mask[batch_idx, kv_idx]
+            q_pos = positions[batch_idx, q_abs]
+            k_pos = positions[batch_idx, kv_idx]
+            is_causal = k_pos <= q_pos
+            in_window = (q_pos - k_pos) < window_size
             return query_is_active & key_is_active & is_causal & in_window

__cache__sliding_window_cache.py CHANGED Viewed

@@ -92,6 +92,15 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
             device=device,
         )
         self.is_initialized = True
         # Cumulative count of all token positions presented through update() for
@@ -104,8 +113,9 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         active_mask: torch.Tensor,
         cache_kwargs: dict | None = None,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Return the current-step local frame and retain the next-step window.
         Args:
@@ -115,6 +125,8 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
                 current chunk.
             active_mask: Shape `(B, T_new)` bool. `True` means the
                 corresponding token position in the current chunk is active.
             cache_kwargs: Present only to satisfy the `CacheLayerMixin`
                 interface. Unused by this cache.
@@ -123,6 +135,7 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
               - visible_keys: `(B, H, sliding_window + T_new, D)`
               - visible_values: `(B, H, sliding_window + T_new, D)`
               - visible_active_mask: `(B, sliding_window + T_new)`
             These are the tensors the local attention path should consume
             directly for the current step.
@@ -134,10 +147,11 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         # The current-step local frame is just retained cache state followed by
         # the current chunk in chronological order.
-        composite_keys, composite_values, composite_mask = self._make_composite_frame(
             key_states=key_states,
             value_states=value_states,
             active_mask=active_mask,
         )
         # The cache remembers only the last raw sliding-window positions of that
@@ -147,11 +161,12 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
             composite_keys=composite_keys,
             composite_values=composite_values,
             composite_mask=composite_mask,
         )
         self._total_processed += key_states.shape[2]
-        return composite_keys, composite_values, composite_mask
     def _ensure_state_compatibility(
         self,
@@ -185,17 +200,25 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
                 non_blocking=True,
             )
     def _make_composite_frame(
         self,
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         active_mask: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """Build the current-step local frame in chronological order."""
         return (
             torch.cat([self.keys, key_states], dim=-2),
             torch.cat([self.values, value_states], dim=-2),
             torch.cat([self.active_mask, active_mask], dim=-1),
         )
     def _retain_next_window(
@@ -203,15 +226,17 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         composite_keys: torch.Tensor,
         composite_values: torch.Tensor,
         composite_mask: torch.Tensor,
     ) -> None:
         """Remember the next-step retained local state.
         This is a raw positional trim to the last `sliding_window` positions, not
         a semantic live-token trim.
         """
-        self.keys = composite_keys[:, :, -self.sliding_window :, :]
-        self.values = composite_values[:, :, -self.sliding_window :, :]
-        self.active_mask = composite_mask[:, -self.sliding_window :]
     def get_seq_length(self) -> int:
         """Return the cumulative number of token positions processed by this cache.
@@ -239,6 +264,7 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         self.keys.zero_()
         self.values.zero_()
         self.active_mask.zero_()
         self._total_processed = 0
     def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
@@ -246,12 +272,14 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         self.keys = self.keys[beam_idx]
         self.values = self.values[beam_idx]
         self.active_mask = self.active_mask[beam_idx]
     def batch_repeat_interleave(self, repeats: int) -> None:
         """Expand the batch dimension for beam-search initialisation."""
         self.keys = self.keys.repeat_interleave(repeats, dim=0)
         self.values = self.values.repeat_interleave(repeats, dim=0)
         self.active_mask = self.active_mask.repeat_interleave(repeats, dim=0)
         self.batch_size = self.batch_size * repeats
     def batch_select_indices(self, indices: torch.Tensor) -> None:
@@ -259,12 +287,14 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
         self.keys = self.keys[indices]
         self.values = self.values[indices]
         self.active_mask = self.active_mask[indices]
         self.batch_size = int(indices.shape[0])
     def offload(self) -> None:
         """Offload cache tensors to CPU."""
         super().offload()
         self.active_mask = self.active_mask.to("cpu", non_blocking=True)
     def prefetch(self) -> None:
         """Move cache tensors back to the model device ahead of time."""
@@ -274,6 +304,10 @@ class LocalSlidingWindowLayerCache(CacheLayerMixin):
                 self.keys.device,
                 non_blocking=True,
             )
     def crop(self, max_length: int) -> None:
         raise NotImplementedError(

             device=device,
         )
+        # Absolute sequence positions of each retained slot. Inactive slots
+        # retain zero; correctness is carried by active_mask.
+        self.positions = torch.zeros(
+            batch_size,
+            sliding_window,
+            dtype=torch.long,
+            device=device,
+        )
         self.is_initialized = True
         # Cumulative count of all token positions presented through update() for
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         active_mask: torch.Tensor,
+        positions: torch.Tensor,
         cache_kwargs: dict | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Return the current-step local frame and retain the next-step window.
         Args:
                 current chunk.
             active_mask: Shape `(B, T_new)` bool. `True` means the
                 corresponding token position in the current chunk is active.
+            positions: Shape `(B, T_new)` long. Absolute sequence position of
+                each token in the current chunk.
             cache_kwargs: Present only to satisfy the `CacheLayerMixin`
                 interface. Unused by this cache.
               - visible_keys: `(B, H, sliding_window + T_new, D)`
               - visible_values: `(B, H, sliding_window + T_new, D)`
               - visible_active_mask: `(B, sliding_window + T_new)`
+              - visible_positions: `(B, sliding_window + T_new)`
             These are the tensors the local attention path should consume
             directly for the current step.
         # The current-step local frame is just retained cache state followed by
         # the current chunk in chronological order.
+        composite_keys, composite_values, composite_mask, composite_positions = self._make_composite_frame(
             key_states=key_states,
             value_states=value_states,
             active_mask=active_mask,
+            positions=positions,
         )
         # The cache remembers only the last raw sliding-window positions of that
             composite_keys=composite_keys,
             composite_values=composite_values,
             composite_mask=composite_mask,
+            composite_positions=composite_positions,
         )
         self._total_processed += key_states.shape[2]
+        return composite_keys, composite_values, composite_mask, composite_positions
     def _ensure_state_compatibility(
         self,
                 non_blocking=True,
             )
+        if self.positions.device != key_states.device:
+            self.positions = self.positions.to(
+                key_states.device,
+                non_blocking=True,
+            )
     def _make_composite_frame(
         self,
         key_states: torch.Tensor,
         value_states: torch.Tensor,
         active_mask: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Build the current-step local frame in chronological order."""
         return (
             torch.cat([self.keys, key_states], dim=-2),
             torch.cat([self.values, value_states], dim=-2),
             torch.cat([self.active_mask, active_mask], dim=-1),
+            torch.cat([self.positions, positions], dim=-1),
         )
     def _retain_next_window(
         composite_keys: torch.Tensor,
         composite_values: torch.Tensor,
         composite_mask: torch.Tensor,
+        composite_positions: torch.Tensor,
     ) -> None:
         """Remember the next-step retained local state.
         This is a raw positional trim to the last `sliding_window` positions, not
         a semantic live-token trim.
         """
+        self.keys[:] = composite_keys[:, :, -self.sliding_window :, :]
+        self.values[:] = composite_values[:, :, -self.sliding_window :, :]
+        self.active_mask[:] = composite_mask[:, -self.sliding_window :]
+        self.positions[:] = composite_positions[:, -self.sliding_window :]
     def get_seq_length(self) -> int:
         """Return the cumulative number of token positions processed by this cache.
         self.keys.zero_()
         self.values.zero_()
         self.active_mask.zero_()
+        self.positions.zero_()
         self._total_processed = 0
     def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
         self.keys = self.keys[beam_idx]
         self.values = self.values[beam_idx]
         self.active_mask = self.active_mask[beam_idx]
+        self.positions = self.positions[beam_idx]
     def batch_repeat_interleave(self, repeats: int) -> None:
         """Expand the batch dimension for beam-search initialisation."""
         self.keys = self.keys.repeat_interleave(repeats, dim=0)
         self.values = self.values.repeat_interleave(repeats, dim=0)
         self.active_mask = self.active_mask.repeat_interleave(repeats, dim=0)
+        self.positions = self.positions.repeat_interleave(repeats, dim=0)
         self.batch_size = self.batch_size * repeats
     def batch_select_indices(self, indices: torch.Tensor) -> None:
         self.keys = self.keys[indices]
         self.values = self.values[indices]
         self.active_mask = self.active_mask[indices]
+        self.positions = self.positions[indices]
         self.batch_size = int(indices.shape[0])
     def offload(self) -> None:
         """Offload cache tensors to CPU."""
         super().offload()
         self.active_mask = self.active_mask.to("cpu", non_blocking=True)
+        self.positions = self.positions.to("cpu", non_blocking=True)
     def prefetch(self) -> None:
         """Move cache tensors back to the model device ahead of time."""
                 self.keys.device,
                 non_blocking=True,
             )
+            self.positions = self.positions.to(
+                self.keys.device,
+                non_blocking=True,
+            )
     def crop(self, max_length: int) -> None:
         raise NotImplementedError(

config.json CHANGED Viewed

@@ -21,7 +21,7 @@
   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
-  "transformers_version": "5.7.0",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128

   "rope_mode": "main_sequence",
   "tie_word_embeddings": false,
   "training_sequence_length": 1024,
+  "transformers_version": "5.8.0",
   "use_cache": true,
   "vocab_size": 50277,
   "window_size": 128