rpDungeon
/

Loopstral-4B-Experimental

Safetensors

mistral

custom_code

Model card Files Files and versions

xet

Community

ToastyPigeon commited on Jan 5

Commit

549e5bd

verified ·

1 Parent(s): 82a99b7

Upload modeling_loopstral.py with huggingface_hub

Browse files

Files changed (1) hide show

modeling_loopstral.py +56 -49

modeling_loopstral.py CHANGED Viewed

@@ -4,6 +4,7 @@
 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_mistral.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 from typing import Callable, Optional, Union
 import torch
@@ -13,7 +14,7 @@ from torch.nn import CrossEntropyLoss
 #from transformers.modeling_utils import check_model_inputs
 from transformers.activations import ACT2FN
-from transformers.cache_utils import Cache, DynamicCache
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
@@ -147,7 +148,7 @@ class MistralAttention(nn.Module):
         attention_mask: Optional[torch.Tensor],
         past_key_values: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        update_cache: Optional[bool] = True,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
@@ -163,20 +164,10 @@ class MistralAttention(nn.Module):
         if past_key_values is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            # --- START DEBUGGING CODE ---
-            # Add these two lines to see what the object is and what's inside it.
-            #print(f"DEBUG: Type of cache object: {type(past_key_values)}")
-            #print(f"DEBUG: Attributes of cache object: {dir(past_key_values)}")
-            # --- END DEBUGGING CODE ---
-            if update_cache:
-                key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
-            else:
-                k_cache, v_cache = past_key_values[self.layer_idx]
-                if k_cache is not None:
-                    key_states = torch.cat([k_cache, key_states], dim=2)
-                    value_states = torch.cat([v_cache, value_states], dim=2)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
@@ -239,7 +230,7 @@ class MistralDecoderLayer(GradientCheckpointingLayer):
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        update_cache: Optional[bool] = True,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
         residual = hidden_states
@@ -253,7 +244,7 @@ class MistralDecoderLayer(GradientCheckpointingLayer):
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
-            update_cache=update_cache,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -321,6 +312,29 @@ class MistralRotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 @auto_docstring
 class LoopstralModel(MistralPreTrainedModel):
     def __init__(self, config: LoopstralConfig):
@@ -336,6 +350,11 @@ class LoopstralModel(MistralPreTrainedModel):
         self.rotary_emb = MistralRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -358,8 +377,18 @@ class LoopstralModel(MistralPreTrainedModel):
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache(config=self.config)
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
@@ -382,34 +411,12 @@ class LoopstralModel(MistralPreTrainedModel):
         hidden_states = inputs_embeds
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
-        #***
-        #Create the loop sequence!
-        #***
-        l_seq = []
-        #print(self.config.layer_sequence)
-        for item in self.config.layer_sequence:
-            if isinstance(item, int):
-                # Single layer index: 5 -> [5]
-                l_seq.append(item)
-            elif isinstance(item, list):
-                if len(item) == 2:
-                    # Range without repeat: [4, 20] -> range(4, 20)
-                    start, end = item
-                    l_seq += list(range(start, min(end, self.config.num_hidden_layers)))
-                elif len(item) == 3:
-                    # Range with repeat: [4, 20, 2] -> range(4, 20) repeated 2 times
-                    start, end, repeats = item
-                    l_seq += list(range(start, min(end, self.config.num_hidden_layers))) * repeats
-                else:
-                    raise ValueError(f"Invalid layer_sequence item: {item}. Expected int, [start, end], or [start, end, repeats]")
-            else:
-                raise ValueError(f"Invalid layer_sequence item type: {type(item)}. Expected int or list.")
-        #print(f"DEBUG: Layer sequence {l_seq}")
-        last_visit_map = {layer_idx: i for i, layer_idx in enumerate(l_seq)}
-        for i, layer in enumerate(l_seq):
-            should_update_cache = use_cache and (last_visit_map[layer] == i)
-            decoder_layer = self.layers[layer]
             hidden_states = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
@@ -418,7 +425,7 @@ class LoopstralModel(MistralPreTrainedModel):
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
-                update_cache=should_update_cache,
                 **kwargs,
             )
         hidden_states = self.norm(hidden_states)

 #             the file from the modular. If any change should be done, please apply the change to the
 #                          modular_mistral.py file directly. One of our CI enforces this.
 #                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import copy
 from typing import Callable, Optional, Union
 import torch
 #from transformers.modeling_utils import check_model_inputs
 from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, DynamicLayer
 from transformers.generation import GenerationMixin
 from transformers.integrations import use_kernel_forward_from_hub
 from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
         attention_mask: Optional[torch.Tensor],
         past_key_values: Optional[Cache] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        cache_slot_idx: Optional[int] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
         if past_key_values is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            # Use cache_slot_idx (position in layer sequence) instead of layer_idx
+            # This allows each visit to a repeated layer to have its own cache slot
+            slot_idx = cache_slot_idx if cache_slot_idx is not None else self.layer_idx
+            key_states, value_states = past_key_values.update(key_states, value_states, slot_idx, cache_kwargs)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        cache_slot_idx: Optional[int] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> torch.Tensor:
         residual = hidden_states
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
+            cache_slot_idx=cache_slot_idx,
             **kwargs,
         )
         hidden_states = residual + hidden_states
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def _expand_layer_sequence(layer_sequence, num_hidden_layers):
+    """Expand layer_sequence config into a flat list of layer indices."""
+    l_seq = []
+    for item in layer_sequence:
+        if isinstance(item, int):
+            # Single layer index: 5 -> [5]
+            l_seq.append(item)
+        elif isinstance(item, list):
+            if len(item) == 2:
+                # Range without repeat: [4, 20] -> range(4, 20)
+                start, end = item
+                l_seq += list(range(start, min(end, num_hidden_layers)))
+            elif len(item) == 3:
+                # Range with repeat: [4, 20, 2] -> range(4, 20) repeated 2 times
+                start, end, repeats = item
+                l_seq += list(range(start, min(end, num_hidden_layers))) * repeats
+            else:
+                raise ValueError(f"Invalid layer_sequence item: {item}. Expected int, [start, end], or [start, end, repeats]")
+        else:
+            raise ValueError(f"Invalid layer_sequence item type: {type(item)}. Expected int or list.")
+    return l_seq
 @auto_docstring
 class LoopstralModel(MistralPreTrainedModel):
     def __init__(self, config: LoopstralConfig):
         self.rotary_emb = MistralRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+        # Pre-compute the expanded layer sequence for the looping mechanism
+        self._layer_sequence = _expand_layer_sequence(config.layer_sequence, config.num_hidden_layers)
+        # Number of cache slots needed (one per position in layer sequence)
+        self._num_cache_slots = len(self._layer_sequence)
         # Initialize weights and apply final processing
         self.post_init()
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache:
+            if past_key_values is None:
+                # Create cache with enough slots for the full layer sequence
+                # (more than num_hidden_layers if layers are repeated)
+                cache_config = copy.copy(self.config)
+                cache_config.num_hidden_layers = self._num_cache_slots
+                past_key_values = DynamicCache(config=cache_config)
+            elif isinstance(past_key_values, DynamicCache) and len(past_key_values.layers) < self._num_cache_slots:
+                # Cache was created externally (e.g., by generate()) with fewer slots
+                # Extend it to have enough slots for our layer sequence
+                while len(past_key_values.layers) < self._num_cache_slots:
+                    past_key_values.layers.append(DynamicLayer())
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
         hidden_states = inputs_embeds
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # Execute layers in the configured sequence
+        # Each position in the sequence gets its own cache slot, allowing
+        # repeated layers to maintain separate KV caches for each visit
+        for cache_slot_idx, layer_idx in enumerate(self._layer_sequence):
+            decoder_layer = self.layers[layer_idx]
             hidden_states = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
+                cache_slot_idx=cache_slot_idx,
                 **kwargs,
             )
         hidden_states = self.norm(hidden_states)