Support transformers>=4.56 (#4)

Browse files

- Support transformers>=4.56 (a2dc46a3514d46d18a36702357e78d9d1f6f431d)

Files changed (2) hide show

README.md +3 -3
modeling_plamo.py +21 -5

README.md CHANGED Viewed

@@ -177,9 +177,9 @@ Please check the PLaMo community license and contact us via the following form t
 ```
 numpy>=1.26.4
 numba>=0.60.0
-torch>=2.4.1
-transformers>=4.44.2
-mamba_ssm>=2.2.2
 causal_conv1d>=1.4.0
 ```

 ```
 numpy>=1.26.4
 numba>=0.60.0
+torch<=2.5.1
+transformers>=4.44.2,<=4.57.1
+mamba_ssm>=2.2.2,<=2.2.4
 causal_conv1d>=1.4.0
 ```

modeling_plamo.py CHANGED Viewed

@@ -19,6 +19,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 from transformers import PretrainedConfig, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
@@ -327,7 +328,8 @@ class Plamo2Cache(torch.nn.Module):
                     if sequence_length is not None
                     else layer_cache.key.shape[2]
                 )
-        assert sequence_length is not None
         return sequence_length
     def get_max_length(self) -> int | None:
@@ -1387,7 +1389,7 @@ class Plamo2Model(Plamo2PreTrainedModel):
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Plamo2Cache] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
@@ -1419,6 +1421,16 @@ class Plamo2Model(Plamo2PreTrainedModel):
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
         assert cache_position is None, "cache_position is not supported yet"
@@ -1623,7 +1635,11 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel):
         image_features: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
-        if past_key_values:
             input_ids = input_ids[:, -1:]
             if image_features is not None:
                 image_features = image_features[:, -1:, :]
@@ -1633,7 +1649,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel):
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
@@ -1704,4 +1720,4 @@ class Bias(nn.Module):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        return x + self._bias

 from torch import nn
 from torch.nn import functional as F
 from transformers import PretrainedConfig, PreTrainedModel
+from transformers.cache_utils import DynamicCache
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
                     if sequence_length is not None
                     else layer_cache.key.shape[2]
                 )
+        if sequence_length is None:
+            return 0
         return sequence_length
     def get_max_length(self) -> int | None:
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Plamo2Cache | DynamicCache] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         image_features: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
+            # In some `transformers` versions, `past_key_values` may be a `DynamicCache` object.
+            if not isinstance(past_key_values, Plamo2Cache):
+                past_key_values_prev = past_key_values
+                past_key_values = Plamo2Cache(self.config)
+                # If `past_key_values` is a `DynamicCache` object, it must be empty or all layer caches have zero sequence length.
+                assert len(past_key_values_prev) == 0 or not any(
+                    layer_cache.get_seq_length() for layer_cache in past_key_values_prev.layers
+                )
+            assert isinstance(past_key_values, Plamo2Cache)
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
         assert cache_position is None, "cache_position is not supported yet"
         image_features: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
+        # Starting from transformers v4.54, `DynamicCache` is passed to `past_key_values` during the prefill stage,
+        # and its length becomes non-zero from v4.56 onward.
+        # `Plamo2Model.forward` converts it into a `Plamo2Cache` on the first call,
+        # se we use the type of `past_key_values` to distinguish between the prefill and decode stages.
+        if isinstance(past_key_values, Plamo2Cache):
             input_ids = input_ids[:, -1:]
             if image_features is not None:
                 image_features = image_features[:, -1:, :]
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
+            if isinstance(past_key_values, Plamo2Cache):
                 position_ids = position_ids[:, -1].unsqueeze(-1)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
+        return x + self._bias