pfnet
/

plamo-2-1b

@@ -27,9 +27,9 @@ PLaMo 2 1B is released under Apache License version 2.0.
 ```
 numpy>=1.26.4
 numba>=0.60.0
-torch<=2.5.1
-transformers>=4.44.2,<=4.57.1
-mamba_ssm>=2.2.2,<=2.2.4
 causal_conv1d>=1.4.0
 ```

 ```
 numpy>=1.26.4
 numba>=0.60.0
+torch>=2.4.1
+transformers>=4.44.2
+mamba_ssm>=2.2.2
 causal_conv1d>=1.4.0
 ```

modeling_plamo.py CHANGED Viewed

@@ -1426,10 +1426,8 @@ class Plamo2Model(Plamo2PreTrainedModel):
                 past_key_values_prev = past_key_values
                 past_key_values = Plamo2Cache(self.config)
-                # If `past_key_values` is a `DynamicCache` object, it must be empty or all layer caches have zero sequence length.
-                assert len(past_key_values_prev) == 0 or not any(
-                    layer_cache.get_seq_length() for layer_cache in past_key_values_prev.layers
-                )
             assert isinstance(past_key_values, Plamo2Cache)
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
@@ -1635,11 +1633,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel):
         image_features: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
-        # Starting from transformers v4.54, `DynamicCache` is passed to `past_key_values` during the prefill stage,
-        # and its length becomes non-zero from v4.56 onward.
-        # `Plamo2Model.forward` converts it into a `Plamo2Cache` on the first call,
-        # se we use the type of `past_key_values` to distinguish between the prefill and decode stages.
-        if isinstance(past_key_values, Plamo2Cache):
             input_ids = input_ids[:, -1:]
             if image_features is not None:
                 image_features = image_features[:, -1:, :]
@@ -1649,7 +1643,7 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel):
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            if isinstance(past_key_values, Plamo2Cache):
                 position_ids = position_ids[:, -1].unsqueeze(-1)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
@@ -1663,9 +1657,6 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel):
                 "position_ids": position_ids,
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
-                "output_attentions": kwargs.get("output_attentions"),
-                "output_hidden_states": kwargs.get("output_hidden_states"),
-                "logits_to_keep": kwargs.get("logits_to_keep"),
                 "attention_mask": attention_mask,
                 "image_features": image_features,
             }
@@ -1723,4 +1714,4 @@ class Bias(nn.Module):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        return x + self._bias

                 past_key_values_prev = past_key_values
                 past_key_values = Plamo2Cache(self.config)
+                # If `past_key_values` is a `DynamicCache` object, it must be empty
+                assert len(past_key_values_prev) == 0
             assert isinstance(past_key_values, Plamo2Cache)
             past_key_values_length = past_key_values.get_seq_length()
             seq_length_with_past = seq_length_with_past + past_key_values_length
         image_features: Optional[torch.Tensor] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
+        if past_key_values:
             input_ids = input_ids[:, -1:]
             if image_features is not None:
                 image_features = image_features[:, -1:, :]
             # create position_ids on the fly for batch generation
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
                 position_ids = position_ids[:, -1].unsqueeze(-1)
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
                 "position_ids": position_ids,
                 "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
                 "image_features": image_features,
             }
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
+        return x + self._bias