pfnet
/

plamo-2-1b

@@ -240,6 +240,8 @@ class PlamoCache(torch.nn.Module):
     def append_kv(self, key: torch.Tensor, value: torch.Tensor, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
         c = self.cache[layer_idx]
         assert isinstance(c, PlamoAttentionCache)
         def _validate(cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
@@ -257,11 +259,17 @@ class PlamoCache(torch.nn.Module):
     def update_attention(
         self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int
     ) -> PlamoAttentionCache:
         if self.cache[layer_idx] is None:
-            self.cache[layer_idx] = PlamoAttentionCache(key_states, value_states)
         else:
-            full_attn = layer_idx in self.config.full_attention_idx
-            window_size = self.config.attention_window_size
             c = self.cache[layer_idx]
             assert isinstance(c, PlamoAttentionCache)
             k, v = self.append_kv(key_states, value_states, layer_idx)
@@ -968,15 +976,6 @@ class Attention(torch.nn.Module):
         query_states = _rms_norm(query_states, None, 1e-6) * self.q_weight[None, :, None]
         key_states = _rms_norm(key_states, None, 1e-6) * self.k_weight[None, :, None]
-        if past_states is not None and past_states[self.layer_idx] is None:
-            bsz, nhead_k, _, c_k = key_states.shape
-            _, nhead_v, _, c_v = value_states.shape
-            past_states.update_attention(
-                torch.zeros((bsz, nhead_k, 0, c_k), dtype=key_states.dtype, device=key_states.device),
-                torch.zeros((bsz, nhead_v, 0, c_v), dtype=value_states.dtype, device=value_states.device),
-                self.layer_idx,
-            )
         if past_states is not None:
             # reuse k, v, self_attention
             key_states_new = key_states
@@ -1154,6 +1153,7 @@ class PlamoDecoder(torch.nn.Module):
                 for i in range(config.num_hidden_layers)
             ]
         )
     def forward(self, x: DecoderInput) -> DecoderOutput:
         all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = () if x.output_hidden_states else None
@@ -1166,19 +1166,12 @@ class PlamoDecoder(torch.nn.Module):
                 all_hidden_states += (hidden_states,)
             if self.training and x.gradient_checkpointing:
-                def create_custom_forward(module):  # type: ignore
-                    def custom_forward(*inputs):  # type: ignore
-                        # None for past_key_value
-                        return module(*inputs, x.output_attentions, None)
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(decoder_layer),  # type: ignore
                     hidden_states,
                     x.attention_mask,
-                    None,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -1217,9 +1210,6 @@ class PlamoPreTrainedModel(PreTrainedModel):  # type: ignore
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-    def _set_gradient_checkpointing(self, module: torch.nn.Module, value: bool = False) -> None:
-        module.gradient_checkpointing = value  # type: ignore
 class PlamoModel(PlamoPreTrainedModel):
     def __init__(self, config: PlamoConfig):
@@ -1613,4 +1603,4 @@ class Bias(nn.Module):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
-        return x + self._bias

     def append_kv(self, key: torch.Tensor, value: torch.Tensor, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
         c = self.cache[layer_idx]
+        if c is None:
+            return key, value
         assert isinstance(c, PlamoAttentionCache)
         def _validate(cache: torch.Tensor, new_tensor: torch.Tensor) -> None:
     def update_attention(
         self, key_states: torch.Tensor, value_states: torch.Tensor, layer_idx: int
     ) -> PlamoAttentionCache:
+        full_attn = layer_idx in self.config.full_attention_idx
+        window_size = self.config.attention_window_size
         if self.cache[layer_idx] is None:
+            if full_attn:
+                self.cache[layer_idx] = PlamoAttentionCache(key_states, value_states)
+            else:
+                self.cache[layer_idx] = PlamoAttentionCache(
+                    key_states[:, :, -window_size:, :], value_states[:, :, -window_size:, :]
+                )
         else:
             c = self.cache[layer_idx]
             assert isinstance(c, PlamoAttentionCache)
             k, v = self.append_kv(key_states, value_states, layer_idx)
         query_states = _rms_norm(query_states, None, 1e-6) * self.q_weight[None, :, None]
         key_states = _rms_norm(key_states, None, 1e-6) * self.k_weight[None, :, None]
         if past_states is not None:
             # reuse k, v, self_attention
             key_states_new = key_states
                 for i in range(config.num_hidden_layers)
             ]
         )
+        self.gradient_checkpointing = False
     def forward(self, x: DecoderInput) -> DecoderOutput:
         all_hidden_states: Optional[Tuple[torch.Tensor, ...]] = () if x.output_hidden_states else None
                 all_hidden_states += (hidden_states,)
             if self.training and x.gradient_checkpointing:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
                     hidden_states,
                     x.attention_mask,
+                    x.past_states,
+                    x.output_attentions,
                 )
             else:
                 layer_outputs = decoder_layer(
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 class PlamoModel(PlamoPreTrainedModel):
     def __init__(self, config: PlamoConfig):
         self,
         x: torch.Tensor,
     ) -> torch.Tensor:
+        return x + self._bias