miike-ai
/

LeanLlama-8B

Safetensors

llama

custom_code

Model card Files Files and versions

xet

Community

miike-ai commited on Feb 15

Commit

d9fe3f3

verified ·

1 Parent(s): ac833d9

Fix compression to handle all new tokens (chunked prefill support)

Browse files

Files changed (1) hide show

modeling_lean_llama.py +73 -41

modeling_lean_llama.py CHANGED Viewed

@@ -137,6 +137,8 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
         kv_granularity: dict[str, str] = dict(getattr(config, "leanllm_kv_granularity", {}))
         self._kv_granularity_map: dict[int, str] = {}
         for layer_idx in self._kv_layers:
             key = str(layer_idx)
@@ -148,6 +150,7 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
             if f_alpha is not None:
                 f_alpha = float(f_alpha)
             self._kv_granularity_map[layer_idx] = kv_granularity.get(key, "per_token")
             # Attach value compressor at the same path used by convert_to_hf_model.py
             # so that from_pretrained auto-loads the saved weights.
@@ -162,7 +165,6 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
             )
             self.model.layers[layer_idx].self_attn.leanllm_v_compressor = v_mod
             # Key compressor (if not values-only)
             if not self._values_only and key in kv_key_dims:
                 k_dim = int(kv_key_dims[key])
@@ -180,6 +182,48 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
     # KV cache compression (applied after each forward pass)
     # ------------------------------------------------------------------
     def _compress_past(self, past_key_values: Any) -> Any:
         if past_key_values is None or not self._kv_layers:
             return past_key_values
@@ -189,36 +233,19 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
             for layer_idx in self._kv_layers:
                 layer_cache = past_key_values.layers[layer_idx]
                 v = layer_cache.values  # [B, H, T, D]
-                gran = self._kv_granularity_map[layer_idx]
-                v_last = v[:, :, -1:, :]
-                v_dtype = v.dtype
-                compressor = self.model.layers[layer_idx].self_attn.leanllm_v_compressor
-                comp_dtype = compressor.encoder.weight.dtype
-                v_vec = _kv_to_vec(v_last, gran).to(comp_dtype)
-                v_rec = compressor.decode(compressor.encode(v_vec), x_orig=None)
-                v_new_last = _vec_to_kv(v_rec, v_last).to(dtype=v_dtype)
-                if v.shape[2] == 1:
-                    layer_cache.values = v_new_last
-                else:
-                    layer_cache.values = torch.cat([v[:, :, :-1, :], v_new_last], dim=2)
                 if not self._values_only and hasattr(
                     self.model.layers[layer_idx].self_attn, "leanllm_k_compressor"
                 ):
-                    k = layer_cache.keys
-                    k_last = k[:, :, -1:, :]
-                    k_dtype = k.dtype
-                    k_comp = self.model.layers[layer_idx].self_attn.leanllm_k_compressor
-                    k_comp_dtype = k_comp.encoder.weight.dtype
-                    k_vec = _kv_to_vec(k_last, gran).to(k_comp_dtype)
-                    k_rec = k_comp.decode(k_comp.encode(k_vec), x_orig=None)
-                    k_new_last = _vec_to_kv(k_rec, k_last).to(dtype=k_dtype)
-                    if k.shape[2] == 1:
-                        layer_cache.keys = k_new_last
-                    else:
-                        layer_cache.keys = torch.cat([k[:, :, :-1, :], k_new_last], dim=2)
             return past_key_values
@@ -227,26 +254,31 @@ class LeanLlamaForCausalLM(LlamaForCausalLM):
             past_list = list(past_key_values)
             for layer_idx in self._kv_layers:
                 k, v = past_list[layer_idx]
-                gran = self._kv_granularity_map[layer_idx]
-                v_last = v[:, :, -1:, :]
-                v_dtype = v.dtype
-                compressor = self.model.layers[layer_idx].self_attn.leanllm_v_compressor
-                comp_dtype = compressor.encoder.weight.dtype
-                v_vec = _kv_to_vec(v_last, gran).to(comp_dtype)
-                v_rec = compressor.decode(compressor.encode(v_vec), x_orig=None)
-                v_new_last = _vec_to_kv(v_rec, v_last).to(dtype=v_dtype)
-                if v.shape[2] == 1:
-                    past_list[layer_idx] = (k, v_new_last)
                 else:
-                    v_new = torch.cat([v[:, :, :-1, :], v_new_last], dim=2)
-                    past_list[layer_idx] = (k, v_new)
             return tuple(past_list)
         return past_key_values
     def forward(self, *args: Any, **kwargs: Any) -> CausalLMOutputWithPast:
         outputs = super().forward(*args, **kwargs)
         if hasattr(outputs, "past_key_values"):
             outputs.past_key_values = self._compress_past(outputs.past_key_values)

         kv_granularity: dict[str, str] = dict(getattr(config, "leanllm_kv_granularity", {}))
         self._kv_granularity_map: dict[int, str] = {}
+        # Track how many tokens have been compressed per layer to avoid re-compressing
+        self._compressed_up_to: dict[int, int] = {}
         for layer_idx in self._kv_layers:
             key = str(layer_idx)
             if f_alpha is not None:
                 f_alpha = float(f_alpha)
             self._kv_granularity_map[layer_idx] = kv_granularity.get(key, "per_token")
+            self._compressed_up_to[layer_idx] = 0
             # Attach value compressor at the same path used by convert_to_hf_model.py
             # so that from_pretrained auto-loads the saved weights.
             )
             self.model.layers[layer_idx].self_attn.leanllm_v_compressor = v_mod
             # Key compressor (if not values-only)
             if not self._values_only and key in kv_key_dims:
                 k_dim = int(kv_key_dims[key])
     # KV cache compression (applied after each forward pass)
     # ------------------------------------------------------------------
+    def _compress_values(
+        self,
+        v: torch.Tensor,
+        layer_idx: int,
+        start: int,
+    ) -> torch.Tensor:
+        """Compress values from position `start` onward, return full tensor."""
+        if start >= v.shape[2]:
+            return v
+        v_new = v[:, :, start:, :]
+        gran = self._kv_granularity_map[layer_idx]
+        v_dtype = v.dtype
+        compressor = self.model.layers[layer_idx].self_attn.leanllm_v_compressor
+        comp_dtype = compressor.encoder.weight.dtype
+        v_vec = _kv_to_vec(v_new, gran).to(comp_dtype)
+        v_rec = compressor.decode(compressor.encode(v_vec), x_orig=None)
+        v_compressed = _vec_to_kv(v_rec, v_new).to(dtype=v_dtype)
+        if start == 0:
+            return v_compressed
+        return torch.cat([v[:, :, :start, :], v_compressed], dim=2)
+    def _compress_keys(
+        self,
+        k: torch.Tensor,
+        layer_idx: int,
+        start: int,
+    ) -> torch.Tensor:
+        """Compress keys from position `start` onward, return full tensor."""
+        if start >= k.shape[2]:
+            return k
+        k_new = k[:, :, start:, :]
+        gran = self._kv_granularity_map[layer_idx]
+        k_dtype = k.dtype
+        k_comp = self.model.layers[layer_idx].self_attn.leanllm_k_compressor
+        comp_dtype = k_comp.encoder.weight.dtype
+        k_vec = _kv_to_vec(k_new, gran).to(comp_dtype)
+        k_rec = k_comp.decode(k_comp.encode(k_vec), x_orig=None)
+        k_compressed = _vec_to_kv(k_rec, k_new).to(dtype=k_dtype)
+        if start == 0:
+            return k_compressed
+        return torch.cat([k[:, :, :start, :], k_compressed], dim=2)
     def _compress_past(self, past_key_values: Any) -> Any:
         if past_key_values is None or not self._kv_layers:
             return past_key_values
             for layer_idx in self._kv_layers:
                 layer_cache = past_key_values.layers[layer_idx]
                 v = layer_cache.values  # [B, H, T, D]
+                total_tokens = v.shape[2]
+                start = self._compressed_up_to[layer_idx]
+                layer_cache.values = self._compress_values(v, layer_idx, start)
                 if not self._values_only and hasattr(
                     self.model.layers[layer_idx].self_attn, "leanllm_k_compressor"
                 ):
+                    layer_cache.keys = self._compress_keys(
+                        layer_cache.keys, layer_idx, start
+                    )
+                self._compressed_up_to[layer_idx] = total_tokens
             return past_key_values
             past_list = list(past_key_values)
             for layer_idx in self._kv_layers:
                 k, v = past_list[layer_idx]
+                total_tokens = v.shape[2]
+                start = self._compressed_up_to[layer_idx]
+                v_new = self._compress_values(v, layer_idx, start)
+                if not self._values_only and hasattr(
+                    self.model.layers[layer_idx].self_attn, "leanllm_k_compressor"
+                ):
+                    k_new = self._compress_keys(k, layer_idx, start)
                 else:
+                    k_new = k
+                past_list[layer_idx] = (k_new, v_new)
+                self._compressed_up_to[layer_idx] = total_tokens
             return tuple(past_list)
         return past_key_values
     def forward(self, *args: Any, **kwargs: Any) -> CausalLMOutputWithPast:
+        # Reset compression tracking when there's no cache (new sequence)
+        past = kwargs.get("past_key_values", None)
+        if past is None and len(args) < 5:
+            # No KV cache passed — starting fresh
+            for layer_idx in self._kv_layers:
+                self._compressed_up_to[layer_idx] = 0
         outputs = super().forward(*args, **kwargs)
         if hasattr(outputs, "past_key_values"):
             outputs.past_key_values = self._compress_past(outputs.past_key_values)