Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +17 -11
pyproject.toml +1 -1
sparsevlm/__init__.py +4 -2
sparsevlm/generate.py +104 -0

README.md CHANGED Viewed

@@ -37,7 +37,7 @@ pip install sparsevlm
 ```python
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from sparsevlm import apply_sparsevlm, reset_n_vis, remove_hooks
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
@@ -47,16 +47,22 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
-# Enable SparseVLM — no retraining needed
-state = apply_sparsevlm(model, n_vis=256)
-# Reset before each new image forward pass
-reset_n_vis(state, n_vis=256)
-inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
-output = model.generate(**inputs, max_new_tokens=256)
-# Remove hooks when done
-remove_hooks(state)
 ```
 ---

 ```python
 import torch
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from sparsevlm import sparsevlm_generate
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+# Prepare inputs normally
+messages = [{"role": "user", "content": [
+    {"type": "image", "image": image},
+    {"type": "text",  "text": "Describe this image."}
+]}]
+text   = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = processor(text=[text], images=[image], return_tensors="pt").to("cuda")
+# Run SparseVLM — keeps top-64 visual tokens out of 256 (25%)
+output = sparsevlm_generate(
+    model, processor, inputs,
+    n_vis=256,          # visual tokens in your sequence
+    keep_n_vis=64,      # keep 25% — tune this
+    max_new_tokens=256,
+)
+print(processor.decode(output[0][1:], skip_special_tokens=True))
 ```
 ---

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sparsevlm"
-version = "0.1.1"
 description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
 readme = "README.md"
 license = { text = "Apache-2.0" }

 [project]
 name = "sparsevlm"
+version = "0.1.2"
 description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
 readme = "README.md"
 license = { text = "Apache-2.0" }

sparsevlm/__init__.py CHANGED Viewed

@@ -9,6 +9,7 @@ Quick start:
 """
 from .patch import patch_qwen2vl, reset_n_vis, unpatch_qwen2vl, remove_hooks
 def apply_sparsevlm(
@@ -43,5 +44,6 @@ def apply_sparsevlm(
     )
-__all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl", "remove_hooks"]
-__version__ = "0.1.1"

 """
 from .patch import patch_qwen2vl, reset_n_vis, unpatch_qwen2vl, remove_hooks
+from .generate import sparsevlm_generate
 def apply_sparsevlm(
     )
+__all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl",
+           "remove_hooks", "sparsevlm_generate"]
+__version__ = "0.1.2"

sparsevlm/generate.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+generate.py — KV cache pruning for SparseVLM.
+Usage:
+    from sparsevlm import sparsevlm_generate
+    output = sparsevlm_generate(
+        model, processor, inputs,
+        n_vis=256,        # total visual tokens in the sequence
+        keep_n_vis=64,    # how many to keep (25%)
+        max_new_tokens=256,
+    )
+"""
+import torch
+def _prune_kv_cache(cache, kept_indices):
+    """
+    Remove KV entries for pruned visual tokens.
+    Works with transformers 5.x DynamicCache (cache.layers[i].keys / .values).
+    .contiguous() ensures no stride gaps after indexing.
+    """
+    for layer in cache.layers:
+        k = kept_indices.to(layer.keys.device)
+        layer.keys   = layer.keys[:, :, k, :].contiguous()
+        layer.values = layer.values[:, :, k, :].contiguous()
+    return cache
+def sparsevlm_generate(
+    model,
+    processor,
+    inputs,
+    n_vis: int,
+    keep_n_vis: int,
+    max_new_tokens: int = 256,
+    target_layer: int = 2,
+    device: str = "cuda",
+):
+    """
+    SparseVLM generation via KV cache pruning.
+    Runs prefill once with output_attentions=True, scores all n_vis visual
+    tokens by their total text attention, keeps the top keep_n_vis, and
+    decodes with the pruned KV cache.
+    Args:
+        model:          Qwen2_5_VLForConditionalGeneration (loaded with
+                        attn_implementation="eager")
+        processor:      AutoProcessor
+        inputs:         dict from processor(..., return_tensors="pt")
+        n_vis:          number of visual tokens in the sequence
+                        (inputs["input_ids"].shape[1] - n_text)
+        keep_n_vis:     how many visual tokens to keep
+        max_new_tokens: generation length
+        target_layer:   which layer's attention to use for scoring (default 2)
+        device:         primary device (default "cuda")
+    Returns:
+        generated token ids [B, max_new_tokens]
+    """
+    N_TOTAL = inputs["input_ids"].shape[1]
+    # ── 1. Prefill — get KV cache + attention weights ─────────────────────────
+    with torch.no_grad():
+        prefill = model(**inputs, use_cache=True, output_attentions=True)
+    # ── 2. Score all n_vis visual tokens ──────────────────────────────────────
+    # text→visual attention submatrix: [B, H, N_text, N_vis] averaged over heads
+    attn   = prefill.attentions[target_layer]
+    A_tv   = attn[:, :, n_vis:, :n_vis].mean(dim=1)  # [B, N_text, N_vis]
+    scores = A_tv.sum(dim=1)[0]                        # [N_vis]
+    # ── 3. Keep top-keep_n_vis visual tokens by attention score ───────────────
+    kept_vis = scores.topk(keep_n_vis).indices
+    text_idx = torch.arange(n_vis, N_TOTAL, device=kept_vis.device)
+    kept_all = torch.cat([kept_vis, text_idx])
+    cache  = _prune_kv_cache(prefill.past_key_values, kept_all)
+    n_kept = cache.get_seq_length()
+    # ── 4. Fix rope_deltas so decode positions are correct ────────────────────
+    # generate() computes: next_pos = cache.get_seq_length() + rope_deltas
+    # After pruning get_seq_length() = n_kept < N_TOTAL, so we compensate:
+    n_pruned     = N_TOTAL - n_kept
+    orig_deltas  = model.model.rope_deltas.clone()
+    model.model.rope_deltas = orig_deltas + n_pruned
+    # ── 5. Decode with pruned cache ────────────────────────────────────────────
+    attn_mask = torch.ones(1, n_kept + 1, device=device, dtype=torch.long)
+    with torch.no_grad():
+        output = model.generate(
+            input_ids=inputs["input_ids"][:, -1:],
+            attention_mask=attn_mask,
+            past_key_values=cache,
+            max_new_tokens=max_new_tokens,
+            use_cache=True,
+        )
+    # ── 6. Restore rope_deltas ─────────────────────────────────────────────────
+    model.model.rope_deltas = orig_deltas
+    return output