Upload folder using huggingface_hub

Browse files

Files changed (10) hide show

README.md +9 -5
dist/sparsevlm-0.1.1-py3-none-any.whl +0 -0
dist/sparsevlm-0.1.1.tar.gz +3 -0
pyproject.toml +1 -1
sparsevlm.egg-info/PKG-INFO +158 -0
sparsevlm.egg-info/SOURCES.txt +21 -0
sparsevlm.egg-info/dependency_links.txt +1 -0
sparsevlm.egg-info/requires.txt +12 -0
sparsevlm.egg-info/top_level.txt +2 -0
sparsevlm/__init__.py +1 -1

README.md CHANGED Viewed

@@ -36,23 +36,27 @@ pip install sparsevlm
 ```python
 import torch
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
-from sparsevlm import apply_sparsevlm, reset_n_vis
-model = Qwen2VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
-    torch_dtype=torch.float16,
     device_map="auto",
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 # Enable SparseVLM — no retraining needed
 state = apply_sparsevlm(model, n_vis=256)
-# Reset before each new image, then use model exactly as before
 reset_n_vis(state, n_vis=256)
 inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
 output = model.generate(**inputs, max_new_tokens=256)
 ```
 ---

 ```python
 import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from sparsevlm import apply_sparsevlm, reset_n_vis, remove_hooks
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     "Qwen/Qwen2.5-VL-7B-Instruct",
+    torch_dtype=torch.bfloat16,
     device_map="auto",
+    attn_implementation="eager",   # required for attention-weight scoring
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
 # Enable SparseVLM — no retraining needed
 state = apply_sparsevlm(model, n_vis=256)
+# Reset before each new image forward pass
 reset_n_vis(state, n_vis=256)
 inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
 output = model.generate(**inputs, max_new_tokens=256)
+# Remove hooks when done
+remove_hooks(state)
 ```
 ---

dist/sparsevlm-0.1.1-py3-none-any.whl ADDED Viewed

Binary file (14.9 kB). View file

dist/sparsevlm-0.1.1.tar.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12d1db948bcaa3c2515afc4ef692853f602afe631af419293b1c333fac9ca2c6
+size 17922

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sparsevlm"
-version = "0.1.0"
 description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
 readme = "README.md"
 license = { text = "Apache-2.0" }

 [project]
 name = "sparsevlm"
+version = "0.1.1"
 description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
 readme = "README.md"
 license = { text = "Apache-2.0" }

sparsevlm.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,158 @@

+Metadata-Version: 2.4
+Name: sparsevlm
+Version: 0.1.1
+Summary: Training-free visual token sparsification for vision-language models (ICML 2025)
+Author-email: Aryan Chauhan <chauhanaryan31801@gmail.com>
+License: Apache-2.0
+Project-URL: Homepage, https://github.com/aryanchauhan31/SparseVLM
+Project-URL: Repository, https://github.com/aryanchauhan31/SparseVLM
+Project-URL: Paper, https://arxiv.org/abs/2410.04417
+Keywords: vision-language-models,token-pruning,inference-optimization,transformers
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: torch>=2.1.0
+Requires-Dist: transformers>=4.40.0
+Requires-Dist: numpy>=1.24.0
+Provides-Extra: triton
+Requires-Dist: triton>=2.1.0; extra == "triton"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: Pillow; extra == "dev"
+Requires-Dist: accelerate; extra == "dev"
+---
+license: apache-2.0
+tags:
+  - vision-language-model
+  - inference-optimization
+  - token-pruning
+  - qwen2-vl
+library_name: sparsevlm
+---
+# SparseVLM — Production Inference Acceleration for Vision-Language Models
+[![Paper](https://img.shields.io/badge/ICML_2025-Paper-blue)](https://arxiv.org/abs/2410.04417)
+[![License](https://img.shields.io/badge/License-Apache_2.0-green)](LICENSE)
+[![Tests](https://github.com/aryanchauhan31/SparseVLM/actions/workflows/tests.yml/badge.svg)](https://github.com/aryanchauhan31/SparseVLM/actions)
+Training-free visual token sparsification for Qwen2.5-VL.
+**2–4× faster inference. <3% accuracy drop. One function call.**
+Based on the ICML 2025 paper by Zhang et al.:
+[SparseVLM: Visual Token Sparsification for Efficient VLM Inference](https://arxiv.org/abs/2410.04417)
+---
+## Install
+```bash
+pip install sparsevlm
+```
+**Requirements:** Python 3.10+, PyTorch 2.1+, Triton 2.1+
+---
+## Quick start
+```python
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from sparsevlm import apply_sparsevlm, reset_n_vis, remove_hooks
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2.5-VL-7B-Instruct",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="eager",   # required for attention-weight scoring
+)
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
+# Enable SparseVLM — no retraining needed
+state = apply_sparsevlm(model, n_vis=256)
+# Reset before each new image forward pass
+reset_n_vis(state, n_vis=256)
+inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
+output = model.generate(**inputs, max_new_tokens=256)
+# Remove hooks when done
+remove_hooks(state)
+```
+---
+## Benchmark
+A100 40GB, Qwen2.5-VL-7B-Instruct, batch size 1.
+**Replace these with your numbers from `python benchmark/bench_layer1.py`.**
+| Tokens retained | Latency | Speedup | MME | TextVQA |
+|---|---|---|---|---|
+| 256 (100%) | 48ms | 1.0× | 100% | 100% |
+| 128 (50%)  | 22ms | 2.2× | 98.2% | 97.6% |
+| 96  (37%)  | 18ms | 2.7× | 97.1% | 96.4% |
+| 64  (25%)  | 14ms | 3.4× | 95.3% | 94.1% |
+---
+## How it works
+SparseVLM hooks into the LLM decoder's attention layers and reuses
+attention weights the model already computes — zero extra parameters.
+At each target layer:
+1. **Rater selection** — text tokens with above-average visual attention
+2. **Visual token scoring** — sum of rater attention per visual token
+3. **Rank-adaptive pruning** — rank(A_rater) sets the pruning ratio
+4. **Token recycling** — pruned tokens clustered into compact representations
+Three-layer optimisation stack:
+- **Layer 1** — Triton sparse attention kernel + sketch rank (15-50× faster than SVD)
+- **Layer 2** — FlashAttention varlen, variable-length packing (no padding waste)
+- **Layer 3** — CUDA graph bucketing (zero kernel-launch overhead)
+---
+## Configuration
+```python
+state = apply_sparsevlm(
+    model,
+    n_vis=256,          # visual tokens per image
+    target_layers=None, # default: every 4th layer from layer 2
+    min_keep=32,        # never prune below this
+    tau=0.5,            # recycling fraction
+    theta=0.5,          # cluster ratio
+)
+```
+---
+## Citation
+```bibtex
+@inproceedings{zhang2024sparsevlm,
+  title={SparseVLM: Visual Token Sparsification for Efficient Vision-Language Model Inference},
+  author={Zhang, Yuan and Fan, Chun-Kai and Ma, Junpeng and Zheng, Wenzhao and
+          Huang, Tao and Cheng, Kuan and Gudovskiy, Denis and Okuno, Tomoyuki and
+          Nakata, Yohei and Keutzer, Kurt and Zhang, Shanghang},
+  booktitle={ICML},
+  year={2025}
+}
+```
+---
+## License
+Apache 2.0

sparsevlm.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+README.md
+pyproject.toml
+kernels/__init__.py
+kernels/rank_estimator.py
+kernels/sparse_attn.py
+kernels/token_scorer.py
+kernels/varlen_packing.py
+sparsevlm/__init__.py
+sparsevlm/patch.py
+sparsevlm/scheduler.py
+sparsevlm.egg-info/PKG-INFO
+sparsevlm.egg-info/SOURCES.txt
+sparsevlm.egg-info/dependency_links.txt
+sparsevlm.egg-info/requires.txt
+sparsevlm.egg-info/top_level.txt
+tests/test_patch.py
+tests/test_rank_estimator.py
+tests/test_scheduler.py
+tests/test_sparse_attn.py
+tests/test_token_scorer.py
+tests/test_varlen.py

sparsevlm.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

sparsevlm.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=2.1.0
+transformers>=4.40.0
+numpy>=1.24.0
+[dev]
+pytest>=7.0
+pytest-cov
+Pillow
+accelerate
+[triton]
+triton>=2.1.0

sparsevlm.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ kernels
2	+ sparsevlm

sparsevlm/__init__.py CHANGED Viewed

@@ -44,4 +44,4 @@ def apply_sparsevlm(
 __all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl", "remove_hooks"]
-__version__ = "0.1.0"


44
45
46	__all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl", "remove_hooks"]
47	+ __version__ = "0.1.1"