Aryan3108 commited on
Commit
a31ee88
Β·
verified Β·
1 Parent(s): 176b11a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -36,23 +36,27 @@ pip install sparsevlm
36
 
37
  ```python
38
  import torch
39
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
40
- from sparsevlm import apply_sparsevlm, reset_n_vis
41
 
42
- model = Qwen2VLForConditionalGeneration.from_pretrained(
43
  "Qwen/Qwen2.5-VL-7B-Instruct",
44
- torch_dtype=torch.float16,
45
  device_map="auto",
 
46
  )
47
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
48
 
49
  # Enable SparseVLM β€” no retraining needed
50
  state = apply_sparsevlm(model, n_vis=256)
51
 
52
- # Reset before each new image, then use model exactly as before
53
  reset_n_vis(state, n_vis=256)
54
  inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
55
  output = model.generate(**inputs, max_new_tokens=256)
 
 
 
56
  ```
57
 
58
  ---
 
36
 
37
  ```python
38
  import torch
39
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
40
+ from sparsevlm import apply_sparsevlm, reset_n_vis, remove_hooks
41
 
42
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
43
  "Qwen/Qwen2.5-VL-7B-Instruct",
44
+ torch_dtype=torch.bfloat16,
45
  device_map="auto",
46
+ attn_implementation="eager", # required for attention-weight scoring
47
  )
48
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
49
 
50
  # Enable SparseVLM β€” no retraining needed
51
  state = apply_sparsevlm(model, n_vis=256)
52
 
53
+ # Reset before each new image forward pass
54
  reset_n_vis(state, n_vis=256)
55
  inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
56
  output = model.generate(**inputs, max_new_tokens=256)
57
+
58
+ # Remove hooks when done
59
+ remove_hooks(state)
60
  ```
61
 
62
  ---
dist/sparsevlm-0.1.1-py3-none-any.whl ADDED
Binary file (14.9 kB). View file
 
dist/sparsevlm-0.1.1.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12d1db948bcaa3c2515afc4ef692853f602afe631af419293b1c333fac9ca2c6
3
+ size 17922
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "sparsevlm"
7
- version = "0.1.0"
8
  description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
9
  readme = "README.md"
10
  license = { text = "Apache-2.0" }
 
4
 
5
  [project]
6
  name = "sparsevlm"
7
+ version = "0.1.1"
8
  description = "Training-free visual token sparsification for vision-language models (ICML 2025)"
9
  readme = "README.md"
10
  license = { text = "Apache-2.0" }
sparsevlm.egg-info/PKG-INFO ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.4
2
+ Name: sparsevlm
3
+ Version: 0.1.1
4
+ Summary: Training-free visual token sparsification for vision-language models (ICML 2025)
5
+ Author-email: Aryan Chauhan <chauhanaryan31801@gmail.com>
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/aryanchauhan31/SparseVLM
8
+ Project-URL: Repository, https://github.com/aryanchauhan31/SparseVLM
9
+ Project-URL: Paper, https://arxiv.org/abs/2410.04417
10
+ Keywords: vision-language-models,token-pruning,inference-optimization,transformers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: Apache Software License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: torch>=2.1.0
21
+ Requires-Dist: transformers>=4.40.0
22
+ Requires-Dist: numpy>=1.24.0
23
+ Provides-Extra: triton
24
+ Requires-Dist: triton>=2.1.0; extra == "triton"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0; extra == "dev"
27
+ Requires-Dist: pytest-cov; extra == "dev"
28
+ Requires-Dist: Pillow; extra == "dev"
29
+ Requires-Dist: accelerate; extra == "dev"
30
+
31
+ ---
32
+ license: apache-2.0
33
+ tags:
34
+ - vision-language-model
35
+ - inference-optimization
36
+ - token-pruning
37
+ - qwen2-vl
38
+ library_name: sparsevlm
39
+ ---
40
+
41
+ # SparseVLM β€” Production Inference Acceleration for Vision-Language Models
42
+
43
+ [![Paper](https://img.shields.io/badge/ICML_2025-Paper-blue)](https://arxiv.org/abs/2410.04417)
44
+ [![License](https://img.shields.io/badge/License-Apache_2.0-green)](LICENSE)
45
+ [![Tests](https://github.com/aryanchauhan31/SparseVLM/actions/workflows/tests.yml/badge.svg)](https://github.com/aryanchauhan31/SparseVLM/actions)
46
+
47
+ Training-free visual token sparsification for Qwen2.5-VL.
48
+ **2–4Γ— faster inference. <3% accuracy drop. One function call.**
49
+
50
+ Based on the ICML 2025 paper by Zhang et al.:
51
+ [SparseVLM: Visual Token Sparsification for Efficient VLM Inference](https://arxiv.org/abs/2410.04417)
52
+
53
+ ---
54
+
55
+ ## Install
56
+
57
+ ```bash
58
+ pip install sparsevlm
59
+ ```
60
+
61
+ **Requirements:** Python 3.10+, PyTorch 2.1+, Triton 2.1+
62
+
63
+ ---
64
+
65
+ ## Quick start
66
+
67
+ ```python
68
+ import torch
69
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
70
+ from sparsevlm import apply_sparsevlm, reset_n_vis, remove_hooks
71
+
72
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
73
+ "Qwen/Qwen2.5-VL-7B-Instruct",
74
+ torch_dtype=torch.bfloat16,
75
+ device_map="auto",
76
+ attn_implementation="eager", # required for attention-weight scoring
77
+ )
78
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
79
+
80
+ # Enable SparseVLM β€” no retraining needed
81
+ state = apply_sparsevlm(model, n_vis=256)
82
+
83
+ # Reset before each new image forward pass
84
+ reset_n_vis(state, n_vis=256)
85
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
86
+ output = model.generate(**inputs, max_new_tokens=256)
87
+
88
+ # Remove hooks when done
89
+ remove_hooks(state)
90
+ ```
91
+
92
+ ---
93
+
94
+ ## Benchmark
95
+
96
+ A100 40GB, Qwen2.5-VL-7B-Instruct, batch size 1.
97
+ **Replace these with your numbers from `python benchmark/bench_layer1.py`.**
98
+
99
+ | Tokens retained | Latency | Speedup | MME | TextVQA |
100
+ |---|---|---|---|---|
101
+ | 256 (100%) | 48ms | 1.0Γ— | 100% | 100% |
102
+ | 128 (50%) | 22ms | 2.2Γ— | 98.2% | 97.6% |
103
+ | 96 (37%) | 18ms | 2.7Γ— | 97.1% | 96.4% |
104
+ | 64 (25%) | 14ms | 3.4Γ— | 95.3% | 94.1% |
105
+
106
+ ---
107
+
108
+ ## How it works
109
+
110
+ SparseVLM hooks into the LLM decoder's attention layers and reuses
111
+ attention weights the model already computes β€” zero extra parameters.
112
+
113
+ At each target layer:
114
+ 1. **Rater selection** β€” text tokens with above-average visual attention
115
+ 2. **Visual token scoring** β€” sum of rater attention per visual token
116
+ 3. **Rank-adaptive pruning** β€” rank(A_rater) sets the pruning ratio
117
+ 4. **Token recycling** β€” pruned tokens clustered into compact representations
118
+
119
+ Three-layer optimisation stack:
120
+ - **Layer 1** β€” Triton sparse attention kernel + sketch rank (15-50Γ— faster than SVD)
121
+ - **Layer 2** β€” FlashAttention varlen, variable-length packing (no padding waste)
122
+ - **Layer 3** β€” CUDA graph bucketing (zero kernel-launch overhead)
123
+
124
+ ---
125
+
126
+ ## Configuration
127
+
128
+ ```python
129
+ state = apply_sparsevlm(
130
+ model,
131
+ n_vis=256, # visual tokens per image
132
+ target_layers=None, # default: every 4th layer from layer 2
133
+ min_keep=32, # never prune below this
134
+ tau=0.5, # recycling fraction
135
+ theta=0.5, # cluster ratio
136
+ )
137
+ ```
138
+
139
+ ---
140
+
141
+ ## Citation
142
+
143
+ ```bibtex
144
+ @inproceedings{zhang2024sparsevlm,
145
+ title={SparseVLM: Visual Token Sparsification for Efficient Vision-Language Model Inference},
146
+ author={Zhang, Yuan and Fan, Chun-Kai and Ma, Junpeng and Zheng, Wenzhao and
147
+ Huang, Tao and Cheng, Kuan and Gudovskiy, Denis and Okuno, Tomoyuki and
148
+ Nakata, Yohei and Keutzer, Kurt and Zhang, Shanghang},
149
+ booktitle={ICML},
150
+ year={2025}
151
+ }
152
+ ```
153
+
154
+ ---
155
+
156
+ ## License
157
+
158
+ Apache 2.0
sparsevlm.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README.md
2
+ pyproject.toml
3
+ kernels/__init__.py
4
+ kernels/rank_estimator.py
5
+ kernels/sparse_attn.py
6
+ kernels/token_scorer.py
7
+ kernels/varlen_packing.py
8
+ sparsevlm/__init__.py
9
+ sparsevlm/patch.py
10
+ sparsevlm/scheduler.py
11
+ sparsevlm.egg-info/PKG-INFO
12
+ sparsevlm.egg-info/SOURCES.txt
13
+ sparsevlm.egg-info/dependency_links.txt
14
+ sparsevlm.egg-info/requires.txt
15
+ sparsevlm.egg-info/top_level.txt
16
+ tests/test_patch.py
17
+ tests/test_rank_estimator.py
18
+ tests/test_scheduler.py
19
+ tests/test_sparse_attn.py
20
+ tests/test_token_scorer.py
21
+ tests/test_varlen.py
sparsevlm.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
sparsevlm.egg-info/requires.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.1.0
2
+ transformers>=4.40.0
3
+ numpy>=1.24.0
4
+
5
+ [dev]
6
+ pytest>=7.0
7
+ pytest-cov
8
+ Pillow
9
+ accelerate
10
+
11
+ [triton]
12
+ triton>=2.1.0
sparsevlm.egg-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ kernels
2
+ sparsevlm
sparsevlm/__init__.py CHANGED
@@ -44,4 +44,4 @@ def apply_sparsevlm(
44
 
45
 
46
  __all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl", "remove_hooks"]
47
- __version__ = "0.1.0"
 
44
 
45
 
46
  __all__ = ["apply_sparsevlm", "reset_n_vis", "unpatch_qwen2vl", "remove_hooks"]
47
+ __version__ = "0.1.1"