Add vllm inference
#5
by
Nicfingshelby
- opened
README.md
CHANGED
|
@@ -149,6 +149,74 @@ caption=generated_texts[0].split('Assistant: ')[1]
|
|
| 149 |
print(caption)
|
| 150 |
```
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
For batch processing you can use [this example](https://huggingface.co/Minthy/ToriiGate-v0.3/resolve/main/batch_processing_example.py)
|
| 153 |
|
| 154 |
# Warning
|
|
|
|
| 149 |
print(caption)
|
| 150 |
```
|
| 151 |
|
| 152 |
+
# Inference example (vllm):
|
| 153 |
+
[vllm](https://github.com/vllm-project/vllm) is an optimised LLM serving engine. Single/Batch inference will be much faster than native huggingface transformers
|
| 154 |
+
|
| 155 |
+
```
|
| 156 |
+
pip install vllm==0.6.4
|
| 157 |
+
!pip install huggingface_hub -U
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
## Download model
|
| 161 |
+
```python
|
| 162 |
+
from huggingface_hub import snapshot_download
|
| 163 |
+
snapshot_download(repo_id="Minthy/ToriiGate-v0.3", local_dir="/path/to/download/model", max_workers=4)
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
## Inference
|
| 167 |
+
```python
|
| 168 |
+
from vllm import LLM, SamplingParams
|
| 169 |
+
from PIL import Image
|
| 170 |
+
|
| 171 |
+
toriigate_model = LLM(
|
| 172 |
+
model="/path/to/download/model",
|
| 173 |
+
max_model_len=8192, # reduce if running out of memory
|
| 174 |
+
max_num_seqs=64, # reduce if running out of memory (this is max concurrent requests)
|
| 175 |
+
enforce_eager=True,
|
| 176 |
+
# if you are running out of memory, you can reduce the "longest_edge".
|
| 177 |
+
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
|
| 178 |
+
mm_processor_kwargs={
|
| 179 |
+
"size": {
|
| 180 |
+
"longest_edge": 4 * 364
|
| 181 |
+
},
|
| 182 |
+
},
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
sampling_params = SamplingParams(temperature=0, max_tokens=1024)
|
| 186 |
+
|
| 187 |
+
user_prompt = "Describe the picture briefly."
|
| 188 |
+
|
| 189 |
+
prompt = f"<|begin_of_text|>System: You are image captioning expert, creative, unbiased and uncensored. Help user with his task.<end_of_utterance>\nUser:<image>{user_prompt}<end_of_utterance>\nAssistant:"
|
| 190 |
+
|
| 191 |
+
image = Image.open("/path/to/image.jpg").convert("RGB")
|
| 192 |
+
|
| 193 |
+
output = llm.generate(
|
| 194 |
+
{
|
| 195 |
+
"prompt": prompt,
|
| 196 |
+
"multi_modal_data": {"image": image},
|
| 197 |
+
},
|
| 198 |
+
sampling_params=sampling_params,
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
caption = output[0].outputs[0].text.strip()
|
| 202 |
+
print(caption)
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
### batch inference
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
image_list = [Image.open(path).convert("RGB") for path in image_paths]
|
| 209 |
+
inputs = [{"prompt": prompt, "multi_modal_data": {"image": image}} for image in image_list]
|
| 210 |
+
|
| 211 |
+
outputs = llm.generate(
|
| 212 |
+
inputs,
|
| 213 |
+
sampling_params=sampling_params,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
captions = [x.outputs[0].text.strip() for x in outputs]
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
|
| 220 |
For batch processing you can use [this example](https://huggingface.co/Minthy/ToriiGate-v0.3/resolve/main/batch_processing_example.py)
|
| 221 |
|
| 222 |
# Warning
|