shuai bai
commited on
Update README.md
Browse files
README.md
CHANGED
|
@@ -129,7 +129,7 @@ output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=
|
|
| 129 |
print(output_text)
|
| 130 |
```
|
| 131 |
<details>
|
| 132 |
-
<summary>
|
| 133 |
|
| 134 |
```python
|
| 135 |
|
|
@@ -179,6 +179,73 @@ print(output_text)
|
|
| 179 |
```
|
| 180 |
</details>
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
### More Usage Tips
|
| 183 |
|
| 184 |
For input images, we support local files, base64, and URLs. For videos, we currently only support local files.
|
|
|
|
| 129 |
print(output_text)
|
| 130 |
```
|
| 131 |
<details>
|
| 132 |
+
<summary>Without qwen_vl_utils</summary>
|
| 133 |
|
| 134 |
```python
|
| 135 |
|
|
|
|
| 179 |
```
|
| 180 |
</details>
|
| 181 |
|
| 182 |
+
<details>
|
| 183 |
+
<summary>Multi image inference</summary>
|
| 184 |
+
|
| 185 |
+
```python
|
| 186 |
+
# Messages containing multiple images and a text query
|
| 187 |
+
messages = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "Identify the similarities between these images."}]}]
|
| 188 |
+
|
| 189 |
+
# Preparation for inference
|
| 190 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 191 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 192 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
| 193 |
+
|
| 194 |
+
# Inference
|
| 195 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
| 196 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
| 197 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 198 |
+
print(output_text)
|
| 199 |
+
```
|
| 200 |
+
</details>
|
| 201 |
+
|
| 202 |
+
<details>
|
| 203 |
+
<summary>Video inference</summary>
|
| 204 |
+
|
| 205 |
+
```python
|
| 206 |
+
|
| 207 |
+
# Messages containing a images list as a video and a text query
|
| 208 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": ["file:///path/to/frame1.jpg", "file:///path/to/frame2.jpg", "file:///path/to/frame3.jpg", "file:///path/to/frame4.jpg"], 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
| 209 |
+
# Messages containing a video and a text query
|
| 210 |
+
messages = [{"role": "user", "content": [{"type": "video", "video": "file:///path/to/video1.mp4", 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]
|
| 211 |
+
|
| 212 |
+
# Preparation for inference
|
| 213 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 214 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 215 |
+
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
| 216 |
+
|
| 217 |
+
# Inference
|
| 218 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
| 219 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
| 220 |
+
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 221 |
+
print(output_text)
|
| 222 |
+
```
|
| 223 |
+
</details>
|
| 224 |
+
|
| 225 |
+
<details>
|
| 226 |
+
<summary>Batch inference</summary>
|
| 227 |
+
|
| 228 |
+
```python
|
| 229 |
+
|
| 230 |
+
# Sample messages for batch inference
|
| 231 |
+
messages1 = [{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image1.jpg"}, {"type": "image", "image": "file:///path/to/image2.jpg"}, {"type": "text", "text": "What are the common elements in these pictures?"}]}]
|
| 232 |
+
messages2 = [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who are you?"}]
|
| 233 |
+
# Combine messages for batch processing
|
| 234 |
+
messages = [messages1, messages1]
|
| 235 |
+
|
| 236 |
+
# Preparation for batch inference
|
| 237 |
+
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
|
| 238 |
+
image_inputs, video_inputs = process_vision_info(messages)
|
| 239 |
+
inputs = processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
|
| 240 |
+
|
| 241 |
+
# Batch Inference
|
| 242 |
+
generated_ids = model.generate(**inputs, max_new_tokens=128)
|
| 243 |
+
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
| 244 |
+
output_texts = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 245 |
+
print(output_texts)
|
| 246 |
+
```
|
| 247 |
+
</details>
|
| 248 |
+
|
| 249 |
### More Usage Tips
|
| 250 |
|
| 251 |
For input images, we support local files, base64, and URLs. For videos, we currently only support local files.
|