Image-Text-to-Text
sentence-transformers
Safetensors
Transformers
qwen2_vl
Qwen2-VL
conversational
text-generation-inference
Instructions to use llamaindex/vdr-2b-multi-v1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- sentence-transformers
How to use llamaindex/vdr-2b-multi-v1 with sentence-transformers:
from sentence_transformers import SentenceTransformer model = SentenceTransformer("llamaindex/vdr-2b-multi-v1") sentences = [ "The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium." ] embeddings = model.encode(sentences) similarities = model.similarity(embeddings, embeddings) print(similarities.shape) # [3, 3] - Transformers
How to use llamaindex/vdr-2b-multi-v1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="llamaindex/vdr-2b-multi-v1") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("llamaindex/vdr-2b-multi-v1") model = AutoModelForImageTextToText.from_pretrained("llamaindex/vdr-2b-multi-v1") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use llamaindex/vdr-2b-multi-v1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "llamaindex/vdr-2b-multi-v1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llamaindex/vdr-2b-multi-v1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/llamaindex/vdr-2b-multi-v1
- SGLang
How to use llamaindex/vdr-2b-multi-v1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "llamaindex/vdr-2b-multi-v1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llamaindex/vdr-2b-multi-v1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "llamaindex/vdr-2b-multi-v1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "llamaindex/vdr-2b-multi-v1", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use llamaindex/vdr-2b-multi-v1 with Docker Model Runner:
docker model run hf.co/llamaindex/vdr-2b-multi-v1
Integrate with Sentence Transformers v5.4
#7
by tomaarsen HF Staff - opened
- config_sentence_transformers.json +1 -4
- custom_st.py +12 -7
config_sentence_transformers.json
CHANGED
|
@@ -4,10 +4,7 @@
|
|
| 4 |
"transformers": "4.46.2",
|
| 5 |
"pytorch": "2.2.2"
|
| 6 |
},
|
| 7 |
-
"prompts":{
|
| 8 |
-
"image": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is shown in this image?<|im_end|>\n<|endoftext|>",
|
| 9 |
-
"query": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Query: %s<|im_end|>\n<|endoftext|>"
|
| 10 |
-
},
|
| 11 |
"default_prompt_name": null,
|
| 12 |
"similarity_fn_name": "cosine"
|
| 13 |
}
|
|
|
|
| 4 |
"transformers": "4.46.2",
|
| 5 |
"pytorch": "2.2.2"
|
| 6 |
},
|
| 7 |
+
"prompts": {},
|
|
|
|
|
|
|
|
|
|
| 8 |
"default_prompt_name": null,
|
| 9 |
"similarity_fn_name": "cosine"
|
| 10 |
}
|
custom_st.py
CHANGED
|
@@ -42,22 +42,23 @@ class Transformer(nn.Module):
|
|
| 42 |
self.max_pixels = max_pixels
|
| 43 |
self.min_pixels = min_pixels
|
| 44 |
self.max_seq_length = max_seq_length
|
| 45 |
-
|
| 46 |
# Handle args
|
| 47 |
model_kwargs = model_args or {}
|
| 48 |
-
model_kwargs.update(kwargs)
|
| 49 |
-
|
| 50 |
processor_kwargs = processor_args or {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
processor_kwargs.update({
|
| 52 |
'min_pixels': min_pixels,
|
| 53 |
'max_pixels': max_pixels,
|
| 54 |
-
'cache_dir': cache_dir
|
| 55 |
})
|
| 56 |
|
| 57 |
# Initialize model
|
| 58 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 59 |
model_name_or_path,
|
| 60 |
-
cache_dir=cache_dir,
|
| 61 |
**model_kwargs
|
| 62 |
).eval()
|
| 63 |
|
|
@@ -271,7 +272,7 @@ class Transformer(nn.Module):
|
|
| 271 |
)
|
| 272 |
return features
|
| 273 |
|
| 274 |
-
def tokenize(self, texts: List[Union[str, Image.Image, bytes]], padding: str = 'longest') -> Dict[str, torch.Tensor]:
|
| 275 |
processed_texts, processed_images = self._process_input(texts)
|
| 276 |
|
| 277 |
return self.processor(
|
|
@@ -311,4 +312,8 @@ class Transformer(nn.Module):
|
|
| 311 |
else:
|
| 312 |
config = {'model_name_or_path': input_path}
|
| 313 |
|
| 314 |
-
return Transformer(**config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
self.max_pixels = max_pixels
|
| 43 |
self.min_pixels = min_pixels
|
| 44 |
self.max_seq_length = max_seq_length
|
| 45 |
+
|
| 46 |
# Handle args
|
| 47 |
model_kwargs = model_args or {}
|
|
|
|
|
|
|
| 48 |
processor_kwargs = processor_args or {}
|
| 49 |
+
|
| 50 |
+
if cache_dir is not None:
|
| 51 |
+
model_kwargs['cache_dir'] = cache_dir
|
| 52 |
+
processor_kwargs['cache_dir'] = cache_dir
|
| 53 |
+
|
| 54 |
processor_kwargs.update({
|
| 55 |
'min_pixels': min_pixels,
|
| 56 |
'max_pixels': max_pixels,
|
|
|
|
| 57 |
})
|
| 58 |
|
| 59 |
# Initialize model
|
| 60 |
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 61 |
model_name_or_path,
|
|
|
|
| 62 |
**model_kwargs
|
| 63 |
).eval()
|
| 64 |
|
|
|
|
| 272 |
)
|
| 273 |
return features
|
| 274 |
|
| 275 |
+
def tokenize(self, texts: List[Union[str, Image.Image, bytes]], padding: str = 'longest', **kwargs) -> Dict[str, torch.Tensor]:
|
| 276 |
processed_texts, processed_images = self._process_input(texts)
|
| 277 |
|
| 278 |
return self.processor(
|
|
|
|
| 312 |
else:
|
| 313 |
config = {'model_name_or_path': input_path}
|
| 314 |
|
| 315 |
+
return Transformer(**config)
|
| 316 |
+
|
| 317 |
+
@property
|
| 318 |
+
def modalities(self) -> List[str]:
|
| 319 |
+
return ['text', 'image']
|