Instructions to use apple/FastVLM-7B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use apple/FastVLM-7B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="apple/FastVLM-7B", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("apple/FastVLM-7B", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use apple/FastVLM-7B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "apple/FastVLM-7B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/FastVLM-7B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/apple/FastVLM-7B

SGLang

How to use apple/FastVLM-7B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "apple/FastVLM-7B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/FastVLM-7B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "apple/FastVLM-7B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "apple/FastVLM-7B",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use apple/FastVLM-7B with Docker Model Runner:
```
docker model run hf.co/apple/FastVLM-7B
```

Create custom processor for easier inference

#12

by JettChenT - opened Jan 23

base: refs/heads/main

←

from: refs/pr/12

Discussion Files changed

+143

-3

This PR is in draft mode

Files changed (4) hide show

config.json +22 -3
preprocessor_config.json +30 -0
processing_fastvlm.py +88 -0
processor_config.json +3 -0

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
-  "_name_or_path": "./llava-v1.5-13b",
   "architectures": [
     "LlavaQwen2ForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaConfig",
-    "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM"
-  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
@@ -45,5 +45,24 @@
   "use_cache": true,
   "use_mm_proj": true,
   "use_sliding_window": false,
   "vocab_size": 152064
 }

 {
   "architectures": [
     "LlavaQwen2ForCausalLM"
   ],
   "auto_map": {
     "AutoConfig": "llava_qwen.LlavaConfig",
+    "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM",
+    "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
+  },
   "attention_dropout": 0.0,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
   "use_cache": true,
   "use_mm_proj": true,
   "use_sliding_window": false,
+  "vision_config": {
+    "cls_ratio": 2.0,
+    "down_patch_size": 7,
+    "down_stride": 2,
+    "downsamples": [true, true, true, true, true],
+    "embed_dims": [96, 192, 384, 768, 1536],
+    "hidden_size": 1024,
+    "image_size": 1024,
+    "intermediate_size": 3072,
+    "layer_scale_init_value": 1e-5,
+    "layers": [2, 12, 24, 4, 2],
+    "mlp_ratios": [4, 4, 4, 4, 4],
+    "num_classes": 1000,
+    "patch_size": 64,
+    "pos_embs_shapes": [null, null, null, [7, 7], [7, 7]],
+    "projection_dim": 768,
+    "repmixer_kernel_size": 3,
+    "token_mixers": ["repmixer", "repmixer", "repmixer", "attention", "attention"]
+  },
   "vocab_size": 152064
 }

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "processing_fastvlm.FastVLMImageProcessor"
+    },
+    "image_processor_type": "FastVLMImageProcessor",
+    "crop_size": {
+        "height": 1024,
+        "width": 1024
+    },
+    "do_center_crop": true,
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+        0.0,
+        0.0,
+        0.0
+    ],
+    "image_std": [
+        1.0,
+        1.0,
+        1.0
+    ],
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+        "shortest_edge": 1024
+    }
+}

processing_fastvlm.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import re
+import torch
+from transformers import ProcessorMixin, BatchFeature, CLIPImageProcessorFast
+from transformers.image_processing_utils import BaseImageProcessor
+from transformers.image_utils import ImageInput
+from typing import Any, Dict, List, Optional, Union
+from PIL import Image
+from .llava_qwen import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
+# Adapted from transformers.models.llava_next.image_processing_llava_next.expand_to_square
+def expand_to_square(image: torch.Tensor, background_color=0) -> torch.Tensor:
+    """
+    Expands an image to a square by adding a background color.
+    """
+    c, height, width = image.shape
+    if width == height:
+        return image
+    elif width > height:
+        result = torch.ones((c, width, width), dtype=image.dtype) * background_color
+        result[:, (width - height) // 2 : (width - height) // 2 + height, :] = image
+        return result
+    else:
+        result = torch.ones((c, height, height), dtype=image.dtype) * background_color
+        result[:, :, (height - width) // 2 : (height - width) // 2 + width] = image
+        return result
+class FastVLMImageProcessor(CLIPImageProcessorFast):
+    def _preprocess(self, images, **kwargs):
+        image_sizes = [image.shape[-2:][::-1] for image in images]
+        images = [expand_to_square(image) for image in images]
+        images = super()._preprocess(images, **kwargs)
+        pixel_values = torch.stack(images.pixel_values, dim=0)
+        return BatchFeature(data={"pixel_values": pixel_values, "image_sizes": image_sizes})
+class FastVLMProcessor(ProcessorMixin):
+    attributes = ["tokenizer", "image_processor"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+    def __init__(
+        self,
+        tokenizer,
+        image_processor,
+        chat_template=None,
+        **kwargs
+    ):
+        super().__init__(tokenizer, image_processor, chat_template=chat_template, **kwargs)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Optional[Union[str, List[str]]] = None,
+        return_tensors: Optional[str] = "pt",
+        **kwargs,
+    ) -> BatchFeature:
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_token = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=torch.int64)
+            input_ids = torch.tensor([], dtype=torch.int64)
+            attention_mask = torch.tensor([], dtype=torch.int64)
+            for prompt in text:
+                image_indexes = [m.start() for m in re.finditer(DEFAULT_IMAGE_TOKEN, prompt)]
+                if len(image_indexes) > 1:
+                    raise ValueError(
+                        f"Expected up to 1 image tokens per prompt, got {len(image_indexes)} instead."
+                    )
+                # DEFAULT_IMAGE_TOKEN is -200, not in the vocab (so we can't tokenize the full string)
+                pre, _, post = prompt.partition(DEFAULT_IMAGE_TOKEN)
+                pre_ids  = self.tokenizer(pre, return_tensors="pt", add_special_tokens=False).input_ids
+                post_ids = self.tokenizer(post, return_tensors="pt", add_special_tokens=False).input_ids
+                sample_ids = torch.cat([pre_ids, image_token, post_ids], dim=1).to(dtype=torch.int64)
+                sample_mask = torch.ones_like(sample_ids)
+                input_ids = torch.cat([input_ids, sample_ids], dim=0)
+                attention_mask = torch.cat([attention_mask, sample_mask], dim=0)
+        return BatchFeature(data={"input_ids": input_ids, "attention_mask": attention_mask, **image_inputs}, tensor_type=return_tensors)

processor_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{%- if messages is string -%}\n    {{- messages -}}\n{%- else -%}\n    {%- for message in messages -%}\n        {%- if loop.first and messages[0]['role'] != 'system' -%}\n            {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' -}}\n        {%- endif -%}\n        {{- '<|im_start|>' + message['role'] + '\\n' -}}\n        {%- if message['content'] is string -%}\n            {{- message['content'] -}}\n        {%- elif message['content'] is iterable -%}\n            {%- for item in message['content'] -%}\n                {%- if item['type'] == 'image' -%}\n                    {{- '<image>\\n' -}}\n                {%- elif item['type'] == 'text' -%}\n                    {{- item['text'] -}}\n                {%- endif -%}\n            {%- endfor -%}\n        {%- else -%}\n            {{- raise_exception(\"Invalid content type\") -}}\n        {%- endif -%}\n        {{- '<|im_end|>' + '\\n' -}}\n    {%- endfor -%}\n    {%- if add_generation_prompt -%}\n        {{- '<|im_start|>assistant\\n' -}}\n    {%- endif -%}\n{%- endif -%}\n"
+}