Instructions to use amitha/molmo-clip-b16-1b-olmo3 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use amitha/molmo-clip-b16-1b-olmo3 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="amitha/molmo-clip-b16-1b-olmo3", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModelForImageTextToText
model = AutoModelForImageTextToText.from_pretrained("amitha/molmo-clip-b16-1b-olmo3", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use amitha/molmo-clip-b16-1b-olmo3 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "amitha/molmo-clip-b16-1b-olmo3"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amitha/molmo-clip-b16-1b-olmo3",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/amitha/molmo-clip-b16-1b-olmo3

SGLang

How to use amitha/molmo-clip-b16-1b-olmo3 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "amitha/molmo-clip-b16-1b-olmo3" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amitha/molmo-clip-b16-1b-olmo3",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "amitha/molmo-clip-b16-1b-olmo3" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "amitha/molmo-clip-b16-1b-olmo3",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use amitha/molmo-clip-b16-1b-olmo3 with Docker Model Runner:
```
docker model run hf.co/amitha/molmo-clip-b16-1b-olmo3
```

molmo-clip-b16-1b-olmo3

File size: 9,171 Bytes

76a07b6

# coding=utf-8
"""Processor for the Molmo-v1 (CLIP vision) VLM.

Reproduces the Molmo preprocessor token layout exactly for this VLM's config
(crop_mode=resize, max_crops=1, image_pooling_2d=none, include_cls_token=true):

  per image block (213 tokens; 197 <im_patch>):
    [<im_start>] [<im_patch>(CLS)] then 14x([<im_patch>*14][<im_col>]) [<im_end>]

  full sequence: [BOS] + <pre-image text> + image_block + <post-image text>
  image_input_idx: the 197 <im_patch> positions (CLS first, then 196 row-major),
                   each +1 for the prepended BOS.
"""

from typing import List, Optional, Union

import numpy as np
import torch

from transformers.processing_utils import ProcessorMixin
from transformers.feature_extraction_utils import BatchFeature


class MolmoOlmo3Processor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    # token-id constants (dolma2 base 100278; specials appended at 100278..100282)
    IMAGE_PROMPT_TOKEN_ID = 100282   # <|image|>
    IMAGE_START_TOKEN_ID = 100278    # <im_start>
    IMAGE_END_TOKEN_ID = 100279      # <im_end>
    IMAGE_PATCH_TOKEN_ID = 100280    # <im_patch>
    IMAGE_COL_TOKEN_ID = 100281      # <im_col>
    BOS_TOKEN_ID = 100257

    # The only styles these models were trained on (system_prompt_kind='demo_or_style').
    # long_caption/user_qa/synthetic_qa saw the "{style}:" prefix only ~10% of the time
    # (no prefix the other ~90%); transcript was always prefixed.
    KNOWN_STYLES = ("long_caption", "transcript", "user_qa", "synthetic_qa")

    def __init__(
        self,
        image_processor=None,
        tokenizer=None,
        image_token_length_w: int = 14,
        image_token_length_h: int = 14,
        include_cls_token: bool = True,
        use_col_tokens: bool = True,
        always_start_with_space: bool = True,
        **kwargs,
    ):
        self.image_token_length_w = image_token_length_w
        self.image_token_length_h = image_token_length_h
        self.include_cls_token = include_cls_token
        self.use_col_tokens = use_col_tokens
        self.always_start_with_space = always_start_with_space
        super().__init__(image_processor, tokenizer, **kwargs)

    def format_prompt(self, question: str, style=None) -> str:
        """Reproduce Molmo's DataFormatter (system_prompt='demo_or_style', message_format='none').

        Usage:
          - VQA / instruction (most common): `text="your question"`, `style=None`
            -> " your question". This matches ~90% of training (no prefix), so leaving
            style unset is usually best.
          - Captioning: `text=""`, `style=None` -> a bare " " prompt; or
            `text="", style="long_caption"` / `style="transcript"` to request that mode
            explicitly. (Training produced captions/transcripts from an empty user turn.)
          - Steer output mode: pass `style` in {long_caption, transcript, user_qa,
            synthetic_qa} -> "{style}: ...". Note long_caption/user_qa/synthetic_qa only
            saw the prefix ~10% of the time in training; transcript was always prefixed.

        always_start_with_space -> a single leading space is always prepended.
        """
        if style is not None and style not in self.KNOWN_STYLES:
            import warnings
            warnings.warn(
                f"style={style!r} was not used to train these models; the model may ignore "
                f"or mishandle it. Known styles: {self.KNOWN_STYLES}. Use style=None for the "
                f"default (no-prefix) behavior the model saw ~90% of the time."
            )
        prefix = "" if not style else f"{style}:"
        if prefix and question:
            text = prefix + " " + question
        elif prefix:
            text = prefix
        else:
            text = question
        if self.always_start_with_space:
            text = " " + text
        return text

    def _image_block(self) -> np.ndarray:
        """The 213-token image block for a single resized crop."""
        per_row = np.full((self.image_token_length_w,), self.IMAGE_PATCH_TOKEN_ID, dtype=np.int32)
        if self.use_col_tokens:
            per_row = np.concatenate([per_row, [self.IMAGE_COL_TOKEN_ID]], 0)
        extra = np.tile(per_row, [self.image_token_length_h])
        joint = [[self.IMAGE_START_TOKEN_ID]]
        if self.include_cls_token:
            joint.append([self.IMAGE_PATCH_TOKEN_ID])
        joint += [extra, [self.IMAGE_END_TOKEN_ID]]
        return np.concatenate(joint, 0).astype(np.int32)

    def _image_input_idx(self, image_block: np.ndarray) -> np.ndarray:
        """Positions of <im_patch> within the block, (1, features_per_image)."""
        tokens_per_image = self.image_token_length_w * self.image_token_length_h
        features_per_image = tokens_per_image + (1 if self.include_cls_token else 0)
        idx = np.nonzero(image_block == self.IMAGE_PATCH_TOKEN_ID)[0].astype(np.int32)
        return idx.reshape(1, features_per_image)

    def __call__(
        self,
        text: Union[str, List[str]],
        images=None,
        style=None,
        apply_prompt_format: bool = True,
        return_tensors: Optional[str] = "pt",
        **kwargs,
    ) -> BatchFeature:
        """Tokenize text + splice image features.

        By default (apply_prompt_format=True) the text is wrapped with the training-time
        formatting (leading space + optional "{style}: " prefix) and the image is placed
        first (Molmo inserts the image at the start when no <|image|> marker is present).
        Pass apply_prompt_format=False to feed pre-formatted text, or include an explicit
        <|image|> marker to control image placement.
        """
        if isinstance(text, (list, tuple)):
            if len(text) != 1:
                raise NotImplementedError("MolmoOlmo3Processor supports a single prompt at a time.")
            text = text[0]
        if images is not None and not isinstance(images, (list, tuple)):
            images = [images]

        if apply_prompt_format and self.IMAGE_PROMPT_TOKEN_ID not in \
                self.tokenizer.encode(text, add_special_tokens=False):
            text = self.format_prompt(text, style=style)

        tokens = np.array(self.tokenizer.encode(text, add_special_tokens=False), dtype=np.int32)

        if not images:
            input_ids = np.pad(tokens, [[1, 0]], constant_values=self.BOS_TOKEN_ID)
            return self._finalize({"input_tokens": input_ids}, None, None, return_tensors)

        marker_pos = np.argwhere(tokens == self.IMAGE_PROMPT_TOKEN_ID)
        # No marker -> image first (token_ix=-1, matching Molmo's no-marker behavior).
        image_idx = marker_pos[:, 0] if len(marker_pos) else np.array([-1] * len(images))
        assert len(image_idx) == len(images), "number of <|image|> markers must match images"

        block = self._image_block()
        patch_idx = self._image_input_idx(block)
        all_pixel = self.image_processor(images, return_tensors=None)["pixel_values"]  # (n,3,H,W)

        out_tokens, all_image_idx = [], []
        for ix in range(len(images)):
            token_ix = image_idx[ix]
            if token_ix == -1:
                start, token_ix = 0, 0
            else:
                start = 0 if ix == 0 else image_idx[ix - 1] + 1
            all_image_idx.append(patch_idx + token_ix)
            out_tokens.append(tokens[start:token_ix])
            out_tokens.append(block)
        end = (image_idx[-1] + 1) if image_idx[-1] != -1 else 0
        out_tokens.append(tokens[end:])

        input_ids = np.concatenate(out_tokens, 0)
        image_input_idx = np.concatenate(all_image_idx, 0)

        # prepend BOS; shift image_input_idx by +1 (matches Molmo inference path)
        input_ids = np.pad(input_ids, [[1, 0]], constant_values=self.BOS_TOKEN_ID)
        image_input_idx = np.where(image_input_idx < 0, image_input_idx, image_input_idx + 1)

        return self._finalize(
            {"input_tokens": input_ids, "image_input_idx": image_input_idx[None]},
            all_pixel, image_input_idx[None], return_tensors,
        )

    def _finalize(self, out, pixel_values, image_input_idx, return_tensors):
        input_ids = out["input_tokens"].astype(np.int64)[None]  # (1, seq)
        attention_mask = np.ones_like(input_ids)
        data = {"input_ids": input_ids, "attention_mask": attention_mask}
        if pixel_values is not None:
            data["pixel_values"] = pixel_values[None]  # (1, n_images, 3, H, W)
            data["image_input_idx"] = image_input_idx  # (1, n_images, features_per_image)
        if return_tensors == "pt":
            data = {k: torch.as_tensor(v) for k, v in data.items()}
            if "pixel_values" in data:
                data["pixel_values"] = data["pixel_values"].to(torch.float32)
        return BatchFeature(data=data, tensor_type=None)

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)


__all__ = ["MolmoOlmo3Processor"]