Instructions to use Salesforce/GTA1-7B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Salesforce/GTA1-7B with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="Salesforce/GTA1-7B")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoProcessor, AutoModelForImageTextToText

processor = AutoProcessor.from_pretrained("Salesforce/GTA1-7B")
model = AutoModelForImageTextToText.from_pretrained("Salesforce/GTA1-7B")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Salesforce/GTA1-7B with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Salesforce/GTA1-7B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Salesforce/GTA1-7B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/Salesforce/GTA1-7B

SGLang

How to use Salesforce/GTA1-7B with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Salesforce/GTA1-7B" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Salesforce/GTA1-7B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Salesforce/GTA1-7B" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Salesforce/GTA1-7B",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use Salesforce/GTA1-7B with Docker Model Runner:
```
docker model run hf.co/Salesforce/GTA1-7B
```

GTA1-7B / processing_opencua.py

HelloKKMe

Upload folder using huggingface_hub

d43fbcb verified 8 months ago

raw

history blame

4.54 kB

	# processing_opencua.py
	import torch
	from typing import List, Dict, Any, Union
	from PIL import Image
	from transformers.processing_utils import ProcessorMixin, BatchFeature
	from transformers import AutoTokenizer, AutoImageProcessor

	PLACEHOLDER = "<\|media_placeholder\|>"

	class OpenCUAProcessor(ProcessorMixin):
	attributes = ["image_processor", "tokenizer", "image_token_id", "merge_size"]

	def __init__(self, image_processor, tokenizer, image_token_id: int = 151664, merge_size: int = 2, **kwargs):
	self.image_processor = image_processor
	self.tokenizer = tokenizer
	self.image_token_id = image_token_id
	self.merge_size = getattr(image_processor, "merge_size", merge_size)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	trust = kwargs.get("trust_remote_code", True)
	# 优先用你仓库的 TikTokenV3；失败回退 AutoTokenizer（只用于初始化/占位）
	try:
	from tokenization_opencua import TikTokenV3
	tok = TikTokenV3.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
	except Exception:
	tok = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
	imgproc = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust)
	return cls(imgproc, tok, **kwargs)

	def apply_chat_template(self, messages: List[Dict[str, Any]], **kwargs) -> Union[str, List[int]]:
	return self.tokenizer.apply_chat_template(messages, **kwargs)

	# 下面这些方法给 HF 路径用；vLLM 初始化只需要能成功 new 出来即可
	def __call__(self, args, *kwargs) -> BatchFeature:
	# 返回一个最小结构，避免被实际调用时崩溃
	data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
	return BatchFeature(data=data)

	# 提供给你自己脚本用的辅助（可选）
	def prepare_vllm_inputs(self, messages, images, add_generation_prompt=True):
	text = self.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt)
	proc = self.image_processor(images=images, return_tensors="pt")
	grid = torch.as_tensor(proc["image_grid_thw"])
	merge = getattr(self, "merge_size", 2)
	for thw in grid:
	num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
	text = text.replace(PLACEHOLDER, PLACEHOLDER * num, 1)
	return text, images



	# # processing_opencua.py
	# from transformers import Qwen2_5_VLProcessor, AutoTokenizer, AutoImageProcessor

	# class OpenCUAProcessor(Qwen2_5_VLProcessor):
	# # 用字符串就行，但我们会在 from_pretrained 里手动加载，避免字符串反射
	# tokenizer_class = "TikTokenV3"

	# @classmethod
	# def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	# # 确保 remote code 可用
	# trust_remote_code = kwargs.get("trust_remote_code", False)

	# # 1) 手动加载 tokenizer（会按模型目录里的 tokenizer_config.json -> TikTokenV3 + tokenization_opencua.py）
	# tokenizer = AutoTokenizer.from_pretrained(
	# pretrained_model_name_or_path,
	# trust_remote_code=trust_remote_code,
	# )

	# # 2) 手动加载图像处理器（保持 Qwen2VLImageProcessor）
	# image_processor = AutoImageProcessor.from_pretrained(
	# pretrained_model_name_or_path,
	# trust_remote_code=trust_remote_code,
	# )

	# # 3) 获取chat_template，如果tokenizer有的话
	# chat_template = getattr(tokenizer, 'chat_template', None)

	# # 4) 构造并返回 Qwen2.5-VL 的 Processor 实例，传递chat_template
	# processor = cls(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template)

	# # 5) 添加vLLM需要的属性
	# # 这些token ID需要与tokenizer_config.json中的定义一致
	# processor.image_token = "<\|media_placeholder\|>" # 使用OpenCUA的媒体占位符
	# processor.video_token = "<\|media_placeholder\|>" # 视频也使用相同的占位符

	# # 添加token ID（从tokenizer_config.json中获取）
	# vocab = tokenizer.get_vocab()
	# processor.image_token_id = vocab.get("<\|media_placeholder\|>", 151664) # 默认151664
	# processor.video_token_id = vocab.get("<\|media_placeholder\|>", 151664) # 视频使用相同ID

	# return processor