Image-Text-to-Text
Transformers
Safetensors
English
qwen2_5_vl
VLM
Computer-Use-Agent
OS-Agent
GUI
Grounding
conversational
Eval Results
text-generation-inference
Instructions to use Salesforce/GTA1-7B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Salesforce/GTA1-7B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="Salesforce/GTA1-7B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("Salesforce/GTA1-7B") model = AutoModelForImageTextToText.from_pretrained("Salesforce/GTA1-7B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Salesforce/GTA1-7B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Salesforce/GTA1-7B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Salesforce/GTA1-7B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/Salesforce/GTA1-7B
- SGLang
How to use Salesforce/GTA1-7B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Salesforce/GTA1-7B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Salesforce/GTA1-7B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Salesforce/GTA1-7B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Salesforce/GTA1-7B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use Salesforce/GTA1-7B with Docker Model Runner:
docker model run hf.co/Salesforce/GTA1-7B
| # processing_opencua.py | |
| import torch | |
| from typing import List, Dict, Any, Union | |
| from PIL import Image | |
| from transformers.processing_utils import ProcessorMixin, BatchFeature | |
| from transformers import AutoTokenizer, AutoImageProcessor | |
| PLACEHOLDER = "<|media_placeholder|>" | |
| class OpenCUAProcessor(ProcessorMixin): | |
| attributes = ["image_processor", "tokenizer", "image_token_id", "merge_size"] | |
| def __init__(self, image_processor, tokenizer, image_token_id: int = 151664, merge_size: int = 2, **kwargs): | |
| self.image_processor = image_processor | |
| self.tokenizer = tokenizer | |
| self.image_token_id = image_token_id | |
| self.merge_size = getattr(image_processor, "merge_size", merge_size) | |
| def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): | |
| trust = kwargs.get("trust_remote_code", True) | |
| # 优先用你仓库的 TikTokenV3;失败回退 AutoTokenizer(只用于初始化/占位) | |
| try: | |
| from tokenization_opencua import TikTokenV3 | |
| tok = TikTokenV3.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) | |
| except Exception: | |
| tok = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) | |
| imgproc = AutoImageProcessor.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust) | |
| return cls(imgproc, tok, **kwargs) | |
| def apply_chat_template(self, messages: List[Dict[str, Any]], **kwargs) -> Union[str, List[int]]: | |
| return self.tokenizer.apply_chat_template(messages, **kwargs) | |
| # 下面这些方法给 HF 路径用;vLLM 初始化只需要能成功 new 出来即可 | |
| def __call__(self, *args, **kwargs) -> BatchFeature: | |
| # 返回一个最小结构,避免被实际调用时崩溃 | |
| data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)} | |
| return BatchFeature(data=data) | |
| # 提供给你自己脚本用的辅助(可选) | |
| def prepare_vllm_inputs(self, messages, images, add_generation_prompt=True): | |
| text = self.apply_chat_template(messages, tokenize=False, add_generation_prompt=add_generation_prompt) | |
| proc = self.image_processor(images=images, return_tensors="pt") | |
| grid = torch.as_tensor(proc["image_grid_thw"]) | |
| merge = getattr(self, "merge_size", 2) | |
| for thw in grid: | |
| num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2)) | |
| text = text.replace(PLACEHOLDER, PLACEHOLDER * num, 1) | |
| return text, images | |
| # # processing_opencua.py | |
| # from transformers import Qwen2_5_VLProcessor, AutoTokenizer, AutoImageProcessor | |
| # class OpenCUAProcessor(Qwen2_5_VLProcessor): | |
| # # 用字符串就行,但我们会在 from_pretrained 里手动加载,避免字符串反射 | |
| # tokenizer_class = "TikTokenV3" | |
| # @classmethod | |
| # def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): | |
| # # 确保 remote code 可用 | |
| # trust_remote_code = kwargs.get("trust_remote_code", False) | |
| # # 1) 手动加载 tokenizer(会按模型目录里的 tokenizer_config.json -> TikTokenV3 + tokenization_opencua.py) | |
| # tokenizer = AutoTokenizer.from_pretrained( | |
| # pretrained_model_name_or_path, | |
| # trust_remote_code=trust_remote_code, | |
| # ) | |
| # # 2) 手动加载图像处理器(保持 Qwen2VLImageProcessor) | |
| # image_processor = AutoImageProcessor.from_pretrained( | |
| # pretrained_model_name_or_path, | |
| # trust_remote_code=trust_remote_code, | |
| # ) | |
| # # 3) 获取chat_template,如果tokenizer有的话 | |
| # chat_template = getattr(tokenizer, 'chat_template', None) | |
| # # 4) 构造并返回 Qwen2.5-VL 的 Processor 实例,传递chat_template | |
| # processor = cls(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template) | |
| # # 5) 添加vLLM需要的属性 | |
| # # 这些token ID需要与tokenizer_config.json中的定义一致 | |
| # processor.image_token = "<|media_placeholder|>" # 使用OpenCUA的媒体占位符 | |
| # processor.video_token = "<|media_placeholder|>" # 视频也使用相同的占位符 | |
| # # 添加token ID(从tokenizer_config.json中获取) | |
| # vocab = tokenizer.get_vocab() | |
| # processor.image_token_id = vocab.get("<|media_placeholder|>", 151664) # 默认151664 | |
| # processor.video_token_id = vocab.get("<|media_placeholder|>", 151664) # 视频使用相同ID | |
| # return processor | |