from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration import torch from PIL import Image import requests processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) model.to("cuda:0") # prepare image and text prompt, using the appropriate prompt template url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" image = Image.open(requests.get(url, stream=True).raw) prompt = "[INST] \nWhat is shown in this image? [/INST]" inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") # autoregressively complete prompt output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True))