|
|
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration |
|
|
import torch |
|
|
from PIL import Image |
|
|
import requests |
|
|
|
|
|
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") |
|
|
|
|
|
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) |
|
|
model.to("cuda:0") |
|
|
|
|
|
|
|
|
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true" |
|
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
prompt = "[INST] <image>\nWhat is shown in this image? [/INST]" |
|
|
|
|
|
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0") |
|
|
|
|
|
|
|
|
output = model.generate(**inputs, max_new_tokens=100) |
|
|
|
|
|
print(processor.decode(output[0], skip_special_tokens=True)) |
|
|
|