File size: 1,244 Bytes
5192214 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
import os
# disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings("ignore")
# set device
device = "cuda" # or cpu
# create model
model = AutoModelForCausalLM.from_pretrained(
"Zero-Vision/Llama-3-MixSense",
torch_dtype=torch.float16, # float32 for cpu
device_map="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
"Zero-Vision/Llama-3-MixSense",
trust_remote_code=True,
)
qs = "describe the image detailly."
input_ids = model.text_process(qs, tokenizer).to(device)
image = Image.open("example.jpg")
image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device)
# generate
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=image_tensor,
max_new_tokens=2048,
use_cache=True,
eos_token_id=[
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0],
],
)
print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())
|