import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer from PIL import Image import warnings import os # disable some warnings transformers.logging.set_verbosity_error() transformers.logging.disable_progress_bar() warnings.filterwarnings("ignore") # set device device = "cuda" # or cpu # create model model = AutoModelForCausalLM.from_pretrained( "Zero-Vision/Llama-3-MixSense", torch_dtype=torch.float16, # float32 for cpu device_map="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained( "Zero-Vision/Llama-3-MixSense", trust_remote_code=True, ) qs = "describe the image detailly." input_ids = model.text_process(qs, tokenizer).to(device) image = Image.open("example.jpg") image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device) # generate with torch.inference_mode(): output_ids = model.generate( input_ids, images=image_tensor, max_new_tokens=2048, use_cache=True, eos_token_id=[ tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0], ], ) print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip())