| import torch | |
| import transformers | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from PIL import Image | |
| import warnings | |
| import os | |
| # disable some warnings | |
| transformers.logging.set_verbosity_error() | |
| transformers.logging.disable_progress_bar() | |
| warnings.filterwarnings("ignore") | |
| # set device | |
| device = "cuda" # or cpu | |
| # create model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| "Zero-Vision/Llama-3-MixSense", | |
| torch_dtype=torch.float16, # float32 for cpu | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| "Zero-Vision/Llama-3-MixSense", | |
| trust_remote_code=True, | |
| ) | |
| qs = "describe the image detailly." | |
| input_ids = model.text_process(qs, tokenizer).to(device) | |
| image = Image.open("example.jpg") | |
| image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device) | |
| # generate | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| input_ids, | |
| images=image_tensor, | |
| max_new_tokens=2048, | |
| use_cache=True, | |
| eos_token_id=[ | |
| tokenizer.eos_token_id, | |
| tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0], | |
| ], | |
| ) | |
| print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()) | |