| import torch |
| import transformers |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from PIL import Image |
| import warnings |
| import os |
|
|
| os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" |
| |
| transformers.logging.set_verbosity_error() |
| transformers.logging.disable_progress_bar() |
| warnings.filterwarnings("ignore") |
|
|
| |
| device = "cuda" |
|
|
| |
| model = AutoModelForCausalLM.from_pretrained( |
| "Zero-Vision/Llama-3-MixSenseV1_1", |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True, |
| ) |
| tokenizer = AutoTokenizer.from_pretrained( |
| "Zero-Vision/Llama-3-MixSenseV1_1", |
| trust_remote_code=True, |
| ) |
| qs = "describe the image detailly." |
| input_ids = model.text_process(qs, tokenizer).to(device) |
|
|
| image = Image.open("example.jpg") |
| image_tensor = model.image_process([image]).to(dtype=model.dtype, device=device) |
|
|
| |
| with torch.inference_mode(): |
| output_ids = model.generate( |
| input_ids, |
| images=image_tensor, |
| max_new_tokens=2048, |
| use_cache=True, |
| eos_token_id=[ |
| tokenizer.eos_token_id, |
| tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0], |
| ], |
| ) |
| print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()) |