| import os |
| import time |
| from pathlib import Path |
|
|
| os.environ["CUDA_VISIBLE_DEVICES"] = "7" |
|
|
| import torch |
| from PIL import Image |
| from unsloth import FastVisionModel |
| from transformers import AutoProcessor |
|
|
| CHECKPOINT_PATH = "outputs/mimic_qwen3vl_lora_8bit_5/checkpoint-17454" |
| BASE_MODEL_NAME = "unsloth/Qwen3-VL-8B-Thinking" |
| SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt.txt") |
| IMAGE_PATH_1 = Path( |
| "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg" |
| ) |
| IMAGE_PATH_2 = Path( |
| "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg" |
| ) |
|
|
| SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() |
| INPUT_IMAGE_1 = Image.open(IMAGE_PATH_1).convert("RGB") |
| INPUT_IMAGE_2 = Image.open(IMAGE_PATH_2).convert("RGB") |
|
|
| model, _ = FastVisionModel.from_pretrained( |
| model_name=CHECKPOINT_PATH, |
| load_in_4bit=False, |
| load_in_8bit=True, |
| ) |
| processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True) |
| FastVisionModel.for_inference(model) |
| messages = [ |
| |
| |
| |
| |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": INPUT_IMAGE_1}, |
| {"type": "image", "image": INPUT_IMAGE_2}, |
| {"type": "text", "text": SYSTEM_PROMPT} |
| ] |
| }, |
| ] |
| inputs = processor.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| tokenize=True, |
| return_dict=True, |
| return_tensors="pt", |
| ) |
|
|
| device = next(model.parameters()).device |
| inputs = inputs.to(device) |
|
|
| if torch.cuda.is_available(): |
| torch.cuda.synchronize() |
| start_time = time.perf_counter() |
| outputs = model.generate(**inputs, max_new_tokens=4096) |
| if torch.cuda.is_available(): |
| torch.cuda.synchronize() |
| gen_time_seconds = time.perf_counter() - start_time |
|
|
| if "attention_mask" in inputs: |
| input_tokens = int(inputs["attention_mask"][0].sum().item()) |
| else: |
| input_tokens = int(inputs["input_ids"].shape[-1]) |
|
|
| total_tokens = int(outputs.shape[-1]) |
| output_tokens = total_tokens - input_tokens |
|
|
| generated_text = processor.decode(outputs[0][input_tokens:]) |
|
|
| print(f"Input tokens: {input_tokens}") |
| print(f"Output tokens: {output_tokens}") |
| print(f"Total tokens: {total_tokens}") |
| print(f"Generation time (s): {gen_time_seconds:.3f}") |
| print("\n--- Raw Output ---") |
| print(generated_text) |