import os import time from pathlib import Path os.environ["CUDA_VISIBLE_DEVICES"] = "7" import torch from PIL import Image from unsloth import FastVisionModel from transformers import AutoProcessor CHECKPOINT_PATH = "outputs/mimic_qwen3vl_lora_8bit_5/checkpoint-17454" BASE_MODEL_NAME = "unsloth/Qwen3-VL-8B-Thinking" SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt.txt") IMAGE_PATH_1 = Path( "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg" ) IMAGE_PATH_2 = Path( "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg" ) SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip() INPUT_IMAGE_1 = Image.open(IMAGE_PATH_1).convert("RGB") INPUT_IMAGE_2 = Image.open(IMAGE_PATH_2).convert("RGB") model, _ = FastVisionModel.from_pretrained( model_name=CHECKPOINT_PATH, load_in_4bit=False, load_in_8bit=True, ) processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True) FastVisionModel.for_inference(model) messages = [ # { # "role": "system", # "content": [{"type": "text", "text": SYSTEM_PROMPT}], # }, { "role": "user", "content": [ {"type": "image", "image": INPUT_IMAGE_1}, {"type": "image", "image": INPUT_IMAGE_2}, {"type": "text", "text": SYSTEM_PROMPT} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ) device = next(model.parameters()).device inputs = inputs.to(device) if torch.cuda.is_available(): torch.cuda.synchronize() start_time = time.perf_counter() outputs = model.generate(**inputs, max_new_tokens=4096) if torch.cuda.is_available(): torch.cuda.synchronize() gen_time_seconds = time.perf_counter() - start_time if "attention_mask" in inputs: input_tokens = int(inputs["attention_mask"][0].sum().item()) else: input_tokens = int(inputs["input_ids"].shape[-1]) total_tokens = int(outputs.shape[-1]) output_tokens = total_tokens - input_tokens generated_text = processor.decode(outputs[0][input_tokens:]) print(f"Input tokens: {input_tokens}") print(f"Output tokens: {output_tokens}") print(f"Total tokens: {total_tokens}") print(f"Generation time (s): {gen_time_seconds:.3f}") print("\n--- Raw Output ---") print(generated_text)