import os
import time
from pathlib import Path

os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import torch
from PIL import Image
from unsloth import FastVisionModel
from transformers import AutoProcessor

CHECKPOINT_PATH = "outputs/mimic_qwen3vl_lora_8bit_5/checkpoint-17454"
BASE_MODEL_NAME = "unsloth/Qwen3-VL-8B-Thinking"
SYSTEM_PROMPT_PATH = Path(__file__).with_name("new_system_prompt.txt")
IMAGE_PATH_1 = Path(
    "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/7e962a95-d661c0db-4769286c-e150a106-fb9586c6.jpg"
)
IMAGE_PATH_2 = Path(
    "/home/dgxuser16/NTL/mccarthy/ahmad/cap/dataset/images_1/s50000230/f605b192-2e612578-c5c95dc3-b9d6d13b-e0eee500.jpg"
)

SYSTEM_PROMPT = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8").strip()
INPUT_IMAGE_1 = Image.open(IMAGE_PATH_1).convert("RGB")
INPUT_IMAGE_2 = Image.open(IMAGE_PATH_2).convert("RGB")

model, _ = FastVisionModel.from_pretrained(
    model_name=CHECKPOINT_PATH,
    load_in_4bit=False,
    load_in_8bit=True,
)
processor = AutoProcessor.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
FastVisionModel.for_inference(model)
messages = [
    # {
    #     "role": "system",
    #     "content": [{"type": "text", "text": SYSTEM_PROMPT}],
    # },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": INPUT_IMAGE_1},
            {"type": "image", "image": INPUT_IMAGE_2},
            {"type": "text", "text": SYSTEM_PROMPT}
        ]
    },
]
inputs = processor.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
)

device = next(model.parameters()).device
inputs = inputs.to(device)

if torch.cuda.is_available():
    torch.cuda.synchronize()
start_time = time.perf_counter()
outputs = model.generate(**inputs, max_new_tokens=4096)
if torch.cuda.is_available():
    torch.cuda.synchronize()
gen_time_seconds = time.perf_counter() - start_time

if "attention_mask" in inputs:
    input_tokens = int(inputs["attention_mask"][0].sum().item())
else:
    input_tokens = int(inputs["input_ids"].shape[-1])

total_tokens = int(outputs.shape[-1])
output_tokens = total_tokens - input_tokens

generated_text = processor.decode(outputs[0][input_tokens:])

print(f"Input tokens: {input_tokens}")
print(f"Output tokens: {output_tokens}")
print(f"Total tokens: {total_tokens}")
print(f"Generation time (s): {gen_time_seconds:.3f}")
print("\n--- Raw Output ---")
print(generated_text)