import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq

MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
print(f"🚀 Loading model on {device} ...")

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    torch_dtype=dtype,
    device_map="auto" if device == "cuda" else None,
    trust_remote_code=True,
)
model.to(device).eval()
print("✅ Model loaded successfully!")

def ask_about_image(image, prompt):
    if image is None or not prompt or not prompt.strip():
        return "Please upload an image and enter a question."

    # ✅ Build a multimodal turn via the official chat template
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt.strip()},
            ],
        }
    ]

    # Tokenize with the chat template (injects the correct image placeholders)
    templated = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,  # adds the assistant prefix
    )

    # Important: pass lists for batched API consistency
    inputs = processor(
        text=[templated],
        images=[image],
        return_tensors="pt",
    ).to(device)

    # Safety pads
    if model.generation_config.pad_token_id is None and processor.tokenizer.pad_token_id is not None:
        model.generation_config.pad_token_id = processor.tokenizer.pad_token_id
    if model.generation_config.eos_token_id is None and processor.tokenizer.eos_token_id is not None:
        model.generation_config.eos_token_id = processor.tokenizer.eos_token_id

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=False,   # deterministic; toggle to True if you want more creative outputs
        )

    answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    return answer.strip()

with gr.Blocks(css="""
#resp_box textarea {
  min-height: 96px;
  max-height: 180px;
  overflow: auto;
  resize: none;
  line-height: 1.2;
  white-space: pre-wrap;
}
#resp_box label { margin-bottom: 4px; }
""") as demo:
    gr.Markdown("## 🧠 Qwen2.5-VL-3B — Visual Reasoning Assistant")

    with gr.Row():
        image = gr.Image(type="pil", label="Upload an Image")
        with gr.Column():
            question = gr.Textbox(
                label="Ask about this image",
                placeholder="e.g. What type of document is this? Is there a stamp or signature?",
            )
            ask_button = gr.Button("Ask")

    # ✅ Compact area showing only the assistant’s answer
    answer = gr.Textbox(
        label="Response",
        lines=4,
        interactive=False,  # user cannot edit
        elem_id="resp_box"
    )

    # The model’s pure output only
    ask_button.click(fn=ask_about_image, inputs=[image, question], outputs=[answer])

demo.launch()