from transformers import AutoProcessor, AutoModelForVision2Seq
from qwen_vl_utils import process_vision_info
import gradio as gr
from PIL import Image
import torch

# Load 72B AWQ model
model2 = AutoModelForVision2Seq.from_pretrained(
    "Qwen/Qwen2.5-VL-32B-Instruct",
    dtype=torch.float16,
    device_map="auto"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")

# Game rules in German
GAME_RULES = """In diesem Bild sehen Sie drei Farbraster. In der folgenden Äußerung beschreibt der Sprecher genau eines der Gitter.
Bitte geben Sie mir an, ob er sich auf das
linke, mittlere oder rechte Farbraster bezieht.
"""

# Load seven images 
IMAGE_OPTIONS = {
    "Bild 1": "example1.jpg",
    "Bild 2": "example2.jpg",
    "Bild 3": "example3.jpg",
    "Bild 4": "example4.jpg",
    "Bild 5": "example5.jpg",
    "Bild 6": "example6.jpg",
    "Bild 7": "example7.jpg",
    "Bild 8": "example8.jpg",
    "Bild 9": "example9.jpg"
}

# Function to run model
def play_game(selected_image_label, user_prompt):
    selected_image_path = IMAGE_OPTIONS[selected_image_label]
    selected_image = Image.open(selected_image_path)

    # Build messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": selected_image},
                {"type": "text", "text": GAME_RULES + "\n" + (user_prompt or "")},
            ],
        }
    ]

    # Prepare input using Qwen's utility function
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)  # Use Qwen utility!

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(model2.device)

    # Run generation
    with torch.inference_mode():
        generated_ids = model2.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]

    return output_text

# Gradio App
with gr.Blocks() as demo:
    with gr.Column():
        image_selector = gr.Dropdown(
            choices=list(IMAGE_OPTIONS.keys()),
            value="Bild 2",  
            label="Wählen Sie ein Bild"
        )
        image_display = gr.Image(
            value=Image.open(IMAGE_OPTIONS["Bild 2"]),
            label="Bild",
            interactive=False,
            type="pil"
        )
        prompt_input = gr.Textbox(
            value="Beschreibung",
            label="Ihre Beschreibung"
        )
        output_text = gr.Textbox(label="Antwort des Modells")
        play_button = gr.Button("Spiel starten")

    def update_image(selected_label):
        selected_path = IMAGE_OPTIONS[selected_label]
        return Image.open(selected_path)

    # When user changes selection, update image
    image_selector.change(
        fn=update_image,
        inputs=[image_selector],
        outputs=image_display
    )

    # When user clicks play, send inputs to model
    play_button.click(
        fn=play_game,
        inputs=[image_selector, prompt_input],
        outputs=output_text
    )

demo.launch()