import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel

# -------------------------
# CONFIG
# -------------------------
BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" 

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
DEFAULT_QUESTION = "What sign language letter is this image?"
ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]

processor = None
model = None

def load_model():
    global processor, model
    if processor is not None and model is not None:
        return processor, model

    processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

    base = AutoModelForImageTextToText.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=DTYPE,
        device_map="auto" if torch.cuda.is_available() else None,
    )

    model_peft = PeftModel.from_pretrained(
        base,
        FINETUNED_MODEL_ID,
        torch_dtype=DTYPE,
    )
    model_peft.to(DEVICE)
    model_peft.eval()
    model_peft.config.use_cache = True

    model = model_peft
    return processor, model

def extract_letter(raw_text: str) -> str:
    for ch in raw_text:
        if ch in ALLOWED_LETTERS:
            return ch
    return "?"

@torch.inference_mode()
def guardio_predict(image, question: str):
    if image is None:
        return "⚠️ Please upload an image of an ASL handshape."

    if not question or not question.strip():
        question = DEFAULT_QUESTION

    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    if image.mode != "RGB":
        image = image.convert("RGB")

    processor, model = load_model()

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image"},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )

    inputs = processor(
        text=[text],
        images=[[image]],
        padding=True,
        return_tensors="pt",
    )
    inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}

    output_ids = model.generate(
        **inputs,
        max_new_tokens=8,
        do_sample=False,
        num_beams=4,
        temperature=0.1,
        pad_token_id=processor.tokenizer.eos_token_id,
    )

    raw_text = processor.batch_decode(
        output_ids,
        skip_special_tokens=True,
    )[0].strip()

    letter = extract_letter(raw_text)

    if letter == "?":
        return (
            "❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
            f"Raw model output: `{raw_text}`"
        )

    return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`"

with gr.Blocks(title="Guardio – ASL Letter Demo") as demo:
    gr.Markdown(
        """
        # 🧤 Guardio – ASL Letter Demo

        Upload an image of a **single ASL alphabet handshape**  
        and ask: *"What sign language letter is this image?"*
        """
    )

    with gr.Row():
        with gr.Column():
            img = gr.Image(
                label="ASL handshape image",
                type="pil",
                height=320,
            )
            q = gr.Textbox(
                label="Question",
                value=DEFAULT_QUESTION,
                lines=2,
            )
            btn = gr.Button("Ask Guardio", variant="primary")

        with gr.Column():
            out = gr.Markdown(
                label="Model answer",
                value="Upload an image and click **Ask Guardio**.",
            )

    btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])

if __name__ == "__main__":
    demo.launch()