Spaces:

Chaste20
/

SmolVLM_Handshape_Letter

Runtime error

File size: 4,641 Bytes

import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel
import traceback, textwrap, re

BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
DEFAULT_QUESTION = (
    "What sign language letter is this image?"
)
ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]

processor = None
model = None

def load_model():
    global processor, model
    if processor is not None and model is not None:
        return processor, model

    print("🔄 Loading processor from", BASE_MODEL_ID)
    processor = AutoProcessor.from_pretrained(
        BASE_MODEL_ID,
        trust_remote_code=True
    )

    print("🔄 Loading base model from", BASE_MODEL_ID)
    base = AutoModelForImageTextToText.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=DTYPE,
        device_map="auto" if torch.cuda.is_available() else None,
        trust_remote_code=True,
    )

    print("🔄 Attaching PEFT adapter from", FINETUNED_MODEL_ID)
    model_peft = PeftModel.from_pretrained(
        base,
        FINETUNED_MODEL_ID,
        torch_dtype=DTYPE,
    )
    model_peft.to(DEVICE)
    model_peft.eval()
    model_peft.config.use_cache = True

    model = model_peft
    print("✅ Guardio model loaded on", DEVICE)
    return processor, model

def extract_letter(raw_text: str) -> str:
    for ch in raw_text:
        if ch in ALLOWED_LETTERS:
            return ch
    return "?"

@torch.inference_mode()
def guardio_predict(image, question: str):
    try:
        if image is None:
            return "Please upload an image of an ASL handshape."

        if not question or not question.strip():
            question = DEFAULT_QUESTION

        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        if image.mode != "RGB":
            image = image.convert("RGB")

        proc, mdl = load_model()

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image"},
                ],
            }
        ]

        text = proc.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=False,
        )

        inputs = proc(
            text=[text],
            images=[image],
            padding=True,
            return_tensors="pt",
        )
        inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}

        output_ids = mdl.generate(
            **inputs,
            max_new_tokens=8,
            do_sample=False,
            num_beams=2,
            temperature=0.1,
            pad_token_id=proc.tokenizer.eos_token_id,
        )

        raw_text = proc.batch_decode(
            output_ids,
            skip_special_tokens=True,
        )[0].strip()

        letter = extract_letter(raw_text)

        if letter == "?":
            return (
                "❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
                f"Raw model output: `{raw_text}`"
            )

        return f"\n\nPredicted letter: {letter}"

    except Exception as e:
        traceback.print_exc()
        msg = textwrap.dedent(f"""
        🚨 **Internal error while running the model**

        **Type:** `{type(e).__name__}`
        **Message:** `{e}`

        """).strip()
        return msg

def build_demo():
    with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo:
        gr.Markdown(
            """
            Guardio – ASL Letter Demo

            - Upload an image of a **single ASL alphabet handshape**
            - Ask: *"Which ASL alphabet letter is this image?"*
            - The model predicts a single A–Z letter.
            """
        )

        with gr.Row():
            with gr.Column():
                img = gr.Image(label="ASL handshape image", type="pil", height=320)
                q = gr.Textbox(label="Question", value=DEFAULT_QUESTION, lines=2)
                btn = gr.Button("Ask Guardio", variant="primary")

            with gr.Column():
                out = gr.Markdown(
                    label="Model answer",
                    value="Upload an image and click **Ask Guardio**.",
                )

        btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])

    return demo

demo = build_demo()

if __name__ == "__main__":
    demo.launch()