import torch import gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModelForImageTextToText from peft import PeftModel # ------------------------- # CONFIG # ------------------------- BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 DEFAULT_QUESTION = "What sign language letter is this image?" ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)] processor = None model = None def load_model(): global processor, model if processor is not None and model is not None: return processor, model processor = AutoProcessor.from_pretrained(BASE_MODEL_ID) base = AutoModelForImageTextToText.from_pretrained( BASE_MODEL_ID, torch_dtype=DTYPE, device_map="auto" if torch.cuda.is_available() else None, ) model_peft = PeftModel.from_pretrained( base, FINETUNED_MODEL_ID, torch_dtype=DTYPE, ) model_peft.to(DEVICE) model_peft.eval() model_peft.config.use_cache = True model = model_peft return processor, model def extract_letter(raw_text: str) -> str: for ch in raw_text: if ch in ALLOWED_LETTERS: return ch return "?" @torch.inference_mode() def guardio_predict(image, question: str): if image is None: return "⚠️ Please upload an image of an ASL handshape." if not question or not question.strip(): question = DEFAULT_QUESTION if not isinstance(image, Image.Image): image = Image.fromarray(image) if image.mode != "RGB": image = image.convert("RGB") processor, model = load_model() messages = [ { "role": "user", "content": [ {"type": "text", "text": question}, {"type": "image"}, ], } ] text = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=False, ) inputs = processor( text=[text], images=[[image]], padding=True, return_tensors="pt", ) inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()} output_ids = model.generate( **inputs, max_new_tokens=8, do_sample=False, num_beams=4, temperature=0.1, pad_token_id=processor.tokenizer.eos_token_id, ) raw_text = processor.batch_decode( output_ids, skip_special_tokens=True, )[0].strip() letter = extract_letter(raw_text) if letter == "?": return ( "❓ I couldn’t confidently map this to a single A–Z letter.\n\n" f"Raw model output: `{raw_text}`" ) return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`" with gr.Blocks(title="Guardio – ASL Letter Demo") as demo: gr.Markdown( """ # 🧤 Guardio – ASL Letter Demo Upload an image of a **single ASL alphabet handshape** and ask: *"What sign language letter is this image?"* """ ) with gr.Row(): with gr.Column(): img = gr.Image( label="ASL handshape image", type="pil", height=320, ) q = gr.Textbox( label="Question", value=DEFAULT_QUESTION, lines=2, ) btn = gr.Button("Ask Guardio", variant="primary") with gr.Column(): out = gr.Markdown( label="Model answer", value="Upload an image and click **Ask Guardio**.", ) btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out]) if __name__ == "__main__": demo.launch()