import torch import gradio as gr from PIL import Image from transformers import AutoProcessor, AutoModelForImageTextToText from peft import PeftModel import traceback, textwrap, re BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 DEFAULT_QUESTION = ( "What sign language letter is this image?" ) ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)] processor = None model = None def load_model(): global processor, model if processor is not None and model is not None: return processor, model print("🔄 Loading processor from", BASE_MODEL_ID) processor = AutoProcessor.from_pretrained( BASE_MODEL_ID, trust_remote_code=True ) print("🔄 Loading base model from", BASE_MODEL_ID) base = AutoModelForImageTextToText.from_pretrained( BASE_MODEL_ID, torch_dtype=DTYPE, device_map="auto" if torch.cuda.is_available() else None, trust_remote_code=True, ) print("🔄 Attaching PEFT adapter from", FINETUNED_MODEL_ID) model_peft = PeftModel.from_pretrained( base, FINETUNED_MODEL_ID, torch_dtype=DTYPE, ) model_peft.to(DEVICE) model_peft.eval() model_peft.config.use_cache = True model = model_peft print("✅ Guardio model loaded on", DEVICE) return processor, model def extract_letter(raw_text: str) -> str: for ch in raw_text: if ch in ALLOWED_LETTERS: return ch return "?" @torch.inference_mode() def guardio_predict(image, question: str): try: if image is None: return "Please upload an image of an ASL handshape." if not question or not question.strip(): question = DEFAULT_QUESTION if not isinstance(image, Image.Image): image = Image.fromarray(image) if image.mode != "RGB": image = image.convert("RGB") proc, mdl = load_model() messages = [ { "role": "user", "content": [ {"type": "text", "text": question}, {"type": "image"}, ], } ] text = proc.apply_chat_template( messages, add_generation_prompt=True, tokenize=False, ) inputs = proc( text=[text], images=[image], padding=True, return_tensors="pt", ) inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()} output_ids = mdl.generate( **inputs, max_new_tokens=8, do_sample=False, num_beams=2, temperature=0.1, pad_token_id=proc.tokenizer.eos_token_id, ) raw_text = proc.batch_decode( output_ids, skip_special_tokens=True, )[0].strip() letter = extract_letter(raw_text) if letter == "?": return ( "❓ I couldn’t confidently map this to a single A–Z letter.\n\n" f"Raw model output: `{raw_text}`" ) return f"\n\nPredicted letter: {letter}" except Exception as e: traceback.print_exc() msg = textwrap.dedent(f""" 🚨 **Internal error while running the model** **Type:** `{type(e).__name__}` **Message:** `{e}` """).strip() return msg def build_demo(): with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo: gr.Markdown( """ Guardio – ASL Letter Demo - Upload an image of a **single ASL alphabet handshape** - Ask: *"Which ASL alphabet letter is this image?"* - The model predicts a single A–Z letter. """ ) with gr.Row(): with gr.Column(): img = gr.Image(label="ASL handshape image", type="pil", height=320) q = gr.Textbox(label="Question", value=DEFAULT_QUESTION, lines=2) btn = gr.Button("Ask Guardio", variant="primary") with gr.Column(): out = gr.Markdown( label="Model answer", value="Upload an image and click **Ask Guardio**.", ) btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out]) return demo demo = build_demo() if __name__ == "__main__": demo.launch()