Spaces:

Chaste20
/

SmolVLM_Handshape_Letter

Runtime error

App Files Files Community

Chaste20 commited on Dec 11, 2025

Commit

cb45a42

1 Parent(s): 74ca19a

Add Guardio ASL demo

Browse files

Files changed (2) hide show

app.py +163 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import torch
+import gradio as gr
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from peft import PeftModel
+import traceback, textwrap, re
+BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
+FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+DEFAULT_QUESTION = (
+    "Which ASL alphabet letter is shown in this image? "
+    "Answer with exactly one capital letter A–Z and nothing else."
+)
+ALLOWED_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+processor = None
+model = None
+def load_model():
+    global processor, model
+    if processor is not None and model is not None:
+        return processor, model
+    print(" Loading processor from", BASE_MODEL_ID)
+    processor = AutoProcessor.from_pretrained(
+        BASE_MODEL_ID,
+        trust_remote_code=True
+    )
+    print(" Loading base model from", BASE_MODEL_ID)
+    base = AutoModelForImageTextToText.from_pretrained(
+        BASE_MODEL_ID,
+        torch_dtype=DTYPE,
+        device_map="auto" if torch.cuda.is_available() else None,
+        trust_remote_code=True,
+    )
+    print(" Attaching PEFT adapter from", FINETUNED_MODEL_ID)
+    model_peft = PeftModel.from_pretrained(
+        base,
+        FINETUNED_MODEL_ID,
+        torch_dtype=DTYPE,
+    )
+    model_peft.to(DEVICE)
+    model_peft.eval()
+    model_peft.config.use_cache = True
+    model = model_peft
+    print(" Guardio model loaded on", DEVICE)
+    return processor, model
+def extract_letter(raw_text: str) -> str:
+    m = re.search(r"\b([A-Z])\b", raw_text.strip())
+    if m and m.group(1) in ALLOWED_LETTERS:
+        return m.group(1)
+    caps = [c for c in raw_text if c in ALLOWED_LETTERS]
+    return caps[-1] if caps else "?"
+@torch.inference_mode()
+def guardio_predict(image, question: str):
+    try:
+        if image is None:
+            return " Please upload an image of an ASL handshape."
+        if not question or not question.strip():
+            question = DEFAULT_QUESTION
+        if not isinstance(image, Image.Image):
+            image = Image.fromarray(image)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        proc, mdl = load_model()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": question},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        text = proc.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        inputs = proc(
+            text=[text],
+            images=[image],
+            padding=True,
+            return_tensors="pt",
+        ).to(DEVICE)
+        output_ids = mdl.generate(
+            **inputs,
+            max_new_tokens=8,
+            do_sample=False,
+            num_beams=1,
+            temperature=0.1,
+            pad_token_id=proc.tokenizer.eos_token_id,
+        )
+        raw_text = proc.batch_decode(
+            output_ids,
+            skip_special_tokens=True,
+        )[0].strip()
+        letter = extract_letter(raw_text)
+        if letter == "?":
+            return (
+                " I couldn’t confidently map this to a single A–Z letter.\n\n"
+                f"Raw model output: `{raw_text}`"
+            )
+        return f" **Predicted letter: {letter}**\n\nRaw model output: `{raw_text}`"
+    except Exception as e:
+        traceback.print_exc()
+        msg = textwrap.dedent(f"""
+         **Internal error while running the model**
+        **Type:** `{type(e).__name__}`
+        **Message:** `{e}`
+        """).strip()
+        return msg
+def build_demo():
+    with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo:
+        gr.Markdown(
+            """
+            ASL Letter Demo
+            - Upload an image of a **single ASL alphabet handshape**
+            - Ask: *"Which ASL alphabet letter is this image?"*
+            - The model predicts a single A–Z letter.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                img = gr.Image(label="ASL handshape image", type="pil", height=320)
+                q = gr.Textbox(label="Question", value=DEFAULT_QUESTION, lines=2)
+                btn = gr.Button("Ask Guardio", variant="primary")
+            with gr.Column():
+                out = gr.Markdown("Upload an image and click **Ask Guardio**.")
+        btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])
+    return demo
+demo = build_demo()
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+transformers>=4.46.0
+peft>=0.14.0
+accelerate>=1.0.0
+bitsandbytes
+num2words
+torch
+gradio
+Pillow