Spaces:

Chaste20
/

SmolVLM_Handshape_Letter

Runtime error

App Files Files Community

Chaste20 commited on Dec 11, 2025

Commit

77a3ed6

verified ·

1 Parent(s): cb45a42

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -29

app.py CHANGED Viewed

@@ -1,19 +1,14 @@
-import torch
 import gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from peft import PeftModel
-import traceback, textwrap, re
-BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
-FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
-DEFAULT_QUESTION = (
-    "Which ASL alphabet letter is shown in this image? "
-    "Answer with exactly one capital letter A–Z and nothing else."
-)
-ALLOWED_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 processor = None
 model = None
@@ -23,21 +18,21 @@ def load_model():
     if processor is not None and model is not None:
         return processor, model
-    print(" Loading processor from", BASE_MODEL_ID)
     processor = AutoProcessor.from_pretrained(
         BASE_MODEL_ID,
         trust_remote_code=True
     )
-    print(" Loading base model from", BASE_MODEL_ID)
     base = AutoModelForImageTextToText.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=DTYPE,
         device_map="auto" if torch.cuda.is_available() else None,
-        trust_remote_code=True,
     )
-    print(" Attaching PEFT adapter from", FINETUNED_MODEL_ID)
     model_peft = PeftModel.from_pretrained(
         base,
         FINETUNED_MODEL_ID,
@@ -48,25 +43,27 @@ def load_model():
     model_peft.config.use_cache = True
     model = model_peft
-    print(" Guardio model loaded on", DEVICE)
     return processor, model
 def extract_letter(raw_text: str) -> str:
-    m = re.search(r"\b([A-Z])\b", raw_text.strip())
-    if m and m.group(1) in ALLOWED_LETTERS:
-        return m.group(1)
-    caps = [c for c in raw_text if c in ALLOWED_LETTERS]
-    return caps[-1] if caps else "?"
 @torch.inference_mode()
 def guardio_predict(image, question: str):
     try:
         if image is None:
-            return " Please upload an image of an ASL handshape."
         if not question or not question.strip():
             question = DEFAULT_QUESTION
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         if image.mode != "RGB":
@@ -84,6 +81,7 @@ def guardio_predict(image, question: str):
             }
         ]
         text = proc.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -95,14 +93,16 @@ def guardio_predict(image, question: str):
             images=[image],
             padding=True,
             return_tensors="pt",
-        ).to(DEVICE)
         output_ids = mdl.generate(
             **inputs,
             max_new_tokens=8,
             do_sample=False,
-            num_beams=1,
             temperature=0.1,
             pad_token_id=proc.tokenizer.eos_token_id,
         )
@@ -115,23 +115,28 @@ def guardio_predict(image, question: str):
         if letter == "?":
             return (
-                " I couldn’t confidently map this to a single A–Z letter.\n\n"
                 f"Raw model output: `{raw_text}`"
             )
-        return f" **Predicted letter: {letter}**\n\nRaw model output: `{raw_text}`"
     except Exception as e:
-        traceback.print_exc()
         msg = textwrap.dedent(f"""
-         **Internal error while running the model**
         **Type:** `{type(e).__name__}`
         **Message:** `{e}`
         """).strip()
         return msg
 def build_demo():
     with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo:
         gr.Markdown(

+import traceback
+import textwrap
 import gradio as gr
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForImageTextToText
+from transformers import AutoProcessor, AutoModelForVision2Seq
 from peft import PeftModel
+import num2words
 processor = None
 model = None
     if processor is not None and model is not None:
         return processor, model
+    print("🔄 Loading processor from", BASE_MODEL_ID)
     processor = AutoProcessor.from_pretrained(
         BASE_MODEL_ID,
         trust_remote_code=True
     )
+    print("🔄 Loading base model from", BASE_MODEL_ID)
     base = AutoModelForImageTextToText.from_pretrained(
         BASE_MODEL_ID,
         torch_dtype=DTYPE,
         device_map="auto" if torch.cuda.is_available() else None,
+        trust_remote_code=True
     )
+    print("🔄 Attaching PEFT adapter from", FINETUNED_MODEL_ID)
     model_peft = PeftModel.from_pretrained(
         base,
         FINETUNED_MODEL_ID,
     model_peft.config.use_cache = True
     model = model_peft
+    print("✅ Guardio model loaded on", DEVICE)
     return processor, model
 def extract_letter(raw_text: str) -> str:
+    for ch in raw_text:
+        if ch in ALLOWED_LETTERS:
+            return ch
+    return "?"
 @torch.inference_mode()
 def guardio_predict(image, question: str):
     try:
         if image is None:
+            return "⚠️ Please upload an image of an ASL handshape."
         if not question or not question.strip():
             question = DEFAULT_QUESTION
+        # Ensure PIL image
         if not isinstance(image, Image.Image):
             image = Image.fromarray(image)
         if image.mode != "RGB":
             }
         ]
+        # chat template with <image> token
         text = proc.apply_chat_template(
             messages,
             add_generation_prompt=True,
             images=[image],
             padding=True,
             return_tensors="pt",
+        )
+        inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}
         output_ids = mdl.generate(
             **inputs,
             max_new_tokens=8,
             do_sample=False,
+            num_beams=2,
             temperature=0.1,
             pad_token_id=proc.tokenizer.eos_token_id,
         )
         if letter == "?":
             return (
+                "❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
                 f"Raw model output: `{raw_text}`"
             )
+        #return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`"
+        return f"**\n`Raw output: {raw_text} ** "
     except Exception as e:
+        traceback.print_exc()  # show full error in Colab logs
         msg = textwrap.dedent(f"""
+        🚨 **Internal error while running the model**
         **Type:** `{type(e).__name__}`
         **Message:** `{e}`
+        Check the Colab cell output for the full traceback.
         """).strip()
         return msg
 def build_demo():
     with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo:
         gr.Markdown(