Spaces:
Build error
Build error
| import torch | |
| import gradio as gr | |
| from PIL import Image | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| from peft import PeftModel | |
| # ------------------------- | |
| # CONFIG | |
| # ------------------------- | |
| BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" | |
| FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
| DEFAULT_QUESTION = "What sign language letter is this image?" | |
| ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)] | |
| processor = None | |
| model = None | |
| def load_model(): | |
| global processor, model | |
| if processor is not None and model is not None: | |
| return processor, model | |
| processor = AutoProcessor.from_pretrained(BASE_MODEL_ID) | |
| base = AutoModelForImageTextToText.from_pretrained( | |
| BASE_MODEL_ID, | |
| torch_dtype=DTYPE, | |
| device_map="auto" if torch.cuda.is_available() else None, | |
| ) | |
| model_peft = PeftModel.from_pretrained( | |
| base, | |
| FINETUNED_MODEL_ID, | |
| torch_dtype=DTYPE, | |
| ) | |
| model_peft.to(DEVICE) | |
| model_peft.eval() | |
| model_peft.config.use_cache = True | |
| model = model_peft | |
| return processor, model | |
| def extract_letter(raw_text: str) -> str: | |
| for ch in raw_text: | |
| if ch in ALLOWED_LETTERS: | |
| return ch | |
| return "?" | |
| def guardio_predict(image, question: str): | |
| if image is None: | |
| return "⚠️ Please upload an image of an ASL handshape." | |
| if not question or not question.strip(): | |
| question = DEFAULT_QUESTION | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| processor, model = load_model() | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": question}, | |
| {"type": "image"}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| tokenize=False, | |
| ) | |
| inputs = processor( | |
| text=[text], | |
| images=[[image]], | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()} | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=8, | |
| do_sample=False, | |
| num_beams=4, | |
| temperature=0.1, | |
| pad_token_id=processor.tokenizer.eos_token_id, | |
| ) | |
| raw_text = processor.batch_decode( | |
| output_ids, | |
| skip_special_tokens=True, | |
| )[0].strip() | |
| letter = extract_letter(raw_text) | |
| if letter == "?": | |
| return ( | |
| "❓ I couldn’t confidently map this to a single A–Z letter.\n\n" | |
| f"Raw model output: `{raw_text}`" | |
| ) | |
| return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`" | |
| with gr.Blocks(title="Guardio – ASL Letter Demo") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🧤 Guardio – ASL Letter Demo | |
| Upload an image of a **single ASL alphabet handshape** | |
| and ask: *"What sign language letter is this image?"* | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| img = gr.Image( | |
| label="ASL handshape image", | |
| type="pil", | |
| height=320, | |
| ) | |
| q = gr.Textbox( | |
| label="Question", | |
| value=DEFAULT_QUESTION, | |
| lines=2, | |
| ) | |
| btn = gr.Button("Ask Guardio", variant="primary") | |
| with gr.Column(): | |
| out = gr.Markdown( | |
| label="Model answer", | |
| value="Upload an image and click **Ask Guardio**.", | |
| ) | |
| btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out]) | |
| if __name__ == "__main__": | |
| demo.launch() | |