Spaces:
Build error
Build error
File size: 3,906 Bytes
8b22f1d 20ef925 3b08fc5 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 004f850 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 004f850 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 8b22f1d 20ef925 004f850 8b22f1d 004f850 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel
# -------------------------
# CONFIG
# -------------------------
BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
DEFAULT_QUESTION = "What sign language letter is this image?"
ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]
processor = None
model = None
def load_model():
global processor, model
if processor is not None and model is not None:
return processor, model
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
base = AutoModelForImageTextToText.from_pretrained(
BASE_MODEL_ID,
torch_dtype=DTYPE,
device_map="auto" if torch.cuda.is_available() else None,
)
model_peft = PeftModel.from_pretrained(
base,
FINETUNED_MODEL_ID,
torch_dtype=DTYPE,
)
model_peft.to(DEVICE)
model_peft.eval()
model_peft.config.use_cache = True
model = model_peft
return processor, model
def extract_letter(raw_text: str) -> str:
for ch in raw_text:
if ch in ALLOWED_LETTERS:
return ch
return "?"
@torch.inference_mode()
def guardio_predict(image, question: str):
if image is None:
return "⚠️ Please upload an image of an ASL handshape."
if not question or not question.strip():
question = DEFAULT_QUESTION
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
if image.mode != "RGB":
image = image.convert("RGB")
processor, model = load_model()
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image"},
],
}
]
text = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
)
inputs = processor(
text=[text],
images=[[image]],
padding=True,
return_tensors="pt",
)
inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}
output_ids = model.generate(
**inputs,
max_new_tokens=8,
do_sample=False,
num_beams=4,
temperature=0.1,
pad_token_id=processor.tokenizer.eos_token_id,
)
raw_text = processor.batch_decode(
output_ids,
skip_special_tokens=True,
)[0].strip()
letter = extract_letter(raw_text)
if letter == "?":
return (
"❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
f"Raw model output: `{raw_text}`"
)
return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`"
with gr.Blocks(title="Guardio – ASL Letter Demo") as demo:
gr.Markdown(
"""
# 🧤 Guardio – ASL Letter Demo
Upload an image of a **single ASL alphabet handshape**
and ask: *"What sign language letter is this image?"*
"""
)
with gr.Row():
with gr.Column():
img = gr.Image(
label="ASL handshape image",
type="pil",
height=320,
)
q = gr.Textbox(
label="Question",
value=DEFAULT_QUESTION,
lines=2,
)
btn = gr.Button("Ask Guardio", variant="primary")
with gr.Column():
out = gr.Markdown(
label="Model answer",
value="Upload an image and click **Ask Guardio**.",
)
btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])
if __name__ == "__main__":
demo.launch()
|