Home_project / app.py
Chaste20's picture
Update app.py
3b08fc5 verified
import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel
# -------------------------
# CONFIG
# -------------------------
BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
DEFAULT_QUESTION = "What sign language letter is this image?"
ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]
processor = None
model = None
def load_model():
global processor, model
if processor is not None and model is not None:
return processor, model
processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)
base = AutoModelForImageTextToText.from_pretrained(
BASE_MODEL_ID,
torch_dtype=DTYPE,
device_map="auto" if torch.cuda.is_available() else None,
)
model_peft = PeftModel.from_pretrained(
base,
FINETUNED_MODEL_ID,
torch_dtype=DTYPE,
)
model_peft.to(DEVICE)
model_peft.eval()
model_peft.config.use_cache = True
model = model_peft
return processor, model
def extract_letter(raw_text: str) -> str:
for ch in raw_text:
if ch in ALLOWED_LETTERS:
return ch
return "?"
@torch.inference_mode()
def guardio_predict(image, question: str):
if image is None:
return "⚠️ Please upload an image of an ASL handshape."
if not question or not question.strip():
question = DEFAULT_QUESTION
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
if image.mode != "RGB":
image = image.convert("RGB")
processor, model = load_model()
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image"},
],
}
]
text = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=False,
)
inputs = processor(
text=[text],
images=[[image]],
padding=True,
return_tensors="pt",
)
inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}
output_ids = model.generate(
**inputs,
max_new_tokens=8,
do_sample=False,
num_beams=4,
temperature=0.1,
pad_token_id=processor.tokenizer.eos_token_id,
)
raw_text = processor.batch_decode(
output_ids,
skip_special_tokens=True,
)[0].strip()
letter = extract_letter(raw_text)
if letter == "?":
return (
"❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
f"Raw model output: `{raw_text}`"
)
return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`"
with gr.Blocks(title="Guardio – ASL Letter Demo") as demo:
gr.Markdown(
"""
# 🧤 Guardio – ASL Letter Demo
Upload an image of a **single ASL alphabet handshape**
and ask: *"What sign language letter is this image?"*
"""
)
with gr.Row():
with gr.Column():
img = gr.Image(
label="ASL handshape image",
type="pil",
height=320,
)
q = gr.Textbox(
label="Question",
value=DEFAULT_QUESTION,
lines=2,
)
btn = gr.Button("Ask Guardio", variant="primary")
with gr.Column():
out = gr.Markdown(
label="Model answer",
value="Upload an image and click **Ask Guardio**.",
)
btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])
if __name__ == "__main__":
demo.launch()