File size: 3,906 Bytes
8b22f1d
 
20ef925
 
 
 
 
 
 
 
3b08fc5
20ef925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b22f1d
20ef925
 
 
 
 
 
 
 
8b22f1d
20ef925
 
8b22f1d
20ef925
 
 
 
 
004f850
20ef925
 
8b22f1d
20ef925
8b22f1d
20ef925
 
8b22f1d
20ef925
 
8b22f1d
 
 
20ef925
 
8b22f1d
 
 
 
 
 
 
20ef925
8b22f1d
 
 
 
 
 
 
 
 
 
 
 
20ef925
 
 
 
 
 
 
 
 
 
 
8b22f1d
 
20ef925
 
 
 
8b22f1d
20ef925
004f850
20ef925
 
 
 
8b22f1d
 
20ef925
 
 
 
 
 
8b22f1d
20ef925
 
 
 
8b22f1d
20ef925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
004f850
8b22f1d
004f850
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForImageTextToText
from peft import PeftModel

# -------------------------
# CONFIG
# -------------------------
BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2" 

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
DEFAULT_QUESTION = "What sign language letter is this image?"
ALLOWED_LETTERS = [chr(ord("A") + i) for i in range(26)]

processor = None
model = None

def load_model():
    global processor, model
    if processor is not None and model is not None:
        return processor, model

    processor = AutoProcessor.from_pretrained(BASE_MODEL_ID)

    base = AutoModelForImageTextToText.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=DTYPE,
        device_map="auto" if torch.cuda.is_available() else None,
    )

    model_peft = PeftModel.from_pretrained(
        base,
        FINETUNED_MODEL_ID,
        torch_dtype=DTYPE,
    )
    model_peft.to(DEVICE)
    model_peft.eval()
    model_peft.config.use_cache = True

    model = model_peft
    return processor, model

def extract_letter(raw_text: str) -> str:
    for ch in raw_text:
        if ch in ALLOWED_LETTERS:
            return ch
    return "?"

@torch.inference_mode()
def guardio_predict(image, question: str):
    if image is None:
        return "⚠️ Please upload an image of an ASL handshape."

    if not question or not question.strip():
        question = DEFAULT_QUESTION

    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    if image.mode != "RGB":
        image = image.convert("RGB")

    processor, model = load_model()

    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image"},
            ],
        }
    ]

    text = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=False,
    )

    inputs = processor(
        text=[text],
        images=[[image]],
        padding=True,
        return_tensors="pt",
    )
    inputs = {k: v.to(DEVICE, dtype=DTYPE) for k, v in inputs.items()}

    output_ids = model.generate(
        **inputs,
        max_new_tokens=8,
        do_sample=False,
        num_beams=4,
        temperature=0.1,
        pad_token_id=processor.tokenizer.eos_token_id,
    )

    raw_text = processor.batch_decode(
        output_ids,
        skip_special_tokens=True,
    )[0].strip()

    letter = extract_letter(raw_text)

    if letter == "?":
        return (
            "❓ I couldn’t confidently map this to a single A–Z letter.\n\n"
            f"Raw model output: `{raw_text}`"
        )

    return f"🔤 **Predicted letter: {letter}**\n\n`Raw output: {raw_text}`"

with gr.Blocks(title="Guardio – ASL Letter Demo") as demo:
    gr.Markdown(
        """
        # 🧤 Guardio – ASL Letter Demo

        Upload an image of a **single ASL alphabet handshape**  
        and ask: *"What sign language letter is this image?"*
        """
    )

    with gr.Row():
        with gr.Column():
            img = gr.Image(
                label="ASL handshape image",
                type="pil",
                height=320,
            )
            q = gr.Textbox(
                label="Question",
                value=DEFAULT_QUESTION,
                lines=2,
            )
            btn = gr.Button("Ask Guardio", variant="primary")

        with gr.Column():
            out = gr.Markdown(
                label="Model answer",
                value="Upload an image and click **Ask Guardio**.",
            )

    btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])

if __name__ == "__main__":
    demo.launch()