Chaste20 commited on
Commit
cb45a42
·
1 Parent(s): 74ca19a

Add Guardio ASL demo

Browse files
Files changed (2) hide show
  1. app.py +163 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from PIL import Image
4
+ from transformers import AutoProcessor, AutoModelForImageTextToText
5
+ from peft import PeftModel
6
+ import traceback, textwrap, re
7
+
8
+ BASE_MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
9
+ FINETUNED_MODEL_ID = "Chaste20/smolvlm2-asl-ql-2"
10
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
+ DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32
12
+ DEFAULT_QUESTION = (
13
+ "Which ASL alphabet letter is shown in this image? "
14
+ "Answer with exactly one capital letter A–Z and nothing else."
15
+ )
16
+ ALLOWED_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
17
+
18
+ processor = None
19
+ model = None
20
+
21
+ def load_model():
22
+ global processor, model
23
+ if processor is not None and model is not None:
24
+ return processor, model
25
+
26
+ print(" Loading processor from", BASE_MODEL_ID)
27
+ processor = AutoProcessor.from_pretrained(
28
+ BASE_MODEL_ID,
29
+ trust_remote_code=True
30
+ )
31
+
32
+ print(" Loading base model from", BASE_MODEL_ID)
33
+ base = AutoModelForImageTextToText.from_pretrained(
34
+ BASE_MODEL_ID,
35
+ torch_dtype=DTYPE,
36
+ device_map="auto" if torch.cuda.is_available() else None,
37
+ trust_remote_code=True,
38
+ )
39
+
40
+ print(" Attaching PEFT adapter from", FINETUNED_MODEL_ID)
41
+ model_peft = PeftModel.from_pretrained(
42
+ base,
43
+ FINETUNED_MODEL_ID,
44
+ torch_dtype=DTYPE,
45
+ )
46
+ model_peft.to(DEVICE)
47
+ model_peft.eval()
48
+ model_peft.config.use_cache = True
49
+
50
+ model = model_peft
51
+ print(" Guardio model loaded on", DEVICE)
52
+ return processor, model
53
+
54
+ def extract_letter(raw_text: str) -> str:
55
+ m = re.search(r"\b([A-Z])\b", raw_text.strip())
56
+ if m and m.group(1) in ALLOWED_LETTERS:
57
+ return m.group(1)
58
+ caps = [c for c in raw_text if c in ALLOWED_LETTERS]
59
+ return caps[-1] if caps else "?"
60
+
61
+ @torch.inference_mode()
62
+ def guardio_predict(image, question: str):
63
+ try:
64
+ if image is None:
65
+ return " Please upload an image of an ASL handshape."
66
+
67
+ if not question or not question.strip():
68
+ question = DEFAULT_QUESTION
69
+
70
+ if not isinstance(image, Image.Image):
71
+ image = Image.fromarray(image)
72
+ if image.mode != "RGB":
73
+ image = image.convert("RGB")
74
+
75
+ proc, mdl = load_model()
76
+
77
+ messages = [
78
+ {
79
+ "role": "user",
80
+ "content": [
81
+ {"type": "text", "text": question},
82
+ {"type": "image"},
83
+ ],
84
+ }
85
+ ]
86
+
87
+ text = proc.apply_chat_template(
88
+ messages,
89
+ add_generation_prompt=True,
90
+ tokenize=False,
91
+ )
92
+
93
+ inputs = proc(
94
+ text=[text],
95
+ images=[image],
96
+ padding=True,
97
+ return_tensors="pt",
98
+ ).to(DEVICE)
99
+
100
+ output_ids = mdl.generate(
101
+ **inputs,
102
+ max_new_tokens=8,
103
+ do_sample=False,
104
+ num_beams=1,
105
+ temperature=0.1,
106
+ pad_token_id=proc.tokenizer.eos_token_id,
107
+ )
108
+
109
+ raw_text = proc.batch_decode(
110
+ output_ids,
111
+ skip_special_tokens=True,
112
+ )[0].strip()
113
+
114
+ letter = extract_letter(raw_text)
115
+
116
+ if letter == "?":
117
+ return (
118
+ " I couldn’t confidently map this to a single A–Z letter.\n\n"
119
+ f"Raw model output: `{raw_text}`"
120
+ )
121
+
122
+ return f" **Predicted letter: {letter}**\n\nRaw model output: `{raw_text}`"
123
+
124
+ except Exception as e:
125
+ traceback.print_exc()
126
+ msg = textwrap.dedent(f"""
127
+ **Internal error while running the model**
128
+
129
+ **Type:** `{type(e).__name__}`
130
+ **Message:** `{e}`
131
+
132
+ """).strip()
133
+ return msg
134
+
135
+ def build_demo():
136
+ with gr.Blocks(title="Guardio – ASL Letter Demo (HF Space)") as demo:
137
+ gr.Markdown(
138
+ """
139
+ ASL Letter Demo
140
+
141
+ - Upload an image of a **single ASL alphabet handshape**
142
+ - Ask: *"Which ASL alphabet letter is this image?"*
143
+ - The model predicts a single A–Z letter.
144
+ """
145
+ )
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ img = gr.Image(label="ASL handshape image", type="pil", height=320)
150
+ q = gr.Textbox(label="Question", value=DEFAULT_QUESTION, lines=2)
151
+ btn = gr.Button("Ask Guardio", variant="primary")
152
+
153
+ with gr.Column():
154
+ out = gr.Markdown("Upload an image and click **Ask Guardio**.")
155
+
156
+ btn.click(fn=guardio_predict, inputs=[img, q], outputs=[out])
157
+
158
+ return demo
159
+
160
+ demo = build_demo()
161
+
162
+ if __name__ == "__main__":
163
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.46.0
2
+ peft>=0.14.0
3
+ accelerate>=1.0.0
4
+ bitsandbytes
5
+ num2words
6
+ torch
7
+ gradio
8
+ Pillow