KarthiEz commited on
Commit
d39e74e
·
verified ·
1 Parent(s): aa797e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -39
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks
2
 
3
  from packaging import version
4
  import transformers
@@ -16,15 +16,17 @@ if version.parse(transformers.__version__) < version.parse(MIN_TF):
16
  f" pip install -U 'transformers>={MIN_TF},<5'"
17
  )
18
 
19
- MODEL_ID = "vikhyatk/moondream2"
 
20
  # Pin to a stable snapshot to avoid “new version downloaded” surprises.
21
- # If you want latest, set revision="main".
22
  PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
23
 
 
 
24
  DEVICE = "cpu"
25
  DTYPE = torch.float32
26
 
27
- # ---- Bootstrap strategy ------------------------------------------------------
28
  # 1) Try image-text-to-text pipeline (preferred for Q&A)
29
  # 2) If it rejects the custom config, try visual-question-answering pipeline
30
  # 3) If that fails, load the model with trust_remote_code and call its remote methods
@@ -35,12 +37,11 @@ MODEL = None
35
  TOKENIZER = None
36
  INIT_ERR = None
37
 
38
-
39
  def _try_itt():
40
  global PIPE, MODE
41
  PIPE = pipeline(
42
  "image-text-to-text",
43
- model=MODEL_ID,
44
  revision=PINNED_REV,
45
  device=DEVICE,
46
  dtype=DTYPE,
@@ -49,44 +50,38 @@ def _try_itt():
49
  )
50
  MODE = "itt"
51
 
52
-
53
  def _try_vqa():
54
  global PIPE, MODE
55
  PIPE = pipeline(
56
  "visual-question-answering",
57
- model=MODEL_ID,
58
  revision=PINNED_REV,
59
  device=DEVICE,
60
  trust_remote_code=True,
61
  )
62
  MODE = "vqa"
63
 
64
-
65
  def _try_remote():
66
  # Some Moondream2 snapshots expose custom methods via remote code.
67
  global MODEL, TOKENIZER, MODE
68
  TOKENIZER = AutoTokenizer.from_pretrained(
69
- MODEL_ID, revision=PINNED_REV, trust_remote_code=True
70
  )
71
  MODEL = AutoModelForCausalLM.from_pretrained(
72
- MODEL_ID,
73
  revision=PINNED_REV,
74
  trust_remote_code=True,
75
  torch_dtype=DTYPE,
76
  device_map=None,
77
  ).to(DEVICE)
78
-
79
- # Heuristic: prefer dedicated helpers if present
80
  MODE = "remote"
81
 
82
-
83
  def _boot():
84
  global INIT_ERR
85
  try:
86
  _try_itt()
87
  return
88
  except Exception as e_itt:
89
- # Fall through
90
  try:
91
  _try_vqa()
92
  return
@@ -104,7 +99,22 @@ def _boot():
104
 
105
  _boot()
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
 
108
  def _normalize(out):
109
  """Normalize pipeline outputs to a plain string (assistant text only)."""
110
  if out is None:
@@ -112,25 +122,21 @@ def _normalize(out):
112
  if isinstance(out, str):
113
  return out
114
 
115
- # ITT often returns dict or list-of-dicts with 'generated_text'
116
  if isinstance(out, dict):
117
  gen = out.get("generated_text")
118
  if isinstance(gen, str):
119
  return gen
120
  if isinstance(gen, (list, tuple)) and gen:
121
- # Look for assistant role if chat-style
122
  for turn in reversed(gen):
123
  if isinstance(turn, dict) and turn.get("role") == "assistant":
124
  c = turn.get("content")
125
  return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
126
- # fallback: first item
127
  return _normalize(gen[0])
128
  if isinstance(out.get("text"), str):
129
  return out["text"]
130
  return str(out)
131
 
132
  if isinstance(out, (list, tuple)) and out:
133
- # VQA often returns a list of dicts with 'generated_text'/'answer'
134
  first = out[0]
135
  if isinstance(first, dict):
136
  if "generated_text" in first and isinstance(first["generated_text"], str):
@@ -141,25 +147,17 @@ def _normalize(out):
141
 
142
  return str(out)
143
 
144
-
145
  def _infer_remote(image: Image.Image, question: str) -> str:
146
- """
147
- Last-resort path: call remote-code helpers if present.
148
- Many Moondream2 builds expose custom methods on the model; we check them dynamically.
149
- """
150
  if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
151
- # Preferred remote API (if exposed by repo)
152
  with torch.no_grad():
153
  img_emb = MODEL.encode_image(image.convert("RGB"))
154
  ans = MODEL.answer_question(img_emb, question)
155
  return str(ans).strip()
156
 
157
- # Generic generate fallback using tokenizer + special tokens
158
- # We try a minimal prompt that many Moondream-style repos accept.
159
  prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
160
  with torch.no_grad():
161
  inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
162
- # Some repos require image embeds concatenated; if unsupported, we still produce text-only best effort.
163
  out_ids = MODEL.generate(
164
  **inputs,
165
  max_new_tokens=128,
@@ -168,8 +166,32 @@ def _infer_remote(image: Image.Image, question: str) -> str:
168
  out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
169
  return out_text.strip()
170
 
171
-
172
- def infer(image: Image.Image, question: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if INIT_ERR:
174
  return f"⚠️ Init error:\n{INIT_ERR}"
175
  if image is None:
@@ -180,7 +202,6 @@ def infer(image: Image.Image, question: str) -> str:
180
 
181
  try:
182
  if MODE == "itt":
183
- # ITT prefers chat-format; falls back to dict if needed
184
  try:
185
  out = PIPE(
186
  text=[{
@@ -197,7 +218,6 @@ def infer(image: Image.Image, question: str) -> str:
197
  return _normalize(out).strip() or "(empty response)"
198
 
199
  if MODE == "vqa":
200
- # Standard VQA signature
201
  out = PIPE(image=image, question=q)
202
  return _normalize(out).strip() or "(empty response)"
203
 
@@ -208,24 +228,33 @@ def infer(image: Image.Image, question: str) -> str:
208
  except Exception as e:
209
  return f"⚠️ Inference error: {e}"
210
 
211
-
212
  # ---- Gradio UI ---------------------------------------------------------------
213
- with gr.Blocks(title="Moondream2 — CPU Vision Q&A") as demo:
214
- gr.Markdown("## 🌙 Moondream2 — CPU Vision Q&A\n"
215
- "Upload an image, ask a question. App auto-negotiates the best working path.")
216
 
 
217
  if INIT_ERR:
218
- gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
 
 
219
 
220
  with gr.Row():
221
  img = gr.Image(type="pil", label="Upload an image")
222
  with gr.Column():
 
 
 
 
 
 
223
  prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
224
  btn = gr.Button("Ask")
225
  ans = gr.TextArea(label="Answer", lines=6)
226
 
227
- btn.click(infer, [img, prompt], ans)
228
- prompt.submit(infer, [img, prompt], ans)
 
229
 
230
  if __name__ == "__main__":
231
  demo.queue().launch(debug=True)
 
1
+ # app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM
2
 
3
  from packaging import version
4
  import transformers
 
16
  f" pip install -U 'transformers>={MIN_TF},<5'"
17
  )
18
 
19
+ # --- Models ---
20
+ MOONDREAM_MODEL_ID = "vikhyatk/moondream2"
21
  # Pin to a stable snapshot to avoid “new version downloaded” surprises.
 
22
  PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
23
 
24
+ SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"
25
+
26
  DEVICE = "cpu"
27
  DTYPE = torch.float32
28
 
29
+ # ---- Moondream bootstrap strategy -------------------------------------------
30
  # 1) Try image-text-to-text pipeline (preferred for Q&A)
31
  # 2) If it rejects the custom config, try visual-question-answering pipeline
32
  # 3) If that fails, load the model with trust_remote_code and call its remote methods
 
37
  TOKENIZER = None
38
  INIT_ERR = None
39
 
 
40
  def _try_itt():
41
  global PIPE, MODE
42
  PIPE = pipeline(
43
  "image-text-to-text",
44
+ model=MOONDREAM_MODEL_ID,
45
  revision=PINNED_REV,
46
  device=DEVICE,
47
  dtype=DTYPE,
 
50
  )
51
  MODE = "itt"
52
 
 
53
  def _try_vqa():
54
  global PIPE, MODE
55
  PIPE = pipeline(
56
  "visual-question-answering",
57
+ model=MOONDREAM_MODEL_ID,
58
  revision=PINNED_REV,
59
  device=DEVICE,
60
  trust_remote_code=True,
61
  )
62
  MODE = "vqa"
63
 
 
64
  def _try_remote():
65
  # Some Moondream2 snapshots expose custom methods via remote code.
66
  global MODEL, TOKENIZER, MODE
67
  TOKENIZER = AutoTokenizer.from_pretrained(
68
+ MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True
69
  )
70
  MODEL = AutoModelForCausalLM.from_pretrained(
71
+ MOONDREAM_MODEL_ID,
72
  revision=PINNED_REV,
73
  trust_remote_code=True,
74
  torch_dtype=DTYPE,
75
  device_map=None,
76
  ).to(DEVICE)
 
 
77
  MODE = "remote"
78
 
 
79
  def _boot():
80
  global INIT_ERR
81
  try:
82
  _try_itt()
83
  return
84
  except Exception as e_itt:
 
85
  try:
86
  _try_vqa()
87
  return
 
99
 
100
  _boot()
101
 
102
+ # ---- SmolVLM (CPU) pipeline --------------------------------------------------
103
+ SMOL_PIPE = None
104
+ SMOL_INIT_ERR = None
105
+ try:
106
+ SMOL_PIPE = pipeline(
107
+ "image-text-to-text",
108
+ model=SMOL_MODEL_ID,
109
+ device=DEVICE,
110
+ dtype=DTYPE,
111
+ use_fast=True,
112
+ trust_remote_code=True, # harmless if not needed
113
+ )
114
+ except Exception as e:
115
+ SMOL_INIT_ERR = f"SmolVLM init failed: {e}"
116
 
117
+ # ---- Shared helpers ----------------------------------------------------------
118
  def _normalize(out):
119
  """Normalize pipeline outputs to a plain string (assistant text only)."""
120
  if out is None:
 
122
  if isinstance(out, str):
123
  return out
124
 
 
125
  if isinstance(out, dict):
126
  gen = out.get("generated_text")
127
  if isinstance(gen, str):
128
  return gen
129
  if isinstance(gen, (list, tuple)) and gen:
 
130
  for turn in reversed(gen):
131
  if isinstance(turn, dict) and turn.get("role") == "assistant":
132
  c = turn.get("content")
133
  return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
 
134
  return _normalize(gen[0])
135
  if isinstance(out.get("text"), str):
136
  return out["text"]
137
  return str(out)
138
 
139
  if isinstance(out, (list, tuple)) and out:
 
140
  first = out[0]
141
  if isinstance(first, dict):
142
  if "generated_text" in first and isinstance(first["generated_text"], str):
 
147
 
148
  return str(out)
149
 
 
150
  def _infer_remote(image: Image.Image, question: str) -> str:
151
+ """Moondream2 last-resort path via remote-code helpers."""
 
 
 
152
  if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
 
153
  with torch.no_grad():
154
  img_emb = MODEL.encode_image(image.convert("RGB"))
155
  ans = MODEL.answer_question(img_emb, question)
156
  return str(ans).strip()
157
 
 
 
158
  prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
159
  with torch.no_grad():
160
  inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
 
161
  out_ids = MODEL.generate(
162
  **inputs,
163
  max_new_tokens=128,
 
166
  out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
167
  return out_text.strip()
168
 
169
+ # ---- Inference (now with model selection) ------------------------------------
170
+ def infer(image: Image.Image, question: str, model_choice: str) -> str:
171
+ if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct":
172
+ if SMOL_INIT_ERR:
173
+ return f"⚠️ {SMOL_INIT_ERR}"
174
+ if image is None:
175
+ return "Please upload an image."
176
+ q = (question or "").strip()
177
+ if not q:
178
+ return "Please enter a question."
179
+ try:
180
+ out = SMOL_PIPE(
181
+ text=[{
182
+ "role": "user",
183
+ "content": [
184
+ {"type": "image", "image": image},
185
+ {"type": "text", "text": q},
186
+ ],
187
+ }],
188
+ max_new_tokens=128,
189
+ )
190
+ except Exception:
191
+ out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128)
192
+ return _normalize(out).strip() or "(empty response)"
193
+
194
+ # Default path: Moondream2 (unchanged logic)
195
  if INIT_ERR:
196
  return f"⚠️ Init error:\n{INIT_ERR}"
197
  if image is None:
 
202
 
203
  try:
204
  if MODE == "itt":
 
205
  try:
206
  out = PIPE(
207
  text=[{
 
218
  return _normalize(out).strip() or "(empty response)"
219
 
220
  if MODE == "vqa":
 
221
  out = PIPE(image=image, question=q)
222
  return _normalize(out).strip() or "(empty response)"
223
 
 
228
  except Exception as e:
229
  return f"⚠️ Inference error: {e}"
230
 
 
231
  # ---- Gradio UI ---------------------------------------------------------------
232
+ with gr.Blocks(title="CPU Vision Q&A") as demo:
233
+ gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n"
234
+ "Upload an image, ask a question, and pick your model.")
235
 
236
+ # Show Moondream init status (kept from your original app)
237
  if INIT_ERR:
238
+ gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`")
239
+ if SMOL_INIT_ERR:
240
+ gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`")
241
 
242
  with gr.Row():
243
  img = gr.Image(type="pil", label="Upload an image")
244
  with gr.Column():
245
+ # NEW: model selector (default = Moondream2) — minimal surface change
246
+ model_choice = gr.Dropdown(
247
+ choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID],
248
+ value=MOONDREAM_MODEL_ID,
249
+ label="Model",
250
+ )
251
  prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
252
  btn = gr.Button("Ask")
253
  ans = gr.TextArea(label="Answer", lines=6)
254
 
255
+ # Wire the new dropdown into the call; everything else is unchanged
256
+ btn.click(infer, [img, prompt, model_choice], ans)
257
+ prompt.submit(infer, [img, prompt, model_choice], ans)
258
 
259
  if __name__ == "__main__":
260
  demo.queue().launch(debug=True)