KarthiEz commited on
Commit
aa797e4
·
verified ·
1 Parent(s): 5087f0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -68
app.py CHANGED
@@ -1,108 +1,231 @@
1
- # app.py — CPU-only image→text QA via Transformers pipeline + Gradio
2
 
3
  from packaging import version
4
  import transformers
5
- from transformers import pipeline
6
  import torch
7
  import gradio as gr
8
  from PIL import Image
9
 
10
- # ---- Governance: ensure pipeline task is supported ----
11
- MIN_TF = "4.46.0"
 
12
  if version.parse(transformers.__version__) < version.parse(MIN_TF):
13
  raise RuntimeError(
14
- f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
15
  f"Found {transformers.__version__}. Upgrade:\n"
16
  f" pip install -U 'transformers>={MIN_TF},<5'"
17
  )
18
 
19
- # -------- Choose a CPU-friendly model here --------
20
- # MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
21
  MODEL_ID = "vikhyatk/moondream2"
22
- # MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct" # example tiny option
 
 
23
 
24
- # ---- Force CPU posture ----
25
  DEVICE = "cpu"
26
- DTYPE = torch.float32 # CPU-safe
27
-
28
- # ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
29
- try:
30
- import torchvision # noqa: F401
31
- except Exception:
32
- pass # If your chosen model needs it, install torchvision
33
-
34
- # ---- Bootstrap pipeline (CPU only) ----
35
- pipe = pipeline(
36
- "image-text-to-text",
37
- model="vikhyatk/moondream2",
38
- trust_remote_code=True,
39
- revision="6b714b26eea5cbd9f31e4edb2541c170afa935ba", # pin to a known commit
40
- device="cpu",
41
- dtype=torch.float32,
42
- use_fast=True,
43
- )
44
-
45
- def _extract_text(obj):
46
- """Normalize pipeline outputs to plain text (handles chat-style payloads)."""
47
- if obj is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  return ""
49
- if isinstance(obj, str):
50
- return obj
51
- if isinstance(obj, dict):
52
- gen = obj.get("generated_text")
 
 
53
  if isinstance(gen, str):
54
  return gen
55
  if isinstance(gen, (list, tuple)) and gen:
56
- # Prefer assistant turns if present
57
  for turn in reversed(gen):
58
  if isinstance(turn, dict) and turn.get("role") == "assistant":
59
- content = turn.get("content")
60
- return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
61
- return _extract_text(gen[0])
62
- if "text" in obj and isinstance(obj["text"], str):
63
- return obj["text"]
64
- return str(obj)
65
- if isinstance(obj, (list, tuple)) and obj:
66
- return _extract_text(obj[0])
67
- return str(obj)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def infer(image: Image.Image, question: str) -> str:
 
 
70
  if image is None:
71
  return "Please upload an image."
72
  q = (question or "").strip()
73
  if not q:
74
  return "Please enter a question."
75
 
76
- # Preferred: chat-style messages (auto-injects image tokens correctly)
77
  try:
78
- out = pipe(
79
- text=[{
80
- "role": "user",
81
- "content": [
82
- {"type": "image", "image": image},
83
- {"type": "text", "text": q},
84
- ],
85
- }],
86
- max_new_tokens=96,
87
- )
88
- except Exception:
89
- # Fallback: dict API — ensure a LIST for images
90
- out = pipe({"images": [image], "text": q}, max_new_tokens=96)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- return _extract_text(out).strip() or "(empty response)"
 
93
 
94
- # ---- Gradio UI ----
95
- with gr.Blocks(title="CPU-only Vision QA") as demo:
96
- gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
97
  with gr.Row():
98
  img = gr.Image(type="pil", label="Upload an image")
99
  with gr.Column():
100
- prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
101
- submit = gr.Button("Ask")
102
- out = gr.TextArea(label="Answer", lines=6)
103
 
104
- submit.click(infer, [img, prompt], out)
105
- prompt.submit(infer, [img, prompt], out)
106
 
107
  if __name__ == "__main__":
108
  demo.queue().launch(debug=True)
 
1
+ # app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks
2
 
3
  from packaging import version
4
  import transformers
 
5
  import torch
6
  import gradio as gr
7
  from PIL import Image
8
 
9
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
10
+
11
+ MIN_TF = "4.51.0" # newer TFs are friendlier to custom multimodal configs
12
  if version.parse(transformers.__version__) < version.parse(MIN_TF):
13
  raise RuntimeError(
14
+ f"Transformers >= {MIN_TF} required for Moondream2. "
15
  f"Found {transformers.__version__}. Upgrade:\n"
16
  f" pip install -U 'transformers>={MIN_TF},<5'"
17
  )
18
 
 
 
19
  MODEL_ID = "vikhyatk/moondream2"
20
+ # Pin to a stable snapshot to avoid “new version downloaded” surprises.
21
+ # If you want latest, set revision="main".
22
+ PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
23
 
 
24
  DEVICE = "cpu"
25
+ DTYPE = torch.float32
26
+
27
+ # ---- Bootstrap strategy ------------------------------------------------------
28
+ # 1) Try image-text-to-text pipeline (preferred for Q&A)
29
+ # 2) If it rejects the custom config, try visual-question-answering pipeline
30
+ # 3) If that fails, load the model with trust_remote_code and call its remote methods
31
+
32
+ PIPE = None
33
+ MODE = None # "itt" | "vqa" | "remote"
34
+ MODEL = None
35
+ TOKENIZER = None
36
+ INIT_ERR = None
37
+
38
+
39
+ def _try_itt():
40
+ global PIPE, MODE
41
+ PIPE = pipeline(
42
+ "image-text-to-text",
43
+ model=MODEL_ID,
44
+ revision=PINNED_REV,
45
+ device=DEVICE,
46
+ dtype=DTYPE,
47
+ trust_remote_code=True,
48
+ use_fast=True,
49
+ )
50
+ MODE = "itt"
51
+
52
+
53
+ def _try_vqa():
54
+ global PIPE, MODE
55
+ PIPE = pipeline(
56
+ "visual-question-answering",
57
+ model=MODEL_ID,
58
+ revision=PINNED_REV,
59
+ device=DEVICE,
60
+ trust_remote_code=True,
61
+ )
62
+ MODE = "vqa"
63
+
64
+
65
+ def _try_remote():
66
+ # Some Moondream2 snapshots expose custom methods via remote code.
67
+ global MODEL, TOKENIZER, MODE
68
+ TOKENIZER = AutoTokenizer.from_pretrained(
69
+ MODEL_ID, revision=PINNED_REV, trust_remote_code=True
70
+ )
71
+ MODEL = AutoModelForCausalLM.from_pretrained(
72
+ MODEL_ID,
73
+ revision=PINNED_REV,
74
+ trust_remote_code=True,
75
+ torch_dtype=DTYPE,
76
+ device_map=None,
77
+ ).to(DEVICE)
78
+
79
+ # Heuristic: prefer dedicated helpers if present
80
+ MODE = "remote"
81
+
82
+
83
+ def _boot():
84
+ global INIT_ERR
85
+ try:
86
+ _try_itt()
87
+ return
88
+ except Exception as e_itt:
89
+ # Fall through
90
+ try:
91
+ _try_vqa()
92
+ return
93
+ except Exception as e_vqa:
94
+ try:
95
+ _try_remote()
96
+ return
97
+ except Exception as e_remote:
98
+ INIT_ERR = (
99
+ "Moondream2 initialization failed.\n\n"
100
+ f"ITT error: {e_itt}\n\n"
101
+ f"VQA error: {e_vqa}\n\n"
102
+ f"Remote error: {e_remote}"
103
+ )
104
+
105
+ _boot()
106
+
107
+
108
+ def _normalize(out):
109
+ """Normalize pipeline outputs to a plain string (assistant text only)."""
110
+ if out is None:
111
  return ""
112
+ if isinstance(out, str):
113
+ return out
114
+
115
+ # ITT often returns dict or list-of-dicts with 'generated_text'
116
+ if isinstance(out, dict):
117
+ gen = out.get("generated_text")
118
  if isinstance(gen, str):
119
  return gen
120
  if isinstance(gen, (list, tuple)) and gen:
121
+ # Look for assistant role if chat-style
122
  for turn in reversed(gen):
123
  if isinstance(turn, dict) and turn.get("role") == "assistant":
124
+ c = turn.get("content")
125
+ return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
126
+ # fallback: first item
127
+ return _normalize(gen[0])
128
+ if isinstance(out.get("text"), str):
129
+ return out["text"]
130
+ return str(out)
131
+
132
+ if isinstance(out, (list, tuple)) and out:
133
+ # VQA often returns a list of dicts with 'generated_text'/'answer'
134
+ first = out[0]
135
+ if isinstance(first, dict):
136
+ if "generated_text" in first and isinstance(first["generated_text"], str):
137
+ return first["generated_text"]
138
+ if "answer" in first and isinstance(first["answer"], str):
139
+ return first["answer"]
140
+ return _normalize(first)
141
+
142
+ return str(out)
143
+
144
+
145
+ def _infer_remote(image: Image.Image, question: str) -> str:
146
+ """
147
+ Last-resort path: call remote-code helpers if present.
148
+ Many Moondream2 builds expose custom methods on the model; we check them dynamically.
149
+ """
150
+ if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
151
+ # Preferred remote API (if exposed by repo)
152
+ with torch.no_grad():
153
+ img_emb = MODEL.encode_image(image.convert("RGB"))
154
+ ans = MODEL.answer_question(img_emb, question)
155
+ return str(ans).strip()
156
+
157
+ # Generic generate fallback using tokenizer + special tokens
158
+ # We try a minimal prompt that many Moondream-style repos accept.
159
+ prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
160
+ with torch.no_grad():
161
+ inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
162
+ # Some repos require image embeds concatenated; if unsupported, we still produce text-only best effort.
163
+ out_ids = MODEL.generate(
164
+ **inputs,
165
+ max_new_tokens=128,
166
+ pad_token_id=TOKENIZER.eos_token_id,
167
+ )
168
+ out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
169
+ return out_text.strip()
170
+
171
 
172
  def infer(image: Image.Image, question: str) -> str:
173
+ if INIT_ERR:
174
+ return f"⚠️ Init error:\n{INIT_ERR}"
175
  if image is None:
176
  return "Please upload an image."
177
  q = (question or "").strip()
178
  if not q:
179
  return "Please enter a question."
180
 
 
181
  try:
182
+ if MODE == "itt":
183
+ # ITT prefers chat-format; falls back to dict if needed
184
+ try:
185
+ out = PIPE(
186
+ text=[{
187
+ "role": "user",
188
+ "content": [
189
+ {"type": "image", "image": image},
190
+ {"type": "text", "text": q},
191
+ ],
192
+ }],
193
+ max_new_tokens=128,
194
+ )
195
+ except Exception:
196
+ out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
197
+ return _normalize(out).strip() or "(empty response)"
198
+
199
+ if MODE == "vqa":
200
+ # Standard VQA signature
201
+ out = PIPE(image=image, question=q)
202
+ return _normalize(out).strip() or "(empty response)"
203
+
204
+ if MODE == "remote":
205
+ return _infer_remote(image, q) or "(empty response)"
206
+
207
+ return "Unknown mode."
208
+ except Exception as e:
209
+ return f"⚠️ Inference error: {e}"
210
+
211
+
212
+ # ---- Gradio UI ---------------------------------------------------------------
213
+ with gr.Blocks(title="Moondream2 — CPU Vision Q&A") as demo:
214
+ gr.Markdown("## 🌙 Moondream2 — CPU Vision Q&A\n"
215
+ "Upload an image, ask a question. App auto-negotiates the best working path.")
216
 
217
+ if INIT_ERR:
218
+ gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
219
 
 
 
 
220
  with gr.Row():
221
  img = gr.Image(type="pil", label="Upload an image")
222
  with gr.Column():
223
+ prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
224
+ btn = gr.Button("Ask")
225
+ ans = gr.TextArea(label="Answer", lines=6)
226
 
227
+ btn.click(infer, [img, prompt], ans)
228
+ prompt.submit(infer, [img, prompt], ans)
229
 
230
  if __name__ == "__main__":
231
  demo.queue().launch(debug=True)