KarthiEz commited on
Commit
c7547a1
·
verified ·
1 Parent(s): 9d90cb2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -76
app.py CHANGED
@@ -1,5 +1,4 @@
1
- # app_gradio_gemma4b_it_bnb4bit.py
2
- # Gradio UX for unsloth/gemma-3-4b-it-unsloth-bnb-4bit (image-text-to-text)
3
 
4
  from packaging import version
5
  import transformers
@@ -8,7 +7,7 @@ import torch
8
  import gradio as gr
9
  from PIL import Image
10
 
11
- # ---------- Governance: ensure pipeline task support ----------
12
  MIN_TF = "4.46.0"
13
  if version.parse(transformers.__version__) < version.parse(MIN_TF):
14
  raise RuntimeError(
@@ -17,56 +16,33 @@ if version.parse(transformers.__version__) < version.parse(MIN_TF):
17
  f" pip install -U 'transformers>={MIN_TF},<5'"
18
  )
19
 
20
- # ---------- Optional dependency gate: torchvision (AutoVideoProcessor) ----------
21
- HAS_TV = True
22
- try:
23
- import torchvision # noqa: F401
24
- except Exception:
25
- HAS_TV = False
26
 
27
- MODEL_ID = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
 
 
28
 
29
- # ---------- Capability checks ----------
30
- HAS_CUDA = torch.cuda.is_available()
31
- # Bitsandbytes is required for 4-bit GPU loading; fail-soft if missing.
32
- HAS_BNB = True
33
  try:
34
- import bitsandbytes as bnb # noqa: F401
35
  except Exception:
36
- HAS_BNB = False
37
-
38
- PIPE = None
39
- INIT_ERR = None
40
-
41
- def _build_pipe():
42
- global PIPE, INIT_ERR
43
- if not HAS_TV:
44
- INIT_ERR = "torchvision not found; required by the processor stack."
45
- return
46
- if not HAS_CUDA or not HAS_BNB:
47
- INIT_ERR = (
48
- "This 4-bit model requires a CUDA GPU + bitsandbytes to run. "
49
- "Please switch to a GPU runtime or use a CPU-compatible model."
50
- )
51
- return
52
- try:
53
- PIPE = pipeline(
54
- task="image-text-to-text",
55
- model=MODEL_ID,
56
- device_map="auto",
57
- dtype=torch.float16, # GPU path
58
- trust_remote_code=True,
59
- use_fast=True,
60
- # Explicit 4-bit hint (bnb). Many UnsLoTH repos infer this automatically.
61
- model_kwargs={"load_in_4bit": True}
62
- )
63
- except Exception as e:
64
- INIT_ERR = f"Pipeline initialization failed: {e}"
65
-
66
- _build_pipe()
67
 
68
  def _extract_text(obj):
69
- """Normalize pipeline outputs to just the assistant text."""
70
  if obj is None:
71
  return ""
72
  if isinstance(obj, str):
@@ -76,13 +52,11 @@ def _extract_text(obj):
76
  if isinstance(gen, str):
77
  return gen
78
  if isinstance(gen, (list, tuple)) and gen:
79
- # Look for assistant turn
80
  for turn in reversed(gen):
81
  if isinstance(turn, dict) and turn.get("role") == "assistant":
82
  content = turn.get("content")
83
- if isinstance(content, list):
84
- return " ".join(map(str, content))
85
- return str(content) if content is not None else ""
86
  return _extract_text(gen[0])
87
  if "text" in obj and isinstance(obj["text"], str):
88
  return obj["text"]
@@ -92,18 +66,15 @@ def _extract_text(obj):
92
  return str(obj)
93
 
94
  def infer(image: Image.Image, question: str) -> str:
95
- # Fail-soft guards to avoid exceptions surfacing to UI
96
- if INIT_ERR:
97
- return f"⚠️ {INIT_ERR}"
98
  if image is None:
99
  return "Please upload an image."
100
  q = (question or "").strip()
101
  if not q:
102
  return "Please enter a question."
103
 
104
- # Preferred: chat-style messages (auto-injects image tokens)
105
  try:
106
- out = PIPE(
107
  text=[{
108
  "role": "user",
109
  "content": [
@@ -111,36 +82,26 @@ def infer(image: Image.Image, question: str) -> str:
111
  {"type": "text", "text": q},
112
  ],
113
  }],
114
- max_new_tokens=128,
115
  )
116
  except Exception:
117
- # Fallback contract (ensure images is a LIST)
118
- out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
119
 
120
  return _extract_text(out).strip() or "(empty response)"
121
 
122
- # ---------- Gradio UX ----------
123
- with gr.Blocks(title="Gemma 3 4B IT (UnsLoTH 4-bit) Image Q&A") as demo:
124
- gr.Markdown("## 🖼️💬 Gemma-3-4B-IT (UnsLoTH 4-bit) — Image Q&A\n"
125
- "- Upload an image, ask a question.\n"
126
- "- This Space expects a **CUDA GPU + bitsandbytes** for this 4-bit model.\n")
127
-
128
- if INIT_ERR:
129
- gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
130
-
131
  with gr.Row():
132
  img = gr.Image(type="pil", label="Upload an image")
133
  with gr.Column():
134
- prompt = gr.Textbox(
135
- label="Question",
136
- placeholder='e.g., What animal is on the candy?',
137
- lines=2,
138
- )
139
  submit = gr.Button("Ask")
140
- output = gr.TextArea(label="Answer", lines=6)
141
 
142
- submit.click(infer, [img, prompt], output)
143
- prompt.submit(infer, [img, prompt], output)
144
 
145
  if __name__ == "__main__":
146
  demo.queue().launch(debug=True)
 
1
+ # app.py — CPU-only image→text QA via Transformers pipeline + Gradio
 
2
 
3
  from packaging import version
4
  import transformers
 
7
  import gradio as gr
8
  from PIL import Image
9
 
10
+ # ---- Governance: ensure pipeline task is supported ----
11
  MIN_TF = "4.46.0"
12
  if version.parse(transformers.__version__) < version.parse(MIN_TF):
13
  raise RuntimeError(
 
16
  f" pip install -U 'transformers>={MIN_TF},<5'"
17
  )
18
 
19
+ # -------- Choose a CPU-friendly model here --------
20
+ # MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
21
+ MODEL_ID = "vikhyatk/moondream2"
22
+ # MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct" # example tiny option
 
 
23
 
24
+ # ---- Force CPU posture ----
25
+ DEVICE = "cpu"
26
+ DTYPE = torch.float32 # CPU-safe
27
 
28
+ # ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
 
 
 
29
  try:
30
+ import torchvision # noqa: F401
31
  except Exception:
32
+ pass # If your chosen model needs it, install torchvision
33
+
34
+ # ---- Bootstrap pipeline (CPU only) ----
35
+ pipe = pipeline(
36
+ task="image-text-to-text",
37
+ model=MODEL_ID,
38
+ device=DEVICE, # <- forces CPU
39
+ dtype=DTYPE, # <- CPU dtype
40
+ trust_remote_code=True,
41
+ use_fast=True, # if supported by the model’s processor
42
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def _extract_text(obj):
45
+ """Normalize pipeline outputs to plain text (handles chat-style payloads)."""
46
  if obj is None:
47
  return ""
48
  if isinstance(obj, str):
 
52
  if isinstance(gen, str):
53
  return gen
54
  if isinstance(gen, (list, tuple)) and gen:
55
+ # Prefer assistant turns if present
56
  for turn in reversed(gen):
57
  if isinstance(turn, dict) and turn.get("role") == "assistant":
58
  content = turn.get("content")
59
+ return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
 
 
60
  return _extract_text(gen[0])
61
  if "text" in obj and isinstance(obj["text"], str):
62
  return obj["text"]
 
66
  return str(obj)
67
 
68
  def infer(image: Image.Image, question: str) -> str:
 
 
 
69
  if image is None:
70
  return "Please upload an image."
71
  q = (question or "").strip()
72
  if not q:
73
  return "Please enter a question."
74
 
75
+ # Preferred: chat-style messages (auto-injects image tokens correctly)
76
  try:
77
+ out = pipe(
78
  text=[{
79
  "role": "user",
80
  "content": [
 
82
  {"type": "text", "text": q},
83
  ],
84
  }],
85
+ max_new_tokens=96,
86
  )
87
  except Exception:
88
+ # Fallback: dict API ensure a LIST for images
89
+ out = pipe({"images": [image], "text": q}, max_new_tokens=96)
90
 
91
  return _extract_text(out).strip() or "(empty response)"
92
 
93
+ # ---- Gradio UI ----
94
+ with gr.Blocks(title="CPU-only Vision QA") as demo:
95
+ gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
 
 
 
 
 
 
96
  with gr.Row():
97
  img = gr.Image(type="pil", label="Upload an image")
98
  with gr.Column():
99
+ prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
 
 
 
 
100
  submit = gr.Button("Ask")
101
+ out = gr.TextArea(label="Answer", lines=6)
102
 
103
+ submit.click(infer, [img, prompt], out)
104
+ prompt.submit(infer, [img, prompt], out)
105
 
106
  if __name__ == "__main__":
107
  demo.queue().launch(debug=True)