KarthiEz commited on
Commit
d35bac6
·
verified ·
1 Parent(s): 480e233

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app_gradio_gemma4b_it_bnb4bit.py
2
+ # Gradio UX for unsloth/gemma-3-4b-it-unsloth-bnb-4bit (image-text-to-text)
3
+
4
+ from packaging import version
5
+ import transformers
6
+ from transformers import pipeline
7
+ import torch
8
+ import gradio as gr
9
+ from PIL import Image
10
+
11
+ # ---------- Governance: ensure pipeline task support ----------
12
+ MIN_TF = "4.46.0"
13
+ if version.parse(transformers.__version__) < version.parse(MIN_TF):
14
+ raise RuntimeError(
15
+ f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
16
+ f"Found {transformers.__version__}. Upgrade:\n"
17
+ f" pip install -U 'transformers>={MIN_TF},<5'"
18
+ )
19
+
20
+ # ---------- Optional dependency gate: torchvision (AutoVideoProcessor) ----------
21
+ HAS_TV = True
22
+ try:
23
+ import torchvision # noqa: F401
24
+ except Exception:
25
+ HAS_TV = False
26
+
27
+ MODEL_ID = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
28
+
29
+ # ---------- Capability checks ----------
30
+ HAS_CUDA = torch.cuda.is_available()
31
+ # Bitsandbytes is required for 4-bit GPU loading; fail-soft if missing.
32
+ HAS_BNB = True
33
+ try:
34
+ import bitsandbytes as bnb # noqa: F401
35
+ except Exception:
36
+ HAS_BNB = False
37
+
38
+ PIPE = None
39
+ INIT_ERR = None
40
+
41
+ def _build_pipe():
42
+ global PIPE, INIT_ERR
43
+ if not HAS_TV:
44
+ INIT_ERR = "torchvision not found; required by the processor stack."
45
+ return
46
+ if not HAS_CUDA or not HAS_BNB:
47
+ INIT_ERR = (
48
+ "This 4-bit model requires a CUDA GPU + bitsandbytes to run. "
49
+ "Please switch to a GPU runtime or use a CPU-compatible model."
50
+ )
51
+ return
52
+ try:
53
+ PIPE = pipeline(
54
+ task="image-text-to-text",
55
+ model=MODEL_ID,
56
+ device_map="auto",
57
+ dtype=torch.float16, # GPU path
58
+ trust_remote_code=True,
59
+ use_fast=True,
60
+ # Explicit 4-bit hint (bnb). Many UnsLoTH repos infer this automatically.
61
+ model_kwargs={"load_in_4bit": True}
62
+ )
63
+ except Exception as e:
64
+ INIT_ERR = f"Pipeline initialization failed: {e}"
65
+
66
+ _build_pipe()
67
+
68
+ def _extract_text(obj):
69
+ """Normalize pipeline outputs to just the assistant text."""
70
+ if obj is None:
71
+ return ""
72
+ if isinstance(obj, str):
73
+ return obj
74
+ if isinstance(obj, dict):
75
+ gen = obj.get("generated_text")
76
+ if isinstance(gen, str):
77
+ return gen
78
+ if isinstance(gen, (list, tuple)) and gen:
79
+ # Look for assistant turn
80
+ for turn in reversed(gen):
81
+ if isinstance(turn, dict) and turn.get("role") == "assistant":
82
+ content = turn.get("content")
83
+ if isinstance(content, list):
84
+ return " ".join(map(str, content))
85
+ return str(content) if content is not None else ""
86
+ return _extract_text(gen[0])
87
+ if "text" in obj and isinstance(obj["text"], str):
88
+ return obj["text"]
89
+ return str(obj)
90
+ if isinstance(obj, (list, tuple)) and obj:
91
+ return _extract_text(obj[0])
92
+ return str(obj)
93
+
94
+ def infer(image: Image.Image, question: str) -> str:
95
+ # Fail-soft guards to avoid exceptions surfacing to UI
96
+ if INIT_ERR:
97
+ return f"⚠️ {INIT_ERR}"
98
+ if image is None:
99
+ return "Please upload an image."
100
+ q = (question or "").strip()
101
+ if not q:
102
+ return "Please enter a question."
103
+
104
+ # Preferred: chat-style messages (auto-injects image tokens)
105
+ try:
106
+ out = PIPE(
107
+ text=[{
108
+ "role": "user",
109
+ "content": [
110
+ {"type": "image", "image": image},
111
+ {"type": "text", "text": q},
112
+ ],
113
+ }],
114
+ max_new_tokens=128,
115
+ )
116
+ except Exception:
117
+ # Fallback contract (ensure images is a LIST)
118
+ out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
119
+
120
+ return _extract_text(out).strip() or "(empty response)"
121
+
122
+ # ---------- Gradio UX ----------
123
+ with gr.Blocks(title="Gemma 3 4B IT (UnsLoTH 4-bit) — Image Q&A") as demo:
124
+ gr.Markdown("## 🖼️💬 Gemma-3-4B-IT (UnsLoTH 4-bit) — Image Q&A\n"
125
+ "- Upload an image, ask a question.\n"
126
+ "- This Space expects a **CUDA GPU + bitsandbytes** for this 4-bit model.\n")
127
+
128
+ if INIT_ERR:
129
+ gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
130
+
131
+ with gr.Row():
132
+ img = gr.Image(type="pil", label="Upload an image")
133
+ with gr.Column():
134
+ prompt = gr.Textbox(
135
+ label="Question",
136
+ placeholder='e.g., What animal is on the candy?',
137
+ lines=2,
138
+ )
139
+ submit = gr.Button("Ask")
140
+ output = gr.TextArea(label="Answer", lines=6)
141
+
142
+ submit.click(infer, [img, prompt], output)
143
+ prompt.submit(infer, [img, prompt], output)
144
+
145
+ if __name__ == "__main__":
146
+ demo.queue().launch(debug=True)