Spaces:

KarthiEz
/

gemmasign

Sleeping

App Files Files Community

KarthiEz commited on Oct 13

Commit

c7547a1

verified ·

1 Parent(s): 9d90cb2

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -76

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
-# app_gradio_gemma4b_it_bnb4bit.py
-# Gradio UX for unsloth/gemma-3-4b-it-unsloth-bnb-4bit (image-text-to-text)
 from packaging import version
 import transformers
@@ -8,7 +7,7 @@ import torch
 import gradio as gr
 from PIL import Image
-# ---------- Governance: ensure pipeline task support ----------
 MIN_TF = "4.46.0"
 if version.parse(transformers.__version__) < version.parse(MIN_TF):
     raise RuntimeError(
@@ -17,56 +16,33 @@ if version.parse(transformers.__version__) < version.parse(MIN_TF):
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
-# ---------- Optional dependency gate: torchvision (AutoVideoProcessor) ----------
-HAS_TV = True
-try:
-    import torchvision  # noqa: F401
-except Exception:
-    HAS_TV = False
-MODEL_ID = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
-# ---------- Capability checks ----------
-HAS_CUDA = torch.cuda.is_available()
-# Bitsandbytes is required for 4-bit GPU loading; fail-soft if missing.
-HAS_BNB = True
 try:
-    import bitsandbytes as bnb  # noqa: F401
 except Exception:
-    HAS_BNB = False
-PIPE = None
-INIT_ERR = None
-def _build_pipe():
-    global PIPE, INIT_ERR
-    if not HAS_TV:
-        INIT_ERR = "torchvision not found; required by the processor stack."
-        return
-    if not HAS_CUDA or not HAS_BNB:
-        INIT_ERR = (
-            "This 4-bit model requires a CUDA GPU + bitsandbytes to run. "
-            "Please switch to a GPU runtime or use a CPU-compatible model."
-        )
-        return
-    try:
-        PIPE = pipeline(
-            task="image-text-to-text",
-            model=MODEL_ID,
-            device_map="auto",
-            dtype=torch.float16,   # GPU path
-            trust_remote_code=True,
-            use_fast=True,
-            # Explicit 4-bit hint (bnb). Many UnsLoTH repos infer this automatically.
-            model_kwargs={"load_in_4bit": True}
-        )
-    except Exception as e:
-        INIT_ERR = f"Pipeline initialization failed: {e}"
-_build_pipe()
 def _extract_text(obj):
-    """Normalize pipeline outputs to just the assistant text."""
     if obj is None:
         return ""
     if isinstance(obj, str):
@@ -76,13 +52,11 @@ def _extract_text(obj):
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
-            # Look for assistant turn
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     content = turn.get("content")
-                    if isinstance(content, list):
-                        return " ".join(map(str, content))
-                    return str(content) if content is not None else ""
             return _extract_text(gen[0])
         if "text" in obj and isinstance(obj["text"], str):
             return obj["text"]
@@ -92,18 +66,15 @@ def _extract_text(obj):
     return str(obj)
 def infer(image: Image.Image, question: str) -> str:
-    # Fail-soft guards to avoid exceptions surfacing to UI
-    if INIT_ERR:
-        return f"⚠️ {INIT_ERR}"
     if image is None:
         return "Please upload an image."
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
-    # Preferred: chat-style messages (auto-injects image tokens)
     try:
-        out = PIPE(
             text=[{
                 "role": "user",
                 "content": [
@@ -111,36 +82,26 @@ def infer(image: Image.Image, question: str) -> str:
                     {"type": "text", "text": q},
                 ],
             }],
-            max_new_tokens=128,
         )
     except Exception:
-        # Fallback contract (ensure images is a LIST)
-        out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
     return _extract_text(out).strip() or "(empty response)"
-# ---------- Gradio UX ----------
-with gr.Blocks(title="Gemma 3 4B IT (UnsLoTH 4-bit) — Image Q&A") as demo:
-    gr.Markdown("## 🖼️💬 Gemma-3-4B-IT (UnsLoTH 4-bit) — Image Q&A\n"
-                "- Upload an image, ask a question.\n"
-                "- This Space expects a **CUDA GPU + bitsandbytes** for this 4-bit model.\n")
-    if INIT_ERR:
-        gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
-            prompt = gr.Textbox(
-                label="Question",
-                placeholder='e.g., What animal is on the candy?',
-                lines=2,
-            )
             submit = gr.Button("Ask")
-            output = gr.TextArea(label="Answer", lines=6)
-    submit.click(infer, [img, prompt], output)
-    prompt.submit(infer, [img, prompt], output)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)

+# app.py — CPU-only image→text QA via Transformers pipeline + Gradio
 from packaging import version
 import transformers
 import gradio as gr
 from PIL import Image
+# ---- Governance: ensure pipeline task is supported ----
 MIN_TF = "4.46.0"
 if version.parse(transformers.__version__) < version.parse(MIN_TF):
     raise RuntimeError(
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
+# -------- Choose a CPU-friendly model here --------
+# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+MODEL_ID = "vikhyatk/moondream2"
+# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct"  # example tiny option
+# ---- Force CPU posture ----
+DEVICE = "cpu"
+DTYPE = torch.float32  # CPU-safe
+# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
 try:
+    import torchvision  # noqa: F401
 except Exception:
+    pass  # If your chosen model needs it, install torchvision
+# ---- Bootstrap pipeline (CPU only) ----
+pipe = pipeline(
+    task="image-text-to-text",
+    model=MODEL_ID,
+    device=DEVICE,      # <- forces CPU
+    dtype=DTYPE,        # <- CPU dtype
+    trust_remote_code=True,
+    use_fast=True,      # if supported by the model’s processor
+)
 def _extract_text(obj):
+    """Normalize pipeline outputs to plain text (handles chat-style payloads)."""
     if obj is None:
         return ""
     if isinstance(obj, str):
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
+            # Prefer assistant turns if present
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     content = turn.get("content")
+                    return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
             return _extract_text(gen[0])
         if "text" in obj and isinstance(obj["text"], str):
             return obj["text"]
     return str(obj)
 def infer(image: Image.Image, question: str) -> str:
     if image is None:
         return "Please upload an image."
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
+    # Preferred: chat-style messages (auto-injects image tokens correctly)
     try:
+        out = pipe(
             text=[{
                 "role": "user",
                 "content": [
                     {"type": "text", "text": q},
                 ],
             }],
+            max_new_tokens=96,
         )
     except Exception:
+        # Fallback: dict API — ensure a LIST for images
+        out = pipe({"images": [image], "text": q}, max_new_tokens=96)
     return _extract_text(out).strip() or "(empty response)"
+# ---- Gradio UI ----
+with gr.Blocks(title="CPU-only Vision QA") as demo:
+    gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
+            prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
             submit = gr.Button("Ask")
+            out = gr.TextArea(label="Answer", lines=6)
+    submit.click(infer, [img, prompt], out)
+    prompt.submit(infer, [img, prompt], out)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)