Spaces:

KarthiEz
/

gemmasign

Sleeping

App Files Files Community

KarthiEz commited on Oct 14, 2025

Commit

d39e74e

verified ·

1 Parent(s): aa797e4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -39

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks
 from packaging import version
 import transformers
@@ -16,15 +16,17 @@ if version.parse(transformers.__version__) < version.parse(MIN_TF):
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
-MODEL_ID = "vikhyatk/moondream2"
 # Pin to a stable snapshot to avoid “new version downloaded” surprises.
-# If you want latest, set revision="main".
 PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
 DEVICE = "cpu"
 DTYPE = torch.float32
-# ---- Bootstrap strategy ------------------------------------------------------
 # 1) Try image-text-to-text pipeline (preferred for Q&A)
 # 2) If it rejects the custom config, try visual-question-answering pipeline
 # 3) If that fails, load the model with trust_remote_code and call its remote methods
@@ -35,12 +37,11 @@ MODEL = None
 TOKENIZER = None
 INIT_ERR = None
 def _try_itt():
     global PIPE, MODE
     PIPE = pipeline(
         "image-text-to-text",
-        model=MODEL_ID,
         revision=PINNED_REV,
         device=DEVICE,
         dtype=DTYPE,
@@ -49,44 +50,38 @@ def _try_itt():
     )
     MODE = "itt"
 def _try_vqa():
     global PIPE, MODE
     PIPE = pipeline(
         "visual-question-answering",
-        model=MODEL_ID,
         revision=PINNED_REV,
         device=DEVICE,
         trust_remote_code=True,
     )
     MODE = "vqa"
 def _try_remote():
     # Some Moondream2 snapshots expose custom methods via remote code.
     global MODEL, TOKENIZER, MODE
     TOKENIZER = AutoTokenizer.from_pretrained(
-        MODEL_ID, revision=PINNED_REV, trust_remote_code=True
     )
     MODEL = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
         revision=PINNED_REV,
         trust_remote_code=True,
         torch_dtype=DTYPE,
         device_map=None,
     ).to(DEVICE)
-    # Heuristic: prefer dedicated helpers if present
     MODE = "remote"
 def _boot():
     global INIT_ERR
     try:
         _try_itt()
         return
     except Exception as e_itt:
-        # Fall through
         try:
             _try_vqa()
             return
@@ -104,7 +99,22 @@ def _boot():
 _boot()
 def _normalize(out):
     """Normalize pipeline outputs to a plain string (assistant text only)."""
     if out is None:
@@ -112,25 +122,21 @@ def _normalize(out):
     if isinstance(out, str):
         return out
-    # ITT often returns dict or list-of-dicts with 'generated_text'
     if isinstance(out, dict):
         gen = out.get("generated_text")
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
-            # Look for assistant role if chat-style
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     c = turn.get("content")
                     return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
-            # fallback: first item
             return _normalize(gen[0])
         if isinstance(out.get("text"), str):
             return out["text"]
         return str(out)
     if isinstance(out, (list, tuple)) and out:
-        # VQA often returns a list of dicts with 'generated_text'/'answer'
         first = out[0]
         if isinstance(first, dict):
             if "generated_text" in first and isinstance(first["generated_text"], str):
@@ -141,25 +147,17 @@ def _normalize(out):
     return str(out)
 def _infer_remote(image: Image.Image, question: str) -> str:
-    """
-    Last-resort path: call remote-code helpers if present.
-    Many Moondream2 builds expose custom methods on the model; we check them dynamically.
-    """
     if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
-        # Preferred remote API (if exposed by repo)
         with torch.no_grad():
             img_emb = MODEL.encode_image(image.convert("RGB"))
             ans = MODEL.answer_question(img_emb, question)
         return str(ans).strip()
-    # Generic generate fallback using tokenizer + special tokens
-    # We try a minimal prompt that many Moondream-style repos accept.
     prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
     with torch.no_grad():
         inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
-        # Some repos require image embeds concatenated; if unsupported, we still produce text-only best effort.
         out_ids = MODEL.generate(
             **inputs,
             max_new_tokens=128,
@@ -168,8 +166,32 @@ def _infer_remote(image: Image.Image, question: str) -> str:
     out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
     return out_text.strip()
-def infer(image: Image.Image, question: str) -> str:
     if INIT_ERR:
         return f"⚠️ Init error:\n{INIT_ERR}"
     if image is None:
@@ -180,7 +202,6 @@ def infer(image: Image.Image, question: str) -> str:
     try:
         if MODE == "itt":
-            # ITT prefers chat-format; falls back to dict if needed
             try:
                 out = PIPE(
                     text=[{
@@ -197,7 +218,6 @@ def infer(image: Image.Image, question: str) -> str:
             return _normalize(out).strip() or "(empty response)"
         if MODE == "vqa":
-            # Standard VQA signature
             out = PIPE(image=image, question=q)
             return _normalize(out).strip() or "(empty response)"
@@ -208,24 +228,33 @@ def infer(image: Image.Image, question: str) -> str:
     except Exception as e:
         return f"⚠️ Inference error: {e}"
 # ---- Gradio UI ---------------------------------------------------------------
-with gr.Blocks(title="Moondream2 — CPU Vision Q&A") as demo:
-    gr.Markdown("## 🌙 Moondream2 — CPU Vision Q&A\n"
-                "Upload an image, ask a question. App auto-negotiates the best working path.")
     if INIT_ERR:
-        gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
             prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
             btn = gr.Button("Ask")
             ans = gr.TextArea(label="Answer", lines=6)
-    btn.click(infer, [img, prompt], ans)
-    prompt.submit(infer, [img, prompt], ans)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)

+# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM
 from packaging import version
 import transformers
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
+# --- Models ---
+MOONDREAM_MODEL_ID = "vikhyatk/moondream2"
 # Pin to a stable snapshot to avoid “new version downloaded” surprises.
 PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
+SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"
 DEVICE = "cpu"
 DTYPE = torch.float32
+# ---- Moondream bootstrap strategy -------------------------------------------
 # 1) Try image-text-to-text pipeline (preferred for Q&A)
 # 2) If it rejects the custom config, try visual-question-answering pipeline
 # 3) If that fails, load the model with trust_remote_code and call its remote methods
 TOKENIZER = None
 INIT_ERR = None
 def _try_itt():
     global PIPE, MODE
     PIPE = pipeline(
         "image-text-to-text",
+        model=MOONDREAM_MODEL_ID,
         revision=PINNED_REV,
         device=DEVICE,
         dtype=DTYPE,
     )
     MODE = "itt"
 def _try_vqa():
     global PIPE, MODE
     PIPE = pipeline(
         "visual-question-answering",
+        model=MOONDREAM_MODEL_ID,
         revision=PINNED_REV,
         device=DEVICE,
         trust_remote_code=True,
     )
     MODE = "vqa"
 def _try_remote():
     # Some Moondream2 snapshots expose custom methods via remote code.
     global MODEL, TOKENIZER, MODE
     TOKENIZER = AutoTokenizer.from_pretrained(
+        MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True
     )
     MODEL = AutoModelForCausalLM.from_pretrained(
+        MOONDREAM_MODEL_ID,
         revision=PINNED_REV,
         trust_remote_code=True,
         torch_dtype=DTYPE,
         device_map=None,
     ).to(DEVICE)
     MODE = "remote"
 def _boot():
     global INIT_ERR
     try:
         _try_itt()
         return
     except Exception as e_itt:
         try:
             _try_vqa()
             return
 _boot()
+# ---- SmolVLM (CPU) pipeline --------------------------------------------------
+SMOL_PIPE = None
+SMOL_INIT_ERR = None
+try:
+    SMOL_PIPE = pipeline(
+        "image-text-to-text",
+        model=SMOL_MODEL_ID,
+        device=DEVICE,
+        dtype=DTYPE,
+        use_fast=True,
+        trust_remote_code=True,  # harmless if not needed
+    )
+except Exception as e:
+    SMOL_INIT_ERR = f"SmolVLM init failed: {e}"
+# ---- Shared helpers ----------------------------------------------------------
 def _normalize(out):
     """Normalize pipeline outputs to a plain string (assistant text only)."""
     if out is None:
     if isinstance(out, str):
         return out
     if isinstance(out, dict):
         gen = out.get("generated_text")
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
                     c = turn.get("content")
                     return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
             return _normalize(gen[0])
         if isinstance(out.get("text"), str):
             return out["text"]
         return str(out)
     if isinstance(out, (list, tuple)) and out:
         first = out[0]
         if isinstance(first, dict):
             if "generated_text" in first and isinstance(first["generated_text"], str):
     return str(out)
 def _infer_remote(image: Image.Image, question: str) -> str:
+    """Moondream2 last-resort path via remote-code helpers."""
     if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
         with torch.no_grad():
             img_emb = MODEL.encode_image(image.convert("RGB"))
             ans = MODEL.answer_question(img_emb, question)
         return str(ans).strip()
     prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
     with torch.no_grad():
         inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
         out_ids = MODEL.generate(
             **inputs,
             max_new_tokens=128,
     out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
     return out_text.strip()
+# ---- Inference (now with model selection) ------------------------------------
+def infer(image: Image.Image, question: str, model_choice: str) -> str:
+    if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct":
+        if SMOL_INIT_ERR:
+            return f"⚠️ {SMOL_INIT_ERR}"
+        if image is None:
+            return "Please upload an image."
+        q = (question or "").strip()
+        if not q:
+            return "Please enter a question."
+        try:
+            out = SMOL_PIPE(
+                text=[{
+                    "role": "user",
+                    "content": [
+                        {"type": "image", "image": image},
+                        {"type": "text", "text": q},
+                    ],
+                }],
+                max_new_tokens=128,
+            )
+        except Exception:
+            out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128)
+        return _normalize(out).strip() or "(empty response)"
+    # Default path: Moondream2 (unchanged logic)
     if INIT_ERR:
         return f"⚠️ Init error:\n{INIT_ERR}"
     if image is None:
     try:
         if MODE == "itt":
             try:
                 out = PIPE(
                     text=[{
             return _normalize(out).strip() or "(empty response)"
         if MODE == "vqa":
             out = PIPE(image=image, question=q)
             return _normalize(out).strip() or "(empty response)"
     except Exception as e:
         return f"⚠️ Inference error: {e}"
 # ---- Gradio UI ---------------------------------------------------------------
+with gr.Blocks(title="CPU Vision Q&A") as demo:
+    gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n"
+                "Upload an image, ask a question, and pick your model.")
+    # Show Moondream init status (kept from your original app)
     if INIT_ERR:
+        gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`")
+    if SMOL_INIT_ERR:
+        gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
+            # NEW: model selector (default = Moondream2) — minimal surface change
+            model_choice = gr.Dropdown(
+                choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID],
+                value=MOONDREAM_MODEL_ID,
+                label="Model",
+            )
             prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
             btn = gr.Button("Ask")
             ans = gr.TextArea(label="Answer", lines=6)
+    # Wire the new dropdown into the call; everything else is unchanged
+    btn.click(infer, [img, prompt, model_choice], ans)
+    prompt.submit(infer, [img, prompt, model_choice], ans)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)