Spaces:

KarthiEz
/

gemmasign

Sleeping

App Files Files Community

KarthiEz commited on Oct 13

Commit

aa797e4

verified ·

1 Parent(s): 5087f0e

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -68

app.py CHANGED Viewed

@@ -1,108 +1,231 @@
-# app.py — CPU-only image→text QA via Transformers pipeline + Gradio
 from packaging import version
 import transformers
-from transformers import pipeline
 import torch
 import gradio as gr
 from PIL import Image
-# ---- Governance: ensure pipeline task is supported ----
-MIN_TF = "4.46.0"
 if version.parse(transformers.__version__) < version.parse(MIN_TF):
     raise RuntimeError(
-        f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
         f"Found {transformers.__version__}. Upgrade:\n"
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
-# -------- Choose a CPU-friendly model here --------
-# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
 MODEL_ID = "vikhyatk/moondream2"
-# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct"  # example tiny option
-# ---- Force CPU posture ----
 DEVICE = "cpu"
-DTYPE = torch.float32  # CPU-safe
-# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
-try:
-    import torchvision  # noqa: F401
-except Exception:
-    pass  # If your chosen model needs it, install torchvision
-# ---- Bootstrap pipeline (CPU only) ----
-pipe = pipeline(
-    "image-text-to-text",
-    model="vikhyatk/moondream2",
-    trust_remote_code=True,
-    revision="6b714b26eea5cbd9f31e4edb2541c170afa935ba",  # pin to a known commit
-    device="cpu",
-    dtype=torch.float32,
-    use_fast=True,
-)
-def _extract_text(obj):
-    """Normalize pipeline outputs to plain text (handles chat-style payloads)."""
-    if obj is None:
         return ""
-    if isinstance(obj, str):
-        return obj
-    if isinstance(obj, dict):
-        gen = obj.get("generated_text")
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
-            # Prefer assistant turns if present
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
-                    content = turn.get("content")
-                    return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
-            return _extract_text(gen[0])
-        if "text" in obj and isinstance(obj["text"], str):
-            return obj["text"]
-        return str(obj)
-    if isinstance(obj, (list, tuple)) and obj:
-        return _extract_text(obj[0])
-    return str(obj)
 def infer(image: Image.Image, question: str) -> str:
     if image is None:
         return "Please upload an image."
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
-    # Preferred: chat-style messages (auto-injects image tokens correctly)
     try:
-        out = pipe(
-            text=[{
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": q},
-                ],
-            }],
-            max_new_tokens=96,
-        )
-    except Exception:
-        # Fallback: dict API — ensure a LIST for images
-        out = pipe({"images": [image], "text": q}, max_new_tokens=96)
-    return _extract_text(out).strip() or "(empty response)"
-# ---- Gradio UI ----
-with gr.Blocks(title="CPU-only Vision QA") as demo:
-    gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
-            prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
-            submit = gr.Button("Ask")
-            out = gr.TextArea(label="Answer", lines=6)
-    submit.click(infer, [img, prompt], out)
-    prompt.submit(infer, [img, prompt], out)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)

+# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks
 from packaging import version
 import transformers
 import torch
 import gradio as gr
 from PIL import Image
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
+MIN_TF = "4.51.0"  # newer TFs are friendlier to custom multimodal configs
 if version.parse(transformers.__version__) < version.parse(MIN_TF):
     raise RuntimeError(
+        f"Transformers >= {MIN_TF} required for Moondream2. "
         f"Found {transformers.__version__}. Upgrade:\n"
         f"  pip install -U 'transformers>={MIN_TF},<5'"
     )
 MODEL_ID = "vikhyatk/moondream2"
+# Pin to a stable snapshot to avoid “new version downloaded” surprises.
+# If you want latest, set revision="main".
+PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"
 DEVICE = "cpu"
+DTYPE = torch.float32
+# ---- Bootstrap strategy ------------------------------------------------------
+# 1) Try image-text-to-text pipeline (preferred for Q&A)
+# 2) If it rejects the custom config, try visual-question-answering pipeline
+# 3) If that fails, load the model with trust_remote_code and call its remote methods
+PIPE = None
+MODE = None     # "itt" | "vqa" | "remote"
+MODEL = None
+TOKENIZER = None
+INIT_ERR = None
+def _try_itt():
+    global PIPE, MODE
+    PIPE = pipeline(
+        "image-text-to-text",
+        model=MODEL_ID,
+        revision=PINNED_REV,
+        device=DEVICE,
+        dtype=DTYPE,
+        trust_remote_code=True,
+        use_fast=True,
+    )
+    MODE = "itt"
+def _try_vqa():
+    global PIPE, MODE
+    PIPE = pipeline(
+        "visual-question-answering",
+        model=MODEL_ID,
+        revision=PINNED_REV,
+        device=DEVICE,
+        trust_remote_code=True,
+    )
+    MODE = "vqa"
+def _try_remote():
+    # Some Moondream2 snapshots expose custom methods via remote code.
+    global MODEL, TOKENIZER, MODE
+    TOKENIZER = AutoTokenizer.from_pretrained(
+        MODEL_ID, revision=PINNED_REV, trust_remote_code=True
+    )
+    MODEL = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        revision=PINNED_REV,
+        trust_remote_code=True,
+        torch_dtype=DTYPE,
+        device_map=None,
+    ).to(DEVICE)
+    # Heuristic: prefer dedicated helpers if present
+    MODE = "remote"
+def _boot():
+    global INIT_ERR
+    try:
+        _try_itt()
+        return
+    except Exception as e_itt:
+        # Fall through
+        try:
+            _try_vqa()
+            return
+        except Exception as e_vqa:
+            try:
+                _try_remote()
+                return
+            except Exception as e_remote:
+                INIT_ERR = (
+                    "Moondream2 initialization failed.\n\n"
+                    f"ITT error: {e_itt}\n\n"
+                    f"VQA error: {e_vqa}\n\n"
+                    f"Remote error: {e_remote}"
+                )
+_boot()
+def _normalize(out):
+    """Normalize pipeline outputs to a plain string (assistant text only)."""
+    if out is None:
         return ""
+    if isinstance(out, str):
+        return out
+    # ITT often returns dict or list-of-dicts with 'generated_text'
+    if isinstance(out, dict):
+        gen = out.get("generated_text")
         if isinstance(gen, str):
             return gen
         if isinstance(gen, (list, tuple)) and gen:
+            # Look for assistant role if chat-style
             for turn in reversed(gen):
                 if isinstance(turn, dict) and turn.get("role") == "assistant":
+                    c = turn.get("content")
+                    return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
+            # fallback: first item
+            return _normalize(gen[0])
+        if isinstance(out.get("text"), str):
+            return out["text"]
+        return str(out)
+    if isinstance(out, (list, tuple)) and out:
+        # VQA often returns a list of dicts with 'generated_text'/'answer'
+        first = out[0]
+        if isinstance(first, dict):
+            if "generated_text" in first and isinstance(first["generated_text"], str):
+                return first["generated_text"]
+            if "answer" in first and isinstance(first["answer"], str):
+                return first["answer"]
+        return _normalize(first)
+    return str(out)
+def _infer_remote(image: Image.Image, question: str) -> str:
+    """
+    Last-resort path: call remote-code helpers if present.
+    Many Moondream2 builds expose custom methods on the model; we check them dynamically.
+    """
+    if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
+        # Preferred remote API (if exposed by repo)
+        with torch.no_grad():
+            img_emb = MODEL.encode_image(image.convert("RGB"))
+            ans = MODEL.answer_question(img_emb, question)
+        return str(ans).strip()
+    # Generic generate fallback using tokenizer + special tokens
+    # We try a minimal prompt that many Moondream-style repos accept.
+    prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
+    with torch.no_grad():
+        inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
+        # Some repos require image embeds concatenated; if unsupported, we still produce text-only best effort.
+        out_ids = MODEL.generate(
+            **inputs,
+            max_new_tokens=128,
+            pad_token_id=TOKENIZER.eos_token_id,
+        )
+    out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
+    return out_text.strip()
 def infer(image: Image.Image, question: str) -> str:
+    if INIT_ERR:
+        return f"⚠️ Init error:\n{INIT_ERR}"
     if image is None:
         return "Please upload an image."
     q = (question or "").strip()
     if not q:
         return "Please enter a question."
     try:
+        if MODE == "itt":
+            # ITT prefers chat-format; falls back to dict if needed
+            try:
+                out = PIPE(
+                    text=[{
+                        "role": "user",
+                        "content": [
+                            {"type": "image", "image": image},
+                            {"type": "text", "text": q},
+                        ],
+                    }],
+                    max_new_tokens=128,
+                )
+            except Exception:
+                out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
+            return _normalize(out).strip() or "(empty response)"
+        if MODE == "vqa":
+            # Standard VQA signature
+            out = PIPE(image=image, question=q)
+            return _normalize(out).strip() or "(empty response)"
+        if MODE == "remote":
+            return _infer_remote(image, q) or "(empty response)"
+        return "Unknown mode."
+    except Exception as e:
+        return f"⚠️ Inference error: {e}"
+# ---- Gradio UI ---------------------------------------------------------------
+with gr.Blocks(title="Moondream2 — CPU Vision Q&A") as demo:
+    gr.Markdown("## 🌙 Moondream2 — CPU Vision Q&A\n"
+                "Upload an image, ask a question. App auto-negotiates the best working path.")
+    if INIT_ERR:
+        gr.Markdown(f"**Startup status:** `{INIT_ERR}`")
     with gr.Row():
         img = gr.Image(type="pil", label="Upload an image")
         with gr.Column():
+            prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
+            btn = gr.Button("Ask")
+            ans = gr.TextArea(label="Answer", lines=6)
+    btn.click(infer, [img, prompt], ans)
+    prompt.submit(infer, [img, prompt], ans)
 if __name__ == "__main__":
     demo.queue().launch(debug=True)