Spaces:

KarthiEz
/

gemmasign

Sleeping

File size: 8,813 Bytes

# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM

from packaging import version
import transformers
import torch
import gradio as gr
from PIL import Image

from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

MIN_TF = "4.51.0"  # newer TFs are friendlier to custom multimodal configs
if version.parse(transformers.__version__) < version.parse(MIN_TF):
    raise RuntimeError(
        f"Transformers >= {MIN_TF} required for Moondream2. "
        f"Found {transformers.__version__}. Upgrade:\n"
        f"  pip install -U 'transformers>={MIN_TF},<5'"
    )

# --- Models ---
MOONDREAM_MODEL_ID = "vikhyatk/moondream2"
# Pin to a stable snapshot to avoid “new version downloaded” surprises.
PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba"

SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct"

DEVICE = "cpu"
DTYPE = torch.float32

# ---- Moondream bootstrap strategy -------------------------------------------
# 1) Try image-text-to-text pipeline (preferred for Q&A)
# 2) If it rejects the custom config, try visual-question-answering pipeline
# 3) If that fails, load the model with trust_remote_code and call its remote methods

PIPE = None
MODE = None     # "itt" | "vqa" | "remote"
MODEL = None
TOKENIZER = None
INIT_ERR = None

def _try_itt():
    global PIPE, MODE
    PIPE = pipeline(
        "image-text-to-text",
        model=MOONDREAM_MODEL_ID,
        revision=PINNED_REV,
        device=DEVICE,
        dtype=DTYPE,
        trust_remote_code=True,
        use_fast=True,
    )
    MODE = "itt"

def _try_vqa():
    global PIPE, MODE
    PIPE = pipeline(
        "visual-question-answering",
        model=MOONDREAM_MODEL_ID,
        revision=PINNED_REV,
        device=DEVICE,
        trust_remote_code=True,
    )
    MODE = "vqa"

def _try_remote():
    # Some Moondream2 snapshots expose custom methods via remote code.
    global MODEL, TOKENIZER, MODE
    TOKENIZER = AutoTokenizer.from_pretrained(
        MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True
    )
    MODEL = AutoModelForCausalLM.from_pretrained(
        MOONDREAM_MODEL_ID,
        revision=PINNED_REV,
        trust_remote_code=True,
        torch_dtype=DTYPE,
        device_map=None,
    ).to(DEVICE)
    MODE = "remote"

def _boot():
    global INIT_ERR
    try:
        _try_itt()
        return
    except Exception as e_itt:
        try:
            _try_vqa()
            return
        except Exception as e_vqa:
            try:
                _try_remote()
                return
            except Exception as e_remote:
                INIT_ERR = (
                    "Moondream2 initialization failed.\n\n"
                    f"ITT error: {e_itt}\n\n"
                    f"VQA error: {e_vqa}\n\n"
                    f"Remote error: {e_remote}"
                )

_boot()

# ---- SmolVLM (CPU) pipeline --------------------------------------------------
SMOL_PIPE = None
SMOL_INIT_ERR = None
try:
    SMOL_PIPE = pipeline(
        "image-text-to-text",
        model=SMOL_MODEL_ID,
        device=DEVICE,
        dtype=DTYPE,
        use_fast=True,
        trust_remote_code=True,  # harmless if not needed
    )
except Exception as e:
    SMOL_INIT_ERR = f"SmolVLM init failed: {e}"

# ---- Shared helpers ----------------------------------------------------------
def _normalize(out):
    """Normalize pipeline outputs to a plain string (assistant text only)."""
    if out is None:
        return ""
    if isinstance(out, str):
        return out

    if isinstance(out, dict):
        gen = out.get("generated_text")
        if isinstance(gen, str):
            return gen
        if isinstance(gen, (list, tuple)) and gen:
            for turn in reversed(gen):
                if isinstance(turn, dict) and turn.get("role") == "assistant":
                    c = turn.get("content")
                    return " ".join(map(str, c)) if isinstance(c, list) else str(c or "")
            return _normalize(gen[0])
        if isinstance(out.get("text"), str):
            return out["text"]
        return str(out)

    if isinstance(out, (list, tuple)) and out:
        first = out[0]
        if isinstance(first, dict):
            if "generated_text" in first and isinstance(first["generated_text"], str):
                return first["generated_text"]
            if "answer" in first and isinstance(first["answer"], str):
                return first["answer"]
        return _normalize(first)

    return str(out)

def _infer_remote(image: Image.Image, question: str) -> str:
    """Moondream2 last-resort path via remote-code helpers."""
    if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"):
        with torch.no_grad():
            img_emb = MODEL.encode_image(image.convert("RGB"))
            ans = MODEL.answer_question(img_emb, question)
        return str(ans).strip()

    prompt = f"<image>\n\nQuestion: {question}\n\nAnswer:"
    with torch.no_grad():
        inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE)
        out_ids = MODEL.generate(
            **inputs,
            max_new_tokens=128,
            pad_token_id=TOKENIZER.eos_token_id,
        )
    out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0]
    return out_text.strip()

# ---- Inference (now with model selection) ------------------------------------
def infer(image: Image.Image, question: str, model_choice: str) -> str:
    if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct":
        if SMOL_INIT_ERR:
            return f"⚠️ {SMOL_INIT_ERR}"
        if image is None:
            return "Please upload an image."
        q = (question or "").strip()
        if not q:
            return "Please enter a question."
        try:
            out = SMOL_PIPE(
                text=[{
                    "role": "user",
                    "content": [
                        {"type": "image", "image": image},
                        {"type": "text", "text": q},
                    ],
                }],
                max_new_tokens=128,
            )
        except Exception:
            out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128)
        return _normalize(out).strip() or "(empty response)"

    # Default path: Moondream2 (unchanged logic)
    if INIT_ERR:
        return f"⚠️ Init error:\n{INIT_ERR}"
    if image is None:
        return "Please upload an image."
    q = (question or "").strip()
    if not q:
        return "Please enter a question."

    try:
        if MODE == "itt":
            try:
                out = PIPE(
                    text=[{
                        "role": "user",
                        "content": [
                            {"type": "image", "image": image},
                            {"type": "text", "text": q},
                        ],
                    }],
                    max_new_tokens=128,
                )
            except Exception:
                out = PIPE({"images": [image], "text": q}, max_new_tokens=128)
            return _normalize(out).strip() or "(empty response)"

        if MODE == "vqa":
            out = PIPE(image=image, question=q)
            return _normalize(out).strip() or "(empty response)"

        if MODE == "remote":
            return _infer_remote(image, q) or "(empty response)"

        return "Unknown mode."
    except Exception as e:
        return f"⚠️ Inference error: {e}"

# ---- Gradio UI ---------------------------------------------------------------
with gr.Blocks(title="CPU Vision Q&A") as demo:
    gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n"
                "Upload an image, ask a question, and pick your model.")

    # Show Moondream init status (kept from your original app)
    if INIT_ERR:
        gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`")
    if SMOL_INIT_ERR:
        gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`")

    with gr.Row():
        img = gr.Image(type="pil", label="Upload an image")
        with gr.Column():
            # NEW: model selector (default = Moondream2) — minimal surface change
            model_choice = gr.Dropdown(
                choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID],
                value=MOONDREAM_MODEL_ID,
                label="Model",
            )
            prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?")
            btn = gr.Button("Ask")
            ans = gr.TextArea(label="Answer", lines=6)

    # Wire the new dropdown into the call; everything else is unchanged
    btn.click(infer, [img, prompt, model_choice], ans)
    prompt.submit(infer, [img, prompt, model_choice], ans)

if __name__ == "__main__":
    demo.queue().launch(debug=True)