# app.py — CPU-only Gradio for vikhyatk/moondream2 with resilient fallbacks + selectable SmolVLM from packaging import version import transformers import torch import gradio as gr from PIL import Image from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer MIN_TF = "4.51.0" # newer TFs are friendlier to custom multimodal configs if version.parse(transformers.__version__) < version.parse(MIN_TF): raise RuntimeError( f"Transformers >= {MIN_TF} required for Moondream2. " f"Found {transformers.__version__}. Upgrade:\n" f" pip install -U 'transformers>={MIN_TF},<5'" ) # --- Models --- MOONDREAM_MODEL_ID = "vikhyatk/moondream2" # Pin to a stable snapshot to avoid “new version downloaded” surprises. PINNED_REV = "6b714b26eea5cbd9f31e4edb2541c170afa935ba" SMOL_MODEL_ID = "HuggingFaceTB/SmolVLM-500M-Instruct" DEVICE = "cpu" DTYPE = torch.float32 # ---- Moondream bootstrap strategy ------------------------------------------- # 1) Try image-text-to-text pipeline (preferred for Q&A) # 2) If it rejects the custom config, try visual-question-answering pipeline # 3) If that fails, load the model with trust_remote_code and call its remote methods PIPE = None MODE = None # "itt" | "vqa" | "remote" MODEL = None TOKENIZER = None INIT_ERR = None def _try_itt(): global PIPE, MODE PIPE = pipeline( "image-text-to-text", model=MOONDREAM_MODEL_ID, revision=PINNED_REV, device=DEVICE, dtype=DTYPE, trust_remote_code=True, use_fast=True, ) MODE = "itt" def _try_vqa(): global PIPE, MODE PIPE = pipeline( "visual-question-answering", model=MOONDREAM_MODEL_ID, revision=PINNED_REV, device=DEVICE, trust_remote_code=True, ) MODE = "vqa" def _try_remote(): # Some Moondream2 snapshots expose custom methods via remote code. global MODEL, TOKENIZER, MODE TOKENIZER = AutoTokenizer.from_pretrained( MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True ) MODEL = AutoModelForCausalLM.from_pretrained( MOONDREAM_MODEL_ID, revision=PINNED_REV, trust_remote_code=True, torch_dtype=DTYPE, device_map=None, ).to(DEVICE) MODE = "remote" def _boot(): global INIT_ERR try: _try_itt() return except Exception as e_itt: try: _try_vqa() return except Exception as e_vqa: try: _try_remote() return except Exception as e_remote: INIT_ERR = ( "Moondream2 initialization failed.\n\n" f"ITT error: {e_itt}\n\n" f"VQA error: {e_vqa}\n\n" f"Remote error: {e_remote}" ) _boot() # ---- SmolVLM (CPU) pipeline -------------------------------------------------- SMOL_PIPE = None SMOL_INIT_ERR = None try: SMOL_PIPE = pipeline( "image-text-to-text", model=SMOL_MODEL_ID, device=DEVICE, dtype=DTYPE, use_fast=True, trust_remote_code=True, # harmless if not needed ) except Exception as e: SMOL_INIT_ERR = f"SmolVLM init failed: {e}" # ---- Shared helpers ---------------------------------------------------------- def _normalize(out): """Normalize pipeline outputs to a plain string (assistant text only).""" if out is None: return "" if isinstance(out, str): return out if isinstance(out, dict): gen = out.get("generated_text") if isinstance(gen, str): return gen if isinstance(gen, (list, tuple)) and gen: for turn in reversed(gen): if isinstance(turn, dict) and turn.get("role") == "assistant": c = turn.get("content") return " ".join(map(str, c)) if isinstance(c, list) else str(c or "") return _normalize(gen[0]) if isinstance(out.get("text"), str): return out["text"] return str(out) if isinstance(out, (list, tuple)) and out: first = out[0] if isinstance(first, dict): if "generated_text" in first and isinstance(first["generated_text"], str): return first["generated_text"] if "answer" in first and isinstance(first["answer"], str): return first["answer"] return _normalize(first) return str(out) def _infer_remote(image: Image.Image, question: str) -> str: """Moondream2 last-resort path via remote-code helpers.""" if hasattr(MODEL, "encode_image") and hasattr(MODEL, "answer_question"): with torch.no_grad(): img_emb = MODEL.encode_image(image.convert("RGB")) ans = MODEL.answer_question(img_emb, question) return str(ans).strip() prompt = f"\n\nQuestion: {question}\n\nAnswer:" with torch.no_grad(): inputs = TOKENIZER(prompt, return_tensors="pt").to(DEVICE) out_ids = MODEL.generate( **inputs, max_new_tokens=128, pad_token_id=TOKENIZER.eos_token_id, ) out_text = TOKENIZER.batch_decode(out_ids, skip_special_tokens=True)[0] return out_text.strip() # ---- Inference (now with model selection) ------------------------------------ def infer(image: Image.Image, question: str, model_choice: str) -> str: if model_choice == "HuggingFaceTB/SmolVLM-500M-Instruct": if SMOL_INIT_ERR: return f"⚠️ {SMOL_INIT_ERR}" if image is None: return "Please upload an image." q = (question or "").strip() if not q: return "Please enter a question." try: out = SMOL_PIPE( text=[{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": q}, ], }], max_new_tokens=128, ) except Exception: out = SMOL_PIPE({"images": [image], "text": q}, max_new_tokens=128) return _normalize(out).strip() or "(empty response)" # Default path: Moondream2 (unchanged logic) if INIT_ERR: return f"⚠️ Init error:\n{INIT_ERR}" if image is None: return "Please upload an image." q = (question or "").strip() if not q: return "Please enter a question." try: if MODE == "itt": try: out = PIPE( text=[{ "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": q}, ], }], max_new_tokens=128, ) except Exception: out = PIPE({"images": [image], "text": q}, max_new_tokens=128) return _normalize(out).strip() or "(empty response)" if MODE == "vqa": out = PIPE(image=image, question=q) return _normalize(out).strip() or "(empty response)" if MODE == "remote": return _infer_remote(image, q) or "(empty response)" return "Unknown mode." except Exception as e: return f"⚠️ Inference error: {e}" # ---- Gradio UI --------------------------------------------------------------- with gr.Blocks(title="CPU Vision Q&A") as demo: gr.Markdown("## 🌙 Moondream2 & 🐣 SmolVLM — CPU Vision Q&A\n" "Upload an image, ask a question, and pick your model.") # Show Moondream init status (kept from your original app) if INIT_ERR: gr.Markdown(f"**Moondream startup status:** `{INIT_ERR}`") if SMOL_INIT_ERR: gr.Markdown(f"**SmolVLM startup status:** `{SMOL_INIT_ERR}`") with gr.Row(): img = gr.Image(type="pil", label="Upload an image") with gr.Column(): # NEW: model selector (default = Moondream2) — minimal surface change model_choice = gr.Dropdown( choices=[MOONDREAM_MODEL_ID, SMOL_MODEL_ID], value=MOONDREAM_MODEL_ID, label="Model", ) prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?") btn = gr.Button("Ask") ans = gr.TextArea(label="Answer", lines=6) # Wire the new dropdown into the call; everything else is unchanged btn.click(infer, [img, prompt, model_choice], ans) prompt.submit(infer, [img, prompt, model_choice], ans) if __name__ == "__main__": demo.queue().launch(debug=True)