|
|
|
|
|
|
|
|
from packaging import version |
|
|
import transformers |
|
|
from transformers import pipeline |
|
|
import torch |
|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
MIN_TF = "4.46.0" |
|
|
if version.parse(transformers.__version__) < version.parse(MIN_TF): |
|
|
raise RuntimeError( |
|
|
f"Transformers >= {MIN_TF} required for 'image-text-to-text'. " |
|
|
f"Found {transformers.__version__}. Upgrade:\n" |
|
|
f" pip install -U 'transformers>={MIN_TF},<5'" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "vikhyatk/moondream2" |
|
|
|
|
|
|
|
|
|
|
|
DEVICE = "cpu" |
|
|
DTYPE = torch.float32 |
|
|
|
|
|
|
|
|
try: |
|
|
import torchvision |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
|
|
|
pipe = pipeline( |
|
|
task="image-text-to-text", |
|
|
model=MODEL_ID, |
|
|
device=DEVICE, |
|
|
dtype=DTYPE, |
|
|
trust_remote_code=True, |
|
|
use_fast=True, |
|
|
) |
|
|
|
|
|
def _extract_text(obj): |
|
|
"""Normalize pipeline outputs to plain text (handles chat-style payloads).""" |
|
|
if obj is None: |
|
|
return "" |
|
|
if isinstance(obj, str): |
|
|
return obj |
|
|
if isinstance(obj, dict): |
|
|
gen = obj.get("generated_text") |
|
|
if isinstance(gen, str): |
|
|
return gen |
|
|
if isinstance(gen, (list, tuple)) and gen: |
|
|
|
|
|
for turn in reversed(gen): |
|
|
if isinstance(turn, dict) and turn.get("role") == "assistant": |
|
|
content = turn.get("content") |
|
|
return " ".join(map(str, content)) if isinstance(content, list) else str(content or "") |
|
|
return _extract_text(gen[0]) |
|
|
if "text" in obj and isinstance(obj["text"], str): |
|
|
return obj["text"] |
|
|
return str(obj) |
|
|
if isinstance(obj, (list, tuple)) and obj: |
|
|
return _extract_text(obj[0]) |
|
|
return str(obj) |
|
|
|
|
|
def infer(image: Image.Image, question: str) -> str: |
|
|
if image is None: |
|
|
return "Please upload an image." |
|
|
q = (question or "").strip() |
|
|
if not q: |
|
|
return "Please enter a question." |
|
|
|
|
|
|
|
|
try: |
|
|
out = pipe( |
|
|
text=[{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": image}, |
|
|
{"type": "text", "text": q}, |
|
|
], |
|
|
}], |
|
|
max_new_tokens=96, |
|
|
) |
|
|
except Exception: |
|
|
|
|
|
out = pipe({"images": [image], "text": q}, max_new_tokens=96) |
|
|
|
|
|
return _extract_text(out).strip() or "(empty response)" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="CPU-only Vision QA") as demo: |
|
|
gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.") |
|
|
with gr.Row(): |
|
|
img = gr.Image(type="pil", label="Upload an image") |
|
|
with gr.Column(): |
|
|
prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2) |
|
|
submit = gr.Button("Ask") |
|
|
out = gr.TextArea(label="Answer", lines=6) |
|
|
|
|
|
submit.click(infer, [img, prompt], out) |
|
|
prompt.submit(infer, [img, prompt], out) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.queue().launch(debug=True) |
|
|
|