File size: 3,736 Bytes
c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 c7547a1 d35bac6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
# app.py — CPU-only image→text QA via Transformers pipeline + Gradio
from packaging import version
import transformers
from transformers import pipeline
import torch
import gradio as gr
from PIL import Image
# ---- Governance: ensure pipeline task is supported ----
MIN_TF = "4.46.0"
if version.parse(transformers.__version__) < version.parse(MIN_TF):
raise RuntimeError(
f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
f"Found {transformers.__version__}. Upgrade:\n"
f" pip install -U 'transformers>={MIN_TF},<5'"
)
# -------- Choose a CPU-friendly model here --------
# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MODEL_ID = "vikhyatk/moondream2"
# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct" # example tiny option
# ---- Force CPU posture ----
DEVICE = "cpu"
DTYPE = torch.float32 # CPU-safe
# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
try:
import torchvision # noqa: F401
except Exception:
pass # If your chosen model needs it, install torchvision
# ---- Bootstrap pipeline (CPU only) ----
pipe = pipeline(
task="image-text-to-text",
model=MODEL_ID,
device=DEVICE, # <- forces CPU
dtype=DTYPE, # <- CPU dtype
trust_remote_code=True,
use_fast=True, # if supported by the model’s processor
)
def _extract_text(obj):
"""Normalize pipeline outputs to plain text (handles chat-style payloads)."""
if obj is None:
return ""
if isinstance(obj, str):
return obj
if isinstance(obj, dict):
gen = obj.get("generated_text")
if isinstance(gen, str):
return gen
if isinstance(gen, (list, tuple)) and gen:
# Prefer assistant turns if present
for turn in reversed(gen):
if isinstance(turn, dict) and turn.get("role") == "assistant":
content = turn.get("content")
return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
return _extract_text(gen[0])
if "text" in obj and isinstance(obj["text"], str):
return obj["text"]
return str(obj)
if isinstance(obj, (list, tuple)) and obj:
return _extract_text(obj[0])
return str(obj)
def infer(image: Image.Image, question: str) -> str:
if image is None:
return "Please upload an image."
q = (question or "").strip()
if not q:
return "Please enter a question."
# Preferred: chat-style messages (auto-injects image tokens correctly)
try:
out = pipe(
text=[{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": q},
],
}],
max_new_tokens=96,
)
except Exception:
# Fallback: dict API — ensure a LIST for images
out = pipe({"images": [image], "text": q}, max_new_tokens=96)
return _extract_text(out).strip() or "(empty response)"
# ---- Gradio UI ----
with gr.Blocks(title="CPU-only Vision QA") as demo:
gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
with gr.Row():
img = gr.Image(type="pil", label="Upload an image")
with gr.Column():
prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
submit = gr.Button("Ask")
out = gr.TextArea(label="Answer", lines=6)
submit.click(infer, [img, prompt], out)
prompt.submit(infer, [img, prompt], out)
if __name__ == "__main__":
demo.queue().launch(debug=True)
|