gemmasign / app.py
KarthiEz's picture
Update app.py
c7547a1 verified
raw
history blame
3.74 kB
# app.py — CPU-only image→text QA via Transformers pipeline + Gradio
from packaging import version
import transformers
from transformers import pipeline
import torch
import gradio as gr
from PIL import Image
# ---- Governance: ensure pipeline task is supported ----
MIN_TF = "4.46.0"
if version.parse(transformers.__version__) < version.parse(MIN_TF):
raise RuntimeError(
f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
f"Found {transformers.__version__}. Upgrade:\n"
f" pip install -U 'transformers>={MIN_TF},<5'"
)
# -------- Choose a CPU-friendly model here --------
# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MODEL_ID = "vikhyatk/moondream2"
# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct" # example tiny option
# ---- Force CPU posture ----
DEVICE = "cpu"
DTYPE = torch.float32 # CPU-safe
# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
try:
import torchvision # noqa: F401
except Exception:
pass # If your chosen model needs it, install torchvision
# ---- Bootstrap pipeline (CPU only) ----
pipe = pipeline(
task="image-text-to-text",
model=MODEL_ID,
device=DEVICE, # <- forces CPU
dtype=DTYPE, # <- CPU dtype
trust_remote_code=True,
use_fast=True, # if supported by the model’s processor
)
def _extract_text(obj):
"""Normalize pipeline outputs to plain text (handles chat-style payloads)."""
if obj is None:
return ""
if isinstance(obj, str):
return obj
if isinstance(obj, dict):
gen = obj.get("generated_text")
if isinstance(gen, str):
return gen
if isinstance(gen, (list, tuple)) and gen:
# Prefer assistant turns if present
for turn in reversed(gen):
if isinstance(turn, dict) and turn.get("role") == "assistant":
content = turn.get("content")
return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
return _extract_text(gen[0])
if "text" in obj and isinstance(obj["text"], str):
return obj["text"]
return str(obj)
if isinstance(obj, (list, tuple)) and obj:
return _extract_text(obj[0])
return str(obj)
def infer(image: Image.Image, question: str) -> str:
if image is None:
return "Please upload an image."
q = (question or "").strip()
if not q:
return "Please enter a question."
# Preferred: chat-style messages (auto-injects image tokens correctly)
try:
out = pipe(
text=[{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": q},
],
}],
max_new_tokens=96,
)
except Exception:
# Fallback: dict API — ensure a LIST for images
out = pipe({"images": [image], "text": q}, max_new_tokens=96)
return _extract_text(out).strip() or "(empty response)"
# ---- Gradio UI ----
with gr.Blocks(title="CPU-only Vision QA") as demo:
gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
with gr.Row():
img = gr.Image(type="pil", label="Upload an image")
with gr.Column():
prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
submit = gr.Button("Ask")
out = gr.TextArea(label="Answer", lines=6)
submit.click(infer, [img, prompt], out)
prompt.submit(infer, [img, prompt], out)
if __name__ == "__main__":
demo.queue().launch(debug=True)