Spaces:

KarthiEz
/

gemmasign

Sleeping

App Files Files Community

gemmasign / app.py

KarthiEz

Update app.py

c7547a1 verified 2 months ago

raw

history blame

3.74 kB

	# app.py — CPU-only image→text QA via Transformers pipeline + Gradio

	from packaging import version
	import transformers
	from transformers import pipeline
	import torch
	import gradio as gr
	from PIL import Image

	# ---- Governance: ensure pipeline task is supported ----
	MIN_TF = "4.46.0"
	if version.parse(transformers.__version__) < version.parse(MIN_TF):
	raise RuntimeError(
	f"Transformers >= {MIN_TF} required for 'image-text-to-text'. "
	f"Found {transformers.__version__}. Upgrade:\n"
	f" pip install -U 'transformers>={MIN_TF},<5'"
	)

	# -------- Choose a CPU-friendly model here --------
	# MODEL_ID = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
	MODEL_ID = "vikhyatk/moondream2"
	# MODEL_ID = "HuggingFaceTB/SmolVLM-Instruct" # example tiny option

	# ---- Force CPU posture ----
	DEVICE = "cpu"
	DTYPE = torch.float32 # CPU-safe

	# ---- Optional: torchvision is used by some processors (e.g., OneVision) ----
	try:
	import torchvision # noqa: F401
	except Exception:
	pass # If your chosen model needs it, install torchvision

	# ---- Bootstrap pipeline (CPU only) ----
	pipe = pipeline(
	task="image-text-to-text",
	model=MODEL_ID,
	device=DEVICE, # <- forces CPU
	dtype=DTYPE, # <- CPU dtype
	trust_remote_code=True,
	use_fast=True, # if supported by the model’s processor
	)

	def _extract_text(obj):
	"""Normalize pipeline outputs to plain text (handles chat-style payloads)."""
	if obj is None:
	return ""
	if isinstance(obj, str):
	return obj
	if isinstance(obj, dict):
	gen = obj.get("generated_text")
	if isinstance(gen, str):
	return gen
	if isinstance(gen, (list, tuple)) and gen:
	# Prefer assistant turns if present
	for turn in reversed(gen):
	if isinstance(turn, dict) and turn.get("role") == "assistant":
	content = turn.get("content")
	return " ".join(map(str, content)) if isinstance(content, list) else str(content or "")
	return _extract_text(gen[0])
	if "text" in obj and isinstance(obj["text"], str):
	return obj["text"]
	return str(obj)
	if isinstance(obj, (list, tuple)) and obj:
	return _extract_text(obj[0])
	return str(obj)

	def infer(image: Image.Image, question: str) -> str:
	if image is None:
	return "Please upload an image."
	q = (question or "").strip()
	if not q:
	return "Please enter a question."

	# Preferred: chat-style messages (auto-injects image tokens correctly)
	try:
	out = pipe(
	text=[{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": q},
	],
	}],
	max_new_tokens=96,
	)
	except Exception:
	# Fallback: dict API — ensure a LIST for images
	out = pipe({"images": [image], "text": q}, max_new_tokens=96)

	return _extract_text(out).strip() or "(empty response)"

	# ---- Gradio UI ----
	with gr.Blocks(title="CPU-only Vision QA") as demo:
	gr.Markdown("## 🧠🖼️ CPU-only Vision Q&A\nDrop an image, ask a question. Runs entirely on CPU.")
	with gr.Row():
	img = gr.Image(type="pil", label="Upload an image")
	with gr.Column():
	prompt = gr.Textbox(label="Question", placeholder="e.g., Is there a stamp or signature?", lines=2)
	submit = gr.Button("Ask")
	out = gr.TextArea(label="Answer", lines=6)

	submit.click(infer, [img, prompt], out)
	prompt.submit(infer, [img, prompt], out)

	if __name__ == "__main__":
	demo.queue().launch(debug=True)