Spaces:

KarthiEz
/

DocQwen2VL

Sleeping

App Files Files Community

DocQwen2VL / app.py

KarthiEz

Update app.py

2b450ba verified 2 months ago

raw

history blame contribute delete

3.59 kB

	import gradio as gr
	from transformers import pipeline
	from PIL import Image
	import io

	# ---------- optional: PDF -> PIL first page ----------
	def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
	import fitz # PyMuPDF
	with fitz.open(stream=file_bytes, filetype="pdf") as doc:
	page = doc[0]
	pix = page.get_pixmap(dpi=200)
	return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

	# ---------- init model ----------
	pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")

	# ---------- robust extractor: returns ONLY the model text ----------
	def _only_model_text(out) -> str:
	# Case 1: pipelines often return {"generated_text": "..."}
	if isinstance(out, dict) and "generated_text" in out:
	return out["generated_text"]

	# Case 2: list of dicts (mixed roles)
	if isinstance(out, list):
	# Prefer any dict with generated_text first
	for item in out:
	if isinstance(item, dict) and "generated_text" in item:
	return item["generated_text"]
	# Otherwise find assistant role
	for item in out:
	if isinstance(item, dict) and item.get("role") == "assistant":
	content = item.get("content")
	if isinstance(content, str):
	return content
	if isinstance(content, list):
	# collect text pieces within the assistant content
	chunks = []
	for c in content:
	if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
	chunks.append(c["text"])
	if chunks:
	return "\n".join(chunks)
	# Fallback
	return str(out)

	def infer(file_obj, prompt):
	if file_obj is None:
	return "Please upload an image or PDF."
	if not prompt or not prompt.strip():
	return "Please enter a prompt."

	# read file
	with open(file_obj.name, "rb") as f:
	raw = f.read()

	# load PIL
	name = (file_obj.name or "").lower()
	try:
	if name.endswith(".pdf") or raw[:4] == b"%PDF":
	pil_img = pdf_first_page_to_pil(raw)
	else:
	pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
	except Exception as e:
	return f"Failed to read the file: {e}"

	# build messages in Qwen2-VL format
	messages = [{
	"role": "user",
	"content": [
	{"type": "image", "image": pil_img},
	{"type": "text", "text": prompt.strip()}
	]
	}]

	# run model
	out = pipe(text=messages, max_new_tokens=256)

	# return ONLY the assistant text
	return _only_model_text(out)

	# ---------- Gradio UI ----------
	with gr.Blocks(
	title="Qwen2-VL-2B — File + Prompt",
	css="""
	/* make the output box grow nicely */
	#resp_out textarea {min-height: 220px;}
	"""
	) as demo:
	gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
	with gr.Row():
	file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
	prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
	run_btn = gr.Button("Run")

	# output textbox that expands (via CSS above)
	resp_out = gr.Textbox(
	label="Model Response",
	lines=8,
	show_copy_button=True,
	elem_id="resp_out"
	)

	run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])

	if __name__ == "__main__":
	demo.launch()