DocQwen2VL / app.py
KarthiEz's picture
Update app.py
2b450ba verified
import gradio as gr
from transformers import pipeline
from PIL import Image
import io
# ---------- optional: PDF -> PIL first page ----------
def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
import fitz # PyMuPDF
with fitz.open(stream=file_bytes, filetype="pdf") as doc:
page = doc[0]
pix = page.get_pixmap(dpi=200)
return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
# ---------- init model ----------
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
# ---------- robust extractor: returns ONLY the model text ----------
def _only_model_text(out) -> str:
# Case 1: pipelines often return {"generated_text": "..."}
if isinstance(out, dict) and "generated_text" in out:
return out["generated_text"]
# Case 2: list of dicts (mixed roles)
if isinstance(out, list):
# Prefer any dict with generated_text first
for item in out:
if isinstance(item, dict) and "generated_text" in item:
return item["generated_text"]
# Otherwise find assistant role
for item in out:
if isinstance(item, dict) and item.get("role") == "assistant":
content = item.get("content")
if isinstance(content, str):
return content
if isinstance(content, list):
# collect text pieces within the assistant content
chunks = []
for c in content:
if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
chunks.append(c["text"])
if chunks:
return "\n".join(chunks)
# Fallback
return str(out)
def infer(file_obj, prompt):
if file_obj is None:
return "Please upload an image or PDF."
if not prompt or not prompt.strip():
return "Please enter a prompt."
# read file
with open(file_obj.name, "rb") as f:
raw = f.read()
# load PIL
name = (file_obj.name or "").lower()
try:
if name.endswith(".pdf") or raw[:4] == b"%PDF":
pil_img = pdf_first_page_to_pil(raw)
else:
pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
except Exception as e:
return f"Failed to read the file: {e}"
# build messages in Qwen2-VL format
messages = [{
"role": "user",
"content": [
{"type": "image", "image": pil_img},
{"type": "text", "text": prompt.strip()}
]
}]
# run model
out = pipe(text=messages, max_new_tokens=256)
# return ONLY the assistant text
return _only_model_text(out)
# ---------- Gradio UI ----------
with gr.Blocks(
title="Qwen2-VL-2B — File + Prompt",
css="""
/* make the output box grow nicely */
#resp_out textarea {min-height: 220px;}
"""
) as demo:
gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
with gr.Row():
file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
run_btn = gr.Button("Run")
# output textbox that expands (via CSS above)
resp_out = gr.Textbox(
label="Model Response",
lines=8,
show_copy_button=True,
elem_id="resp_out"
)
run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])
if __name__ == "__main__":
demo.launch()