Spaces:
Sleeping
Sleeping
File size: 3,585 Bytes
18bfa5d 195b7ab 18bfa5d 195b7ab 18bfa5d 2b450ba 18bfa5d 195b7ab 18bfa5d 195b7ab 18bfa5d 195b7ab 80bb39b 195b7ab 80bb39b 195b7ab 80bb39b 6a80dac 18bfa5d 195b7ab 18bfa5d 195b7ab 18bfa5d 195b7ab 18bfa5d 195b7ab 18bfa5d 6a80dac 18bfa5d 195b7ab 6a80dac 195b7ab 18bfa5d 195b7ab 18bfa5d 195b7ab 6a80dac 80bb39b 18bfa5d 80bb39b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
from transformers import pipeline
from PIL import Image
import io
# ---------- optional: PDF -> PIL first page ----------
def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
import fitz # PyMuPDF
with fitz.open(stream=file_bytes, filetype="pdf") as doc:
page = doc[0]
pix = page.get_pixmap(dpi=200)
return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
# ---------- init model ----------
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
# ---------- robust extractor: returns ONLY the model text ----------
def _only_model_text(out) -> str:
# Case 1: pipelines often return {"generated_text": "..."}
if isinstance(out, dict) and "generated_text" in out:
return out["generated_text"]
# Case 2: list of dicts (mixed roles)
if isinstance(out, list):
# Prefer any dict with generated_text first
for item in out:
if isinstance(item, dict) and "generated_text" in item:
return item["generated_text"]
# Otherwise find assistant role
for item in out:
if isinstance(item, dict) and item.get("role") == "assistant":
content = item.get("content")
if isinstance(content, str):
return content
if isinstance(content, list):
# collect text pieces within the assistant content
chunks = []
for c in content:
if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
chunks.append(c["text"])
if chunks:
return "\n".join(chunks)
# Fallback
return str(out)
def infer(file_obj, prompt):
if file_obj is None:
return "Please upload an image or PDF."
if not prompt or not prompt.strip():
return "Please enter a prompt."
# read file
with open(file_obj.name, "rb") as f:
raw = f.read()
# load PIL
name = (file_obj.name or "").lower()
try:
if name.endswith(".pdf") or raw[:4] == b"%PDF":
pil_img = pdf_first_page_to_pil(raw)
else:
pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
except Exception as e:
return f"Failed to read the file: {e}"
# build messages in Qwen2-VL format
messages = [{
"role": "user",
"content": [
{"type": "image", "image": pil_img},
{"type": "text", "text": prompt.strip()}
]
}]
# run model
out = pipe(text=messages, max_new_tokens=256)
# return ONLY the assistant text
return _only_model_text(out)
# ---------- Gradio UI ----------
with gr.Blocks(
title="Qwen2-VL-2B — File + Prompt",
css="""
/* make the output box grow nicely */
#resp_out textarea {min-height: 220px;}
"""
) as demo:
gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
with gr.Row():
file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
run_btn = gr.Button("Run")
# output textbox that expands (via CSS above)
resp_out = gr.Textbox(
label="Model Response",
lines=8,
show_copy_button=True,
elem_id="resp_out"
)
run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])
if __name__ == "__main__":
demo.launch()
|