Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| from PIL import Image | |
| import io | |
| # ---------- optional: PDF -> PIL first page ---------- | |
| def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image: | |
| import fitz # PyMuPDF | |
| with fitz.open(stream=file_bytes, filetype="pdf") as doc: | |
| page = doc[0] | |
| pix = page.get_pixmap(dpi=200) | |
| return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB") | |
| # ---------- init model ---------- | |
| pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct") | |
| # ---------- robust extractor: returns ONLY the model text ---------- | |
| def _only_model_text(out) -> str: | |
| # Case 1: pipelines often return {"generated_text": "..."} | |
| if isinstance(out, dict) and "generated_text" in out: | |
| return out["generated_text"] | |
| # Case 2: list of dicts (mixed roles) | |
| if isinstance(out, list): | |
| # Prefer any dict with generated_text first | |
| for item in out: | |
| if isinstance(item, dict) and "generated_text" in item: | |
| return item["generated_text"] | |
| # Otherwise find assistant role | |
| for item in out: | |
| if isinstance(item, dict) and item.get("role") == "assistant": | |
| content = item.get("content") | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| # collect text pieces within the assistant content | |
| chunks = [] | |
| for c in content: | |
| if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str): | |
| chunks.append(c["text"]) | |
| if chunks: | |
| return "\n".join(chunks) | |
| # Fallback | |
| return str(out) | |
| def infer(file_obj, prompt): | |
| if file_obj is None: | |
| return "Please upload an image or PDF." | |
| if not prompt or not prompt.strip(): | |
| return "Please enter a prompt." | |
| # read file | |
| with open(file_obj.name, "rb") as f: | |
| raw = f.read() | |
| # load PIL | |
| name = (file_obj.name or "").lower() | |
| try: | |
| if name.endswith(".pdf") or raw[:4] == b"%PDF": | |
| pil_img = pdf_first_page_to_pil(raw) | |
| else: | |
| pil_img = Image.open(io.BytesIO(raw)).convert("RGB") | |
| except Exception as e: | |
| return f"Failed to read the file: {e}" | |
| # build messages in Qwen2-VL format | |
| messages = [{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": pil_img}, | |
| {"type": "text", "text": prompt.strip()} | |
| ] | |
| }] | |
| # run model | |
| out = pipe(text=messages, max_new_tokens=256) | |
| # return ONLY the assistant text | |
| return _only_model_text(out) | |
| # ---------- Gradio UI ---------- | |
| with gr.Blocks( | |
| title="Qwen2-VL-2B — File + Prompt", | |
| css=""" | |
| /* make the output box grow nicely */ | |
| #resp_out textarea {min-height: 220px;} | |
| """ | |
| ) as demo: | |
| gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.") | |
| with gr.Row(): | |
| file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"]) | |
| prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3) | |
| run_btn = gr.Button("Run") | |
| # output textbox that expands (via CSS above) | |
| resp_out = gr.Textbox( | |
| label="Model Response", | |
| lines=8, | |
| show_copy_button=True, | |
| elem_id="resp_out" | |
| ) | |
| run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out]) | |
| if __name__ == "__main__": | |
| demo.launch() | |