Spaces:

KarthiEz
/

DocQwen2VL

Sleeping

App Files Files Community

KarthiEz commited on Oct 14

Commit

195b7ab

verified ·

1 Parent(s): 5e4ed27

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -35

app.py CHANGED Viewed

@@ -1,71 +1,104 @@
 import gradio as gr
 from transformers import pipeline
 from PIL import Image
-import io, os
-# Optional: PDF -> PIL (first page). Imported lazily to avoid extra cost if unused.
 def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
     import fitz  # PyMuPDF
     with fitz.open(stream=file_bytes, filetype="pdf") as doc:
-        if doc.page_count == 0:
-            raise ValueError("Empty PDF uploaded.")
         page = doc[0]
         pix = page.get_pixmap(dpi=200)
-        img_bytes = pix.tobytes("png")
-    return Image.open(io.BytesIO(img_bytes)).convert("RGB")
-# Initialize the multimodal pipeline once
-# Task: "image-text-to-text" for Qwen2-VL
 pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
 def infer(file_obj, prompt):
     if file_obj is None:
         return "Please upload an image or PDF."
     if not prompt or not prompt.strip():
         return "Please enter a prompt."
-    # Read uploaded file
-    file_path = file_obj.name if hasattr(file_obj, "name") else None
-    with open(file_path, "rb") as f:
         raw = f.read()
-    # Decide how to load (image vs pdf)
-    name_lower = (file_path or "").lower()
     try:
-        if name_lower.endswith(".pdf") or raw[:4] == b"%PDF":
             pil_img = pdf_first_page_to_pil(raw)
         else:
             pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
     except Exception as e:
         return f"Failed to read the file: {e}"
-    # Build messages in Qwen2-VL format
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_img},
-                {"type": "text", "text": prompt.strip()}
-            ]
-        }
-    ]
-    # Run inference and return only the model response
     out = pipe(text=messages, max_new_tokens=256)
-    # pipeline may return a dict or list of dicts depending on version
-    if isinstance(out, list) and len(out) > 0 and isinstance(out[0], dict):
-        out = out[0]
-    if isinstance(out, dict) and "generated_text" in out:
-        return out["generated_text"]
-    return str(out)
-with gr.Blocks(title="Qwen2-VL-2B-Instruct") as demo:
-    gr.Markdown("# Qwen2-VL-2B — File + Prompt\nUpload an image (or PDF) and ask a question.")
     with gr.Row():
         file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
-    prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything about the document/image…", lines=3)
     run_btn = gr.Button("Run")
-    resp_out = gr.Textbox(label="Model Response", show_label=True)
     run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])

 import gradio as gr
 from transformers import pipeline
 from PIL import Image
+import io
+# ---------- optional: PDF -> PIL first page ----------
 def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
     import fitz  # PyMuPDF
     with fitz.open(stream=file_bytes, filetype="pdf") as doc:
         page = doc[0]
         pix = page.get_pixmap(dpi=200)
+        return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+# ---------- init model ----------
 pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
+# ---------- robust extractor: returns ONLY the model text ----------
+def _only_model_text(out) -> str:
+    # Case 1: pipelines often return {"generated_text": "..."}
+    if isinstance(out, dict) and "generated_text" in out:
+        return out["generated_text"]
+    # Case 2: list of dicts (mixed roles)
+    if isinstance(out, list):
+        # Prefer any dict with generated_text first
+        for item in out:
+            if isinstance(item, dict) and "generated_text" in item:
+                return item["generated_text"]
+        # Otherwise find assistant role
+        for item in out:
+            if isinstance(item, dict) and item.get("role") == "assistant":
+                content = item.get("content")
+                if isinstance(content, str):
+                    return content
+                if isinstance(content, list):
+                    # collect text pieces within the assistant content
+                    chunks = []
+                    for c in content:
+                        if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
+                            chunks.append(c["text"])
+                    if chunks:
+                        return "\n".join(chunks)
+    # Fallback
+    return str(out)
 def infer(file_obj, prompt):
     if file_obj is None:
         return "Please upload an image or PDF."
     if not prompt or not prompt.strip():
         return "Please enter a prompt."
+    # read file
+    with open(file_obj.name, "rb") as f:
         raw = f.read()
+    # load PIL
+    name = (file_obj.name or "").lower()
     try:
+        if name.endswith(".pdf") or raw[:4] == b"%PDF":
             pil_img = pdf_first_page_to_pil(raw)
         else:
             pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
     except Exception as e:
         return f"Failed to read the file: {e}"
+    # build messages in Qwen2-VL format
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": pil_img},
+            {"type": "text", "text": prompt.strip()}
+        ]
+    }]
+    # run model
     out = pipe(text=messages, max_new_tokens=256)
+    # return ONLY the assistant text
+    return _only_model_text(out)
+# ---------- Gradio UI ----------
+with gr.Blocks(
+    title="Qwen2-VL-2B — File + Prompt",
+    css="""
+    /* make the output box grow nicely */
+    #resp_out textarea {min-height: 220px;}
+    """
+) as demo:
+    gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
     with gr.Row():
         file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
+    prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
     run_btn = gr.Button("Run")
+    # output textbox that expands (via CSS above)
+    resp_out = gr.Textbox(
+        label="Model Response",
+        lines=8,
+        show_copy_button=True,
+        elem_id="resp_out"
+    )
     run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])