KarthiEz commited on
Commit
195b7ab
·
verified ·
1 Parent(s): 5e4ed27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -35
app.py CHANGED
@@ -1,71 +1,104 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
  from PIL import Image
4
- import io, os
5
 
6
- # Optional: PDF -> PIL (first page). Imported lazily to avoid extra cost if unused.
7
  def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
8
  import fitz # PyMuPDF
9
  with fitz.open(stream=file_bytes, filetype="pdf") as doc:
10
- if doc.page_count == 0:
11
- raise ValueError("Empty PDF uploaded.")
12
  page = doc[0]
13
  pix = page.get_pixmap(dpi=200)
14
- img_bytes = pix.tobytes("png")
15
- return Image.open(io.BytesIO(img_bytes)).convert("RGB")
16
 
17
- # Initialize the multimodal pipeline once
18
- # Task: "image-text-to-text" for Qwen2-VL
19
  pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def infer(file_obj, prompt):
22
  if file_obj is None:
23
  return "Please upload an image or PDF."
24
  if not prompt or not prompt.strip():
25
  return "Please enter a prompt."
26
 
27
- # Read uploaded file
28
- file_path = file_obj.name if hasattr(file_obj, "name") else None
29
- with open(file_path, "rb") as f:
30
  raw = f.read()
31
 
32
- # Decide how to load (image vs pdf)
33
- name_lower = (file_path or "").lower()
34
  try:
35
- if name_lower.endswith(".pdf") or raw[:4] == b"%PDF":
36
  pil_img = pdf_first_page_to_pil(raw)
37
  else:
38
  pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
39
  except Exception as e:
40
  return f"Failed to read the file: {e}"
41
 
42
- # Build messages in Qwen2-VL format
43
- messages = [
44
- {
45
- "role": "user",
46
- "content": [
47
- {"type": "image", "image": pil_img},
48
- {"type": "text", "text": prompt.strip()}
49
- ]
50
- }
51
- ]
52
 
53
- # Run inference and return only the model response
54
  out = pipe(text=messages, max_new_tokens=256)
55
- # pipeline may return a dict or list of dicts depending on version
56
- if isinstance(out, list) and len(out) > 0 and isinstance(out[0], dict):
57
- out = out[0]
58
- if isinstance(out, dict) and "generated_text" in out:
59
- return out["generated_text"]
60
- return str(out)
61
 
62
- with gr.Blocks(title="Qwen2-VL-2B-Instruct") as demo:
63
- gr.Markdown("# Qwen2-VL-2B — File + Prompt\nUpload an image (or PDF) and ask a question.")
 
 
 
 
 
 
 
 
 
 
64
  with gr.Row():
65
  file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
66
- prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything about the document/image…", lines=3)
67
  run_btn = gr.Button("Run")
68
- resp_out = gr.Textbox(label="Model Response", show_label=True)
 
 
 
 
 
 
 
69
 
70
  run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])
71
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
  from PIL import Image
4
+ import io
5
 
6
+ # ---------- optional: PDF -> PIL first page ----------
7
  def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
8
  import fitz # PyMuPDF
9
  with fitz.open(stream=file_bytes, filetype="pdf") as doc:
 
 
10
  page = doc[0]
11
  pix = page.get_pixmap(dpi=200)
12
+ return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
 
13
 
14
+ # ---------- init model ----------
 
15
  pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")
16
 
17
+ # ---------- robust extractor: returns ONLY the model text ----------
18
+ def _only_model_text(out) -> str:
19
+ # Case 1: pipelines often return {"generated_text": "..."}
20
+ if isinstance(out, dict) and "generated_text" in out:
21
+ return out["generated_text"]
22
+
23
+ # Case 2: list of dicts (mixed roles)
24
+ if isinstance(out, list):
25
+ # Prefer any dict with generated_text first
26
+ for item in out:
27
+ if isinstance(item, dict) and "generated_text" in item:
28
+ return item["generated_text"]
29
+ # Otherwise find assistant role
30
+ for item in out:
31
+ if isinstance(item, dict) and item.get("role") == "assistant":
32
+ content = item.get("content")
33
+ if isinstance(content, str):
34
+ return content
35
+ if isinstance(content, list):
36
+ # collect text pieces within the assistant content
37
+ chunks = []
38
+ for c in content:
39
+ if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
40
+ chunks.append(c["text"])
41
+ if chunks:
42
+ return "\n".join(chunks)
43
+ # Fallback
44
+ return str(out)
45
+
46
  def infer(file_obj, prompt):
47
  if file_obj is None:
48
  return "Please upload an image or PDF."
49
  if not prompt or not prompt.strip():
50
  return "Please enter a prompt."
51
 
52
+ # read file
53
+ with open(file_obj.name, "rb") as f:
 
54
  raw = f.read()
55
 
56
+ # load PIL
57
+ name = (file_obj.name or "").lower()
58
  try:
59
+ if name.endswith(".pdf") or raw[:4] == b"%PDF":
60
  pil_img = pdf_first_page_to_pil(raw)
61
  else:
62
  pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
63
  except Exception as e:
64
  return f"Failed to read the file: {e}"
65
 
66
+ # build messages in Qwen2-VL format
67
+ messages = [{
68
+ "role": "user",
69
+ "content": [
70
+ {"type": "image", "image": pil_img},
71
+ {"type": "text", "text": prompt.strip()}
72
+ ]
73
+ }]
 
 
74
 
75
+ # run model
76
  out = pipe(text=messages, max_new_tokens=256)
 
 
 
 
 
 
77
 
78
+ # return ONLY the assistant text
79
+ return _only_model_text(out)
80
+
81
+ # ---------- Gradio UI ----------
82
+ with gr.Blocks(
83
+ title="Qwen2-VL-2B — File + Prompt",
84
+ css="""
85
+ /* make the output box grow nicely */
86
+ #resp_out textarea {min-height: 220px;}
87
+ """
88
+ ) as demo:
89
+ gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
90
  with gr.Row():
91
  file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
92
+ prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
93
  run_btn = gr.Button("Run")
94
+
95
+ # output textbox that expands (via CSS above)
96
+ resp_out = gr.Textbox(
97
+ label="Model Response",
98
+ lines=8,
99
+ show_copy_button=True,
100
+ elem_id="resp_out"
101
+ )
102
 
103
  run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])
104