prithivMLmods commited on
Commit
76468c1
·
verified ·
1 Parent(s): 0c8310d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -146
app.py CHANGED
@@ -5,8 +5,6 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
8
- from pathlib import Path
9
- from io import BytesIO
10
 
11
  import gradio as gr
12
  import spaces
@@ -15,9 +13,6 @@ import numpy as np
15
  from PIL import Image
16
  import cv2
17
  import requests
18
- import fitz # PyMuPDF
19
- import html2text
20
- import markdown
21
 
22
  from transformers import (
23
  Qwen3VLMoeForConditionalGeneration,
@@ -81,35 +76,6 @@ def downsample_video(video_path):
81
  vidcap.release()
82
  return frames
83
 
84
- def convert_file_to_images(file_path: str, dpi: int = 200):
85
- """
86
- Converts a PDF or image file into a list of PIL Images.
87
- """
88
- images = []
89
- file_ext = Path(file_path).suffix.lower()
90
-
91
- image_suffixes = [".png", ".jpeg", ".jpg"]
92
- pdf_suffixes = [".pdf"]
93
-
94
- if file_ext in image_suffixes:
95
- images.append(Image.open(file_path).convert("RGB"))
96
- return images
97
-
98
- if file_ext not in pdf_suffixes:
99
- raise ValueError(f"Unsupported file type: {file_ext}")
100
-
101
- pdf_document = fitz.open(file_path)
102
- zoom = dpi / 72.0
103
- mat = fitz.Matrix(zoom, zoom)
104
- for page_num in range(len(pdf_document)):
105
- page = pdf_document.load_page(page_num)
106
- pix = page.get_pixmap(matrix=mat)
107
- img_data = pix.tobytes("png")
108
- images.append(Image.open(BytesIO(img_data)))
109
- pdf_document.close()
110
- return images
111
-
112
-
113
  @spaces.GPU
114
  def generate_image(text: str, image: Image.Image,
115
  max_new_tokens: int = 1024,
@@ -119,15 +85,15 @@ def generate_image(text: str, image: Image.Image,
119
  repetition_penalty: float = 1.2):
120
  """
121
  Generates responses using the Qwen3-VL model for image input.
122
- Yields outputs for the new tabbed layout.
123
  """
124
  if image is None:
125
- yield "Please upload an image.", "", "", "Please upload an image."
126
  return
127
 
128
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
129
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
130
 
 
131
  inputs = processor_q3vl(
132
  text=[prompt_full], images=[image], return_tensors="pt", padding=True
133
  ).to(device)
@@ -140,7 +106,7 @@ def generate_image(text: str, image: Image.Image,
140
  for new_text in streamer:
141
  buffer += new_text
142
  time.sleep(0.01)
143
- yield buffer, "", "", buffer
144
 
145
  @spaces.GPU
146
  def generate_video(text: str, video_path: str,
@@ -151,25 +117,26 @@ def generate_video(text: str, video_path: str,
151
  repetition_penalty: float = 1.2):
152
  """
153
  Generates responses using the Qwen3-VL model for video input.
154
- Yields outputs for the new tabbed layout.
155
  """
156
  if video_path is None:
157
- yield "Please upload a video.", "", "", "Please upload a video."
158
  return
159
 
160
  frames_with_ts = downsample_video(video_path)
161
  if not frames_with_ts:
162
- yield "Could not process video.", "", "", "Could not process video."
163
  return
164
 
165
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
166
  images_for_processor = []
 
167
  for frame, timestamp in frames_with_ts:
168
- messages[0]["content"].insert(0, {"type": "image"})
169
  images_for_processor.append(frame)
170
 
171
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
172
 
 
173
  inputs = processor_q3vl(
174
  text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
175
  ).to(device)
@@ -187,72 +154,17 @@ def generate_video(text: str, video_path: str,
187
  buffer += new_text
188
  buffer = buffer.replace("<|im_end|>", "")
189
  time.sleep(0.01)
190
- yield buffer, "", "", buffer
191
-
192
- @spaces.GPU
193
- def generate_document(
194
- file_path: str,
195
- max_new_tokens: int = 2048,
196
- temperature: float = 0.1,
197
- top_p: float = 0.9,
198
- top_k: int = 50,
199
- repetition_penalty: float = 1.05,
200
- ):
201
- """
202
- Processes a document (PDF/image) page by page, generating structured HTML and Markdown.
203
- """
204
- if not file_path:
205
- yield "Please upload a document.", "", "", "Please upload a document."
206
- return
207
-
208
- try:
209
- page_images = convert_file_to_images(file_path)
210
- if not page_images:
211
- yield "Could not process the document.", "", "", "Could not process the document."
212
- return
213
- except Exception as e:
214
- error_msg = f"Error reading file: {e}"
215
- yield error_msg, "", "", error_msg
216
- return
217
-
218
- full_html_content = ""
219
- raw_stream_buffer = ""
220
 
221
- for i, image in enumerate(page_images):
222
- page_start_message = f"--- Processing Page {i+1}/{len(page_images)} ---\n"
223
- raw_stream_buffer += page_start_message
224
- yield markdown.markdown(raw_stream_buffer), "", "", raw_stream_buffer
225
 
226
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."}]}]
227
- prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
228
- inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
229
-
230
- with torch.no_grad():
231
- generated_ids = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
232
-
233
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
234
- page_html = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
235
-
236
- full_html_content += f'\n\n<!-- Page {i+1} -->\n{page_html}'
237
- raw_stream_buffer += f"{page_html}\n"
238
-
239
- full_markdown_source = html2text.html2text(full_html_content)
240
- rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
241
-
242
- yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
243
-
244
- final_message = "\n--- Document processing complete. ---"
245
- raw_stream_buffer += final_message
246
- full_markdown_source = html2text.html2text(full_html_content)
247
- rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
248
- yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
249
-
250
-
251
- # --- Gradio Interface ---
252
  image_examples = [
253
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
254
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
 
255
  ["Explain the creativity in the image.", "images/6.jpg"],
 
 
256
  ]
257
 
258
  video_examples = [
@@ -260,39 +172,29 @@ video_examples = [
260
  ["Explain the ad in detail.", "videos/1.mp4"]
261
  ]
262
 
263
- doc_examples = [
264
- ["examples/sample-doc.pdf"],
265
- ["examples/sample-page.png"],
266
- ]
267
-
268
  css = """
269
  .submit-btn { background-color: #2980b9 !important; color: white !important; }
270
  .submit-btn:hover { background-color: #3498db !important; }
271
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
272
  """
273
 
 
274
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
275
  gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
276
  with gr.Row():
277
- with gr.Column(scale=1):
278
  with gr.Tabs():
279
  with gr.TabItem("Image Inference"):
280
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
281
  image_upload = gr.Image(type="pil", label="Image", height=290)
282
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
283
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
-
285
  with gr.TabItem("Video Inference"):
286
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
287
  video_upload = gr.Video(label="Video", height=290)
288
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
289
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
 
291
- with gr.TabItem("Document Parsing"):
292
- doc_upload = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"])
293
- doc_submit = gr.Button("Process Document", elem_classes="submit-btn")
294
- gr.Examples(examples=doc_examples, inputs=[doc_upload])
295
-
296
  with gr.Accordion("Advanced options", open=False):
297
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
298
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -300,51 +202,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
300
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
301
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
302
 
303
- with gr.Column(scale=2):
304
  with gr.Column(elem_classes="canvas-output"):
305
  gr.Markdown("## Output")
306
- with gr.Tabs():
307
- with gr.Tab("Rendered Output"):
308
- rendered_output = gr.Markdown(label="Rendered Result")
309
- with gr.Tab("Markdown Source"):
310
- markdown_source_output = gr.TextArea(label="Markdown Source Code", interactive=False, lines=15, show_copy_button=True)
311
- with gr.Tab("Generated HTML"):
312
- html_output = gr.TextArea(label="Generated HTML Source", interactive=False, lines=15, show_copy_button=True)
313
- with gr.Tab("Raw Stream"):
314
- raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=15, show_copy_button=True)
315
-
316
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
317
  gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
318
- gr.Markdown("> ⚠️ Note: Video and document inference performance can vary depending on the complexity and length of the input.")
319
-
320
- # Define the output components list
321
- output_components = [rendered_output, markdown_source_output, html_output, raw_output]
322
-
323
- # Link buttons to functions
324
  image_submit.click(
325
  fn=generate_image,
326
  inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
327
- outputs=output_components
328
  )
329
  video_submit.click(
330
  fn=generate_video,
331
  inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
332
- outputs=output_components
333
- )
334
- doc_submit.click(
335
- fn=generate_document,
336
- inputs=[doc_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
337
- outputs=output_components
338
  )
339
 
340
  if __name__ == "__main__":
341
- # Create dummy example files if they don't exist
342
- if not os.path.exists("images"):
343
- os.makedirs("images")
344
- if not os.path.exists("videos"):
345
- os.makedirs("videos")
346
- if not os.path.exists("examples"):
347
- os.makedirs("examples")
348
- # You may need to add placeholder files to these directories for the examples to load without errors.
349
-
350
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
 
 
8
 
9
  import gradio as gr
10
  import spaces
 
13
  from PIL import Image
14
  import cv2
15
  import requests
 
 
 
16
 
17
  from transformers import (
18
  Qwen3VLMoeForConditionalGeneration,
 
76
  vidcap.release()
77
  return frames
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  @spaces.GPU
80
  def generate_image(text: str, image: Image.Image,
81
  max_new_tokens: int = 1024,
 
85
  repetition_penalty: float = 1.2):
86
  """
87
  Generates responses using the Qwen3-VL model for image input.
 
88
  """
89
  if image is None:
90
+ yield "Please upload an image.", "Please upload an image."
91
  return
92
 
93
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
94
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
95
 
96
+ # FIX: Removed truncation=True and max_length to prevent the ValueError
97
  inputs = processor_q3vl(
98
  text=[prompt_full], images=[image], return_tensors="pt", padding=True
99
  ).to(device)
 
106
  for new_text in streamer:
107
  buffer += new_text
108
  time.sleep(0.01)
109
+ yield buffer, buffer
110
 
111
  @spaces.GPU
112
  def generate_video(text: str, video_path: str,
 
117
  repetition_penalty: float = 1.2):
118
  """
119
  Generates responses using the Qwen3-VL model for video input.
 
120
  """
121
  if video_path is None:
122
+ yield "Please upload a video.", "Please upload a video."
123
  return
124
 
125
  frames_with_ts = downsample_video(video_path)
126
  if not frames_with_ts:
127
+ yield "Could not process video.", "Could not process video."
128
  return
129
 
130
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
131
  images_for_processor = []
132
+ # Add an <|image|> placeholder for each frame in the message
133
  for frame, timestamp in frames_with_ts:
134
+ messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
135
  images_for_processor.append(frame)
136
 
137
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
138
 
139
+ # FIX: Removed truncation=True and max_length to prevent the ValueError
140
  inputs = processor_q3vl(
141
  text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
142
  ).to(device)
 
154
  buffer += new_text
155
  buffer = buffer.replace("<|im_end|>", "")
156
  time.sleep(0.01)
157
+ yield buffer, buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
 
 
 
159
 
160
+ # Define examples for image and video inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  image_examples = [
162
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
163
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
164
+ ["Convert this page to doc [markdown] precisely.", "images/4.png"],
165
  ["Explain the creativity in the image.", "images/6.jpg"],
166
+ ["Convert this page to doc [markdown] precisely.", "images/1.png"],
167
+ ["Convert chart to OTSL.", "images/2.png"]
168
  ]
169
 
170
  video_examples = [
 
172
  ["Explain the ad in detail.", "videos/1.mp4"]
173
  ]
174
 
 
 
 
 
 
175
  css = """
176
  .submit-btn { background-color: #2980b9 !important; color: white !important; }
177
  .submit-btn:hover { background-color: #3498db !important; }
178
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
179
  """
180
 
181
+ # Create the Gradio Interface
182
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
183
  gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
184
  with gr.Row():
185
+ with gr.Column():
186
  with gr.Tabs():
187
  with gr.TabItem("Image Inference"):
188
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
189
  image_upload = gr.Image(type="pil", label="Image", height=290)
190
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
191
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
 
192
  with gr.TabItem("Video Inference"):
193
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
194
  video_upload = gr.Video(label="Video", height=290)
195
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
196
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
197
 
 
 
 
 
 
198
  with gr.Accordion("Advanced options", open=False):
199
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
200
  temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
 
202
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
203
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
204
 
205
+ with gr.Column():
206
  with gr.Column(elem_classes="canvas-output"):
207
  gr.Markdown("## Output")
208
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
209
+ with gr.Accordion("(Result.md)", open=False):
210
+ markdown_output = gr.Markdown(label="(Result.Md)")
 
 
 
 
 
 
 
211
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
212
  gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
213
+ gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
214
+
 
 
 
 
215
  image_submit.click(
216
  fn=generate_image,
217
  inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
218
+ outputs=[output, markdown_output]
219
  )
220
  video_submit.click(
221
  fn=generate_video,
222
  inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
223
+ outputs=[output, markdown_output]
 
 
 
 
 
224
  )
225
 
226
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
227
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)