prithivMLmods commited on
Commit
0c8310d
·
verified ·
1 Parent(s): f180cec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -78
app.py CHANGED
@@ -5,6 +5,8 @@ import json
5
  import time
6
  import asyncio
7
  from threading import Thread
 
 
8
 
9
  import gradio as gr
10
  import spaces
@@ -13,6 +15,7 @@ import numpy as np
13
  from PIL import Image
14
  import cv2
15
  import requests
 
16
  import html2text
17
  import markdown
18
 
@@ -78,6 +81,35 @@ def downsample_video(video_path):
78
  vidcap.release()
79
  return frames
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  @spaces.GPU
82
  def generate_image(text: str, image: Image.Image,
83
  max_new_tokens: int = 1024,
@@ -87,10 +119,10 @@ def generate_image(text: str, image: Image.Image,
87
  repetition_penalty: float = 1.2):
88
  """
89
  Generates responses using the Qwen3-VL model for image input.
90
- Yields three identical outputs to fit the new tabbed output structure.
91
  """
92
  if image is None:
93
- yield "Please upload an image.", "Please upload an image.", "Please upload an image."
94
  return
95
 
96
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
@@ -108,8 +140,7 @@ def generate_image(text: str, image: Image.Image,
108
  for new_text in streamer:
109
  buffer += new_text
110
  time.sleep(0.01)
111
- # Yield to all three output tabs: Rendered, Source, and Raw
112
- yield buffer, buffer, buffer
113
 
114
  @spaces.GPU
115
  def generate_video(text: str, video_path: str,
@@ -120,15 +151,15 @@ def generate_video(text: str, video_path: str,
120
  repetition_penalty: float = 1.2):
121
  """
122
  Generates responses using the Qwen3-VL model for video input.
123
- Yields three identical outputs to fit the new tabbed output structure.
124
  """
125
  if video_path is None:
126
- yield "Please upload a video.", "Please upload a video.", "Please upload a video."
127
  return
128
 
129
  frames_with_ts = downsample_video(video_path)
130
  if not frames_with_ts:
131
- yield "Could not process video.", "Could not process video.", "Could not process video."
132
  return
133
 
134
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
@@ -156,59 +187,72 @@ def generate_video(text: str, video_path: str,
156
  buffer += new_text
157
  buffer = buffer.replace("<|im_end|>", "")
158
  time.sleep(0.01)
159
- # Yield to all three output tabs: Rendered, Source, and Raw
160
- yield buffer, buffer, buffer
161
 
162
  @spaces.GPU
163
- def generate_html(text: str, image: Image.Image,
164
- max_new_tokens: int = 2048,
165
- temperature: float = 0.6,
166
- top_p: float = 0.9,
167
- top_k: int = 50,
168
- repetition_penalty: float = 1.2):
 
 
169
  """
170
- Generates a structured HTML representation from an image.
171
  """
172
- if image is None:
173
- yield "<h3>Please upload an image.</h3>", "Please upload an image.", "Please upload an image."
174
  return
175
 
176
- # Use a specific, detailed prompt for HTML generation if the user provides none.
177
- prompt = text if text else "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."
 
 
 
 
 
 
 
178
 
179
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
180
- prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
181
-
182
- inputs = processor_q3vl(
183
- text=[prompt_full], images=[image], return_tensors="pt", padding=True
184
- ).to(device)
185
-
186
- streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
187
- generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
188
- thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
189
- thread.start()
190
-
191
- buffer = ""
192
- for new_text in streamer:
193
- buffer += new_text
194
- buffer = buffer.replace("<|im_end|>", "")
195
 
196
- # Convert the generated HTML to Markdown for the other views
197
- md_source = html2text.html2text(buffer)
198
- md_render = markdown.markdown(md_source, extensions=['fenced_code', 'tables'])
199
 
200
- time.sleep(0.01)
201
- yield md_render, md_source, buffer
 
 
 
202
 
 
203
 
204
- # --- UI Definition ---
 
 
 
 
205
 
206
- # Define examples for each tab
 
207
  image_examples = [
208
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
209
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
210
  ["Explain the creativity in the image.", "images/6.jpg"],
211
- ["Convert chart to OTSL.", "images/2.png"]
212
  ]
213
 
214
  video_examples = [
@@ -216,10 +260,9 @@ video_examples = [
216
  ["Explain the ad in detail.", "videos/1.mp4"]
217
  ]
218
 
219
- html_examples = [
220
- ["Convert this page to a structured HTML document.", "images/1.png"],
221
- ["Parse the content of this image into clean HTML.", "images/3.png"],
222
- ["Generate an HTML representation of this chart, including a table.", "images/4.png"]
223
  ]
224
 
225
  css = """
@@ -228,29 +271,27 @@ css = """
228
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
229
  """
230
 
231
- # Create the Gradio Interface
232
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
233
  gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
234
  with gr.Row():
235
- with gr.Column():
236
  with gr.Tabs():
237
  with gr.TabItem("Image Inference"):
238
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
239
  image_upload = gr.Image(type="pil", label="Image", height=290)
240
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
241
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
242
-
243
  with gr.TabItem("Video Inference"):
244
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
245
  video_upload = gr.Video(label="Video", height=290)
246
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
247
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
248
-
249
- with gr.TabItem("Generate HTML"):
250
- html_query = gr.Textbox(label="Query Input", placeholder="Describe the desired HTML, or leave blank for a default prompt.")
251
- html_upload = gr.Image(type="pil", label="Image to Parse", height=290)
252
- html_submit = gr.Button("Submit", elem_classes="submit-btn")
253
- gr.Examples(examples=html_examples, inputs=[html_query, html_upload])
254
 
255
  with gr.Accordion("Advanced options", open=False):
256
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -259,41 +300,51 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
259
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
260
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
261
 
262
- with gr.Column():
263
  with gr.Column(elem_classes="canvas-output"):
264
  gr.Markdown("## Output")
265
  with gr.Tabs():
266
  with gr.Tab("Rendered Output"):
267
- markdown_output = gr.Markdown(label="Result")
268
  with gr.Tab("Markdown Source"):
269
- markdown_source_output = gr.TextArea(label="Markdown Source", interactive=False, lines=12, show_copy_button=True)
270
- with gr.Tab("Raw Output"):
271
- raw_output = gr.TextArea(label="Raw Output Stream", interactive=False, lines=12, show_copy_button=True)
272
-
 
 
273
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
274
  gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
275
- gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
276
-
277
- # Link buttons to their respective functions
278
- shared_inputs = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
279
- shared_outputs = [markdown_output, markdown_source_output, raw_output]
280
-
281
  image_submit.click(
282
  fn=generate_image,
283
- inputs=[image_query, image_upload] + shared_inputs,
284
- outputs=shared_outputs
285
  )
286
  video_submit.click(
287
  fn=generate_video,
288
- inputs=[video_query, video_upload] + shared_inputs,
289
- outputs=shared_outputs
290
  )
291
- html_submit.click(
292
- fn=generate_html,
293
- inputs=[html_query, html_upload] + shared_inputs,
294
- outputs=shared_outputs
295
  )
296
 
297
-
298
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
299
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)
 
5
  import time
6
  import asyncio
7
  from threading import Thread
8
+ from pathlib import Path
9
+ from io import BytesIO
10
 
11
  import gradio as gr
12
  import spaces
 
15
  from PIL import Image
16
  import cv2
17
  import requests
18
+ import fitz # PyMuPDF
19
  import html2text
20
  import markdown
21
 
 
81
  vidcap.release()
82
  return frames
83
 
84
+ def convert_file_to_images(file_path: str, dpi: int = 200):
85
+ """
86
+ Converts a PDF or image file into a list of PIL Images.
87
+ """
88
+ images = []
89
+ file_ext = Path(file_path).suffix.lower()
90
+
91
+ image_suffixes = [".png", ".jpeg", ".jpg"]
92
+ pdf_suffixes = [".pdf"]
93
+
94
+ if file_ext in image_suffixes:
95
+ images.append(Image.open(file_path).convert("RGB"))
96
+ return images
97
+
98
+ if file_ext not in pdf_suffixes:
99
+ raise ValueError(f"Unsupported file type: {file_ext}")
100
+
101
+ pdf_document = fitz.open(file_path)
102
+ zoom = dpi / 72.0
103
+ mat = fitz.Matrix(zoom, zoom)
104
+ for page_num in range(len(pdf_document)):
105
+ page = pdf_document.load_page(page_num)
106
+ pix = page.get_pixmap(matrix=mat)
107
+ img_data = pix.tobytes("png")
108
+ images.append(Image.open(BytesIO(img_data)))
109
+ pdf_document.close()
110
+ return images
111
+
112
+
113
  @spaces.GPU
114
  def generate_image(text: str, image: Image.Image,
115
  max_new_tokens: int = 1024,
 
119
  repetition_penalty: float = 1.2):
120
  """
121
  Generates responses using the Qwen3-VL model for image input.
122
+ Yields outputs for the new tabbed layout.
123
  """
124
  if image is None:
125
+ yield "Please upload an image.", "", "", "Please upload an image."
126
  return
127
 
128
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
 
140
  for new_text in streamer:
141
  buffer += new_text
142
  time.sleep(0.01)
143
+ yield buffer, "", "", buffer
 
144
 
145
  @spaces.GPU
146
  def generate_video(text: str, video_path: str,
 
151
  repetition_penalty: float = 1.2):
152
  """
153
  Generates responses using the Qwen3-VL model for video input.
154
+ Yields outputs for the new tabbed layout.
155
  """
156
  if video_path is None:
157
+ yield "Please upload a video.", "", "", "Please upload a video."
158
  return
159
 
160
  frames_with_ts = downsample_video(video_path)
161
  if not frames_with_ts:
162
+ yield "Could not process video.", "", "", "Could not process video."
163
  return
164
 
165
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
 
187
  buffer += new_text
188
  buffer = buffer.replace("<|im_end|>", "")
189
  time.sleep(0.01)
190
+ yield buffer, "", "", buffer
 
191
 
192
  @spaces.GPU
193
+ def generate_document(
194
+ file_path: str,
195
+ max_new_tokens: int = 2048,
196
+ temperature: float = 0.1,
197
+ top_p: float = 0.9,
198
+ top_k: int = 50,
199
+ repetition_penalty: float = 1.05,
200
+ ):
201
  """
202
+ Processes a document (PDF/image) page by page, generating structured HTML and Markdown.
203
  """
204
+ if not file_path:
205
+ yield "Please upload a document.", "", "", "Please upload a document."
206
  return
207
 
208
+ try:
209
+ page_images = convert_file_to_images(file_path)
210
+ if not page_images:
211
+ yield "Could not process the document.", "", "", "Could not process the document."
212
+ return
213
+ except Exception as e:
214
+ error_msg = f"Error reading file: {e}"
215
+ yield error_msg, "", "", error_msg
216
+ return
217
 
218
+ full_html_content = ""
219
+ raw_stream_buffer = ""
220
+
221
+ for i, image in enumerate(page_images):
222
+ page_start_message = f"--- Processing Page {i+1}/{len(page_images)} ---\n"
223
+ raw_stream_buffer += page_start_message
224
+ yield markdown.markdown(raw_stream_buffer), "", "", raw_stream_buffer
225
+
226
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."}]}]
227
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
228
+ inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
229
+
230
+ with torch.no_grad():
231
+ generated_ids = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
 
 
232
 
233
+ generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
234
+ page_html = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
 
235
 
236
+ full_html_content += f'\n\n<!-- Page {i+1} -->\n{page_html}'
237
+ raw_stream_buffer += f"{page_html}\n"
238
+
239
+ full_markdown_source = html2text.html2text(full_html_content)
240
+ rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
241
 
242
+ yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
243
 
244
+ final_message = "\n--- Document processing complete. ---"
245
+ raw_stream_buffer += final_message
246
+ full_markdown_source = html2text.html2text(full_html_content)
247
+ rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
248
+ yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
249
 
250
+
251
+ # --- Gradio Interface ---
252
  image_examples = [
253
  ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
254
  ["Convert this page to doc [markdown] precisely.", "images/3.png"],
255
  ["Explain the creativity in the image.", "images/6.jpg"],
 
256
  ]
257
 
258
  video_examples = [
 
260
  ["Explain the ad in detail.", "videos/1.mp4"]
261
  ]
262
 
263
+ doc_examples = [
264
+ ["examples/sample-doc.pdf"],
265
+ ["examples/sample-page.png"],
 
266
  ]
267
 
268
  css = """
 
271
  .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
272
  """
273
 
 
274
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
275
  gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
276
  with gr.Row():
277
+ with gr.Column(scale=1):
278
  with gr.Tabs():
279
  with gr.TabItem("Image Inference"):
280
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
281
  image_upload = gr.Image(type="pil", label="Image", height=290)
282
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
283
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
284
+
285
  with gr.TabItem("Video Inference"):
286
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
287
  video_upload = gr.Video(label="Video", height=290)
288
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
289
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
290
+
291
+ with gr.TabItem("Document Parsing"):
292
+ doc_upload = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"])
293
+ doc_submit = gr.Button("Process Document", elem_classes="submit-btn")
294
+ gr.Examples(examples=doc_examples, inputs=[doc_upload])
 
295
 
296
  with gr.Accordion("Advanced options", open=False):
297
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
300
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
301
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
302
 
303
+ with gr.Column(scale=2):
304
  with gr.Column(elem_classes="canvas-output"):
305
  gr.Markdown("## Output")
306
  with gr.Tabs():
307
  with gr.Tab("Rendered Output"):
308
+ rendered_output = gr.Markdown(label="Rendered Result")
309
  with gr.Tab("Markdown Source"):
310
+ markdown_source_output = gr.TextArea(label="Markdown Source Code", interactive=False, lines=15, show_copy_button=True)
311
+ with gr.Tab("Generated HTML"):
312
+ html_output = gr.TextArea(label="Generated HTML Source", interactive=False, lines=15, show_copy_button=True)
313
+ with gr.Tab("Raw Stream"):
314
+ raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=15, show_copy_button=True)
315
+
316
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
317
  gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
318
+ gr.Markdown("> ⚠️ Note: Video and document inference performance can vary depending on the complexity and length of the input.")
319
+
320
+ # Define the output components list
321
+ output_components = [rendered_output, markdown_source_output, html_output, raw_output]
322
+
323
+ # Link buttons to functions
324
  image_submit.click(
325
  fn=generate_image,
326
+ inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
327
+ outputs=output_components
328
  )
329
  video_submit.click(
330
  fn=generate_video,
331
+ inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
332
+ outputs=output_components
333
  )
334
+ doc_submit.click(
335
+ fn=generate_document,
336
+ inputs=[doc_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
337
+ outputs=output_components
338
  )
339
 
 
340
  if __name__ == "__main__":
341
+ # Create dummy example files if they don't exist
342
+ if not os.path.exists("images"):
343
+ os.makedirs("images")
344
+ if not os.path.exists("videos"):
345
+ os.makedirs("videos")
346
+ if not os.path.exists("examples"):
347
+ os.makedirs("examples")
348
+ # You may need to add placeholder files to these directories for the examples to load without errors.
349
+
350
  demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)