prithivMLmods commited on
Commit
f180cec
·
verified ·
1 Parent(s): a138236

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -121
app.py CHANGED
@@ -3,6 +3,7 @@ import random
3
  import uuid
4
  import json
5
  import time
 
6
  from threading import Thread
7
 
8
  import gradio as gr
@@ -12,7 +13,8 @@ import numpy as np
12
  from PIL import Image
13
  import cv2
14
  import requests
15
- import supervision as sv # Added for object detection visualization
 
16
 
17
  from transformers import (
18
  Qwen3VLMoeForConditionalGeneration,
@@ -50,7 +52,7 @@ processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=
50
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
51
  MODEL_ID_Q3VL,
52
  trust_remote_code=True,
53
- torch_dtype=torch.float16
54
  ).to(device).eval()
55
 
56
 
@@ -85,9 +87,10 @@ def generate_image(text: str, image: Image.Image,
85
  repetition_penalty: float = 1.2):
86
  """
87
  Generates responses using the Qwen3-VL model for image input.
 
88
  """
89
  if image is None:
90
- yield "Please upload an image.", "Please upload an image."
91
  return
92
 
93
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
@@ -105,7 +108,8 @@ def generate_image(text: str, image: Image.Image,
105
  for new_text in streamer:
106
  buffer += new_text
107
  time.sleep(0.01)
108
- yield buffer, buffer
 
109
 
110
  @spaces.GPU
111
  def generate_video(text: str, video_path: str,
@@ -116,20 +120,21 @@ def generate_video(text: str, video_path: str,
116
  repetition_penalty: float = 1.2):
117
  """
118
  Generates responses using the Qwen3-VL model for video input.
 
119
  """
120
  if video_path is None:
121
- yield "Please upload a video.", "Please upload a video."
122
  return
123
 
124
  frames_with_ts = downsample_video(video_path)
125
  if not frames_with_ts:
126
- yield "Could not process video.", "Could not process video."
127
  return
128
 
129
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
130
  images_for_processor = []
131
  for frame, timestamp in frames_with_ts:
132
- messages[0]["content"].insert(0, {"type": "image"})
133
  images_for_processor.append(frame)
134
 
135
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -151,108 +156,70 @@ def generate_video(text: str, video_path: str,
151
  buffer += new_text
152
  buffer = buffer.replace("<|im_end|>", "")
153
  time.sleep(0.01)
154
- yield buffer, buffer
155
-
156
- # --- Object Detection Functions ---
157
-
158
- def create_annotated_image(image: Image.Image, json_data_string: str):
159
- """Parses JSON from model and draws bounding boxes on the image."""
160
- try:
161
- # Clean up the string to get pure JSON from markdown code blocks
162
- if "```json" in json_data_string:
163
- json_str = json_data_string.split("```json")[1].split("```").strip()
164
- else:
165
- json_str = json_data_string
166
-
167
- bbox_data = json.loads(json_str)
168
- if not isinstance(bbox_data, list):
169
- bbox_data = [bbox_data]
170
-
171
- except (json.JSONDecodeError, IndexError):
172
- # If parsing fails, return the original image and an error message
173
- return image, f"Failed to parse JSON from model output:\n{json_data_string}"
174
-
175
- annotated_image = np.array(image.convert("RGB"))
176
- boxes = []
177
- labels = []
178
-
179
- for item in bbox_data:
180
- if "box_2d" in item and "label" in item:
181
- boxes.append(item["box_2d"])
182
- labels.append(str(item["label"]))
183
-
184
- if not boxes:
185
- return image, "No bounding boxes with labels found in the model's output."
186
-
187
- # Create supervision Detections object from the parsed data
188
- detections = sv.Detections(xyxy=np.array(boxes))
189
-
190
- # Create annotators
191
- bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
192
- label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
193
-
194
- # Annotate the image
195
- annotated_image = bounding_box_annotator.annotate(
196
- scene=annotated_image, detections=detections
197
- )
198
- annotated_image = label_annotator.annotate(
199
- scene=annotated_image, detections=detections, labels=labels
200
- )
201
-
202
- return Image.fromarray(annotated_image), json.dumps(bbox_data, indent=2)
203
 
204
  @spaces.GPU
205
- def generate_detection(image: Image.Image, prompt: str):
 
 
 
 
 
206
  """
207
- Generates object detections using the Qwen3-VL model.
208
  """
209
  if image is None:
210
- return None, "Please upload an image first."
211
-
212
- # A detailed prompt to guide the model for object detection
213
- detection_prompt = f"""
214
- This is an object detection task. Analyze the image to identify all instances of '{prompt}'.
215
- Respond ONLY with a JSON array where each object is a dictionary with two keys:
216
- 1. "label": The name of the object found (e.g., "{prompt}").
217
- 2. "box_2d": The bounding box coordinates as a list of four numbers [x_min, y_min, x_max, y_max].
218
- Do not include any other text or explanations outside of the final JSON code block.
219
- """
220
 
221
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": detection_prompt}]}]
222
- prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
223
 
 
 
 
224
  inputs = processor_q3vl(
225
  text=[prompt_full], images=[image], return_tensors="pt", padding=True
226
  ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
- # Generate a static response (no streaming) for easier JSON parsing
229
- generated_ids = model_q3vl.generate(**inputs, max_new_tokens=2048)
230
- generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape:]
231
- response_text = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
232
-
233
- # Create annotated image from the model's response
234
- annotated_image, formatted_json = create_annotated_image(image, response_text)
235
-
236
- return annotated_image, formatted_json
237
 
238
- # --- Gradio UI ---
239
 
240
- # Define examples for image and video inference
241
  image_examples = [
242
- ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "examples/5.jpg"],
243
- ["Convert this page to doc [markdown] precisely.", "examples/3.png"],
244
- ["Explain the creativity in the image.", "examples/6.jpg"],
 
245
  ]
246
 
247
  video_examples = [
248
- ["Explain the video in detail.", "examples/2.mp4"],
249
- ["Explain the ad in detail.", "examples/1.mp4"]
250
  ]
251
 
252
- detection_examples = [
253
- ["examples/detection_1.jpg", "person"],
254
- ["examples/detection_2.jpg", "car"],
255
- ["examples/detection_3.jpg", "cat"],
256
  ]
257
 
258
  css = """
@@ -267,27 +234,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
267
  with gr.Row():
268
  with gr.Column():
269
  with gr.Tabs():
270
- # Tab 1: Image Inference
271
  with gr.TabItem("Image Inference"):
272
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
273
  image_upload = gr.Image(type="pil", label="Image", height=290)
274
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
275
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
276
-
277
- # Tab 2: Video Inference
278
  with gr.TabItem("Video Inference"):
279
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
280
  video_upload = gr.Video(label="Video", height=290)
281
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
282
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
283
-
284
- # Tab 3: Object Detection
285
- with gr.TabItem("Object Detection & Pointing"):
286
- detection_image_upload = gr.Image(type="pil", label="Image to Analyze", height=290)
287
- detection_query = gr.Textbox(label="Object to Detect", placeholder="e.g., car, person, cat...")
288
- detection_submit = gr.Button("Detect Objects", elem_classes="submit-btn")
289
- gr.Examples(examples=detection_examples, inputs=[detection_image_upload, detection_query])
290
-
291
 
292
  with gr.Accordion("Advanced options", open=False):
293
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -299,34 +262,38 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
299
  with gr.Column():
300
  with gr.Column(elem_classes="canvas-output"):
301
  gr.Markdown("## Output")
302
- # Outputs for Image/Video Inference
303
- output_stream = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
304
- markdown_output = gr.Markdown(label="Formatted Output (Result.md)")
305
-
306
- # Outputs for Object Detection
307
- annotated_image = gr.Image(type="pil", label="Annotated Image")
308
- json_output = gr.JSON(label="Detection JSON Output")
309
-
310
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
311
- gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks like visual question answering, video analysis, and object detection.")
312
- gr.Markdown("> ⚠️ Note: Performance can vary depending on the complexity of the input.")
 
 
 
 
313
 
314
- # Wire up the events
315
  image_submit.click(
316
  fn=generate_image,
317
- inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
318
- outputs=[output_stream, markdown_output]
319
  )
320
  video_submit.click(
321
  fn=generate_video,
322
- inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
323
- outputs=[output_stream, markdown_output]
324
  )
325
- detection_submit.click(
326
- fn=generate_detection,
327
- inputs=[detection_image_upload, detection_query],
328
- outputs=[annotated_image, json_output]
329
  )
330
 
 
331
  if __name__ == "__main__":
332
- demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)
 
3
  import uuid
4
  import json
5
  import time
6
+ import asyncio
7
  from threading import Thread
8
 
9
  import gradio as gr
 
13
  from PIL import Image
14
  import cv2
15
  import requests
16
+ import html2text
17
+ import markdown
18
 
19
  from transformers import (
20
  Qwen3VLMoeForConditionalGeneration,
 
52
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
53
  MODEL_ID_Q3VL,
54
  trust_remote_code=True,
55
+ dtype=torch.float16
56
  ).to(device).eval()
57
 
58
 
 
87
  repetition_penalty: float = 1.2):
88
  """
89
  Generates responses using the Qwen3-VL model for image input.
90
+ Yields three identical outputs to fit the new tabbed output structure.
91
  """
92
  if image is None:
93
+ yield "Please upload an image.", "Please upload an image.", "Please upload an image."
94
  return
95
 
96
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
 
108
  for new_text in streamer:
109
  buffer += new_text
110
  time.sleep(0.01)
111
+ # Yield to all three output tabs: Rendered, Source, and Raw
112
+ yield buffer, buffer, buffer
113
 
114
  @spaces.GPU
115
  def generate_video(text: str, video_path: str,
 
120
  repetition_penalty: float = 1.2):
121
  """
122
  Generates responses using the Qwen3-VL model for video input.
123
+ Yields three identical outputs to fit the new tabbed output structure.
124
  """
125
  if video_path is None:
126
+ yield "Please upload a video.", "Please upload a video.", "Please upload a video."
127
  return
128
 
129
  frames_with_ts = downsample_video(video_path)
130
  if not frames_with_ts:
131
+ yield "Could not process video.", "Could not process video.", "Could not process video."
132
  return
133
 
134
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
135
  images_for_processor = []
136
  for frame, timestamp in frames_with_ts:
137
+ messages[0]["content"].insert(0, {"type": "image"})
138
  images_for_processor.append(frame)
139
 
140
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
156
  buffer += new_text
157
  buffer = buffer.replace("<|im_end|>", "")
158
  time.sleep(0.01)
159
+ # Yield to all three output tabs: Rendered, Source, and Raw
160
+ yield buffer, buffer, buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
  @spaces.GPU
163
+ def generate_html(text: str, image: Image.Image,
164
+ max_new_tokens: int = 2048,
165
+ temperature: float = 0.6,
166
+ top_p: float = 0.9,
167
+ top_k: int = 50,
168
+ repetition_penalty: float = 1.2):
169
  """
170
+ Generates a structured HTML representation from an image.
171
  """
172
  if image is None:
173
+ yield "<h3>Please upload an image.</h3>", "Please upload an image.", "Please upload an image."
174
+ return
 
 
 
 
 
 
 
 
175
 
176
+ # Use a specific, detailed prompt for HTML generation if the user provides none.
177
+ prompt = text if text else "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."
178
 
179
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
180
+ prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
181
+
182
  inputs = processor_q3vl(
183
  text=[prompt_full], images=[image], return_tensors="pt", padding=True
184
  ).to(device)
185
+
186
+ streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
187
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
188
+ thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
189
+ thread.start()
190
+
191
+ buffer = ""
192
+ for new_text in streamer:
193
+ buffer += new_text
194
+ buffer = buffer.replace("<|im_end|>", "")
195
+
196
+ # Convert the generated HTML to Markdown for the other views
197
+ md_source = html2text.html2text(buffer)
198
+ md_render = markdown.markdown(md_source, extensions=['fenced_code', 'tables'])
199
+
200
+ time.sleep(0.01)
201
+ yield md_render, md_source, buffer
202
 
 
 
 
 
 
 
 
 
 
203
 
204
+ # --- UI Definition ---
205
 
206
+ # Define examples for each tab
207
  image_examples = [
208
+ ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
209
+ ["Convert this page to doc [markdown] precisely.", "images/3.png"],
210
+ ["Explain the creativity in the image.", "images/6.jpg"],
211
+ ["Convert chart to OTSL.", "images/2.png"]
212
  ]
213
 
214
  video_examples = [
215
+ ["Explain the video in detail.", "videos/2.mp4"],
216
+ ["Explain the ad in detail.", "videos/1.mp4"]
217
  ]
218
 
219
+ html_examples = [
220
+ ["Convert this page to a structured HTML document.", "images/1.png"],
221
+ ["Parse the content of this image into clean HTML.", "images/3.png"],
222
+ ["Generate an HTML representation of this chart, including a table.", "images/4.png"]
223
  ]
224
 
225
  css = """
 
234
  with gr.Row():
235
  with gr.Column():
236
  with gr.Tabs():
 
237
  with gr.TabItem("Image Inference"):
238
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
239
  image_upload = gr.Image(type="pil", label="Image", height=290)
240
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
241
  gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
242
+
 
243
  with gr.TabItem("Video Inference"):
244
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
245
  video_upload = gr.Video(label="Video", height=290)
246
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
247
  gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
248
+
249
+ with gr.TabItem("Generate HTML"):
250
+ html_query = gr.Textbox(label="Query Input", placeholder="Describe the desired HTML, or leave blank for a default prompt.")
251
+ html_upload = gr.Image(type="pil", label="Image to Parse", height=290)
252
+ html_submit = gr.Button("Submit", elem_classes="submit-btn")
253
+ gr.Examples(examples=html_examples, inputs=[html_query, html_upload])
 
 
254
 
255
  with gr.Accordion("Advanced options", open=False):
256
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
 
262
  with gr.Column():
263
  with gr.Column(elem_classes="canvas-output"):
264
  gr.Markdown("## Output")
265
+ with gr.Tabs():
266
+ with gr.Tab("Rendered Output"):
267
+ markdown_output = gr.Markdown(label="Result")
268
+ with gr.Tab("Markdown Source"):
269
+ markdown_source_output = gr.TextArea(label="Markdown Source", interactive=False, lines=12, show_copy_button=True)
270
+ with gr.Tab("Raw Output"):
271
+ raw_output = gr.TextArea(label="Raw Output Stream", interactive=False, lines=12, show_copy_button=True)
272
+
273
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
274
+ gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
275
+ gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
276
+
277
+ # Link buttons to their respective functions
278
+ shared_inputs = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
279
+ shared_outputs = [markdown_output, markdown_source_output, raw_output]
280
 
 
281
  image_submit.click(
282
  fn=generate_image,
283
+ inputs=[image_query, image_upload] + shared_inputs,
284
+ outputs=shared_outputs
285
  )
286
  video_submit.click(
287
  fn=generate_video,
288
+ inputs=[video_query, video_upload] + shared_inputs,
289
+ outputs=shared_outputs
290
  )
291
+ html_submit.click(
292
+ fn=generate_html,
293
+ inputs=[html_query, html_upload] + shared_inputs,
294
+ outputs=shared_outputs
295
  )
296
 
297
+
298
  if __name__ == "__main__":
299
+ demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)