Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

f180cec

verified ·

1 Parent(s): a138236

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -121

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import random
 import uuid
 import json
 import time
 from threading import Thread
 import gradio as gr
@@ -12,7 +13,8 @@ import numpy as np
 from PIL import Image
 import cv2
 import requests
-import supervision as sv # Added for object detection visualization
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
@@ -50,7 +52,7 @@ processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     MODEL_ID_Q3VL,
     trust_remote_code=True,
-    torch_dtype=torch.float16
 ).to(device).eval()
@@ -85,9 +87,10 @@ def generate_image(text: str, image: Image.Image,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
     """
     if image is None:
-        yield "Please upload an image.", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
@@ -105,7 +108,8 @@ def generate_image(text: str, image: Image.Image,
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
-        yield buffer, buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
@@ -116,20 +120,21 @@ def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
     """
     if video_path is None:
-        yield "Please upload a video.", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
-        yield "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
-        messages[0]["content"].insert(0, {"type": "image"})
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -151,108 +156,70 @@ def generate_video(text: str, video_path: str,
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, buffer
-# --- Object Detection Functions ---
-def create_annotated_image(image: Image.Image, json_data_string: str):
-    """Parses JSON from model and draws bounding boxes on the image."""
-    try:
-        # Clean up the string to get pure JSON from markdown code blocks
-        if "```json" in json_data_string:
-            json_str = json_data_string.split("```json")[1].split("```").strip()
-        else:
-            json_str = json_data_string
-        bbox_data = json.loads(json_str)
-        if not isinstance(bbox_data, list):
-            bbox_data = [bbox_data]
-    except (json.JSONDecodeError, IndexError):
-        # If parsing fails, return the original image and an error message
-        return image, f"Failed to parse JSON from model output:\n{json_data_string}"
-    annotated_image = np.array(image.convert("RGB"))
-    boxes = []
-    labels = []
-    for item in bbox_data:
-        if "box_2d" in item and "label" in item:
-            boxes.append(item["box_2d"])
-            labels.append(str(item["label"]))
-    if not boxes:
-        return image, "No bounding boxes with labels found in the model's output."
-    # Create supervision Detections object from the parsed data
-    detections = sv.Detections(xyxy=np.array(boxes))
-    # Create annotators
-    bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
-    label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
-    # Annotate the image
-    annotated_image = bounding_box_annotator.annotate(
-        scene=annotated_image, detections=detections
-    )
-    annotated_image = label_annotator.annotate(
-        scene=annotated_image, detections=detections, labels=labels
-    )
-    return Image.fromarray(annotated_image), json.dumps(bbox_data, indent=2)
 @spaces.GPU
-def generate_detection(image: Image.Image, prompt: str):
     """
-    Generates object detections using the Qwen3-VL model.
     """
     if image is None:
-        return None, "Please upload an image first."
-    # A detailed prompt to guide the model for object detection
-    detection_prompt = f"""
-This is an object detection task. Analyze the image to identify all instances of '{prompt}'.
-Respond ONLY with a JSON array where each object is a dictionary with two keys:
-1. "label": The name of the object found (e.g., "{prompt}").
-2. "box_2d": The bounding box coordinates as a list of four numbers [x_min, y_min, x_max, y_max].
-Do not include any other text or explanations outside of the final JSON code block.
-"""
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": detection_prompt}]}]
-    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True
     ).to(device)
-    # Generate a static response (no streaming) for easier JSON parsing
-    generated_ids = model_q3vl.generate(**inputs, max_new_tokens=2048)
-    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape:]
-    response_text = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)
-    # Create annotated image from the model's response
-    annotated_image, formatted_json = create_annotated_image(image, response_text)
-    return annotated_image, formatted_json
-# --- Gradio UI ---
-# Define examples for image and video inference
 image_examples = [
-    ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "examples/5.jpg"],
-    ["Convert this page to doc [markdown] precisely.", "examples/3.png"],
-    ["Explain the creativity in the image.", "examples/6.jpg"],
 ]
 video_examples = [
-    ["Explain the video in detail.", "examples/2.mp4"],
-    ["Explain the ad in detail.", "examples/1.mp4"]
 ]
-detection_examples = [
-    ["examples/detection_1.jpg", "person"],
-    ["examples/detection_2.jpg", "car"],
-    ["examples/detection_3.jpg", "cat"],
 ]
 css = """
@@ -267,27 +234,23 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
-                # Tab 1: Image Inference
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
-                # Tab 2: Video Inference
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
-                # Tab 3: Object Detection
-                with gr.TabItem("Object Detection & Pointing"):
-                    detection_image_upload = gr.Image(type="pil", label="Image to Analyze", height=290)
-                    detection_query = gr.Textbox(label="Object to Detect", placeholder="e.g., car, person, cat...")
-                    detection_submit = gr.Button("Detect Objects", elem_classes="submit-btn")
-                    gr.Examples(examples=detection_examples, inputs=[detection_image_upload, detection_query])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -299,34 +262,38 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
-                # Outputs for Image/Video Inference
-                output_stream = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
-                markdown_output = gr.Markdown(label="Formatted Output (Result.md)")
-                # Outputs for Object Detection
-                annotated_image = gr.Image(type="pil", label="Annotated Image")
-                json_output = gr.JSON(label="Detection JSON Output")
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
-            gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks like visual question answering, video analysis, and object detection.")
-            gr.Markdown("> ⚠️ Note: Performance can vary depending on the complexity of the input.")
-    # Wire up the events
     image_submit.click(
         fn=generate_image,
-        inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output_stream, markdown_output]
     )
     video_submit.click(
         fn=generate_video,
-        inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=[output_stream, markdown_output]
     )
-    detection_submit.click(
-        fn=generate_detection,
-        inputs=[detection_image_upload, detection_query],
-        outputs=[annotated_image, json_output]
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)

 import uuid
 import json
 import time
+import asyncio
 from threading import Thread
 import gradio as gr
 from PIL import Image
 import cv2
 import requests
+import html2text
+import markdown
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     MODEL_ID_Q3VL,
     trust_remote_code=True,
+    dtype=torch.float16
 ).to(device).eval()
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
+    Yields three identical outputs to fit the new tabbed output structure.
     """
     if image is None:
+        yield "Please upload an image.", "Please upload an image.", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
+        # Yield to all three output tabs: Rendered, Source, and Raw
+        yield buffer, buffer, buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
+    Yields three identical outputs to fit the new tabbed output structure.
     """
     if video_path is None:
+        yield "Please upload a video.", "Please upload a video.", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
+        yield "Could not process video.", "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
+        messages[0]["content"].insert(0, {"type": "image"})
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        # Yield to all three output tabs: Rendered, Source, and Raw
+        yield buffer, buffer, buffer
 @spaces.GPU
+def generate_html(text: str, image: Image.Image,
+                  max_new_tokens: int = 2048,
+                  temperature: float = 0.6,
+                  top_p: float = 0.9,
+                  top_k: int = 50,
+                  repetition_penalty: float = 1.2):
     """
+    Generates a structured HTML representation from an image.
     """
     if image is None:
+        yield "<h3>Please upload an image.</h3>", "Please upload an image.", "Please upload an image."
+        return
+    # Use a specific, detailed prompt for HTML generation if the user provides none.
+    prompt = text if text else "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
+    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True
     ).to(device)
+    streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+    thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        # Convert the generated HTML to Markdown for the other views
+        md_source = html2text.html2text(buffer)
+        md_render = markdown.markdown(md_source, extensions=['fenced_code', 'tables'])
+        time.sleep(0.01)
+        yield md_render, md_source, buffer
+# --- UI Definition ---
+# Define examples for each tab
 image_examples = [
+    ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
+    ["Convert this page to doc [markdown] precisely.", "images/3.png"],
+    ["Explain the creativity in the image.", "images/6.jpg"],
+    ["Convert chart to OTSL.", "images/2.png"]
 ]
 video_examples = [
+    ["Explain the video in detail.", "videos/2.mp4"],
+    ["Explain the ad in detail.", "videos/1.mp4"]
 ]
+html_examples = [
+    ["Convert this page to a structured HTML document.", "images/1.png"],
+    ["Parse the content of this image into clean HTML.", "images/3.png"],
+    ["Generate an HTML representation of this chart, including a table.", "images/4.png"]
 ]
 css = """
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
+                with gr.TabItem("Generate HTML"):
+                    html_query = gr.Textbox(label="Query Input", placeholder="Describe the desired HTML, or leave blank for a default prompt.")
+                    html_upload = gr.Image(type="pil", label="Image to Parse", height=290)
+                    html_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    gr.Examples(examples=html_examples, inputs=[html_query, html_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
+                with gr.Tabs():
+                    with gr.Tab("Rendered Output"):
+                        markdown_output = gr.Markdown(label="Result")
+                    with gr.Tab("Markdown Source"):
+                        markdown_source_output = gr.TextArea(label="Markdown Source", interactive=False, lines=12, show_copy_button=True)
+                    with gr.Tab("Raw Output"):
+                        raw_output = gr.TextArea(label="Raw Output Stream", interactive=False, lines=12, show_copy_button=True)
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
+            gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
+            gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
+    # Link buttons to their respective functions
+    shared_inputs = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
+    shared_outputs = [markdown_output, markdown_source_output, raw_output]
     image_submit.click(
         fn=generate_image,
+        inputs=[image_query, image_upload] + shared_inputs,
+        outputs=shared_outputs
     )
     video_submit.click(
         fn=generate_video,
+        inputs=[video_query, video_upload] + shared_inputs,
+        outputs=shared_outputs
     )
+    html_submit.click(
+        fn=generate_html,
+        inputs=[html_query, html_upload] + shared_inputs,
+        outputs=shared_outputs
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)