Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

0c8310d

verified ·

1 Parent(s): f180cec

Update app.py

Browse files

Files changed (1) hide show

app.py +129 -78

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
@@ -13,6 +15,7 @@ import numpy as np
 from PIL import Image
 import cv2
 import requests
 import html2text
 import markdown
@@ -78,6 +81,35 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
@@ -87,10 +119,10 @@ def generate_image(text: str, image: Image.Image,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
-    Yields three identical outputs to fit the new tabbed output structure.
     """
     if image is None:
-        yield "Please upload an image.", "Please upload an image.", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
@@ -108,8 +140,7 @@ def generate_image(text: str, image: Image.Image,
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
-        # Yield to all three output tabs: Rendered, Source, and Raw
-        yield buffer, buffer, buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
@@ -120,15 +151,15 @@ def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
-    Yields three identical outputs to fit the new tabbed output structure.
     """
     if video_path is None:
-        yield "Please upload a video.", "Please upload a video.", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
-        yield "Could not process video.", "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
@@ -156,59 +187,72 @@ def generate_video(text: str, video_path: str,
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        # Yield to all three output tabs: Rendered, Source, and Raw
-        yield buffer, buffer, buffer
 @spaces.GPU
-def generate_html(text: str, image: Image.Image,
-                  max_new_tokens: int = 2048,
-                  temperature: float = 0.6,
-                  top_p: float = 0.9,
-                  top_k: int = 50,
-                  repetition_penalty: float = 1.2):
     """
-    Generates a structured HTML representation from an image.
     """
-    if image is None:
-        yield "<h3>Please upload an image.</h3>", "Please upload an image.", "Please upload an image."
         return
-    # Use a specific, detailed prompt for HTML generation if the user provides none.
-    prompt = text if text else "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
-    prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor_q3vl(
-        text=[prompt_full], images=[image], return_tensors="pt", padding=True
-    ).to(device)
-    streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
-    thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        # Convert the generated HTML to Markdown for the other views
-        md_source = html2text.html2text(buffer)
-        md_render = markdown.markdown(md_source, extensions=['fenced_code', 'tables'])
-        time.sleep(0.01)
-        yield md_render, md_source, buffer
-# --- UI Definition ---
-# Define examples for each tab
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
-    ["Convert chart to OTSL.", "images/2.png"]
 ]
 video_examples = [
@@ -216,10 +260,9 @@ video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
-html_examples = [
-    ["Convert this page to a structured HTML document.", "images/1.png"],
-    ["Parse the content of this image into clean HTML.", "images/3.png"],
-    ["Generate an HTML representation of this chart, including a table.", "images/4.png"]
 ]
 css = """
@@ -228,29 +271,27 @@ css = """
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
-# Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
     with gr.Row():
-        with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
-                with gr.TabItem("Generate HTML"):
-                    html_query = gr.Textbox(label="Query Input", placeholder="Describe the desired HTML, or leave blank for a default prompt.")
-                    html_upload = gr.Image(type="pil", label="Image to Parse", height=290)
-                    html_submit = gr.Button("Submit", elem_classes="submit-btn")
-                    gr.Examples(examples=html_examples, inputs=[html_query, html_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -259,41 +300,51 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 with gr.Tabs():
                     with gr.Tab("Rendered Output"):
-                        markdown_output = gr.Markdown(label="Result")
                     with gr.Tab("Markdown Source"):
-                        markdown_source_output = gr.TextArea(label="Markdown Source", interactive=False, lines=12, show_copy_button=True)
-                    with gr.Tab("Raw Output"):
-                        raw_output = gr.TextArea(label="Raw Output Stream", interactive=False, lines=12, show_copy_button=True)
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
             gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
-            gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
-    # Link buttons to their respective functions
-    shared_inputs = [max_new_tokens, temperature, top_p, top_k, repetition_penalty]
-    shared_outputs = [markdown_output, markdown_source_output, raw_output]
     image_submit.click(
         fn=generate_image,
-        inputs=[image_query, image_upload] + shared_inputs,
-        outputs=shared_outputs
     )
     video_submit.click(
         fn=generate_video,
-        inputs=[video_query, video_upload] + shared_inputs,
-        outputs=shared_outputs
     )
-    html_submit.click(
-        fn=generate_html,
-        inputs=[html_query, html_upload] + shared_inputs,
-        outputs=shared_outputs
     )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 import time
 import asyncio
 from threading import Thread
+from pathlib import Path
+from io import BytesIO
 import gradio as gr
 import spaces
 from PIL import Image
 import cv2
 import requests
+import fitz  # PyMuPDF
 import html2text
 import markdown
     vidcap.release()
     return frames
+def convert_file_to_images(file_path: str, dpi: int = 200):
+    """
+    Converts a PDF or image file into a list of PIL Images.
+    """
+    images = []
+    file_ext = Path(file_path).suffix.lower()
+    image_suffixes = [".png", ".jpeg", ".jpg"]
+    pdf_suffixes = [".pdf"]
+    if file_ext in image_suffixes:
+        images.append(Image.open(file_path).convert("RGB"))
+        return images
+    if file_ext not in pdf_suffixes:
+        raise ValueError(f"Unsupported file type: {file_ext}")
+    pdf_document = fitz.open(file_path)
+    zoom = dpi / 72.0
+    mat = fitz.Matrix(zoom, zoom)
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap(matrix=mat)
+        img_data = pix.tobytes("png")
+        images.append(Image.open(BytesIO(img_data)))
+    pdf_document.close()
+    return images
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
+    Yields outputs for the new tabbed layout.
     """
     if image is None:
+        yield "Please upload an image.", "", "", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
+        yield buffer, "", "", buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
+    Yields outputs for the new tabbed layout.
     """
     if video_path is None:
+        yield "Please upload a video.", "", "", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
+        yield "Could not process video.", "", "", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer, "", "", buffer
 @spaces.GPU
+def generate_document(
+    file_path: str,
+    max_new_tokens: int = 2048,
+    temperature: float = 0.1,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.05,
+):
     """
+    Processes a document (PDF/image) page by page, generating structured HTML and Markdown.
     """
+    if not file_path:
+        yield "Please upload a document.", "", "", "Please upload a document."
         return
+    try:
+        page_images = convert_file_to_images(file_path)
+        if not page_images:
+            yield "Could not process the document.", "", "", "Could not process the document."
+            return
+    except Exception as e:
+        error_msg = f"Error reading file: {e}"
+        yield error_msg, "", "", error_msg
+        return
+    full_html_content = ""
+    raw_stream_buffer = ""
+    for i, image in enumerate(page_images):
+        page_start_message = f"--- Processing Page {i+1}/{len(page_images)} ---\n"
+        raw_stream_buffer += page_start_message
+        yield markdown.markdown(raw_stream_buffer), "", "", raw_stream_buffer
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."}]}]
+        prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+        with torch.no_grad():
+            generated_ids = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
+        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        page_html = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
+        full_html_content += f'\n\n<!-- Page {i+1} -->\n{page_html}'
+        raw_stream_buffer += f"{page_html}\n"
+        full_markdown_source = html2text.html2text(full_html_content)
+        rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
+        yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
+    final_message = "\n--- Document processing complete. ---"
+    raw_stream_buffer += final_message
+    full_markdown_source = html2text.html2text(full_html_content)
+    rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
+    yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
+# --- Gradio Interface ---
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
 ]
 video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
+doc_examples = [
+    ["examples/sample-doc.pdf"],
+    ["examples/sample-page.png"],
 ]
 css = """
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
     with gr.Row():
+        with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
+                with gr.TabItem("Document Parsing"):
+                    doc_upload = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"])
+                    doc_submit = gr.Button("Process Document", elem_classes="submit-btn")
+                    gr.Examples(examples=doc_examples, inputs=[doc_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column(scale=2):
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
                 with gr.Tabs():
                     with gr.Tab("Rendered Output"):
+                        rendered_output = gr.Markdown(label="Rendered Result")
                     with gr.Tab("Markdown Source"):
+                        markdown_source_output = gr.TextArea(label="Markdown Source Code", interactive=False, lines=15, show_copy_button=True)
+                    with gr.Tab("Generated HTML"):
+                        html_output = gr.TextArea(label="Generated HTML Source", interactive=False, lines=15, show_copy_button=True)
+                    with gr.Tab("Raw Stream"):
+                        raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=15, show_copy_button=True)
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
             gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
+            gr.Markdown("> ⚠️ Note: Video and document inference performance can vary depending on the complexity and length of the input.")
+    # Define the output components list
+    output_components = [rendered_output, markdown_source_output, html_output, raw_output]
+    # Link buttons to functions
     image_submit.click(
         fn=generate_image,
+        inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output_components
     )
     video_submit.click(
         fn=generate_video,
+        inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output_components
     )
+    doc_submit.click(
+        fn=generate_document,
+        inputs=[doc_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=output_components
     )
 if __name__ == "__main__":
+    # Create dummy example files if they don't exist
+    if not os.path.exists("images"):
+        os.makedirs("images")
+    if not os.path.exists("videos"):
+        os.makedirs("videos")
+    if not os.path.exists("examples"):
+        os.makedirs("examples")
+    # You may need to add placeholder files to these directories for the examples to load without errors.
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)