Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

76468c1

verified ·

1 Parent(s): 0c8310d

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -146

app.py CHANGED Viewed

@@ -5,8 +5,6 @@ import json
 import time
 import asyncio
 from threading import Thread
-from pathlib import Path
-from io import BytesIO
 import gradio as gr
 import spaces
@@ -15,9 +13,6 @@ import numpy as np
 from PIL import Image
 import cv2
 import requests
-import fitz  # PyMuPDF
-import html2text
-import markdown
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
@@ -81,35 +76,6 @@ def downsample_video(video_path):
     vidcap.release()
     return frames
-def convert_file_to_images(file_path: str, dpi: int = 200):
-    """
-    Converts a PDF or image file into a list of PIL Images.
-    """
-    images = []
-    file_ext = Path(file_path).suffix.lower()
-    image_suffixes = [".png", ".jpeg", ".jpg"]
-    pdf_suffixes = [".pdf"]
-    if file_ext in image_suffixes:
-        images.append(Image.open(file_path).convert("RGB"))
-        return images
-    if file_ext not in pdf_suffixes:
-        raise ValueError(f"Unsupported file type: {file_ext}")
-    pdf_document = fitz.open(file_path)
-    zoom = dpi / 72.0
-    mat = fitz.Matrix(zoom, zoom)
-    for page_num in range(len(pdf_document)):
-        page = pdf_document.load_page(page_num)
-        pix = page.get_pixmap(matrix=mat)
-        img_data = pix.tobytes("png")
-        images.append(Image.open(BytesIO(img_data)))
-    pdf_document.close()
-    return images
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
@@ -119,15 +85,15 @@ def generate_image(text: str, image: Image.Image,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
-    Yields outputs for the new tabbed layout.
     """
     if image is None:
-        yield "Please upload an image.", "", "", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True
     ).to(device)
@@ -140,7 +106,7 @@ def generate_image(text: str, image: Image.Image,
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
-        yield buffer, "", "", buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
@@ -151,25 +117,26 @@ def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
-    Yields outputs for the new tabbed layout.
     """
     if video_path is None:
-        yield "Please upload a video.", "", "", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
-        yield "Could not process video.", "", "", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
-        messages[0]["content"].insert(0, {"type": "image"})
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
     ).to(device)
@@ -187,72 +154,17 @@ def generate_video(text: str, video_path: str,
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer, "", "", buffer
-@spaces.GPU
-def generate_document(
-    file_path: str,
-    max_new_tokens: int = 2048,
-    temperature: float = 0.1,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.05,
-):
-    """
-    Processes a document (PDF/image) page by page, generating structured HTML and Markdown.
-    """
-    if not file_path:
-        yield "Please upload a document.", "", "", "Please upload a document."
-        return
-    try:
-        page_images = convert_file_to_images(file_path)
-        if not page_images:
-            yield "Could not process the document.", "", "", "Could not process the document."
-            return
-    except Exception as e:
-        error_msg = f"Error reading file: {e}"
-        yield error_msg, "", "", error_msg
-        return
-    full_html_content = ""
-    raw_stream_buffer = ""
-    for i, image in enumerate(page_images):
-        page_start_message = f"--- Processing Page {i+1}/{len(page_images)} ---\n"
-        raw_stream_buffer += page_start_message
-        yield markdown.markdown(raw_stream_buffer), "", "", raw_stream_buffer
-        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Parse this document page into a clean, structured HTML representation. Preserve the logical structure with appropriate tags for content blocks such as paragraphs (<p>), headings (<h1>-<h6>), tables (<table>), and figures (<figure>). Filter out irrelevant elements like headers and footers."}]}]
-        prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
-        with torch.no_grad():
-            generated_ids = model_q3vl.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty)
-        generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        page_html = processor_q3vl.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0]
-        full_html_content += f'\n\n<!-- Page {i+1} -->\n{page_html}'
-        raw_stream_buffer += f"{page_html}\n"
-        full_markdown_source = html2text.html2text(full_html_content)
-        rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
-        yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
-    final_message = "\n--- Document processing complete. ---"
-    raw_stream_buffer += final_message
-    full_markdown_source = html2text.html2text(full_html_content)
-    rendered_markdown = markdown.markdown(full_markdown_source, extensions=['fenced_code', 'tables'])
-    yield rendered_markdown, full_markdown_source, full_html_content, raw_stream_buffer
-# --- Gradio Interface ---
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
 ]
 video_examples = [
@@ -260,39 +172,29 @@ video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
-doc_examples = [
-    ["examples/sample-doc.pdf"],
-    ["examples/sample-page.png"],
-]
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
     with gr.Row():
-        with gr.Column(scale=1):
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
-                with gr.TabItem("Document Parsing"):
-                    doc_upload = gr.File(label="Upload PDF or Image", file_types=[".pdf", ".jpg", ".jpeg", ".png"])
-                    doc_submit = gr.Button("Process Document", elem_classes="submit-btn")
-                    gr.Examples(examples=doc_examples, inputs=[doc_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -300,51 +202,26 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        with gr.Column(scale=2):
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
-                with gr.Tabs():
-                    with gr.Tab("Rendered Output"):
-                        rendered_output = gr.Markdown(label="Rendered Result")
-                    with gr.Tab("Markdown Source"):
-                        markdown_source_output = gr.TextArea(label="Markdown Source Code", interactive=False, lines=15, show_copy_button=True)
-                    with gr.Tab("Generated HTML"):
-                        html_output = gr.TextArea(label="Generated HTML Source", interactive=False, lines=15, show_copy_button=True)
-                    with gr.Tab("Raw Stream"):
-                        raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=15, show_copy_button=True)
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
             gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
-            gr.Markdown("> ⚠️ Note: Video and document inference performance can vary depending on the complexity and length of the input.")
-    # Define the output components list
-    output_components = [rendered_output, markdown_source_output, html_output, raw_output]
-    # Link buttons to functions
     image_submit.click(
         fn=generate_image,
         inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=output_components
     )
     video_submit.click(
         fn=generate_video,
         inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=output_components
-    )
-    doc_submit.click(
-        fn=generate_document,
-        inputs=[doc_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-        outputs=output_components
     )
 if __name__ == "__main__":
-    # Create dummy example files if they don't exist
-    if not os.path.exists("images"):
-        os.makedirs("images")
-    if not os.path.exists("videos"):
-        os.makedirs("videos")
-    if not os.path.exists("examples"):
-        os.makedirs("examples")
-    # You may need to add placeholder files to these directories for the examples to load without errors.
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
 from PIL import Image
 import cv2
 import requests
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
     vidcap.release()
     return frames
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for image input.
     """
     if image is None:
+        yield "Please upload an image.", "Please upload an image."
         return
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # FIX: Removed truncation=True and max_length to prevent the ValueError
     inputs = processor_q3vl(
         text=[prompt_full], images=[image], return_tensors="pt", padding=True
     ).to(device)
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
+        yield buffer, buffer
 @spaces.GPU
 def generate_video(text: str, video_path: str,
                    repetition_penalty: float = 1.2):
     """
     Generates responses using the Qwen3-VL model for video input.
     """
     if video_path is None:
+        yield "Please upload a video.", "Please upload a video."
         return
     frames_with_ts = downsample_video(video_path)
     if not frames_with_ts:
+        yield "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
+    # Add an <|image|> placeholder for each frame in the message
     for frame, timestamp in frames_with_ts:
+        messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # FIX: Removed truncation=True and max_length to prevent the ValueError
     inputs = processor_q3vl(
         text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
     ).to(device)
         buffer += new_text
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
+        yield buffer, buffer
+# Define examples for image and video inference
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
+    ["Convert this page to doc [markdown] precisely.", "images/4.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
+    ["Convert this page to doc [markdown] precisely.", "images/1.png"],
+    ["Convert chart to OTSL.", "images/2.png"]
 ]
 video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
+# Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
     with gr.Row():
+        with gr.Column():
             with gr.Tabs():
                 with gr.TabItem("Image Inference"):
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
+        with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
+                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5, show_copy_button=True)
+                with gr.Accordion("(Result.md)", open=False):
+                    markdown_output = gr.Markdown(label="(Result.Md)")
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
             gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
+            gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
     image_submit.click(
         fn=generate_image,
         inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
     )
     video_submit.click(
         fn=generate_video,
         inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
     )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)