Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

b560a0e

verified ·

1 Parent(s): 6d94394

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -32

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import json
 import time
 import asyncio
 from threading import Thread
 import gradio as gr
 import spaces
@@ -13,6 +15,7 @@ import numpy as np
 from PIL import Image
 import cv2
 import requests
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
@@ -26,7 +29,6 @@ MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 # Let the environment (e.g., Hugging Face Spaces) determine the device.
-# This avoids conflicts with the CUDA environment setup by the platform.
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
@@ -41,9 +43,6 @@ if torch.cuda.is_available():
 print("Using device:", device)
 # --- Model Loading ---
-# To address the warnings, we add `use_fast=False` to ensure we use the
-# processor version the model was originally saved with.
 # Load Qwen3VL
 MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
@@ -57,13 +56,11 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
-    Each frame is returned as a PIL image along with its timestamp.
     """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Use a maximum of 10 frames to avoid excessive memory usage
     frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -71,11 +68,29 @@ def downsample_video(video_path):
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
@@ -84,7 +99,7 @@ def generate_image(text: str, image: Image.Image,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the Qwen3-VL model for image input.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image."
@@ -93,10 +108,7 @@ def generate_image(text: str, image: Image.Image,
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # FIX: Removed truncation=True and max_length to prevent the ValueError
-    inputs = processor_q3vl(
-        text=[prompt_full], images=[image], return_tensors="pt", padding=True
-    ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
@@ -116,30 +128,23 @@ def generate_video(text: str, video_path: str,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
-    Generates responses using the Qwen3-VL model for video input.
     """
     if video_path is None:
         yield "Please upload a video.", "Please upload a video."
         return
-    frames_with_ts = downsample_video(video_path)
-    if not frames_with_ts:
         yield "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
-    images_for_processor = []
-    # Add an <|image|> placeholder for each frame in the message
-    for frame, timestamp in frames_with_ts:
-        messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
-        images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # FIX: Removed truncation=True and max_length to prevent the ValueError
-    inputs = processor_q3vl(
-        text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
-    ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
@@ -156,15 +161,57 @@ def generate_video(text: str, video_path: str,
         time.sleep(0.01)
         yield buffer, buffer
-# Define examples for image and video inference
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
-    ["Convert this page to doc [markdown] precisely.", "images/4.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
-    ["Convert this page to doc [markdown] precisely.", "images/1.png"],
-    ["Convert chart to OTSL.", "images/2.png"]
 ]
 video_examples = [
@@ -172,13 +219,17 @@ video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
-# Create the Gradio Interface
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# **Qwen3-VL-Processor**")
     with gr.Row():
@@ -189,12 +240,19 @@ with gr.Blocks(css=css) as demo:
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
@@ -205,9 +263,11 @@ with gr.Blocks(css=css) as demo:
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
-                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=9, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
     image_submit.click(
         fn=generate_image,
         inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
@@ -218,6 +278,11 @@ with gr.Blocks(css=css) as demo:
         inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)

 import time
 import asyncio
 from threading import Thread
+from pathlib import Path
+from io import BytesIO
 import gradio as gr
 import spaces
 from PIL import Image
 import cv2
 import requests
+import fitz  # PyMuPDF
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
 DEFAULT_MAX_NEW_TOKENS = 2048
 # Let the environment (e.g., Hugging Face Spaces) determine the device.
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("Using device:", device)
 # --- Model Loading ---
 # Load Qwen3VL
 MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
     """
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, min(total_frames, 10), dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
         if success:
             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
             pil_image = Image.fromarray(image)
+            frames.append(pil_image)
     vidcap.release()
     return frames
+def convert_pdf_to_images(file_path: str, dpi: int = 200):
+    """
+    Converts a PDF file into a list of PIL Images.
+    """
+    if not file_path:
+        return []
+    images = []
+    pdf_document = fitz.open(file_path)
+    zoom = dpi / 72.0
+    mat = fitz.Matrix(zoom, zoom)
+    for page_num in range(len(pdf_document)):
+        page = pdf_document.load_page(page_num)
+        pix = page.get_pixmap(matrix=mat)
+        img_data = pix.tobytes("png")
+        images.append(Image.open(BytesIO(img_data)))
+    pdf_document.close()
+    return images
 @spaces.GPU
 def generate_image(text: str, image: Image.Image,
                    max_new_tokens: int = 1024,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses for a single image input.
     """
     if image is None:
         yield "Please upload an image.", "Please upload an image."
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """
+    Generates responses for a video input by processing downsampled frames.
     """
     if video_path is None:
         yield "Please upload a video.", "Please upload a video."
         return
+    frames = downsample_video(video_path)
+    if not frames:
         yield "Could not process video.", "Could not process video."
         return
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
+    for frame in frames:
+        messages[0]["content"].insert(0, {"type": "image"})
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = processor_q3vl(text=[prompt_full], images=frames, return_tensors="pt", padding=True).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         time.sleep(0.01)
         yield buffer, buffer
+@spaces.GPU
+def generate_pdf(text: str, pdf_path: str,
+                 max_new_tokens: int = 2048,
+                 temperature: float = 0.6,
+                 top_p: float = 0.9,
+                 top_k: int = 50,
+                 repetition_penalty: float = 1.2):
+    """
+    Processes a PDF file page by page and generates a combined textual output.
+    """
+    if not pdf_path:
+        yield "Please upload a PDF file.", "Please upload a PDF file."
+        return
+    try:
+        page_images = convert_pdf_to_images(pdf_path)
+        if not page_images:
+            yield "Could not extract pages from the PDF.", "Could not extract pages from the PDF."
+            return
+    except Exception as e:
+        yield f"Error processing PDF: {e}", f"Error processing PDF: {e}"
+        return
+    full_response = ""
+    for i, image in enumerate(page_images):
+        page_header = f"--- Page {i+1}/{len(page_images)} ---\n"
+        yield page_header, page_header
+        messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
+        prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor_q3vl(text=[prompt_full], images=[image], return_tensors="pt", padding=True).to(device)
+        streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
+        thread.start()
+        page_buffer = ""
+        for new_text in streamer:
+            page_buffer += new_text
+            yield full_response + page_header + page_buffer, full_response + page_header + page_buffer
+            time.sleep(0.01)
+        full_response += page_header + page_buffer + "\n"
+# --- Gradio Interface ---
 image_examples = [
     ["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"],
     ["Convert this page to doc [markdown] precisely.", "images/3.png"],
     ["Explain the creativity in the image.", "images/6.jpg"],
 ]
 video_examples = [
     ["Explain the ad in detail.", "videos/1.mp4"]
 ]
+#pdf_examples = [
+#    ["Summarize the key findings from this document.", "examples/sample-doc.pdf"],
+#    ["Extract the main points from each section.", "examples/research-paper.pdf"],
+#]
 css = """
 .submit-btn { background-color: #2980b9 !important; color: white !important; }
 .submit-btn:hover { background-color: #3498db !important; }
 .canvas-output { border: 2px solid #4682B4; border-radius: 10px; padding: 20px; }
 """
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# **Qwen3-VL-Processor**")
     with gr.Row():
                     image_upload = gr.Image(type="pil", label="Image", height=290)
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=image_examples, inputs=[image_query, image_upload])
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video", height=290)
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
                     gr.Examples(examples=video_examples, inputs=[video_query, video_upload])
+                with gr.TabItem("PDF Inference"):
+                    pdf_query = gr.Textbox(label="Query Input", placeholder="e.g., 'Summarize this document'")
+                    pdf_upload = gr.File(label="Upload PDF", file_types=[".pdf"])
+                    pdf_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    #gr.Examples(examples=pdf_examples, inputs=[pdf_query, pdf_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
         with gr.Column():
             with gr.Column(elem_classes="canvas-output"):
                 gr.Markdown("## Output")
+                output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=10, show_copy_button=True)
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
+    # Event handlers
     image_submit.click(
         fn=generate_image,
         inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[output, markdown_output]
     )
+    pdf_submit.click(
+        fn=generate_pdf,
+        inputs=[pdf_query, pdf_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+        outputs=[output, markdown_output]
+    )
 if __name__ == "__main__":
     demo.queue(max_size=50).launch(mcp_server=True, ssr_mode=False, show_error=True)