Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Sleeping

App Files Files Community

prithivMLmods commited on Oct 11

Commit

30d47a8

verified ·

1 Parent(s): 432c2cb

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -20

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ import numpy as np
 from PIL import Image
 import cv2
 import requests
-import fitz  # PyMuPDF
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
@@ -25,7 +25,6 @@ from transformers import (
 )
 from transformers.image_utils import load_image
-# --- Theme Definition ---
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
@@ -50,7 +49,6 @@ class ThistleTheme(Soft):
         primary_hue: colors.Color | str = colors.gray,
         secondary_hue: colors.Color | str = colors.thistle,
         neutral_hue: colors.Color | str = colors.slate,
-        # Update: Increased base text size from md to lg
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
             fonts.GoogleFont("Inconsolata"), "Arial", "sans-serif",
@@ -105,14 +103,12 @@ class ThistleTheme(Soft):
 thistle_theme = ThistleTheme()
-# --- Custom CSS ---
 css = """
-/* Update: Added styles to increase the size of the main titles */
 #main-title h1 {
-    font-size: 2.8em !important;
 }
 #output-title h2 {
-    font-size: 2.2em !important;
 }
 :root {
     --color-grey-50: #f9fafb;
@@ -160,7 +156,6 @@ div.no-padding { padding: 0 !important; }
 @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
 """
-# --- App Constants & Setup ---
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -175,7 +170,6 @@ if torch.cuda.is_available():
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
-# --- Model Loading ---
 MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
@@ -184,8 +178,6 @@ model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     dtype=torch.float16
 ).to(device).eval()
-# --- Backend Functions ---
 def extract_gif_frames(gif_path: str):
     if not gif_path:
         return []
@@ -380,16 +372,22 @@ def generate_gif(text: str, gif_path: str, max_new_tokens: int = 1024, temperatu
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer, buffer
-# --- Gradio Interface ---
-image_examples = [["Describe the safety measures in the image. Conclude (Safe / Unsafe)..", "images/5.jpg"], ["Convert this page to doc [markdown] precisely.", "images/3.png"]]
-video_examples = [["Explain the video in detail.", "videos/2.mp4"]]
-pdf_examples = [["examples/sample-doc.pdf"]]
 with gr.Blocks(theme=thistle_theme, css=css) as demo:
     pdf_state = gr.State(value=get_initial_pdf_state())
-    # Update: Added elem_id for CSS targeting
-    gr.Markdown("## **Qwen3-VL-Demo**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Tabs():
@@ -422,10 +420,12 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
                     gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
                     gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
                     gif_submit = gr.Button("Submit", variant="primary")
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
@@ -435,13 +435,11 @@ with gr.Blocks(theme=thistle_theme, css=css) as demo:
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
-            # Update: Added elem_id for CSS targeting
             gr.Markdown("## Output", elem_id="output-title")
             output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
             with gr.Accordion("(Result.md)", open=False):
                 markdown_output = gr.Markdown(label="(Result.Md)")
-    # Event handlers
     image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])

 from PIL import Image
 import cv2
 import requests
+import fitz
 from transformers import (
     Qwen3VLMoeForConditionalGeneration,
 )
 from transformers.image_utils import load_image
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
         primary_hue: colors.Color | str = colors.gray,
         secondary_hue: colors.Color | str = colors.thistle,
         neutral_hue: colors.Color | str = colors.slate,
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
             fonts.GoogleFont("Inconsolata"), "Arial", "sans-serif",
 thistle_theme = ThistleTheme()
 css = """
 #main-title h1 {
+    font-size: 2.3em !important;
 }
 #output-title h2 {
+    font-size: 2.1em !important;
 }
 :root {
     --color-grey-50: #f9fafb;
 @media (max-height: 1280px) { div.block.chatbot { max-height: 800px !important; } }
 """
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
 MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     dtype=torch.float16
 ).to(device).eval()
 def extract_gif_frames(gif_path: str):
     if not gif_path:
         return []
         buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
         yield buffer, buffer
+image_examples = [["Perform OCR on the image precisely and reconstruct it correctly...", "examples/images/1.jpg"],
+                  ["Caption the image. Describe the safety measures shown in the image. Conclude whether the situation is (safe or unsafe)...", "examples/images/2.jpg"],
+                  ["Solve the problem...", "examples/images/3.png"]]
+video_examples = [["Explain the Ad video in detail.", "examples/videos/1.mp4"],
+                  ["Explain the video in detail.", "examples/videos/2.mp4"]]
+pdf_examples = [["Extract the content precisely.", "examples/pdfs/doc1.pdf"],
+                ["Analyze and provide a short report.", "examples/pdfs/doc2.pdf"]]
+gif_examples = [["Describe this GIF.", "examples/gifs/1.gif"],
+                ["Describe this GIF.", "examples/gifs/2.gif"]]
+caption_examples = [["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/thailand.jpg"],
+                    ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG"]]
 with gr.Blocks(theme=thistle_theme, css=css) as demo:
     pdf_state = gr.State(value=get_initial_pdf_state())
+    gr.Markdown("# **Qwen3VL HF Demo**", elem_id="main-title")
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Tabs():
                     gif_query = gr.Textbox(label="Query Input", placeholder="e.g., 'What is happening in this gif?'")
                     gif_upload = gr.Image(type="filepath", label="Upload GIF", height=290)
                     gif_submit = gr.Button("Submit", variant="primary")
+                    gr.Examples(examples=gif_examples, inputs=[gif_query, gif_upload])
                 with gr.TabItem("Caption"):
                     caption_image_upload = gr.Image(type="pil", label="Image to Caption", height=290)
                     caption_submit = gr.Button("Generate Caption", variant="primary")
+                    gr.Examples(examples=caption_examples, inputs=[caption_image_upload])
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column(scale=3):
             gr.Markdown("## Output", elem_id="output-title")
             output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=14, show_copy_button=True)
             with gr.Accordion("(Result.md)", open=False):
                 markdown_output = gr.Markdown(label="(Result.Md)")
     image_submit.click(fn=generate_image, inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     video_submit.click(fn=generate_video, inputs=[video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])
     pdf_submit.click(fn=generate_pdf, inputs=[pdf_query, pdf_state, max_new_tokens, temperature, top_p, top_k, repetition_penalty], outputs=[output, markdown_output])