Spaces:

prithivMLmods
/

Multimodal-VLM-Thinking

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 11

Commit

416d62f

verified ·

1 Parent(s): 7d7a5da

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -24,7 +24,6 @@ from transformers.image_utils import load_image
 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 # Let the environment (e.g., Hugging Face Spaces) determine the device.
 # This avoids conflicts with the CUDA environment setup by the platform.
@@ -46,7 +45,7 @@ print("Using device:", device)
 # processor version the model was originally saved with.
 # Load Qwen3VL
-MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     MODEL_ID_Q3VL,
@@ -93,10 +92,12 @@ def generate_image(text: str, image: Image.Image,
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
-        text=[prompt_full], images=[image], return_tensors="pt", padding=True,
-        truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
@@ -128,15 +129,18 @@ def generate_video(text: str, video_path: str,
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
     for frame, timestamp in frames_with_ts:
-        messages[0]["content"].append({"type": "image"})
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor_q3vl(
-        text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
-        truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
@@ -176,7 +180,7 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal VLM Thinking](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -205,7 +209,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
-            gr.Markdown("> [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) is a powerful, versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
             gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
     image_submit.click(

 # Constants for text generation
 MAX_MAX_NEW_TOKENS = 4096
 DEFAULT_MAX_NEW_TOKENS = 2048
 # Let the environment (e.g., Hugging Face Spaces) determine the device.
 # This avoids conflicts with the CUDA environment setup by the platform.
 # processor version the model was originally saved with.
 # Load Qwen3VL
+MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8" # fp8 quantized version of the Qwen3-VL-30B-A3B-Instruct model.
 processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
 model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
     MODEL_ID_Q3VL,
     messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # FIX: Removed truncation=True and max_length to prevent the ValueError
     inputs = processor_q3vl(
+        text=[prompt_full], images=[image], return_tensors="pt", padding=True
     ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
     messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
     images_for_processor = []
+    # Add an <|image|> placeholder for each frame in the message
     for frame, timestamp in frames_with_ts:
+        messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
         images_for_processor.append(frame)
     prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # FIX: Removed truncation=True and max_length to prevent the ValueError
     inputs = processor_q3vl(
+        text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
     ).to(device)
     streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                 with gr.Accordion("(Result.md)", open=False):
                     markdown_output = gr.Markdown(label="(Result.Md)")
             gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
+            gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
             gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
     image_submit.click(