prithivMLmods commited on
Commit
416d62f
·
verified ·
1 Parent(s): 7d7a5da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -9
app.py CHANGED
@@ -24,7 +24,6 @@ from transformers.image_utils import load_image
24
  # Constants for text generation
25
  MAX_MAX_NEW_TOKENS = 4096
26
  DEFAULT_MAX_NEW_TOKENS = 2048
27
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
28
 
29
  # Let the environment (e.g., Hugging Face Spaces) determine the device.
30
  # This avoids conflicts with the CUDA environment setup by the platform.
@@ -46,7 +45,7 @@ print("Using device:", device)
46
  # processor version the model was originally saved with.
47
 
48
  # Load Qwen3VL
49
- MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
50
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
51
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
52
  MODEL_ID_Q3VL,
@@ -93,10 +92,12 @@ def generate_image(text: str, image: Image.Image,
93
 
94
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
95
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
96
  inputs = processor_q3vl(
97
- text=[prompt_full], images=[image], return_tensors="pt", padding=True,
98
- truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
99
  ).to(device)
 
100
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
101
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
102
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
@@ -128,15 +129,18 @@ def generate_video(text: str, video_path: str,
128
 
129
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
130
  images_for_processor = []
 
131
  for frame, timestamp in frames_with_ts:
132
- messages[0]["content"].append({"type": "image"})
133
  images_for_processor.append(frame)
134
 
135
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
136
  inputs = processor_q3vl(
137
- text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True,
138
- truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
139
  ).to(device)
 
140
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
141
  generation_kwargs = {
142
  **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
@@ -176,7 +180,7 @@ css = """
176
 
177
  # Create the Gradio Interface
178
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
179
- gr.Markdown("# **[Multimodal VLM Thinking](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
180
  with gr.Row():
181
  with gr.Column():
182
  with gr.Tabs():
@@ -205,7 +209,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
205
  with gr.Accordion("(Result.md)", open=False):
206
  markdown_output = gr.Markdown(label="(Result.Md)")
207
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
208
- gr.Markdown("> [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct) is a powerful, versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
209
  gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
210
 
211
  image_submit.click(
 
24
  # Constants for text generation
25
  MAX_MAX_NEW_TOKENS = 4096
26
  DEFAULT_MAX_NEW_TOKENS = 2048
 
27
 
28
  # Let the environment (e.g., Hugging Face Spaces) determine the device.
29
  # This avoids conflicts with the CUDA environment setup by the platform.
 
45
  # processor version the model was originally saved with.
46
 
47
  # Load Qwen3VL
48
+ MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8" # fp8 quantized version of the Qwen3-VL-30B-A3B-Instruct model.
49
  processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
50
  model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
51
  MODEL_ID_Q3VL,
 
92
 
93
  messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
94
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
95
+
96
+ # FIX: Removed truncation=True and max_length to prevent the ValueError
97
  inputs = processor_q3vl(
98
+ text=[prompt_full], images=[image], return_tensors="pt", padding=True
 
99
  ).to(device)
100
+
101
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
102
  generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
103
  thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
 
129
 
130
  messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
131
  images_for_processor = []
132
+ # Add an <|image|> placeholder for each frame in the message
133
  for frame, timestamp in frames_with_ts:
134
+ messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
135
  images_for_processor.append(frame)
136
 
137
  prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
138
+
139
+ # FIX: Removed truncation=True and max_length to prevent the ValueError
140
  inputs = processor_q3vl(
141
+ text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
 
142
  ).to(device)
143
+
144
  streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
145
  generation_kwargs = {
146
  **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
 
180
 
181
  # Create the Gradio Interface
182
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
183
+ gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
184
  with gr.Row():
185
  with gr.Column():
186
  with gr.Tabs():
 
209
  with gr.Accordion("(Result.md)", open=False):
210
  markdown_output = gr.Markdown(label="(Result.Md)")
211
  gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
212
+ gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
213
  gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
214
 
215
  image_submit.click(