Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,7 +24,6 @@ from transformers.image_utils import load_image
|
|
| 24 |
# Constants for text generation
|
| 25 |
MAX_MAX_NEW_TOKENS = 4096
|
| 26 |
DEFAULT_MAX_NEW_TOKENS = 2048
|
| 27 |
-
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
| 28 |
|
| 29 |
# Let the environment (e.g., Hugging Face Spaces) determine the device.
|
| 30 |
# This avoids conflicts with the CUDA environment setup by the platform.
|
|
@@ -46,7 +45,7 @@ print("Using device:", device)
|
|
| 46 |
# processor version the model was originally saved with.
|
| 47 |
|
| 48 |
# Load Qwen3VL
|
| 49 |
-
MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct"
|
| 50 |
processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
|
| 51 |
model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
|
| 52 |
MODEL_ID_Q3VL,
|
|
@@ -93,10 +92,12 @@ def generate_image(text: str, image: Image.Image,
|
|
| 93 |
|
| 94 |
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
|
| 95 |
prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
|
|
|
| 96 |
inputs = processor_q3vl(
|
| 97 |
-
text=[prompt_full], images=[image], return_tensors="pt", padding=True
|
| 98 |
-
truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
|
| 99 |
).to(device)
|
|
|
|
| 100 |
streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
|
| 101 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 102 |
thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
|
|
@@ -128,15 +129,18 @@ def generate_video(text: str, video_path: str,
|
|
| 128 |
|
| 129 |
messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
|
| 130 |
images_for_processor = []
|
|
|
|
| 131 |
for frame, timestamp in frames_with_ts:
|
| 132 |
-
messages[0]["content"].
|
| 133 |
images_for_processor.append(frame)
|
| 134 |
|
| 135 |
prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
|
|
|
|
|
| 136 |
inputs = processor_q3vl(
|
| 137 |
-
text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
|
| 138 |
-
truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH
|
| 139 |
).to(device)
|
|
|
|
| 140 |
streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
|
| 141 |
generation_kwargs = {
|
| 142 |
**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
|
|
@@ -176,7 +180,7 @@ css = """
|
|
| 176 |
|
| 177 |
# Create the Gradio Interface
|
| 178 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 179 |
-
gr.Markdown("# **[Multimodal VLM Thinking](https://huggingface.co/
|
| 180 |
with gr.Row():
|
| 181 |
with gr.Column():
|
| 182 |
with gr.Tabs():
|
|
@@ -205,7 +209,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
| 205 |
with gr.Accordion("(Result.md)", open=False):
|
| 206 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 207 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
|
| 208 |
-
gr.Markdown("> [Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)
|
| 209 |
gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
|
| 210 |
|
| 211 |
image_submit.click(
|
|
|
|
| 24 |
# Constants for text generation
|
| 25 |
MAX_MAX_NEW_TOKENS = 4096
|
| 26 |
DEFAULT_MAX_NEW_TOKENS = 2048
|
|
|
|
| 27 |
|
| 28 |
# Let the environment (e.g., Hugging Face Spaces) determine the device.
|
| 29 |
# This avoids conflicts with the CUDA environment setup by the platform.
|
|
|
|
| 45 |
# processor version the model was originally saved with.
|
| 46 |
|
| 47 |
# Load Qwen3VL
|
| 48 |
+
MODEL_ID_Q3VL = "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8" # fp8 quantized version of the Qwen3-VL-30B-A3B-Instruct model.
|
| 49 |
processor_q3vl = AutoProcessor.from_pretrained(MODEL_ID_Q3VL, trust_remote_code=True, use_fast=False)
|
| 50 |
model_q3vl = Qwen3VLMoeForConditionalGeneration.from_pretrained(
|
| 51 |
MODEL_ID_Q3VL,
|
|
|
|
| 92 |
|
| 93 |
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": text}]}]
|
| 94 |
prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 95 |
+
|
| 96 |
+
# FIX: Removed truncation=True and max_length to prevent the ValueError
|
| 97 |
inputs = processor_q3vl(
|
| 98 |
+
text=[prompt_full], images=[image], return_tensors="pt", padding=True
|
|
|
|
| 99 |
).to(device)
|
| 100 |
+
|
| 101 |
streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
|
| 102 |
generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
|
| 103 |
thread = Thread(target=model_q3vl.generate, kwargs=generation_kwargs)
|
|
|
|
| 129 |
|
| 130 |
messages = [{"role": "user", "content": [{"type": "text", "text": text}]}]
|
| 131 |
images_for_processor = []
|
| 132 |
+
# Add an <|image|> placeholder for each frame in the message
|
| 133 |
for frame, timestamp in frames_with_ts:
|
| 134 |
+
messages[0]["content"].insert(0, {"type": "image"}) # Insert at beginning to match common patterns
|
| 135 |
images_for_processor.append(frame)
|
| 136 |
|
| 137 |
prompt_full = processor_q3vl.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 138 |
+
|
| 139 |
+
# FIX: Removed truncation=True and max_length to prevent the ValueError
|
| 140 |
inputs = processor_q3vl(
|
| 141 |
+
text=[prompt_full], images=images_for_processor, return_tensors="pt", padding=True
|
|
|
|
| 142 |
).to(device)
|
| 143 |
+
|
| 144 |
streamer = TextIteratorStreamer(processor_q3vl, skip_prompt=True, skip_special_tokens=True)
|
| 145 |
generation_kwargs = {
|
| 146 |
**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
|
|
|
|
| 180 |
|
| 181 |
# Create the Gradio Interface
|
| 182 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
| 183 |
+
gr.Markdown("# **[Multimodal VLM Thinking with Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**")
|
| 184 |
with gr.Row():
|
| 185 |
with gr.Column():
|
| 186 |
with gr.Tabs():
|
|
|
|
| 209 |
with gr.Accordion("(Result.md)", open=False):
|
| 210 |
markdown_output = gr.Markdown(label="(Result.Md)")
|
| 211 |
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-Thinking/discussions)")
|
| 212 |
+
gr.Markdown("> Using **[Qwen/Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct)**, a powerful and versatile vision-language model. It excels at understanding and processing both text and visual information, making it suitable for a wide range of multimodal tasks. The model demonstrates strong performance in areas like visual question answering, image captioning, and video analysis.")
|
| 213 |
gr.Markdown("> ⚠️ Note: Video inference performance can vary depending on the complexity and length of the video.")
|
| 214 |
|
| 215 |
image_submit.click(
|