Spaces:
Running on Zero
Running on Zero
File size: 7,557 Bytes
05b3646 d7f4e54 ad0605a 3b1fff6 bbe149a 3957f9a b4d2fca bfa99ab b4d2fca bfa99ab b4d2fca bfa99ab b4d2fca ad0605a d7f4e54 ad0605a d7f4e54 ad0605a 05b3646 0c5cb30 3957f9a ad0605a 3957f9a d7f4e54 ad0605a 3d14a12 0c5cb30 ad0605a 3f13efa ad0605a 3f13efa d7f4e54 ad0605a 878eb98 ad0605a d7f4e54 ad0605a 3d14a12 b4d2fca 3957f9a ad0605a d7f4e54 b4d2fca d7f4e54 ad0605a d7f4e54 b4d2fca ad0605a b4d2fca ad0605a d7f4e54 ad0605a d7f4e54 878eb98 ad0605a 3d14a12 ad0605a d7f4e54 878eb98 d7f4e54 ad0605a d7f4e54 ad0605a d7f4e54 878eb98 ad0605a 3d14a12 ad0605a 3d14a12 d7f4e54 878eb98 3957f9a c281ccf b4d2fca 3957f9a 05b3646 d7f4e54 ad0605a b4d2fca 5194dd5 b4d2fca 3957f9a ad0605a b4d2fca ad0605a 3957f9a c281ccf 5194dd5 ad0605a b4d2fca ad0605a d7f4e54 ad0605a d7f4e54 b4d2fca d7f4e54 cdc1784 05b3646 3957f9a 05b3646 ad0605a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | import os
import gradio as gr
import torch
import spaces
import cv2
import numpy as np
from PIL import Image
from typing import List
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
# ========== Configuration ==========
MODEL_ID = "WoWolf/Qwen2_5vl-7b-fm-tuned"
MAX_FRAMES = 48
MAX_NEW_TOKENS = 128
TEMPERATURE = 1.0
# ========== Video Examples Configuration ==========
VIDEO_EXAMPLES = {
"1_raw.mp4": {
"path": "1_raw.mp4",
"questions": ["What's happening in this video?", "Which hand holds the pen?"]
},
"4_raw.mp4": {
"path": "4_raw.mp4",
"questions": ["What's happening in this video?", "What is the main action in the video?"]
},
"6_raw.mp4": {
"path": "6_raw.mp4",
"questions": ["What's happening in this video?", "What's the right hand doing?"]
},
}
# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(
MODEL_ID,
trust_remote_code=True,
)
# ========== Video Frame Extraction ==========
def extract_video_frames(video_path: str, max_frames: int = 8) -> List[Image.Image]:
"""Extract key frames from video using OpenCV"""
cap = cv2.VideoCapture(video_path)
frames = []
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
if total_frames == 0:
cap.release()
return frames
# Select frames evenly
frame_indices = np.linspace(0, total_frames - 1, min(max_frames, total_frames), dtype=int)
for frame_idx in frame_indices:
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
ret, frame = cap.read()
if ret:
# Convert BGR to RGB
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(frame_rgb))
cap.release()
return frames
# ========== Message Builder ==========
SYSTEM_PROMPT = (
"You are a helpful assistant that watches a user-provided video and answers "
"questions about it concisely and accurately."
)
def build_messages(frames: List[Image.Image], question: str, fps: float = 1.0):
"""Build messages in Qwen-VL format"""
messages = [
{
"role": "system",
"content": [{"type": "text", "text": SYSTEM_PROMPT}],
},
{
"role": "user",
"content": [
{
"type": "video",
"video": frames,
"fps": fps,
},
{"type": "text", "text": question},
],
},
]
return messages
# ========== Helper Functions ==========
def update_video_display(video_name):
"""Update video display and example questions when video is selected"""
if video_name is None:
return None, ""
video_info = VIDEO_EXAMPLES[video_name]
video_path = video_info["path"]
example_questions = "\n".join([f"• {q}" for q in video_info["questions"]])
return video_path, example_questions
def fill_question(video_name, question_idx):
"""Fill the question textbox with selected example question"""
if video_name is None:
return ""
questions = VIDEO_EXAMPLES[video_name]["questions"]
if 0 <= question_idx < len(questions):
return questions[question_idx]
return ""
# ========== Inference ==========
@spaces.GPU
@torch.inference_mode()
def answer(video_name, question):
if video_name is None:
return "Please select a video first."
if not question or question.strip() == "":
question = "Describe this video in detail."
video_path = VIDEO_EXAMPLES[video_name]["path"]
# Extract frames from video
frames = extract_video_frames(video_path, max_frames=MAX_FRAMES)
if not frames:
return "Error: Unable to extract frames from video."
# Build messages
messages = build_messages(frames, question, fps=1.0)
# Apply chat template
text = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Process vision info
image_inputs, video_inputs = process_vision_info(messages)
# Prepare inputs
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(model.device)
# Generation settings
gen_kwargs = dict(
max_new_tokens=MAX_NEW_TOKENS,
do_sample=(TEMPERATURE > 0.0),
temperature=TEMPERATURE if TEMPERATURE > 0 else None,
pad_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
)
# Generate
generated_ids = model.generate(**inputs, **gen_kwargs)
# Decode output
generated_ids_trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return output_text.strip()
# ========== Gradio UI ==========
with gr.Blocks(title="Video Q&A with Qwen2.5-VL-7B") as demo:
gr.Markdown(
"""
# FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
Select a video, ask a question, and get an answer!
"""
)
with gr.Row():
with gr.Column(scale=1):
# Video selector dropdown
video_selector = gr.Dropdown(
choices=list(VIDEO_EXAMPLES.keys()),
label="Select a Video",
value=None,
interactive=True,
)
# Video display (read-only)
video_display = gr.Video(
label="Video Preview",
height=400,
interactive=False,
)
with gr.Column(scale=1):
# Example questions display
example_questions_display = gr.Textbox(
label="Example Questions (click buttons below to use)",
lines=3,
interactive=False,
)
# Buttons for quick question selection
with gr.Row():
q1_btn = gr.Button("Use Question 1", size="sm")
q2_btn = gr.Button("Use Question 2", size="sm")
question = gr.Textbox(
label="Your Question",
placeholder="Type your question or click an example button above",
lines=2,
)
ask_btn = gr.Button("Ask", variant="primary")
output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)
# Event handlers
video_selector.change(
fn=update_video_display,
inputs=[video_selector],
outputs=[video_display, example_questions_display],
)
q1_btn.click(
fn=lambda v: fill_question(v, 0),
inputs=[video_selector],
outputs=[question],
)
q2_btn.click(
fn=lambda v: fill_question(v, 1),
inputs=[video_selector],
outputs=[question],
)
ask_btn.click(
fn=answer,
inputs=[video_selector, question],
outputs=[output],
)
# ========== Launch ==========
if __name__ == "__main__":
demo.launch() |