Spaces:

yulu2
/

FoundationMotion

Running on Zero

App Files Files Community

FoundationMotion / app.py

yulu2

Update app.py

bfa99ab verified 6 days ago

raw

history blame contribute delete

7.56 kB

	import os
	import gradio as gr
	import torch
	import spaces
	import cv2
	import numpy as np
	from PIL import Image
	from typing import List

	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from qwen_vl_utils import process_vision_info

	# ========== Configuration ==========
	MODEL_ID = "WoWolf/Qwen2_5vl-7b-fm-tuned"
	MAX_FRAMES = 48
	MAX_NEW_TOKENS = 128
	TEMPERATURE = 1.0

	# ========== Video Examples Configuration ==========
	VIDEO_EXAMPLES = {
	"1_raw.mp4": {
	"path": "1_raw.mp4",
	"questions": ["What's happening in this video?", "Which hand holds the pen?"]
	},
	"4_raw.mp4": {
	"path": "4_raw.mp4",
	"questions": ["What's happening in this video?", "What is the main action in the video?"]
	},
	"6_raw.mp4": {
	"path": "6_raw.mp4",
	"questions": ["What's happening in this video?", "What's the right hand doing?"]
	},
	}

	# ========== Load Model & Processor ==========
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	trust_remote_code=True,
	)

	processor = AutoProcessor.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	)

	# ========== Video Frame Extraction ==========
	def extract_video_frames(video_path: str, max_frames: int = 8) -> List[Image.Image]:
	"""Extract key frames from video using OpenCV"""
	cap = cv2.VideoCapture(video_path)
	frames = []

	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if total_frames == 0:
	cap.release()
	return frames

	# Select frames evenly
	frame_indices = np.linspace(0, total_frames - 1, min(max_frames, total_frames), dtype=int)

	for frame_idx in frame_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
	ret, frame = cap.read()
	if ret:
	# Convert BGR to RGB
	frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(Image.fromarray(frame_rgb))

	cap.release()
	return frames

	# ========== Message Builder ==========
	SYSTEM_PROMPT = (
	"You are a helpful assistant that watches a user-provided video and answers "
	"questions about it concisely and accurately."
	)

	def build_messages(frames: List[Image.Image], question: str, fps: float = 1.0):
	"""Build messages in Qwen-VL format"""
	messages = [
	{
	"role": "system",
	"content": [{"type": "text", "text": SYSTEM_PROMPT}],
	},
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": frames,
	"fps": fps,
	},
	{"type": "text", "text": question},
	],
	},
	]
	return messages

	# ========== Helper Functions ==========
	def update_video_display(video_name):
	"""Update video display and example questions when video is selected"""
	if video_name is None:
	return None, ""

	video_info = VIDEO_EXAMPLES[video_name]
	video_path = video_info["path"]
	example_questions = "\n".join([f"• {q}" for q in video_info["questions"]])

	return video_path, example_questions

	def fill_question(video_name, question_idx):
	"""Fill the question textbox with selected example question"""
	if video_name is None:
	return ""
	questions = VIDEO_EXAMPLES[video_name]["questions"]
	if 0 <= question_idx < len(questions):
	return questions[question_idx]
	return ""

	# ========== Inference ==========
	@spaces.GPU
	@torch.inference_mode()
	def answer(video_name, question):
	if video_name is None:
	return "Please select a video first."
	if not question or question.strip() == "":
	question = "Describe this video in detail."

	video_path = VIDEO_EXAMPLES[video_name]["path"]

	# Extract frames from video
	frames = extract_video_frames(video_path, max_frames=MAX_FRAMES)
	if not frames:
	return "Error: Unable to extract frames from video."

	# Build messages
	messages = build_messages(frames, question, fps=1.0)

	# Apply chat template
	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	# Process vision info
	image_inputs, video_inputs = process_vision_info(messages)

	# Prepare inputs
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(model.device)

	# Generation settings
	gen_kwargs = dict(
	max_new_tokens=MAX_NEW_TOKENS,
	do_sample=(TEMPERATURE > 0.0),
	temperature=TEMPERATURE if TEMPERATURE > 0 else None,
	pad_token_id=processor.tokenizer.eos_token_id,
	use_cache=True,
	)

	# Generate
	generated_ids = model.generate(inputs, gen_kwargs)

	# Decode output
	generated_ids_trimmed = [
	out_ids[len(in_ids):]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]

	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]

	return output_text.strip()


	# ========== Gradio UI ==========
	with gr.Blocks(title="Video Q&A with Qwen2.5-VL-7B") as demo:
	gr.Markdown(
	"""
	# FoundationMotion: Auto-Labeling and Reasoning about Spatial Movement in Videos
	Select a video, ask a question, and get an answer!
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Video selector dropdown
	video_selector = gr.Dropdown(
	choices=list(VIDEO_EXAMPLES.keys()),
	label="Select a Video",
	value=None,
	interactive=True,
	)
	# Video display (read-only)
	video_display = gr.Video(
	label="Video Preview",
	height=400,
	interactive=False,
	)

	with gr.Column(scale=1):
	# Example questions display
	example_questions_display = gr.Textbox(
	label="Example Questions (click buttons below to use)",
	lines=3,
	interactive=False,
	)

	# Buttons for quick question selection
	with gr.Row():
	q1_btn = gr.Button("Use Question 1", size="sm")
	q2_btn = gr.Button("Use Question 2", size="sm")

	question = gr.Textbox(
	label="Your Question",
	placeholder="Type your question or click an example button above",
	lines=2,
	)
	ask_btn = gr.Button("Ask", variant="primary")
	output = gr.Textbox(label="Answer", lines=10, show_copy_button=True)

	# Event handlers
	video_selector.change(
	fn=update_video_display,
	inputs=[video_selector],
	outputs=[video_display, example_questions_display],
	)

	q1_btn.click(
	fn=lambda v: fill_question(v, 0),
	inputs=[video_selector],
	outputs=[question],
	)

	q2_btn.click(
	fn=lambda v: fill_question(v, 1),
	inputs=[video_selector],
	outputs=[question],
	)

	ask_btn.click(
	fn=answer,
	inputs=[video_selector, question],
	outputs=[output],
	)

	# ========== Launch ==========
	if __name__ == "__main__":
	demo.launch()