Spaces:

cweigendev
/

videoanalyzer

Paused

App Files Files Community

videoanalyzer / app.py

cweigendev

Update app.py

89d0079 verified 4 months ago

raw

history blame contribute delete

6.83 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoProcessor

	# Model name - USE THE BASE MODEL THAT HAS ALL FILES
	model_name = "DAMO-NLP-SG/VideoLLaMA3-7B" # This has all the config files

	# Global variables for model and processor
	model = None
	processor = None

	def load_model():
	global model, processor
	try:
	print("Loading VideoLLaMA3 model...")
	print("This may take several minutes on first load...")

	# Load model with correct parameters based on official documentation
	# Try with flash attention first, fall back to standard attention
	try:
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	device_map="auto",
	torch_dtype=torch.bfloat16, # Changed from float16 to bfloat16
	attn_implementation="flash_attention_2", # Added for better performance
	)
	print("Loaded with flash attention")
	except Exception as flash_error:
	print(f"Flash attention failed: {flash_error}")
	print("Falling back to standard attention...")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	)
	print("Loaded with standard attention")

	# Load processor (not tokenizer)
	processor = AutoProcessor.from_pretrained(
	model_name,
	trust_remote_code=True
	)

	print("Model and processor loaded successfully!")
	return True

	except Exception as e:
	print(f"Error loading model: {e}")
	import traceback
	traceback.print_exc()
	return False

	def process_video_question(video_file, question):
	"""Process video and answer questions about it using VideoLLaMA3"""
	global model, processor

	if model is None or processor is None:
	return "Model is not loaded. Please wait for the model to initialize or check the logs for errors."

	if video_file is None:
	return "Please upload a video file first."

	if not question.strip():
	return "Please enter a question about the video."

	try:
	print(f"Processing video: {video_file}")
	print(f"Question: {question}")

	# Prepare conversation in the format expected by VideoLLaMA3
	conversation = [
	{"role": "system", "content": "You are a helpful assistant."},
	{
	"role": "user",
	"content": [
	{
	"type": "video",
	"video": {
	"video_path": video_file,
	"fps": 1,
	"max_frames": 128
	}
	},
	{"type": "text", "text": question}
	]
	}
	]

	# Process the conversation
	inputs = processor(conversation=conversation, return_tensors="pt")

	# Move inputs to GPU if available
	inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

	# Convert pixel values to bfloat16 if present
	if "pixel_values" in inputs:
	inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)

	# Generate response
	print("Generating response...")
	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	use_cache=True
	)

	# Decode the response
	response = processor.decode(output_ids[0], skip_special_tokens=True)

	# Extract just the assistant's response (remove the conversation history)
	if "assistant" in response:
	response = response.split("assistant")[-1].strip()

	print(f"Generated response: {response}")
	return response

	except Exception as e:
	error_msg = f"Error processing video: {str(e)}"
	print(error_msg)
	import traceback
	traceback.print_exc()
	return error_msg

	# Initialize model when the Space starts
	print(f"Initializing {model_name}...")
	model_loaded = load_model()

	if not model_loaded:
	print("❌ Failed to load model. Check the logs above for details.")

	# Create the Gradio interface
	with gr.Blocks(title="VideoLLaMA3 Demo", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎥 VideoRefer-VideoLLaMA3 Interactive Demo")
	gr.Markdown(f"Model: `{model_name}`")

	if model_loaded:
	gr.Markdown("✅ Model Status: Loaded and ready!")
	else:
	gr.Markdown("❌ Model Status: Failed to load. Check logs for details.")

	gr.Markdown("Upload a video and ask questions about its content!")

	with gr.Row():
	with gr.Column(scale=1):
	video_input = gr.Video(
	label="📹 Upload Video",
	height=300
	)
	question_input = gr.Textbox(
	label="❓ Ask a question about the video",
	placeholder="What is happening in this video?",
	lines=3
	)
	submit_btn = gr.Button("🚀 Analyze Video", variant="primary", size="lg")

	with gr.Column(scale=1):
	output_text = gr.Textbox(
	label="🤖 AI Response",
	lines=12,
	placeholder="The AI response will appear here...",
	show_copy_button=True
	)

	# Examples section
	with gr.Row():
	gr.Markdown("""
	### 💡 Example Questions:
	- "What objects can you see in this video?"
	- "Describe the main action happening in detail"
	- "What is the setting or location of this video?"
	- "How many people are in the video and what are they doing?"
	- "What emotions or mood does this video convey?"
	- "Describe the sequence of events in chronological order"
	""")

	# Connect the button to the function
	submit_btn.click(
	fn=process_video_question,
	inputs=[video_input, question_input],
	outputs=output_text
	)

	# Auto-submit when Enter is pressed in the question box
	question_input.submit(
	fn=process_video_question,
	inputs=[video_input, question_input],
	outputs=output_text
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()