Spaces:

anas31
/

sasasa

Runtime error

App Files Files Community

sasasa / app.py

anas31

Create app.py

5752234 verified 9 months ago

raw

history blame contribute delete

2.34 kB

	import gradio as gr
	import torch
	import numpy as np
	import cv2
	from transformers import AutoTokenizer, AutoModel

	# Load model and tokenizer
	tokenizer = AutoTokenizer.from_pretrained("facebook/vjepa2-vitl-fpc64-256")
	model = AutoModel.from_pretrained("facebook/vjepa2-vitl-fpc64-256")

	def extract_frames(video_path, num_frames=8):
	"""Extract frames from a video file."""
	cap = cv2.VideoCapture(video_path)
	frames = []
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)

	for idx in frame_indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(frame)
	cap.release()
	return frames

	def process_video(video_file):
	"""Process video and extract embeddings."""
	# Extract frames
	frames = extract_frames(video_file)

	# Preprocess frames (resize, normalize, etc.)
	processed_frames = []
	for frame in frames:
	frame = cv2.resize(frame, (256, 256)) # Adjust to model's expected input
	frame = frame / 255.0 # Normalize
	processed_frames.append(frame)

	# Convert to tensor (batch_size, num_frames, C, H, W)
	video_tensor = torch.tensor(np.stack(processed_frames)).permute(0, 3, 1, 2).unsqueeze(0).float()

	# Get embeddings
	with torch.no_grad():
	outputs = model(video_tensor)

	# Return the embeddings (or process further)
	embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

	return {
	"embeddings": embeddings,
	"frames": frames # Display the extracted frames
	}

	# Gradio Interface
	with gr.Blocks() as demo:
	gr.Markdown("# V-JEPA Video Embedding Extractor")
	gr.Markdown("Upload a video to extract embeddings using `facebook/vjepa2-vitl-fpc64-256`.")

	with gr.Row():
	video_input = gr.Video(label="Upload Video")
	submit_btn = gr.Button("Process")

	with gr.Row():
	frame_gallery = gr.Gallery(label="Extracted Frames")
	embeddings_output = gr.JSON(label="Embeddings")

	submit_btn.click(
	fn=process_video,
	inputs=video_input,
	outputs=[frame_gallery, embeddings_output]
	)

	demo.launch()