| import gradio as gr |
| import torch |
| import numpy as np |
| import cv2 |
| from transformers import AutoTokenizer, AutoModel |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained("facebook/vjepa2-vitl-fpc64-256") |
| model = AutoModel.from_pretrained("facebook/vjepa2-vitl-fpc64-256") |
|
|
| def extract_frames(video_path, num_frames=8): |
| """Extract frames from a video file.""" |
| cap = cv2.VideoCapture(video_path) |
| frames = [] |
| total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) |
| frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) |
| |
| for idx in frame_indices: |
| cap.set(cv2.CAP_PROP_POS_FRAMES, idx) |
| ret, frame = cap.read() |
| if ret: |
| frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) |
| frames.append(frame) |
| cap.release() |
| return frames |
|
|
| def process_video(video_file): |
| """Process video and extract embeddings.""" |
| |
| frames = extract_frames(video_file) |
| |
| |
| processed_frames = [] |
| for frame in frames: |
| frame = cv2.resize(frame, (256, 256)) |
| frame = frame / 255.0 |
| processed_frames.append(frame) |
| |
| |
| video_tensor = torch.tensor(np.stack(processed_frames)).permute(0, 3, 1, 2).unsqueeze(0).float() |
| |
| |
| with torch.no_grad(): |
| outputs = model(video_tensor) |
| |
| |
| embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy() |
| |
| return { |
| "embeddings": embeddings, |
| "frames": frames |
| } |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("# V-JEPA Video Embedding Extractor") |
| gr.Markdown("Upload a video to extract embeddings using `facebook/vjepa2-vitl-fpc64-256`.") |
| |
| with gr.Row(): |
| video_input = gr.Video(label="Upload Video") |
| submit_btn = gr.Button("Process") |
| |
| with gr.Row(): |
| frame_gallery = gr.Gallery(label="Extracted Frames") |
| embeddings_output = gr.JSON(label="Embeddings") |
| |
| submit_btn.click( |
| fn=process_video, |
| inputs=video_input, |
| outputs=[frame_gallery, embeddings_output] |
| ) |
|
|
| demo.launch() |