import gradio as gr import torch from transformers import VideoMAEForVideoClassification, VideoMAEFeatureExtractor from PIL import Image import numpy as np # Load model & processor model_name = "OPear/videomae-large-finetuned-UCF-Crime" model = VideoMAEForVideoClassification.from_pretrained(model_name) processor = VideoMAEFeatureExtractor.from_pretrained(model_name) def classify_video(video): # video is a numpy array of shape (frames, H, W, C) inputs = processor(video, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits predicted_class_idx = logits.argmax(-1).item() return model.config.id2label[predicted_class_idx] iface = gr.Interface(fn=classify_video, inputs=gr.Video(), outputs="text", title="Video Classifier using VideoMAE") iface.launch()