""" UCF-50 Action Recognition - Gradio App Deployed on HuggingFace Spaces """ import gradio as gr import torch import torch.nn as nn import torch.nn.functional as F import torchvision.transforms as transforms import torchvision.models as models import cv2 import numpy as np from PIL import Image import tempfile import os class GRUModel(nn.Module): """GRU Model - 97.23% Accuracy""" def __init__(self, input_dim=2048, hidden_dim=512, num_classes=50, dropout=0.3): super(GRUModel, self).__init__() self.hidden_dim = hidden_dim self.gru = nn.GRU( input_size=input_dim, hidden_size=hidden_dim, num_layers=1, batch_first=True, dropout=0 if dropout == 0 else dropout ) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(hidden_dim, num_classes) def forward(self, x): out, hidden = self.gru(x) out = out[:, -1, :] out = self.dropout(out) out = self.fc(out) return out CLASS_NAMES = [ 'BaseballPitch', 'Basketball', 'BenchPress', 'Biking', 'Billiards', 'BreastStroke', 'CleanAndJerk', 'Diving', 'Drumming', 'Fencing', 'GolfSwing', 'HighJump', 'HorseRace', 'HorseRiding', 'HulaHoop', 'JavelinThrow', 'JugglingBalls', 'JumpRope', 'JumpingJack', 'Kayaking', 'Lunges', 'MilitaryParade', 'Mixing', 'Nunchucks', 'PizzaTossing', 'PlayingGuitar', 'PlayingPiano', 'PlayingTabla', 'PlayingViolin', 'PoleVault', 'PommelHorse', 'PullUps', 'Punch', 'PushUps', 'RockClimbingIndoor', 'RopeClimbing', 'Rowing', 'SalsaSpin', 'SkateBoarding', 'Skiing', 'Skijet', 'SoccerJuggling', 'Swing', 'TaiChi', 'TennisSwing', 'ThrowDiscus', 'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog', 'YoYo' ] print("Loading models...") # Load feature extractor (ResNet50) resnet = models.resnet50(pretrained=True) feature_extractor = nn.Sequential(*list(resnet.children())[:-1]) feature_extractor.eval() # Load action recognition model (GRU) model = GRUModel( input_dim=2048, hidden_dim=512, num_classes=50, dropout=0.3 ) # Load trained weights if os.path.exists('best_model.pth'): try: checkpoint = torch.load('best_model.pth', map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) print("✓ Trained model loaded successfully!") except Exception as e: print(f" Could not load trained weights: {str(e)}") else: print(" No trained model found. Using random initialization.") model.eval() print("Models loaded!") def extract_frames(video_path, num_frames=32): """Extract uniformly sampled frames from video""" cap = cv2.VideoCapture(video_path) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) if total_frames == 0: cap.release() return None if total_frames < num_frames: frame_indices = list(range(total_frames)) + [total_frames - 1] * (num_frames - total_frames) else: frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) frames = [] for idx in frame_indices: cap.set(cv2.CAP_PROP_POS_FRAMES, idx) ret, frame = cap.read() if ret: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frames.append(Image.fromarray(frame_rgb)) cap.release() while len(frames) < num_frames: frames.append(frames[-1] if frames else Image.new('RGB', (224, 224))) return frames[:num_frames] def preprocess_frames(frames): """Preprocess frames for model input""" transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) return torch.stack([transform(frame) for frame in frames]) def convert_video_for_web(video_path): """Convert video to web-compatible format""" if video_path is None: return None try: # Create temp file for converted video temp_output = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name # Open original video cap = cv2.VideoCapture(video_path) # Get video properties fps = int(cap.get(cv2.CAP_PROP_FPS)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) # Define codec and create VideoWriter fourcc = cv2.VideoWriter_fourcc(*'mp4v') out = cv2.VideoWriter(temp_output, fourcc, fps, (width, height)) # Read and write all frames while True: ret, frame = cap.read() if not ret: break out.write(frame) cap.release() out.release() return temp_output except Exception as e: print(f"Video conversion failed: {e}") return video_path # Return original if conversion fails def predict_action(video_path): """Main prediction function""" if video_path is None: return ( None, "Please upload a video first.", None, None, None, None, gr.update(visible=False), None # Add this for converted video ) try: # Convert video for web playback web_video = convert_video_for_web(video_path) # Extract frames (still use original path for analysis) frames = extract_frames(video_path, num_frames=32) if frames is None or len(frames) == 0: return ( None, "Error: Could not extract frames from video. Please try another video.", None, None, None, None, gr.update(visible=False), None ) # Preprocess frames_tensor = preprocess_frames(frames) # Extract features with torch.no_grad(): features = feature_extractor(frames_tensor) features = features.view(features.size(0), -1) features = features.unsqueeze(0) # Predict outputs = model(features) probs = F.softmax(outputs, dim=1) top5_probs, top5_indices = torch.topk(probs, 5) # Format results top5_probs = top5_probs[0].numpy() top5_indices = top5_indices[0].numpy() # Create prediction dictionary for Gradio predictions = { CLASS_NAMES[idx]: float(prob) for idx, prob in zip(top5_indices, top5_probs) } # Create result text result_text = f"**Predicted Action:** {CLASS_NAMES[top5_indices[0]]}\n\n" result_text += f"**Confidence:** {top5_probs[0] * 100:.2f}%\n\n" result_text += "**Top 5 Predictions:**\n\n" for i, (idx, prob) in enumerate(zip(top5_indices, top5_probs), 1): result_text += f"{i}. {CLASS_NAMES[idx]}: {prob * 100:.2f}%\n" # Get sample frames for display sample_frames = [frames[i] for i in [0, 10, 20, 31]] return ( predictions, result_text, sample_frames[0], sample_frames[1], sample_frames[2], sample_frames[3], gr.update(visible=True), web_video # Return converted video ) except Exception as e: return ( None, f"Error processing video: {str(e)}", None, None, None, None, gr.update(visible=False), None ) # Custom CSS css = """ .gradio-container { max-width: 1400px !important; margin: auto; } #upload-zone { border: 2px dashed #d1d5db; border-radius: 12px; padding: 2rem; background: #f9fafb; transition: all 0.3s ease; } #upload-zone:hover { border-color: #2563eb; background: #eff6ff; } .primary-button { background: #2563eb !important; border: none !important; font-weight: 600 !important; font-size: 1.1em !important; padding: 0.8rem 2rem !important; } #title-text { font-size: 2.5em; font-weight: 700; color: #111827; margin-bottom: 0.3rem; } #subtitle-text { color: #6b7280; font-size: 1.1em; margin-bottom: 2rem; } .results-container { background: #f9fafb; border-radius: 12px; padding: 1.5rem; border: 1px solid #e5e7eb; } .frame-container img { border-radius: 8px; border: 1px solid #e5e7eb; } """ # Create Gradio interface with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo: # Header gr.Markdown("
Video Action Recognition
") gr.Markdown("
GRU-based sequence model · 97.23% accuracy on UCF-50
") # Model details (collapsed) with gr.Accordion("Model Details", open=False): gr.Markdown(""" **Architecture:** ResNet50 feature extractor + GRU sequence model **Performance:** 97.23% Top-1 accuracy · 99.85% Top-5 accuracy **Dataset:** UCF-50 (50 human action categories) **Parameters:** 3.96M trainable parameters **Supported Actions:** BaseballPitch, Basketball, BenchPress, Biking, Billiards, BreastStroke, CleanAndJerk, Diving, Drumming, Fencing, GolfSwing, HighJump, HorseRace, HorseRiding, HulaHoop, JavelinThrow, JugglingBalls, JumpRope, JumpingJack, Kayaking, Lunges, MilitaryParade, Mixing, Nunchucks, PizzaTossing, PlayingGuitar, PlayingPiano, PlayingTabla, PlayingViolin, PoleVault, PommelHorse, PullUps, Punch, PushUps, RockClimbingIndoor, RopeClimbing, Rowing, SalsaSpin, SkateBoarding, Skiing, Skijet, SoccerJuggling, Swing, TaiChi, TennisSwing, ThrowDiscus, TrampolineJumping, VolleyballSpiking, WalkingWithDog, YoYo """) gr.Markdown("---") # Main interface with gr.Row(): # Left column - Upload with gr.Column(scale=1): gr.Markdown("### Upload Video") with gr.Group(elem_id="upload-zone"): video_input = gr.File( label="Drop video file here or click to upload", file_types=["video"], type="filepath" ) # Add a second video component for playback video_preview = gr.Video( label="Video Preview", visible=False, interactive=False, show_label=False ) predict_button = gr.Button( "Analyze Video", variant="primary", size="lg", elem_classes="primary-button" ) gr.Markdown(""" **Requirements:** - Clear view of human performing action - 3-10 seconds recommended - Formats: MP4, AVI, MOV """) # Right column - Results with gr.Column(scale=1): gr.Markdown("### Results") with gr.Group(elem_classes="results-container"): result_text = gr.Markdown("*Upload a video and click 'Analyze Video' to see predictions*") prediction_chart = gr.Label( label="Confidence Distribution", num_top_classes=5, show_label=True ) # Frames section (hidden initially) with gr.Column(visible=False) as frames_container: gr.Markdown("### Extracted Frames") gr.Markdown("*Sample frames used for analysis*") with gr.Row(): frame1 = gr.Image(label="", show_label=False, elem_classes="frame-container") frame2 = gr.Image(label="", show_label=False, elem_classes="frame-container") frame3 = gr.Image(label="", show_label=False, elem_classes="frame-container") frame4 = gr.Image(label="", show_label=False, elem_classes="frame-container") # Connect prediction function predict_button.click( fn=predict_action, inputs=video_input, outputs=[ prediction_chart, result_text, frame1, frame2, frame3, frame4, frames_container, video_preview ] ) # Footer gr.Markdown("---") gr.Markdown("""
View Source Code · PyTorch · ResNet50 · GRU
""") # Launch if __name__ == "__main__": demo.launch()