File size: 3,274 Bytes
015f0f2
f7c47ba
c5fba2e
af17b79
 
8a40a91
f5ffff9
 
f7c47ba
956cfce
f7c47ba
a55051b
c5fba2e
 
 
 
 
af17b79
 
f5ffff9
c5fba2e
7f28013
f5ffff9
015f0f2
c5fba2e
015f0f2
 
 
956cfce
015f0f2
f5ffff9
c5fba2e
f5ffff9
015f0f2
f5ffff9
7f28013
f5ffff9
015f0f2
c5fba2e
a55051b
015f0f2
7f28013
956cfce
7f28013
 
 
015f0f2
 
 
956cfce
7f28013
 
 
015f0f2
af17b79
956cfce
015f0f2
956cfce
7f28013
 
 
 
015f0f2
c5fba2e
af17b79
c5fba2e
 
af17b79
c5fba2e
7f28013
015f0f2
c5fba2e
015f0f2
 
 
 
 
 
 
7f28013
015f0f2
 
f5ffff9
015f0f2
f5ffff9
015f0f2
af17b79
015f0f2
af17b79
7f28013
 
015f0f2
 
7f28013
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import cv2
import mediapipe as mp
import torch
import numpy as np
import tempfile
from transformers import pipeline
from PIL import Image

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose

# Hugging Face pretrained model for action recognition
action_model = pipeline(
    "image-classification",
    model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224"
)

def detect_pose_and_activity(video_file):
    """
    Process the uploaded video to detect human poses and classify activity.
    Video is limited to 10 seconds. Returns annotated video and predicted action.
    """
    try:
        # Save uploaded video temporarily
        temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        temp_video.write(open(video_file, "rb").read())
        temp_video.close()

        cap = cv2.VideoCapture(temp_video.name)
        if not cap.isOpened():
            return None, "Error: Could not open video."

        fps = cap.get(cv2.CAP_PROP_FPS)
        if fps == 0:
            fps = 30  # fallback

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        max_frames = int(min(total_frames/fps, 10) * fps)  # limit 10s

        output_frames = []
        action_predictions = []

        # Process frames
        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
            for _ in range(max_frames):
                ret, frame = cap.read()
                if not ret:
                    break

                # Pose detection
                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = pose.process(image_rgb)
                if results.pose_landmarks:
                    mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

                output_frames.append(frame)

                # Convert frame to PIL image for Hugging Face model
                pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                pred = action_model(pil_image)
                action_predictions.append(pred[0]['label'])

        cap.release()

        if len(output_frames) == 0:
            return None, "Error: No frames to process."

        # Take the most frequent predicted action
        action_label = max(set(action_predictions), key=action_predictions.count)

        # Save annotated video
        output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
        height, width, _ = output_frames[0].shape
        out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
        for f in output_frames:
            out.write(f)
        out.release()

        return output_file, f"Predicted Action: {action_label}"

    except Exception as e:
        return None, f"Runtime Error: {str(e)}"

# Gradio Interface
iface = gr.Interface(
    fn=detect_pose_and_activity,
    inputs=gr.Video(label="Upload a Video (max 10s)"),
    outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
    title="Human Pose & Activity Recognition",
    description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
)

iface.launch()