vu0018 commited on
Commit
194244e
·
verified ·
1 Parent(s): 5e33f41

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import mediapipe as mp
4
+ import torch
5
+ import numpy as np
6
+ import tempfile
7
+ from transformers import pipeline
8
+ from PIL import Image
9
+
10
+ # Initialize MediaPipe Pose
11
+ mp_pose = mp.solutions.pose
12
+
13
+ # Hugging Face pretrained model for action recognition
14
+ action_model = pipeline(
15
+ "image-classification",
16
+ model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224"
17
+ )
18
+
19
+ def detect_pose_and_activity(video_file):
20
+ """
21
+ Process the uploaded video to detect human poses and classify activity.
22
+ Video is limited to 10 seconds. Returns annotated video and predicted action.
23
+ """
24
+ try:
25
+ # Save uploaded video temporarily
26
+ temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
27
+ temp_video.write(open(video_file, "rb").read())
28
+ temp_video.close()
29
+
30
+ cap = cv2.VideoCapture(temp_video.name)
31
+ if not cap.isOpened():
32
+ return None, "Error: Could not open video."
33
+
34
+ fps = cap.get(cv2.CAP_PROP_FPS)
35
+ if fps == 0:
36
+ fps = 30 # fallback
37
+
38
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
39
+ max_frames = int(min(total_frames/fps, 10) * fps) # limit 10s
40
+
41
+ output_frames = []
42
+ action_predictions = []
43
+
44
+ # Process frames
45
+ with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
46
+ for _ in range(max_frames):
47
+ ret, frame = cap.read()
48
+ if not ret:
49
+ break
50
+
51
+ # Pose detection
52
+ image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
53
+ results = pose.process(image_rgb)
54
+ if results.pose_landmarks:
55
+ mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
56
+
57
+ output_frames.append(frame)
58
+
59
+ # Convert frame to PIL image for Hugging Face model
60
+ pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
61
+ pred = action_model(pil_image)
62
+ action_predictions.append(pred[0]['label'])
63
+
64
+ cap.release()
65
+
66
+ if len(output_frames) == 0:
67
+ return None, "Error: No frames to process."
68
+
69
+ # Take the most frequent predicted action
70
+ action_label = max(set(action_predictions), key=action_predictions.count)
71
+
72
+ # Save annotated video
73
+ output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
74
+ height, width, _ = output_frames[0].shape
75
+ out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
76
+ for f in output_frames:
77
+ out.write(f)
78
+ out.release()
79
+
80
+ return output_file, f"Predicted Action: {action_label}"
81
+
82
+ except Exception as e:
83
+ return None, f"Runtime Error: {str(e)}"
84
+
85
+ # Gradio Interface
86
+ iface = gr.Interface(
87
+ fn=detect_pose_and_activity,
88
+ inputs=gr.Video(label="Upload a Video (max 10s)"),
89
+ outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
90
+ title="Human Pose & Activity Recognition",
91
+ description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
92
+ )
93
+
94
+ iface.launch()