vu0018 commited on
Commit
c74bf35
·
verified ·
1 Parent(s): c5e8001

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -19
app.py CHANGED
@@ -7,6 +7,11 @@ import tempfile
7
  from transformers import pipeline
8
  from PIL import Image
9
 
 
 
 
 
 
10
  # Initialize MediaPipe Pose
11
  mp_pose = mp.solutions.pose
12
 
@@ -17,10 +22,6 @@ action_model = pipeline(
17
  )
18
 
19
  def detect_pose_and_activity(video_file):
20
- """
21
- Process the uploaded video to detect human poses and classify activity.
22
- Video is limited to 10 seconds. Returns annotated video and predicted action.
23
- """
24
  try:
25
  # Save uploaded video temporarily
26
  temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
@@ -36,30 +37,48 @@ def detect_pose_and_activity(video_file):
36
  fps = 30 # fallback
37
 
38
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
39
- max_frames = int(min(total_frames/fps, 10) * fps) # limit 10s
40
 
41
  output_frames = []
42
  action_predictions = []
43
 
44
- # Process frames
45
  with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
46
  for _ in range(max_frames):
47
  ret, frame = cap.read()
48
  if not ret:
49
  break
50
 
51
- # Pose detection
52
- image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
53
- results = pose.process(image_rgb)
54
- if results.pose_landmarks:
55
- mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  output_frames.append(frame)
58
 
59
- # Convert frame to PIL image for Hugging Face model
60
- pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
61
- pred = action_model(pil_image)
62
- action_predictions.append(pred[0]['label'])
63
 
64
  cap.release()
65
 
@@ -67,7 +86,7 @@ def detect_pose_and_activity(video_file):
67
  return None, "Error: No frames to process."
68
 
69
  # Take the most frequent predicted action
70
- action_label = max(set(action_predictions), key=action_predictions.count)
71
 
72
  # Save annotated video
73
  output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
@@ -87,8 +106,8 @@ iface = gr.Interface(
87
  fn=detect_pose_and_activity,
88
  inputs=gr.Video(label="Upload a Video (max 10s)"),
89
  outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
90
- title="Human Pose & Activity Recognition",
91
- description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
92
  )
93
 
94
- iface.launch()
 
7
  from transformers import pipeline
8
  from PIL import Image
9
 
10
+ # Load YOLOv5 model from torch hub
11
+ yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
12
+ yolo_model.conf = 0.4 # confidence threshold
13
+ yolo_model.classes = [0] # only detect persons (class 0)
14
+
15
  # Initialize MediaPipe Pose
16
  mp_pose = mp.solutions.pose
17
 
 
22
  )
23
 
24
  def detect_pose_and_activity(video_file):
 
 
 
 
25
  try:
26
  # Save uploaded video temporarily
27
  temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
 
37
  fps = 30 # fallback
38
 
39
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
40
+ max_frames = int(min(total_frames / fps, 10) * fps) # limit 10s
41
 
42
  output_frames = []
43
  action_predictions = []
44
 
 
45
  with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
46
  for _ in range(max_frames):
47
  ret, frame = cap.read()
48
  if not ret:
49
  break
50
 
51
+ # Detect people using YOLOv5
52
+ results = yolo_model(frame)
53
+ detections = results.xyxy[0].cpu().numpy()
54
+
55
+ frame_actions = []
56
+
57
+ for det in detections:
58
+ x1, y1, x2, y2, conf, cls = map(int, det[:6])
59
+ person_crop = frame[y1:y2, x1:x2]
60
+
61
+ # Pose estimation on cropped person
62
+ person_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
63
+ pose_result = pose.process(person_rgb)
64
+
65
+ if pose_result.pose_landmarks:
66
+ mp.solutions.drawing_utils.draw_landmarks(
67
+ person_crop, pose_result.pose_landmarks, mp_pose.POSE_CONNECTIONS
68
+ )
69
+
70
+ # Action recognition
71
+ pil_image = Image.fromarray(cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB))
72
+ pred = action_model(pil_image)
73
+ frame_actions.append(pred[0]['label'])
74
+
75
+ # Draw bounding box
76
+ cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
77
 
78
  output_frames.append(frame)
79
 
80
+ if frame_actions:
81
+ action_predictions.append(max(set(frame_actions), key=frame_actions.count))
 
 
82
 
83
  cap.release()
84
 
 
86
  return None, "Error: No frames to process."
87
 
88
  # Take the most frequent predicted action
89
+ action_label = max(set(action_predictions), key=action_predictions.count) if action_predictions else "Unknown"
90
 
91
  # Save annotated video
92
  output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
 
106
  fn=detect_pose_and_activity,
107
  inputs=gr.Video(label="Upload a Video (max 10s)"),
108
  outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
109
+ title="Multi-Person Pose & Activity Recognition",
110
+ description="Upload a short video (max 10s). The app detects multiple people, estimates their poses, and predicts their actions."
111
  )
112
 
113
+ iface.launch()