Spaces:

vu0018
/

my-pose-estimation-space

Sleeping

App Files Files Community

my-pose-estimation-space / app.py

vu0018

Update app.py

7f28013 verified 3 months ago

raw

history blame

3.27 kB

	import gradio as gr
	import cv2
	import mediapipe as mp
	import torch
	import numpy as np
	import tempfile
	from transformers import pipeline
	from PIL import Image

	# Initialize MediaPipe Pose
	mp_pose = mp.solutions.pose

	# Hugging Face pretrained model for action recognition
	action_model = pipeline(
	"image-classification",
	model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224"
	)

	def detect_pose_and_activity(video_file):
	"""
	Process the uploaded video to detect human poses and classify activity.
	Video is limited to 10 seconds. Returns annotated video and predicted action.
	"""
	try:
	# Save uploaded video temporarily
	temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
	temp_video.write(open(video_file, "rb").read())
	temp_video.close()

	cap = cv2.VideoCapture(temp_video.name)
	if not cap.isOpened():
	return None, "Error: Could not open video."

	fps = cap.get(cv2.CAP_PROP_FPS)
	if fps == 0:
	fps = 30 # fallback

	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	max_frames = int(min(total_frames/fps, 10) * fps) # limit 10s

	output_frames = []
	action_predictions = []

	# Process frames
	with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
	for _ in range(max_frames):
	ret, frame = cap.read()
	if not ret:
	break

	# Pose detection
	image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	results = pose.process(image_rgb)
	if results.pose_landmarks:
	mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)

	output_frames.append(frame)

	# Convert frame to PIL image for Hugging Face model
	pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
	pred = action_model(pil_image)
	action_predictions.append(pred[0]['label'])

	cap.release()

	if len(output_frames) == 0:
	return None, "Error: No frames to process."

	# Take the most frequent predicted action
	action_label = max(set(action_predictions), key=action_predictions.count)

	# Save annotated video
	output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
	height, width, _ = output_frames[0].shape
	out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
	for f in output_frames:
	out.write(f)
	out.release()

	return output_file, f"Predicted Action: {action_label}"

	except Exception as e:
	return None, f"Runtime Error: {str(e)}"

	# Gradio Interface
	iface = gr.Interface(
	fn=detect_pose_and_activity,
	inputs=gr.Video(label="Upload a Video (max 10s)"),
	outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
	title="Human Pose & Activity Recognition",
	description="Upload a short video (max 10s). The app detects human poses and predicts the activity (e.g., dancing, cycling, running)."
	)

	iface.launch()