import gradio as gr
import cv2
import torch
import numpy as np
import tempfile
from transformers import pipeline
from PIL import Image
import requests
import mediapipe as mp

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose

# Load Hugging Face models
action_model = pipeline("image-classification", model="rvv-karma/Human-Action-Recognition-VIT-Base-patch16-224")
pose_model = torch.hub.load("facebookresearch/ViTPose", "vitpose", pretrained=True)

# Define action labels
action_labels = [
    "calling", "clapping", "cycling", "dancing", "drinking", "eating", "fighting", "hugging",
    "laughing", "listening_to_music", "running", "sitting", "sleeping", "texting", "using_laptop"
]

def detect_pose_and_activity(video_file):
    """
    Process the uploaded video to detect human poses and classify the activity.
    Video is trimmed to 10 seconds if longer.
    Returns the annotated video and predicted activity label.
    """
    try:
        # Save uploaded video to a temporary file
        temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
        temp_video.write(open(video_file, "rb").read())
        temp_video.close()

        cap = cv2.VideoCapture(temp_video.name)
        if not cap.isOpened():
            return None, "Error: Could not open video file. Please upload a valid mp4 video."

        fps = cap.get(cv2.CAP_PROP_FPS)
        if fps == 0:
            fps = 30  # fallback if fps is zero

        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        max_frames = int(min(total_frames/fps, 10) * fps)  # limit to 10 seconds

        output_frames = []
        keypoints_sequence = []

        with mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5) as pose:
            for _ in range(max_frames):
                ret, frame = cap.read()
                if not ret:
                    break

                image_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results = pose.process(image_rgb)

                # Extract keypoints
                if results.pose_landmarks:
                    keypoints = []
                    for lm in results.pose_landmarks.landmark:
                        keypoints.extend([lm.x, lm.y, lm.z])
                    if len(keypoints) != 99:
                        keypoints = [0]*99
                    keypoints_sequence.append(keypoints)
                    mp.solutions.drawing_utils.draw_landmarks(frame, results.pose_landmarks, mp_pose.POSE_CONNECTIONS)
                else:
                    keypoints_sequence.append([0]*99)

                output_frames.append(frame)

        cap.release()

        if len(keypoints_sequence) == 0 or len(output_frames) == 0:
            return None, "Error: No frames or poses detected."

        # Convert keypoints sequence to tensor
        keypoints_tensor = torch.tensor(keypoints_sequence, dtype=torch.float32).mean(dim=0, keepdim=True)

        # Predict activity
        with torch.no_grad():
            preds = pose_model(keypoints_tensor)
            action_idx = torch.argmax(preds, dim=1).item()
            action_label = action_labels[action_idx]

        # Save output video
        output_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
        height, width, _ = output_frames[0].shape
        out = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
        for f in output_frames:
            out.write(f)
        out.release()

        return output_file, f"Predicted Action: {action_label}"

    except Exception as e:
        return None, f"Runtime Error: {str(e)}"

# Gradio Interface
iface = gr.Interface(
    fn=detect_pose_and_activity,
    inputs=gr.Video(label="Upload a Video (max 10s)"),
    outputs=[gr.Video(label="Pose Detection Output"), gr.Textbox(label="Detected Action")],
    title="Human Pose & Activity Recognition",
    description="Upload a short video (max 10s), and the app will detect human poses and predict the activity (e.g., ballet, cycling, running)."
)

iface.launch()