import tempfile
from micro_gestures import *
from composite_gestures import detect_composite_gesture

# initialize mediapipe modules
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
    min_tracking_confidence=0.5
)

def process_video(video_path, target_width=640):
    # open video file
    cap = cv2.VideoCapture(video_path)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    temp_output = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
    out = None

    sequence_buffer = []  # buffer for micro-gestures
    last_label = "None"  # store last detected gesture label

    while cap.isOpened():
        ret, frame = cap.read()
            break

        h0, w0 = frame.shape[:2]
        # resize frame keeping aspect ratio
        scale = target_width / float(w0)
        target_height = int(round(h0 * scale))
        frame_small = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)

        # convert to rgb
        rgb_small = cv2.cvtColor(frame_small, cv2.COLOR_BGR2RGB)

        # hand detection
        hand_results = hands.process(rgb_small)
        micro_label = ""
        if hand_results.multi_hand_landmarks:
            for hand_landmarks in hand_results.multi_hand_landmarks:
                mp_drawing.draw_landmarks(frame_small, hand_landmarks, mp_hands.HAND_CONNECTIONS)        
                landmarks = [(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark]        
                if fist_closed(landmarks):
                    micro_label = "fist_closed"
                elif palm_open(landmarks):
                    micro_label = "index_up"
                elif thumb_up(landmarks):
                    micro_label = "thumb_up"
        
                if micro_label:
                    sequence_buffer.append(micro_label)
                    if len(sequence_buffer) > 5:
                        sequence_buffer.pop(0)

        # detect composite gesture from micro-gesture sequence
        if sequence_buffer:
            detected = detect_composite_gesture(sequence_buffer)
            if detected:
                last_label = detected

        # initialize video writer
        if out is None:
            fps = cap.get(cv2.CAP_PROP_FPS)
            if fps <= 0 or fps > 120:
                (frame_small.shape[1], frame_small.shape[0])
            )

        # write processed frame
        out.write(frame_small)

    cap.release()
    if out:
        out.release()

    # return video path and last detected label
    return temp_output.name, last_label


# gradio interface
iface = gr.Interface(
    fn=process_video,
    inputs=[
        gr.Video(label="Upload or Record Video"),
        gr.Slider(minimum=160, maximum=1280, value=640, step=20, label="Processing Width")
    ],
    outputs=[
        gr.Video(label="Processed Video with Landmarks"),
        gr.Textbox(label="Detected Gesture", interactive=False)

    ],
    title="Hand & Body Pose Detection",
    description="Upload or record a video, see MediaPipe detect hand landmarks. Gesture label is shown below the video."
)

if __name__ == "__main__":