# preprocessing.py

import cv2
import mediapipe as mp
import tensorflow as tf

class VideoPreprocessor:
    def __init__(self):
        self.mp_face_mesh = mp.solutions.face_mesh
        # Indices for lip landmarks
        self.UPPER_LIP_INDICES = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
        self.LOWER_LIP_INDICES = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
        self.LIP_INDICES = self.UPPER_LIP_INDICES + self.LOWER_LIP_INDICES

    def preprocess_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        # Utilize mediapipe's GPU acceleration if available
        with self.mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        ) as face_mesh:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                # Convert the BGR image to RGB
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # Process the frame and get the facial landmarks
                results = face_mesh.process(rgb_frame)

                if results.multi_face_landmarks:
                    # Get the landmarks for the first face
                    face_landmarks = results.multi_face_landmarks[0]

                    try:
                        # Extract lip landmarks
                        lip_landmarks = [face_landmarks.landmark[i] for i in self.LIP_INDICES]

                        # Extract bounding box around the lips
                        h, w, _ = frame.shape
                        x_coords = [int(landmark.x * w) for landmark in lip_landmarks]
                        y_coords = [int(landmark.y * h) for landmark in lip_landmarks]

                        x_min, x_max = max(0, min(x_coords)), min(w, max(x_coords))
                        y_min, y_max = max(0, min(y_coords)), min(h, max(y_coords))

                        if x_max > x_min and y_max > y_min:
                            # Crop the lip region
                            lip_frame = frame[y_min:y_max, x_min:x_max]

                            # Resize to 85x85 pixels
                            lip_frame_resized = cv2.resize(lip_frame, (85, 85))

                            # Convert to grayscale using TensorFlow
                            lip_frame_gray = tf.image.rgb_to_grayscale(lip_frame_resized)

                            frames.append(lip_frame_gray)
                    except Exception as e:
                        print(f"Error processing frame: {e}")
                        continue  # Skip this frame
                else:
                    print("No face landmarks detected in frame.")

        cap.release()

        if not frames:
            print("No frames extracted during preprocessing.")
            return None  # Return None to indicate failure

        # Stack frames into a tensor
        frames = tf.stack(frames)

        # Normalize the frames
        mean = tf.math.reduce_mean(frames)
        std = tf.math.reduce_std(tf.cast(frames, tf.float32))
        normalized_frames = tf.cast((frames - mean), tf.float32) / std

        return normalized_frames