Spaces:

thienphuc12339
/

Lip_Reading

Runtime error

File size: 3,305 Bytes

b126da6
8a07d8a
41a19eb
 
3276b67
41a19eb
 
 
 
 
 
 
 
 
8a07d8a
41a19eb
 
8a07d8a
3276b67
41a19eb
 
 
 
 
 
 
 
 
 
 
3263c3e
41a19eb
3263c3e
 
41a19eb
3263c3e
41a19eb
3263c3e
41a19eb
3263c3e
41a19eb
3263c3e
41a19eb
3263c3e
3276b67
41a19eb
3276b67
 
3263c3e
3276b67
 
3263c3e
41a19eb
3263c3e
41a19eb
3263c3e
1d3782e
41a19eb
3263c3e
8a07d8a
 
3263c3e
41a19eb
 
3276b67
3263c3e
41a19eb
3276b67
ec68455
8a07d8a
ec68455
8a07d8a
 
 
ec68455
8a07d8a
 
ec68455
8a07d8a
 
 
 
3263c3e
8a07d8a

# preprocessing.py

import cv2
import mediapipe as mp
import tensorflow as tf

class VideoPreprocessor:
    def __init__(self):
        self.mp_face_mesh = mp.solutions.face_mesh
        # Indices for lip landmarks
        self.UPPER_LIP_INDICES = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
        self.LOWER_LIP_INDICES = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
        self.LIP_INDICES = self.UPPER_LIP_INDICES + self.LOWER_LIP_INDICES

    def preprocess_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        # Utilize mediapipe's GPU acceleration if available
        with self.mp_face_mesh.FaceMesh(
            static_image_mode=False,
            max_num_faces=1,
            refine_landmarks=True,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        ) as face_mesh:
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                # Convert the BGR image to RGB
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # Process the frame and get the facial landmarks
                results = face_mesh.process(rgb_frame)

                if results.multi_face_landmarks:
                    # Get the landmarks for the first face
                    face_landmarks = results.multi_face_landmarks[0]

                    try:
                        # Extract lip landmarks
                        lip_landmarks = [face_landmarks.landmark[i] for i in self.LIP_INDICES]

                        # Extract bounding box around the lips
                        h, w, _ = frame.shape
                        x_coords = [int(landmark.x * w) for landmark in lip_landmarks]
                        y_coords = [int(landmark.y * h) for landmark in lip_landmarks]

                        x_min, x_max = max(0, min(x_coords)), min(w, max(x_coords))
                        y_min, y_max = max(0, min(y_coords)), min(h, max(y_coords))

                        if x_max > x_min and y_max > y_min:
                            # Crop the lip region
                            lip_frame = frame[y_min:y_max, x_min:x_max]

                            # Resize to 85x85 pixels
                            lip_frame_resized = cv2.resize(lip_frame, (85, 85))

                            # Convert to grayscale using TensorFlow
                            lip_frame_gray = tf.image.rgb_to_grayscale(lip_frame_resized)

                            frames.append(lip_frame_gray)
                    except Exception as e:
                        print(f"Error processing frame: {e}")
                        continue  # Skip this frame
                else:
                    print("No face landmarks detected in frame.")

        cap.release()

        if not frames:
            print("No frames extracted during preprocessing.")
            return None  # Return None to indicate failure

        # Stack frames into a tensor
        frames = tf.stack(frames)

        # Normalize the frames
        mean = tf.math.reduce_mean(frames)
        std = tf.math.reduce_std(tf.cast(frames, tf.float32))
        normalized_frames = tf.cast((frames - mean), tf.float32) / std

        return normalized_frames