Lip_Reading / preprocessing.py
thienphuc12339's picture
Update preprocessing.py
db67496 verified
# preprocessing.py
import cv2
import mediapipe as mp
import tensorflow as tf
class VideoPreprocessor:
def __init__(self):
self.mp_face_mesh = mp.solutions.face_mesh
# Indices for lip landmarks
self.UPPER_LIP_INDICES = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
self.LOWER_LIP_INDICES = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
self.LIP_INDICES = self.UPPER_LIP_INDICES + self.LOWER_LIP_INDICES
def preprocess_video(self, video_path):
cap = cv2.VideoCapture(video_path)
frames = []
# Utilize mediapipe's GPU acceleration if available
with self.mp_face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=1,
refine_landmarks=True,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
) as face_mesh:
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Convert the BGR image to RGB
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Process the frame and get the facial landmarks
results = face_mesh.process(rgb_frame)
if results.multi_face_landmarks:
# Get the landmarks for the first face
face_landmarks = results.multi_face_landmarks[0]
try:
# Extract lip landmarks
lip_landmarks = [face_landmarks.landmark[i] for i in self.LIP_INDICES]
# Extract bounding box around the lips
h, w, _ = frame.shape
x_coords = [int(landmark.x * w) for landmark in lip_landmarks]
y_coords = [int(landmark.y * h) for landmark in lip_landmarks]
x_min, x_max = max(0, min(x_coords)), min(w, max(x_coords))
y_min, y_max = max(0, min(y_coords)), min(h, max(y_coords))
if x_max > x_min and y_max > y_min:
# Crop the lip region
lip_frame = frame[y_min:y_max, x_min:x_max]
# Resize to 85x85 pixels
lip_frame_resized = cv2.resize(lip_frame, (85, 85))
# Convert to grayscale using TensorFlow
lip_frame_gray = tf.image.rgb_to_grayscale(lip_frame_resized)
frames.append(lip_frame_gray)
except Exception as e:
print(f"Error processing frame: {e}")
continue # Skip this frame
else:
print("No face landmarks detected in frame.")
cap.release()
if not frames:
print("No frames extracted during preprocessing.")
return None # Return None to indicate failure
# Stack frames into a tensor
frames = tf.stack(frames)
# Normalize the frames
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
normalized_frames = tf.cast((frames - mean), tf.float32) / std
return normalized_frames