Test

Runtime error

File size: 5,497 Bytes

83e35a7

#!/usr/bin/env python3
"""
Modern Face Detection for Accurate Bubble Placement
Uses state-of-the-art models for better face and lip detection
"""

import cv2
import numpy as np
import os
from typing import Tuple, List, Optional

class ModernFaceDetector:
    def __init__(self):
        """Initialize modern face detection models"""
        
        # Option 1: MediaPipe (Google's modern face detection)
        try:
            import mediapipe as mp
            self.mp_face_mesh = mp.solutions.face_mesh
            self.face_mesh = self.mp_face_mesh.FaceMesh(
                static_image_mode=True,
                max_num_faces=4,
                refine_landmarks=True,
                min_detection_confidence=0.5
            )
            self.use_mediapipe = True
            print("Using MediaPipe face detection")
        except ImportError:
            self.use_mediapipe = False
            print("MediaPipe not available, using OpenCV")
        
        # Option 2: OpenCV DNN face detector (more modern than dlib)
        if not self.use_mediapipe:
            # Load OpenCV's DNN face detector
            model_path = "backend/speech_bubble/face_detection_yunet_2023mar.onnx"
            if not os.path.exists(model_path):
                # Download if not available
                self._download_face_model()
            
            self.face_detector = cv2.FaceDetectorYN_create(
                model_path,
                "",
                (320, 320),
                0.9,
                0.3,
                5000
            )
    
    def _download_face_model(self):
        """Download OpenCV face detection model if not available"""
        import urllib.request
        url = "https://github.com/opencv/opencv_zoo/raw/main/models/face_detection_yunet/face_detection_yunet_2023mar.onnx"
        print(f"Downloading face detection model from {url}")
        urllib.request.urlretrieve(url, "backend/speech_bubble/face_detection_yunet_2023mar.onnx")
    
    def detect_faces_mediapipe(self, image) -> List[Tuple[int, int]]:
        """Detect faces using MediaPipe (most accurate)"""
        # Handle both file paths and image objects
        if isinstance(image, str):
            img = cv2.imread(image)
        else:
            img = image
            
        if img is None:
            return [(-1, -1)]
        
        rgb_image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        results = self.face_mesh.process(rgb_image)
        
        lip_positions = []
        if results.multi_face_landmarks:
            for face_landmarks in results.multi_face_landmarks:
                # MediaPipe lip landmarks (more accurate than dlib)
                # Upper lip center
                upper_lip = face_landmarks.landmark[13]  # Upper lip center
                # Lower lip center  
                lower_lip = face_landmarks.landmark[14]  # Lower lip center
                
                # Calculate lip center
                lip_x = int((upper_lip.x + lower_lip.x) / 2 * image.shape[1])
                lip_y = int((upper_lip.y + lower_lip.y) / 2 * image.shape[0])
                
                lip_positions.append((lip_x, lip_y))
        
        return lip_positions if lip_positions else [(-1, -1)]
    
    def detect_faces_opencv(self, image) -> List[Tuple[int, int]]:
        """Detect faces using OpenCV DNN (fallback)"""
        # Handle both file paths and image objects
        if isinstance(image, str):
            img = cv2.imread(image)
        else:
            img = image
            
        if img is None:
            return [(-1, -1)]
        
        height, width = img.shape[:2]
        self.face_detector.setInputSize((width, height))
        
        _, faces = self.face_detector.detect(img)
        lip_positions = []
        
        if faces is not None:
            for face in faces:
                # Extract face bounding box
                x, y, w, h = face[:4].astype(int)
                
                # Estimate lip position (center of lower face area)
                lip_x = x + w // 2
                lip_y = y + int(h * 0.7)  # 70% down the face (lip area)
                
                lip_positions.append((lip_x, lip_y))
        
        return lip_positions if lip_positions else [(-1, -1)]
    
    def detect_faces(self, image) -> List[Tuple[int, int]]:
        """Main face detection method"""
        if self.use_mediapipe:
            return self.detect_faces_mediapipe(image)
        else:
            return self.detect_faces_opencv(image)

def get_modern_lip_positions(video_path: str, frame_paths: List[str]) -> dict:
    """
    Get lip positions using modern face detection
    Returns: {frame_index: (lip_x, lip_y)}
    """
    detector = ModernFaceDetector()
    lip_positions = {}
    
    for i, frame_path in enumerate(frame_paths, 1):
        if os.path.exists(frame_path):
            positions = detector.detect_faces(frame_path)
            # Use the first detected face (most prominent)
            lip_positions[i] = positions[0] if positions else (-1, -1)
        else:
            lip_positions[i] = (-1, -1)
    
    return lip_positions

if __name__ == "__main__":
    # Test the modern face detector
    test_image = "frames/final/frame001.png"
    if os.path.exists(test_image):
        detector = ModernFaceDetector()
        positions = detector.detect_faces(test_image)
        print(f"Detected lip positions: {positions}")
    else:
        print("Test image not found")