File size: 12,463 Bytes

import os
import argparse
import subprocess
import torch
import numpy as np
from tqdm import tqdm
from omegaconf import OmegaConf
from typing import Tuple, List, Union
import decord
import json
import cv2
from musetalk.utils.face_detection import FaceAlignment,LandmarksType
from mmpose.apis import inference_topdown, init_model
from mmpose.structures import merge_data_samples
import sys

def fast_check_ffmpeg():
    try:
        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
        return True
    except:
        return False

ffmpeg_path = "./ffmpeg-4.4-amd64-static/"
if not fast_check_ffmpeg():
    print("Adding ffmpeg to PATH")
    # Choose path separator based on operating system
    path_separator = ';' if sys.platform == 'win32' else ':'
    os.environ["PATH"] = f"{args.ffmpeg_path}{path_separator}{os.environ['PATH']}"
    if not fast_check_ffmpeg():
        print("Warning: Unable to find ffmpeg, please ensure ffmpeg is properly installed")

class AnalyzeFace:
    def __init__(self, device: Union[str, torch.device], config_file: str, checkpoint_file: str):
        """
        Initialize the AnalyzeFace class with the given device, config file, and checkpoint file.

        Parameters:
        device (Union[str, torch.device]): The device to run the models on ('cuda' or 'cpu').
        config_file (str): Path to the mmpose model configuration file.
        checkpoint_file (str): Path to the mmpose model checkpoint file.
        """
        self.device = device
        self.dwpose = init_model(config_file, checkpoint_file, device=self.device)
        self.facedet = FaceAlignment(LandmarksType._2D, flip_input=False, device=self.device)

    def __call__(self, im: np.ndarray) -> Tuple[List[np.ndarray], np.ndarray]:
        """
        Detect faces and keypoints in the given image.

        Parameters:
        im (np.ndarray): The input image.
        maxface (bool): Whether to detect the maximum face. Default is True.

        Returns:
        Tuple[List[np.ndarray], np.ndarray]: A tuple containing the bounding boxes and keypoints.
        """
        try:
            # Ensure the input image has the correct shape
            if im.ndim == 3:
                im = np.expand_dims(im, axis=0)
            elif im.ndim != 4 or im.shape[0] != 1:
                raise ValueError("Input image must have shape (1, H, W, C)")
            
            bbox = self.facedet.get_detections_for_batch(np.asarray(im))
            results = inference_topdown(self.dwpose, np.asarray(im)[0])
            results = merge_data_samples(results)
            keypoints = results.pred_instances.keypoints
            face_land_mark= keypoints[0][23:91]
            face_land_mark = face_land_mark.astype(np.int32)

            return face_land_mark, bbox
        
        except Exception as e:
            print(f"Error during face analysis: {e}")
            return np.array([]),[] 

def convert_video(org_path: str, dst_path: str, vid_list: List[str]) -> None:

    """
    Convert video files to a specified format and save them to the destination path.

    Parameters:
    org_path (str): The directory containing the original video files.
    dst_path (str): The directory where the converted video files will be saved.
    vid_list (List[str]): A list of video file names to process.

    Returns:
    None
    """
    for idx, vid in enumerate(vid_list):
        if vid.endswith('.mp4'):
            org_vid_path = os.path.join(org_path, vid)
            dst_vid_path = os.path.join(dst_path, vid)
                
            if org_vid_path != dst_vid_path:
                cmd = [
                    "ffmpeg", "-hide_banner", "-y", "-i", org_vid_path, 
                    "-r", "25", "-crf", "15", "-c:v", "libx264", 
                    "-pix_fmt", "yuv420p", dst_vid_path
                ]
                subprocess.run(cmd, check=True)

            if idx % 1000 == 0:
                print(f"### {idx} videos converted ###")

def segment_video(org_path: str, dst_path: str, vid_list: List[str], segment_duration: int = 30) -> None:
    """
    Segment video files into smaller clips of specified duration.

    Parameters:
    org_path (str): The directory containing the original video files.
    dst_path (str): The directory where the segmented video files will be saved.
    vid_list (List[str]): A list of video file names to process.
    segment_duration (int): The duration of each segment in seconds. Default is 30 seconds.

    Returns:
    None
    """
    for idx, vid in enumerate(vid_list):
        if vid.endswith('.mp4'):
            input_file = os.path.join(org_path, vid)
            original_filename = os.path.basename(input_file)

            command = [
                'ffmpeg', '-i', input_file, '-c', 'copy', '-map', '0',
                '-segment_time', str(segment_duration), '-f', 'segment',
                '-reset_timestamps', '1',
                os.path.join(dst_path, f'clip%03d_{original_filename}')
            ]

            subprocess.run(command, check=True)

def extract_audio(org_path: str, dst_path: str, vid_list: List[str]) -> None:
    """
    Extract audio from video files and save as WAV format.

    Parameters:
    org_path (str): The directory containing the original video files.
    dst_path (str): The directory where the extracted audio files will be saved.
    vid_list (List[str]): A list of video file names to process.

    Returns:
    None
    """
    for idx, vid in enumerate(vid_list):
        if vid.endswith('.mp4'):
            video_path = os.path.join(org_path, vid)
            audio_output_path = os.path.join(dst_path, os.path.splitext(vid)[0] + ".wav")
            try:
                command = [
                    'ffmpeg', '-hide_banner', '-y', '-i', video_path,
                    '-vn', '-acodec', 'pcm_s16le', '-f', 'wav',
                    '-ar', '16000', '-ac', '1', audio_output_path,
                ]
                
                subprocess.run(command, check=True)
                print(f"Audio saved to: {audio_output_path}")
            except subprocess.CalledProcessError as e:
                print(f"Error extracting audio from {vid}: {e}")

def split_data(video_files: List[str], val_list_hdtf: List[str]) -> (List[str], List[str]):
    """
    Split video files into training and validation sets based on val_list_hdtf.

    Parameters:
    video_files (List[str]): A list of video file names.
    val_list_hdtf (List[str]): A list of validation file identifiers.

    Returns:
    (List[str], List[str]): A tuple containing the training and validation file lists.
    """
    val_files = [f for f in video_files if any(val_id in f for val_id in val_list_hdtf)]
    train_files = [f for f in video_files if f not in val_files]
    return train_files, val_files

def save_list_to_file(file_path: str, data_list: List[str]) -> None:
    """
    Save a list of strings to a file, each string on a new line.

    Parameters:
    file_path (str): The path to the file where the list will be saved.
    data_list (List[str]): The list of strings to save.

    Returns:
    None
    """
    with open(file_path, 'w') as file:
        for item in data_list:
            file.write(f"{item}\n")

def generate_train_list(cfg):
    train_file_path = cfg.video_clip_file_list_train
    val_file_path = cfg.video_clip_file_list_val
    val_list_hdtf = cfg.val_list_hdtf

    meta_list = os.listdir(cfg.meta_root)

    sorted_meta_list = sorted(meta_list)
    train_files, val_files = split_data(meta_list, val_list_hdtf)

    save_list_to_file(train_file_path, train_files)
    save_list_to_file(val_file_path, val_files)

    print(val_list_hdtf)    

def analyze_video(org_path: str, dst_path: str, vid_list: List[str]) -> None:
    """
    Convert video files to a specified format and save them to the destination path.

    Parameters:
    org_path (str): The directory containing the original video files.
    dst_path (str): The directory where the meta json will be saved.
    vid_list (List[str]): A list of video file names to process.

    Returns:
    None
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    config_file = './musetalk/utils/dwpose/rtmpose-l_8xb32-270e_coco-ubody-wholebody-384x288.py'
    checkpoint_file = './models/dwpose/dw-ll_ucoco_384.pth'

    analyze_face = AnalyzeFace(device, config_file, checkpoint_file)
    
    for vid in tqdm(vid_list, desc="Processing videos"):
        #vid = "clip005_WDA_BernieSanders_000.mp4"
        #print(vid)
        if vid.endswith('.mp4'):
            vid_path = os.path.join(org_path, vid)
            wav_path = vid_path.replace(".mp4",".wav")
            vid_meta = os.path.join(dst_path, os.path.splitext(vid)[0] + ".json")
            if os.path.exists(vid_meta):
                continue
            print('process video {}'.format(vid))

            total_bbox_list = []
            total_pts_list = []
            isvalid = True

            # process
            try:
                cap = decord.VideoReader(vid_path, fault_tol=1)
            except Exception as e:
                print(e)
                continue

            total_frames = len(cap)
            for frame_idx in range(total_frames):
                frame = cap[frame_idx]
                if frame_idx==0:
                    video_height,video_width,_ = frame.shape
                frame_bgr = cv2.cvtColor(frame.asnumpy(), cv2.COLOR_BGR2RGB)
                pts_list, bbox_list = analyze_face(frame_bgr)

                if len(bbox_list)>0 and None not in bbox_list:
                    bbox = bbox_list[0]
                else:
                    isvalid = False
                    bbox = []
                    print(f"set isvalid to False as broken img in {frame_idx} of {vid}")
                    break

                #print(pts_list)
                if len(pts_list)>0 and pts_list is not None:
                    pts = pts_list.tolist()
                else:
                    isvalid = False
                    pts = []
                    break

                if frame_idx==0:
                    x1,y1,x2,y2 = bbox 
                    face_height, face_width = y2-y1,x2-x1

                total_pts_list.append(pts)
                total_bbox_list.append(bbox)

            meta_data = {
                    "mp4_path": vid_path,
                     "wav_path": wav_path,
                     "video_size": [video_height, video_width],
                     "face_size": [face_height, face_width],
                     "frames": total_frames,
                     "face_list": total_bbox_list,
                     "landmark_list": total_pts_list,
                     "isvalid":isvalid,
            }        
            with open(vid_meta, 'w') as f:
                json.dump(meta_data, f, indent=4)
            


def main(cfg):
    # Ensure all necessary directories exist
    os.makedirs(cfg.video_root_25fps, exist_ok=True)
    os.makedirs(cfg.video_audio_clip_root, exist_ok=True)
    os.makedirs(cfg.meta_root, exist_ok=True)
    os.makedirs(os.path.dirname(cfg.video_file_list), exist_ok=True)
    os.makedirs(os.path.dirname(cfg.video_clip_file_list_train), exist_ok=True)
    os.makedirs(os.path.dirname(cfg.video_clip_file_list_val), exist_ok=True)

    vid_list = os.listdir(cfg.video_root_raw)
    sorted_vid_list = sorted(vid_list)
 
    # Save video file list
    with open(cfg.video_file_list, 'w') as file:
        for vid in sorted_vid_list:
            file.write(vid + '\n')

    # 1. Convert videos to 25 FPS
    convert_video(cfg.video_root_raw, cfg.video_root_25fps, sorted_vid_list)
    
    # 2. Segment videos into 30-second clips
    segment_video(cfg.video_root_25fps, cfg.video_audio_clip_root, vid_list, segment_duration=cfg.clip_len_second)
    
    # 3. Extract audio
    clip_vid_list = os.listdir(cfg.video_audio_clip_root)
    extract_audio(cfg.video_audio_clip_root, cfg.video_audio_clip_root, clip_vid_list)
    
    # 4. Generate video metadata
    analyze_video(cfg.video_audio_clip_root, cfg.meta_root, clip_vid_list)
    
    # 5. Generate training and validation set lists
    generate_train_list(cfg)
    print("done")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, default="./configs/training/preprocess.yaml")
    args = parser.parse_args()
    config = OmegaConf.load(args.config)

    main(config)