File size: 7,875 Bytes

5c69097

import os
import cv2
import json
import glob
import pickle
import shutil
import subprocess
from typing import List, Optional
from cog import BasePredictor, BaseModel, Input, Path


class Output(BaseModel):
    media_path: Optional[List[Path]]
    json_str: Optional[str]


class Predictor(BasePredictor):
    def setup(self):
        pass

    def predict(

        self,

        video: Path = Input(description="Path to the video"),

        face_det_scale: float = Input(

            default=0.25,

            description="Scale factor for face detection, the frames will be scaled to 0.25 of the original",

            ge=0,

            le=1,

        ),

        min_track: int = Input(

            default=10, description="Number of min frames for each shot"

        ),

        num_failed_det: int = Input(

            default=10,

            description="Number of missed detections allowed before tracking is stopped",

            ge=1,

        ),

        min_face_size: int = Input(

            default=1, description="Minimum face size in pixels", ge=1

        ),

        crop_scale: float = Input(

            default=0.40, description="Scale bounding box", ge=0, le=1

        ),

        start: int = Input(default=0, description="The start time of the video", ge=0),

        duration: int = Input(

            default=-1,

            description="The duration of the video, when set as -1, will extract the whole video",

        ),

        return_json: bool = Input(

            description="Return results in json format", default=True

        ),

        return_boundingbox_percentages: bool = Input(

            description="Return bounding box coordinates as percentages of the video width and height",

            default=False,

        ),

    ) -> Output:

        video_path = str(video)
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        video_folder = "demo"

        # Clean up and create the video folder
        shutil.rmtree(video_folder, ignore_errors=True)
        os.makedirs(video_folder, exist_ok=True)

        # Copy the input video to the video folder
        target_video_path = os.path.join(video_folder, os.path.basename(video_path))
        shutil.copy(video_path, target_video_path)

        duration = max(0, duration)
        n_data_loader_thread = 32

        # Run the demoTalkNet.py script with the provided arguments
        command = (
            f"python demoTalkNet.py --videoName {video_name} "
            f"--videoFolder {video_folder} "
            f"--pretrainModel pretrain_TalkSet.model "
            f"--nDataLoaderThread {n_data_loader_thread} "
            f"--facedetScale {face_det_scale} "
            f"--minTrack {min_track} "
            f"--numFailedDet {num_failed_det} "
            f"--minFaceSize {min_face_size} "
            f"--cropScale {crop_scale} "
            f"--start {start} "
            f"--duration {duration} "
        )

        process = subprocess.Popen(
            command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        )
        stdout, stderr = process.communicate()
        print(f"Command output: {stdout.decode()}")
        if stderr:
            print(f"Command errors: {stderr.decode()}")

        # Find the most recent pywork folder
        pywork_folders = glob.glob(os.path.join(video_folder, "*", "pywork"))
        latest_pywork_folder = max(pywork_folders, key=os.path.getctime)

        # Load the face tracks and scores from the pickle files generated by demoTalkNet.py
        tracks_file = os.path.join(latest_pywork_folder, "tracks.pckl")
        scores_file = os.path.join(latest_pywork_folder, "scores.pckl")
        with open(tracks_file, "rb") as f:
            face_tracks = pickle.load(f)  # list
        with open(scores_file, "rb") as f:
            scores = pickle.load(f)  # list

        # Get the video dimensions
        video = cv2.VideoCapture(target_video_path)
        video_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
        video_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        video.release()

        # Convert face tracks and scores to the desired JSON format
        output_data = []
        for track_idx, track in enumerate(face_tracks):
            # Get the frame numbers for the current track
            frames = track["track"]["frame"]

            # Get the bounding box information for the current track
            boxes = track["proc_track"]

            # Get the speaking scores for the current track
            # If the track index is out of range, use an empty list
            speaking_scores = scores[track_idx] if track_idx < len(scores) else []

            for i, frame in enumerate(frames):
                # Check if the current index is within the valid range of the bounding box information
                # If not, break the loop and move to the next track
                if i >= len(boxes["x"]) or i >= len(boxes["y"]) or i >= len(boxes["s"]):
                    break

                # Calculate bounding box coordinates
                x0 = int(boxes["x"][i] - boxes["s"][i])
                y0 = int(boxes["y"][i] - boxes["s"][i])
                x1 = int(boxes["x"][i] + boxes["s"][i])
                y1 = int(boxes["y"][i] + boxes["s"][i])

                # Normalize the bounding box coordinates if required
                if return_boundingbox_percentages:
                    x0 /= video_width
                    y0 /= video_height
                    x1 /= video_width
                    y1 /= video_height

                # Determine speaking status
                speaking = (
                    bool(speaking_scores[i] >= 0) if i < len(speaking_scores) else False
                )

                # Create the bounding box dictionary
                box = {
                    "face_id": track_idx,
                    "x0": x0,
                    "y0": y0,
                    "x1": x1,
                    "y1": y1,
                    "speaking": speaking,
                }

                # Create a dictionary for each frame if it doesn't exist
                frame_data = next(
                    (
                        data
                        for data in output_data
                        if data["frame_number"] == int(frame)
                    ),
                    None,
                )
                if frame_data is None:
                    frame_data = {"frame_number": int(frame), "faces": []}
                    output_data.append(frame_data)

                # Add the current face's bounding box and speaking status to the frame's data
                frame_data["faces"].append(box)

        # Convert the output data to JSON string
        json_str = json.dumps(output_data)

        if return_json:
            return Output(json_str=json_str)
        else:
            mp4_files = []
            excluded_files = ["video_only.avi", "video.avi"]
            avi_files = [
                avi_file
                for avi_file in Path(video_folder).rglob("*.avi")
                if avi_file.name not in excluded_files
            ]
            for avi_file in avi_files:
                mp4_file = avi_file.with_suffix(".mp4")
                conversion_command = f"ffmpeg -i {avi_file} {mp4_file}"
                conversion_process = subprocess.run(
                    conversion_command,
                    shell=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                )
                if conversion_process.returncode == 0:
                    mp4_files.append(Path(mp4_file))
            return Output(media_path=mp4_files)