Spaces:

AIDetect-benchmarked
/

Deepfake-Detector

Sleeping

File size: 8,421 Bytes

d670799

# ------------------------------------------------------------------------------
# Adapted from https://github.com/activitynet/ActivityNet/
# Original licence: Copyright (c) Microsoft, under the MIT License.
# ------------------------------------------------------------------------------
import argparse
import glob
import json
import os
import shutil
import ssl
import subprocess
import uuid
from collections import OrderedDict

import pandas as pd
from joblib import Parallel, delayed

ssl._create_default_https_context = ssl._create_unverified_context


def create_video_folders(dataset, output_dir, tmp_dir):
    """Creates a directory for each label name in the dataset."""
    if 'label-name' not in dataset.columns:
        this_dir = os.path.join(output_dir, 'test')
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        # I should return a dict but ...
        return this_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)

    label_to_dir = {}
    for label_name in dataset['label-name'].unique():
        this_dir = os.path.join(output_dir, label_name)
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        label_to_dir[label_name] = this_dir
    return label_to_dir


def construct_video_filename(row, label_to_dir, trim_format='%06d'):
    """Given a dataset row, this function constructs the output filename for a

    given video."""
    basename = '%s_%s_%s.mp4' % (row['video-id'],
                                 trim_format % row['start-time'],
                                 trim_format % row['end-time'])
    if not isinstance(label_to_dir, dict):
        dirname = label_to_dir
    else:
        dirname = label_to_dir[row['label-name']]
    output_filename = os.path.join(dirname, basename)
    return output_filename


def download_clip(video_identifier,

                  output_filename,

                  start_time,

                  end_time,

                  tmp_dir='/tmp/kinetics/.tmp_dir',

                  num_attempts=5,

                  url_base='https://www.youtube.com/watch?v='):
    """Download a video from youtube if exists and is not blocked.

    arguments:

    ---------

    video_identifier: str

        Unique YouTube video identifier (11 characters)

    output_filename: str

        File path where the video will be stored.

    start_time: float

        Indicates the beginning time in seconds from where the video

        will be trimmed.

    end_time: float

        Indicates the ending time in seconds of the trimmed video.

    """
    # Defensive argument checking.
    assert isinstance(video_identifier, str), 'video_identifier must be string'
    assert isinstance(output_filename, str), 'output_filename must be string'
    assert len(video_identifier) == 11, 'video_identifier must have length 11'

    status = False
    # Construct command line for getting the direct video link.
    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())

    if not os.path.exists(output_filename):
        if not os.path.exists(tmp_filename):
            command = [
                'youtube-dl', '--quiet', '--no-warnings',
                '--no-check-certificate', '-f', 'mp4', '-o',
                '"%s"' % tmp_filename,
                '"%s"' % (url_base + video_identifier)
            ]
            command = ' '.join(command)
            print(command)
            attempts = 0
            while True:
                try:
                    subprocess.check_output(
                        command, shell=True, stderr=subprocess.STDOUT)
                except subprocess.CalledProcessError as err:
                    attempts += 1
                    if attempts == num_attempts:
                        return status, err.output
                else:
                    break

        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
        # Construct command to trim the videos (ffmpeg required).
        command = [
            'ffmpeg', '-i',
            '"%s"' % tmp_filename, '-ss',
            str(start_time), '-t',
            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
            '-threads', '1', '-loglevel', 'panic',
            '"%s"' % output_filename
        ]
        command = ' '.join(command)
        try:
            subprocess.check_output(
                command, shell=True, stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError as err:
            return status, err.output

    # Check if the video was successfully saved.
    status = os.path.exists(output_filename)
    os.remove(tmp_filename)
    return status, 'Downloaded'


def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
    """Wrapper for parallel processing purposes."""
    output_filename = construct_video_filename(row, label_to_dir, trim_format)
    clip_id = os.path.basename(output_filename).split('.mp4')[0]
    if os.path.exists(output_filename):
        status = tuple([clip_id, True, 'Exists'])
        return status

    downloaded, log = download_clip(
        row['video-id'],
        output_filename,
        row['start-time'],
        row['end-time'],
        tmp_dir=tmp_dir)
    status = tuple([clip_id, downloaded, log])
    return status


def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
    """Returns a parsed DataFrame.

    arguments:

    ---------

    input_csv: str

        Path to CSV file containing the following columns:

          'YouTube Identifier,Start time,End time,Class label'

    returns:

    -------

    dataset: DataFrame

        Pandas with the following columns:

            'video-id', 'start-time', 'end-time', 'label-name'

    """
    df = pd.read_csv(input_csv)
    if 'youtube_id' in df.columns:
        columns = OrderedDict([('youtube_id', 'video-id'),
                               ('time_start', 'start-time'),
                               ('time_end', 'end-time'),
                               ('label', 'label-name')])
        df.rename(columns=columns, inplace=True)
        if ignore_is_cc:
            df = df.loc[:, df.columns.tolist()[:-1]]
    return df


def main(input_csv,

         output_dir,

         trim_format='%06d',

         num_jobs=24,

         tmp_dir='/tmp/kinetics'):
    tmp_dir = os.path.join(tmp_dir, '.tmp_dir')

    # Reading and parsing Kinetics.
    dataset = parse_kinetics_annotations(input_csv)

    # Creates folders where videos will be saved later.
    label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)

    # Download all clips.
    if num_jobs == 1:
        status_list = []
        for _, row in dataset.iterrows():
            status_list.append(
                download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
    else:
        status_list = Parallel(
            n_jobs=num_jobs)(delayed(download_clip_wrapper)(
                row, label_to_dir, trim_format, tmp_dir)
                             for i, row in dataset.iterrows())

    # Clean tmp dir.
    shutil.rmtree(tmp_dir)

    # Save download report.
    with open('download_report.json', 'w') as fobj:
        fobj.write(json.dumps(status_list))


if __name__ == '__main__':
    description = 'Helper script for downloading and trimming kinetics videos.'
    p = argparse.ArgumentParser(description=description)
    p.add_argument(
        'input_csv',
        type=str,
        help=('CSV file containing the following format: '
              'YouTube Identifier,Start time,End time,Class label'))
    p.add_argument(
        'output_dir',
        type=str,
        help='Output directory where videos will be saved.')
    p.add_argument(
        '-f',
        '--trim-format',
        type=str,
        default='%06d',
        help=('This will be the format for the '
              'filename of trimmed videos: '
              'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
    p.add_argument('-n', '--num-jobs', type=int, default=24)
    p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics')
    # help='CSV file of the previous version of Kinetics.')
    main(**vars(p.parse_args()))