|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
import glob
|
|
|
import json
|
|
|
import os
|
|
|
import shutil
|
|
|
import ssl
|
|
|
import subprocess
|
|
|
import uuid
|
|
|
from collections import OrderedDict
|
|
|
|
|
|
import pandas as pd
|
|
|
from joblib import Parallel, delayed
|
|
|
|
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
|
|
|
|
|
|
|
def create_video_folders(dataset, output_dir, tmp_dir):
|
|
|
"""Creates a directory for each label name in the dataset."""
|
|
|
if 'label-name' not in dataset.columns:
|
|
|
this_dir = os.path.join(output_dir, 'test')
|
|
|
if not os.path.exists(this_dir):
|
|
|
os.makedirs(this_dir)
|
|
|
|
|
|
return this_dir
|
|
|
if not os.path.exists(output_dir):
|
|
|
os.makedirs(output_dir)
|
|
|
if not os.path.exists(tmp_dir):
|
|
|
os.makedirs(tmp_dir)
|
|
|
|
|
|
label_to_dir = {}
|
|
|
for label_name in dataset['label-name'].unique():
|
|
|
this_dir = os.path.join(output_dir, label_name)
|
|
|
if not os.path.exists(this_dir):
|
|
|
os.makedirs(this_dir)
|
|
|
label_to_dir[label_name] = this_dir
|
|
|
return label_to_dir
|
|
|
|
|
|
|
|
|
def construct_video_filename(row, label_to_dir, trim_format='%06d'):
|
|
|
"""Given a dataset row, this function constructs the output filename for a
|
|
|
given video."""
|
|
|
basename = '%s_%s_%s.mp4' % (row['video-id'],
|
|
|
trim_format % row['start-time'],
|
|
|
trim_format % row['end-time'])
|
|
|
if not isinstance(label_to_dir, dict):
|
|
|
dirname = label_to_dir
|
|
|
else:
|
|
|
dirname = label_to_dir[row['label-name']]
|
|
|
output_filename = os.path.join(dirname, basename)
|
|
|
return output_filename
|
|
|
|
|
|
|
|
|
def download_clip(video_identifier,
|
|
|
output_filename,
|
|
|
start_time,
|
|
|
end_time,
|
|
|
tmp_dir='/tmp/kinetics/.tmp_dir',
|
|
|
num_attempts=5,
|
|
|
url_base='https://www.youtube.com/watch?v='):
|
|
|
"""Download a video from youtube if exists and is not blocked.
|
|
|
arguments:
|
|
|
---------
|
|
|
video_identifier: str
|
|
|
Unique YouTube video identifier (11 characters)
|
|
|
output_filename: str
|
|
|
File path where the video will be stored.
|
|
|
start_time: float
|
|
|
Indicates the beginning time in seconds from where the video
|
|
|
will be trimmed.
|
|
|
end_time: float
|
|
|
Indicates the ending time in seconds of the trimmed video.
|
|
|
"""
|
|
|
|
|
|
assert isinstance(video_identifier, str), 'video_identifier must be string'
|
|
|
assert isinstance(output_filename, str), 'output_filename must be string'
|
|
|
assert len(video_identifier) == 11, 'video_identifier must have length 11'
|
|
|
|
|
|
status = False
|
|
|
|
|
|
tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())
|
|
|
|
|
|
if not os.path.exists(output_filename):
|
|
|
if not os.path.exists(tmp_filename):
|
|
|
command = [
|
|
|
'youtube-dl', '--quiet', '--no-warnings',
|
|
|
'--no-check-certificate', '-f', 'mp4', '-o',
|
|
|
'"%s"' % tmp_filename,
|
|
|
'"%s"' % (url_base + video_identifier)
|
|
|
]
|
|
|
command = ' '.join(command)
|
|
|
print(command)
|
|
|
attempts = 0
|
|
|
while True:
|
|
|
try:
|
|
|
subprocess.check_output(
|
|
|
command, shell=True, stderr=subprocess.STDOUT)
|
|
|
except subprocess.CalledProcessError as err:
|
|
|
attempts += 1
|
|
|
if attempts == num_attempts:
|
|
|
return status, err.output
|
|
|
else:
|
|
|
break
|
|
|
|
|
|
tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
|
|
|
|
|
|
command = [
|
|
|
'ffmpeg', '-i',
|
|
|
'"%s"' % tmp_filename, '-ss',
|
|
|
str(start_time), '-t',
|
|
|
str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
|
|
|
'-threads', '1', '-loglevel', 'panic',
|
|
|
'"%s"' % output_filename
|
|
|
]
|
|
|
command = ' '.join(command)
|
|
|
try:
|
|
|
subprocess.check_output(
|
|
|
command, shell=True, stderr=subprocess.STDOUT)
|
|
|
except subprocess.CalledProcessError as err:
|
|
|
return status, err.output
|
|
|
|
|
|
|
|
|
status = os.path.exists(output_filename)
|
|
|
os.remove(tmp_filename)
|
|
|
return status, 'Downloaded'
|
|
|
|
|
|
|
|
|
def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
|
|
|
"""Wrapper for parallel processing purposes."""
|
|
|
output_filename = construct_video_filename(row, label_to_dir, trim_format)
|
|
|
clip_id = os.path.basename(output_filename).split('.mp4')[0]
|
|
|
if os.path.exists(output_filename):
|
|
|
status = tuple([clip_id, True, 'Exists'])
|
|
|
return status
|
|
|
|
|
|
downloaded, log = download_clip(
|
|
|
row['video-id'],
|
|
|
output_filename,
|
|
|
row['start-time'],
|
|
|
row['end-time'],
|
|
|
tmp_dir=tmp_dir)
|
|
|
status = tuple([clip_id, downloaded, log])
|
|
|
return status
|
|
|
|
|
|
|
|
|
def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
|
|
|
"""Returns a parsed DataFrame.
|
|
|
arguments:
|
|
|
---------
|
|
|
input_csv: str
|
|
|
Path to CSV file containing the following columns:
|
|
|
'YouTube Identifier,Start time,End time,Class label'
|
|
|
returns:
|
|
|
-------
|
|
|
dataset: DataFrame
|
|
|
Pandas with the following columns:
|
|
|
'video-id', 'start-time', 'end-time', 'label-name'
|
|
|
"""
|
|
|
df = pd.read_csv(input_csv)
|
|
|
if 'youtube_id' in df.columns:
|
|
|
columns = OrderedDict([('youtube_id', 'video-id'),
|
|
|
('time_start', 'start-time'),
|
|
|
('time_end', 'end-time'),
|
|
|
('label', 'label-name')])
|
|
|
df.rename(columns=columns, inplace=True)
|
|
|
if ignore_is_cc:
|
|
|
df = df.loc[:, df.columns.tolist()[:-1]]
|
|
|
return df
|
|
|
|
|
|
|
|
|
def main(input_csv,
|
|
|
output_dir,
|
|
|
trim_format='%06d',
|
|
|
num_jobs=24,
|
|
|
tmp_dir='/tmp/kinetics'):
|
|
|
tmp_dir = os.path.join(tmp_dir, '.tmp_dir')
|
|
|
|
|
|
|
|
|
dataset = parse_kinetics_annotations(input_csv)
|
|
|
|
|
|
|
|
|
label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)
|
|
|
|
|
|
|
|
|
if num_jobs == 1:
|
|
|
status_list = []
|
|
|
for _, row in dataset.iterrows():
|
|
|
status_list.append(
|
|
|
download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
|
|
|
else:
|
|
|
status_list = Parallel(
|
|
|
n_jobs=num_jobs)(delayed(download_clip_wrapper)(
|
|
|
row, label_to_dir, trim_format, tmp_dir)
|
|
|
for i, row in dataset.iterrows())
|
|
|
|
|
|
|
|
|
shutil.rmtree(tmp_dir)
|
|
|
|
|
|
|
|
|
with open('download_report.json', 'w') as fobj:
|
|
|
fobj.write(json.dumps(status_list))
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
description = 'Helper script for downloading and trimming kinetics videos.'
|
|
|
p = argparse.ArgumentParser(description=description)
|
|
|
p.add_argument(
|
|
|
'input_csv',
|
|
|
type=str,
|
|
|
help=('CSV file containing the following format: '
|
|
|
'YouTube Identifier,Start time,End time,Class label'))
|
|
|
p.add_argument(
|
|
|
'output_dir',
|
|
|
type=str,
|
|
|
help='Output directory where videos will be saved.')
|
|
|
p.add_argument(
|
|
|
'-f',
|
|
|
'--trim-format',
|
|
|
type=str,
|
|
|
default='%06d',
|
|
|
help=('This will be the format for the '
|
|
|
'filename of trimmed videos: '
|
|
|
'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
|
|
|
p.add_argument('-n', '--num-jobs', type=int, default=24)
|
|
|
p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics')
|
|
|
|
|
|
main(**vars(p.parse_args()))
|
|
|
|