Spaces:

AIDetect-benchmarked
/

Deepfake-Detector

Sleeping

App Files Files Community

Deepfake-Detector / tools /data /kinetics /download.py

AZIIIIIIIIZ

Upload 1039 files

d670799 verified 3 months ago

raw

history blame contribute delete

8.42 kB

	# ------------------------------------------------------------------------------
	# Adapted from https://github.com/activitynet/ActivityNet/
	# Original licence: Copyright (c) Microsoft, under the MIT License.
	# ------------------------------------------------------------------------------
	import argparse
	import glob
	import json
	import os
	import shutil
	import ssl
	import subprocess
	import uuid
	from collections import OrderedDict

	import pandas as pd
	from joblib import Parallel, delayed

	ssl._create_default_https_context = ssl._create_unverified_context


	def create_video_folders(dataset, output_dir, tmp_dir):
	"""Creates a directory for each label name in the dataset."""
	if 'label-name' not in dataset.columns:
	this_dir = os.path.join(output_dir, 'test')
	if not os.path.exists(this_dir):
	os.makedirs(this_dir)
	# I should return a dict but ...
	return this_dir
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	if not os.path.exists(tmp_dir):
	os.makedirs(tmp_dir)

	label_to_dir = {}
	for label_name in dataset['label-name'].unique():
	this_dir = os.path.join(output_dir, label_name)
	if not os.path.exists(this_dir):
	os.makedirs(this_dir)
	label_to_dir[label_name] = this_dir
	return label_to_dir


	def construct_video_filename(row, label_to_dir, trim_format='%06d'):
	"""Given a dataset row, this function constructs the output filename for a
	given video."""
	basename = '%s_%s_%s.mp4' % (row['video-id'],
	trim_format % row['start-time'],
	trim_format % row['end-time'])
	if not isinstance(label_to_dir, dict):
	dirname = label_to_dir
	else:
	dirname = label_to_dir[row['label-name']]
	output_filename = os.path.join(dirname, basename)
	return output_filename


	def download_clip(video_identifier,
	output_filename,
	start_time,
	end_time,
	tmp_dir='/tmp/kinetics/.tmp_dir',
	num_attempts=5,
	url_base='https://www.youtube.com/watch?v='):
	"""Download a video from youtube if exists and is not blocked.
	arguments:
	---------
	video_identifier: str
	Unique YouTube video identifier (11 characters)
	output_filename: str
	File path where the video will be stored.
	start_time: float
	Indicates the beginning time in seconds from where the video
	will be trimmed.
	end_time: float
	Indicates the ending time in seconds of the trimmed video.
	"""
	# Defensive argument checking.
	assert isinstance(video_identifier, str), 'video_identifier must be string'
	assert isinstance(output_filename, str), 'output_filename must be string'
	assert len(video_identifier) == 11, 'video_identifier must have length 11'

	status = False
	# Construct command line for getting the direct video link.
	tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())

	if not os.path.exists(output_filename):
	if not os.path.exists(tmp_filename):
	command = [
	'youtube-dl', '--quiet', '--no-warnings',
	'--no-check-certificate', '-f', 'mp4', '-o',
	'"%s"' % tmp_filename,
	'"%s"' % (url_base + video_identifier)
	]
	command = ' '.join(command)
	print(command)
	attempts = 0
	while True:
	try:
	subprocess.check_output(
	command, shell=True, stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError as err:
	attempts += 1
	if attempts == num_attempts:
	return status, err.output
	else:
	break

	tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
	# Construct command to trim the videos (ffmpeg required).
	command = [
	'ffmpeg', '-i',
	'"%s"' % tmp_filename, '-ss',
	str(start_time), '-t',
	str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
	'-threads', '1', '-loglevel', 'panic',
	'"%s"' % output_filename
	]
	command = ' '.join(command)
	try:
	subprocess.check_output(
	command, shell=True, stderr=subprocess.STDOUT)
	except subprocess.CalledProcessError as err:
	return status, err.output

	# Check if the video was successfully saved.
	status = os.path.exists(output_filename)
	os.remove(tmp_filename)
	return status, 'Downloaded'


	def download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir):
	"""Wrapper for parallel processing purposes."""
	output_filename = construct_video_filename(row, label_to_dir, trim_format)
	clip_id = os.path.basename(output_filename).split('.mp4')[0]
	if os.path.exists(output_filename):
	status = tuple([clip_id, True, 'Exists'])
	return status

	downloaded, log = download_clip(
	row['video-id'],
	output_filename,
	row['start-time'],
	row['end-time'],
	tmp_dir=tmp_dir)
	status = tuple([clip_id, downloaded, log])
	return status


	def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
	"""Returns a parsed DataFrame.
	arguments:
	---------
	input_csv: str
	Path to CSV file containing the following columns:
	'YouTube Identifier,Start time,End time,Class label'
	returns:
	-------
	dataset: DataFrame
	Pandas with the following columns:
	'video-id', 'start-time', 'end-time', 'label-name'
	"""
	df = pd.read_csv(input_csv)
	if 'youtube_id' in df.columns:
	columns = OrderedDict([('youtube_id', 'video-id'),
	('time_start', 'start-time'),
	('time_end', 'end-time'),
	('label', 'label-name')])
	df.rename(columns=columns, inplace=True)
	if ignore_is_cc:
	df = df.loc[:, df.columns.tolist()[:-1]]
	return df


	def main(input_csv,
	output_dir,
	trim_format='%06d',
	num_jobs=24,
	tmp_dir='/tmp/kinetics'):
	tmp_dir = os.path.join(tmp_dir, '.tmp_dir')

	# Reading and parsing Kinetics.
	dataset = parse_kinetics_annotations(input_csv)

	# Creates folders where videos will be saved later.
	label_to_dir = create_video_folders(dataset, output_dir, tmp_dir)

	# Download all clips.
	if num_jobs == 1:
	status_list = []
	for _, row in dataset.iterrows():
	status_list.append(
	download_clip_wrapper(row, label_to_dir, trim_format, tmp_dir))
	else:
	status_list = Parallel(
	n_jobs=num_jobs)(delayed(download_clip_wrapper)(
	row, label_to_dir, trim_format, tmp_dir)
	for i, row in dataset.iterrows())

	# Clean tmp dir.
	shutil.rmtree(tmp_dir)

	# Save download report.
	with open('download_report.json', 'w') as fobj:
	fobj.write(json.dumps(status_list))


	if __name__ == '__main__':
	description = 'Helper script for downloading and trimming kinetics videos.'
	p = argparse.ArgumentParser(description=description)
	p.add_argument(
	'input_csv',
	type=str,
	help=('CSV file containing the following format: '
	'YouTube Identifier,Start time,End time,Class label'))
	p.add_argument(
	'output_dir',
	type=str,
	help='Output directory where videos will be saved.')
	p.add_argument(
	'-f',
	'--trim-format',
	type=str,
	default='%06d',
	help=('This will be the format for the '
	'filename of trimmed videos: '
	'videoid_%0xd(start_time)_%0xd(end_time).mp4'))
	p.add_argument('-n', '--num-jobs', type=int, default=24)
	p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/kinetics')
	# help='CSV file of the previous version of Kinetics.')
	main(**vars(p.parse_args()))