# Copyright (c) OpenMMLab. All rights reserved. # This scripts is copied from # https://github.com/activitynet/ActivityNet/blob/master/Crawler/Kinetics/download.py # noqa: E501 # The code is licensed under the MIT licence. import argparse import os import ssl import subprocess import mmengine from joblib import Parallel, delayed ssl._create_default_https_context = ssl._create_unverified_context data_file = '../../../data/ActivityNet' output_dir = f'{data_file}/videos' def parse_args(): parser = argparse.ArgumentParser(description='ActivityNet downloader') parser.add_argument( '--bsn', action='store_true', help='download for BSN annotation or official one') args = parser.parse_args() return args def download_clip(video_identifier, output_filename, num_attempts=5, url_base='https://www.youtube.com/watch?v='): """Download a video from youtube if exists and is not blocked. arguments: --------- video_identifier: str Unique YouTube video identifier (11 characters) output_filename: str File path where the video will be stored. """ # Defensive argument checking. assert isinstance(video_identifier, str), 'video_identifier must be string' assert isinstance(output_filename, str), 'output_filename must be string' assert len(video_identifier) == 11, 'video_identifier must have length 11' status = False if not os.path.exists(output_filename): command = [ 'youtube-dl', '--quiet', '--no-warnings', '--no-check-certificate', '-f', 'mp4', '-o', '"%s"' % output_filename, '"%s"' % (url_base + video_identifier) ] command = ' '.join(command) print(command) attempts = 0 while True: try: subprocess.check_output( command, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: attempts += 1 if attempts == num_attempts: return status, 'Fail' else: break # Check if the video was successfully saved. status = os.path.exists(output_filename) return status, 'Downloaded' def download_clip_wrapper(youtube_id, output_dir): """Wrapper for parallel processing purposes.""" # we do this to align with names in annotations output_filename = os.path.join(output_dir, 'v_' + youtube_id + '.mp4') if os.path.exists(output_filename): status = tuple(['v_' + youtube_id, True, 'Exists']) return status downloaded, log = download_clip(youtube_id, output_filename) status = tuple(['v_' + youtube_id, downloaded, log]) return status def parse_activitynet_annotations(input_csv, is_bsn_case=False): """Returns a list of YoutubeID. arguments: --------- input_csv: str Path to CSV file containing the following columns: 'video,numFrame,seconds,fps,rfps,subset,featureFrame' returns: ------- youtube_ids: list List of all YoutubeIDs in ActivityNet. """ if is_bsn_case: lines = open(input_csv).readlines() lines = lines[1:] # YoutubeIDs do not have prefix `v_` youtube_ids = [x.split(',')[0][2:] for x in lines] else: data = mmengine.load(anno_file)['database'] youtube_ids = list(data.keys()) return youtube_ids def main(input_csv, output_dir, anno_file, num_jobs=24, is_bsn_case=False): # Reading and parsing ActivityNet. youtube_ids = parse_activitynet_annotations(input_csv, is_bsn_case) # Creates folders where videos will be saved later. if not os.path.exists(output_dir): os.makedirs(output_dir) # Download all clips. if num_jobs == 1: status_list = [] for index in youtube_ids: status_list.append(download_clip_wrapper(index, output_dir)) else: status_list = Parallel(n_jobs=num_jobs)( delayed(download_clip_wrapper)(index, output_dir) for index in youtube_ids) # Save download report. mmengine.dump(status_list, 'download_report.json') annotation = mmengine.load(anno_file) downloaded = {status[0]: status[1] for status in status_list} annotation = {k: v for k, v in annotation.items() if downloaded[k]} if is_bsn_case: anno_file_bak = anno_file.replace('.json', '_bak.json') os.rename(anno_file, anno_file_bak) mmengine.dump(annotation, anno_file) if __name__ == '__main__': args = parse_args() is_bsn_case = args.bsn if is_bsn_case: video_list = f'{data_file}/video_info_new.csv' anno_file = f'{data_file}/anet_anno_action.json' else: video_list = f'{data_file}/activity_net.v1-3.min.json' anno_file = video_list main(video_list, output_dir, anno_file, 24, is_bsn_case)