| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| import argparse |
| import os |
| import pickle |
| import time |
|
|
| try: |
| import librosa |
| import requests |
| import requests_oauthlib |
| from joblib import Parallel, delayed |
| from oauthlib.oauth2 import TokenExpiredError |
| except (ModuleNotFoundError, ImportError) as e: |
| raise e |
|
|
| try: |
| import freesound |
| except ModuleNotFoundError as e: |
| raise ModuleNotFoundError( |
| "freesound is not installed. Execute `pip install --no-cache-dir git+https://github.com/MTG/freesound-python.git` in terminal" |
| ) |
|
|
|
|
| """ |
| Instructions |
| 1. We will need some requirements including freesound, requests, requests_oauthlib, joblib, librosa and sox. If they are not installed, please run `pip install -r freesound_requirements.txt` |
| 2. Create an API key for freesound.org at https://freesound.org/help/developers/ |
| 3. Create a python file called `freesound_private_apikey.py` and add lined `api_key = <your Freesound api key>` and `client_id = <your Freesound client id>` |
| 4. Authorize by run `python freesound_download.py --authorize` and visit website, and paste response code |
| 5. Feel free to change any arguments in download_resample_freesound.sh such as max_samples and max_filesize |
| 6. Run `bash download_resample_freesound.sh <numbers of files you want> <download data directory> <resampled data directory>` |
| """ |
|
|
| |
| try: |
| from freesound_private_apikey import api_key, client_id |
|
|
| print("API Key found !") |
| except ImportError: |
| raise ImportError( |
| "Create a python file called `freesound_private_apikey.py` and add lined `api_key = <your Freesound api key>` and `client_id = <your Freesound client id>`" |
| ) |
|
|
| auth_url = 'https://freesound.org/apiv2/oauth2/authorize/' |
| redirect_url = 'https://freesound.org/home/app_permissions/permission_granted/' |
| token_url = 'https://freesound.org/apiv2/oauth2/access_token/' |
| scope = ["read", "write"] |
|
|
| BACKGROUND_CLASSES = [ |
| "Air brake", |
| "Static", |
| "Acoustic environment", |
| "Distortion", |
| "Tape hiss", |
| "Hubbub", |
| "Vibration", |
| "Cacophony", |
| "Throbbing", |
| "Reverberation", |
| "Inside, public space", |
| "Inside, small room", |
| "Echo", |
| "Outside, rural", |
| "Outside, natural", |
| "Outside, urban", |
| "Outside, manmade", |
| "Car", |
| "Bus", |
| "Traffic noise", |
| "Roadway noise", |
| "Truck", |
| "Emergency vehicle", |
| "Motorcycle", |
| "Aircraft engine", |
| "Aircraft", |
| "Helicopter", |
| "Bicycle", |
| "Skateboard", |
| "Subway, metro, underground", |
| "Railroad car", |
| "Train wagon", |
| "Train", |
| "Sailboat", |
| "Rowboat", |
| "Ship", |
| ] |
|
|
| SPEECH_CLASSES = [ |
| "Male speech", |
| "Female speech", |
| "Speech synthesizer", |
| "Babbling", |
| "Conversation", |
| "Child speech", |
| "Narration", |
| "Laughter", |
| "Yawn", |
| "Whispering", |
| "Whimper", |
| "Baby cry", |
| "Sigh", |
| "Groan", |
| "Humming", |
| "Male singing", |
| "Female singing", |
| "Child singing", |
| "Children shouting", |
| ] |
|
|
|
|
| def initialize_oauth(): |
| |
| if os.path.exists('_token.pkl'): |
| token = unpickle_object('_token') |
| oauth = requests_oauthlib.OAuth2Session(client_id, redirect_uri=redirect_url, scope=scope, token=token) |
|
|
| else: |
| |
| |
| oauth = requests_oauthlib.OAuth2Session(client_id, redirect_uri=redirect_url, scope=scope) |
|
|
| authorization_url, state = oauth.authorization_url(auth_url) |
| print(f"Visit below website and paste access token below : \n\n{authorization_url}\n") |
|
|
| authorization_response = input("Paste authorization response code here :\n") |
|
|
| token = oauth.fetch_token( |
| token_url, |
| authorization_response=authorization_response, |
| code=authorization_response, |
| client_secret=api_key, |
| ) |
|
|
| |
| pickle_object(token, '_token') |
|
|
| return oauth, token |
|
|
|
|
| def instantiate_session(): |
| |
| |
| token = unpickle_object('_token') |
| session = requests_oauthlib.OAuth2Session(client_id, redirect_uri=redirect_url, scope=scope, token=token) |
| adapter = requests.adapters.HTTPAdapter(pool_connections=1, pool_maxsize=1) |
| session.mount('http://', adapter) |
| return session |
|
|
|
|
| def refresh_token(session): |
| print("Refreshing tokens...") |
| |
| extras = {'client_id': client_id, 'client_secret': api_key} |
| token = session.refresh_token(token_url, **extras) |
| print("Token refresh performed...") |
| |
| pickle_object(token, '_token') |
| return session |
|
|
|
|
| def pickle_object(token, name): |
| with open(name + '.pkl', 'wb') as f: |
| pickle.dump(token, f) |
|
|
|
|
| def unpickle_object(name): |
| fp = name + '.pkl' |
| if os.path.exists(fp): |
| with open(fp, 'rb') as f: |
| token = pickle.load(f) |
|
|
| return token |
| else: |
| raise FileNotFoundError('Token not found!') |
|
|
|
|
| def is_resource_limited(e: freesound.FreesoundException): |
| """ |
| Test if the reason for a freesound exception was either rate limit |
| or daily limit. |
| |
| If it was for either reason, sleep for an appropriate delay and return |
| to try again. |
| |
| Args: |
| e: Freesound Exception object |
| |
| Returns: |
| A boolean which describes whether the error was due to some |
| api limit issue, or if it was some other reason. |
| |
| If false is returned, then the user should carefully check the cause |
| and log it. |
| """ |
| detail = e.detail['detail'] |
|
|
| if '2000' in detail: |
| |
| print(f"Hit daily limit, sleeping for 20 minutes.") |
| time.sleep(60 * 20) |
| return True |
|
|
| elif '60' in detail: |
| |
| print(f"Hit rate limit, sleeping for 1 minute.") |
| time.sleep(60) |
| return True |
|
|
| else: |
| return False |
|
|
|
|
| def prepare_client(client: freesound.FreesoundClient, token) -> freesound.FreesoundClient: |
| |
| client.set_token(token['access_token'], auth_type='oauth') |
| print("Client ready !") |
| return client |
|
|
|
|
| def get_text_query_with_resource_limit_checks(client, query: str, filters: list, fields: str, page_size: int): |
| """ |
| Performs a text query, checks for rate / api limits, and retries. |
| |
| Args: |
| client: FreesoundAPI client |
| query: query string (either exact or inexact) |
| filters: list of string filters |
| fields: String of values to recover |
| page_size: samples per page returned |
| |
| Returns: |
| |
| """ |
| pages = None |
| attempts = 20 |
|
|
| while pages is None: |
| try: |
| pages = client.text_search(query=query, filter=" ".join(filters), fields=fields, page_size=str(page_size),) |
|
|
| except freesound.FreesoundException as e: |
| |
| |
| |
| was_resource_limited = is_resource_limited(e) |
|
|
| |
| |
| if not was_resource_limited: |
| print(e.with_traceback(None)) |
| break |
|
|
| attempts -= 1 |
|
|
| |
| if attempts % 5 == 0 and attempts > 0: |
| session = instantiate_session() |
| refresh_token(session) |
| session.close() |
| token = unpickle_object('_token') |
| client = prepare_client(client, token) |
|
|
| if attempts <= 0: |
| print(f"Failed to query pages for '{query}' after 10 attempts, skipping query") |
| break |
|
|
| if pages is None: |
| print(f"Query attempts remaining = {attempts}") |
|
|
| return client, pages |
|
|
|
|
| def get_resource_with_auto_refresh(session, download_url): |
| """ |
| Attempts download of audio with a token refresh if necessary. |
| """ |
| try: |
| result = session.get(download_url) |
|
|
| except TokenExpiredError as e: |
| session = refresh_token(session) |
| result = session.get(download_url) |
|
|
| except Exception as e: |
| result = None |
|
|
| print(f"Skipping file {download_url} due to exception below\n\n") |
| print(e) |
|
|
| return result.content |
|
|
|
|
| def download_song(basepath, id, name, download_url): |
| |
| name = name.encode('ascii', 'replace').decode() |
| name = name.replace("?", "-") |
| name = name.replace(":", "-") |
| name = name.replace("(", "-") |
| name = name.replace(")", "-") |
| name = name.replace("'", "") |
| name = name.replace(",", "-") |
| name = name.replace("/", "-") |
| name = name.replace("\\", "-") |
| name = name.replace(".", "-") |
| name = name.replace(" ", "") |
|
|
| |
| name = name[:-4] + '.wav' |
|
|
| |
| name = f"id_{id}" + "_" + name |
|
|
| fp = os.path.join(basepath, name) |
|
|
| |
| |
| |
| if os.path.exists(fp): |
| try: |
| _ = librosa.load(path=fp) |
| except Exception: |
| |
| os.remove(fp) |
|
|
| print(f"Pre-existing file {fp} was corrupt and was deleted, will be re-downloaded.") |
|
|
| if not os.path.exists(fp): |
| print("Downloading file :", name) |
|
|
| session = instantiate_session() |
|
|
| data = None |
| attempts = 10 |
|
|
| try: |
| while data is None: |
|
|
| try: |
| |
| data = get_resource_with_auto_refresh(session, download_url) |
|
|
| except freesound.FreesoundException as e: |
| |
| |
| |
| was_resource_limited = is_resource_limited(e) |
|
|
| |
| |
| if not was_resource_limited: |
| print(e) |
| break |
|
|
| attempts -= 1 |
|
|
| if attempts <= 0: |
| print(f"Failed to download file {fp} after 10 attempts, skipping file") |
| break |
|
|
| if data is None: |
| print(f"Download attempts remaining = {attempts}") |
|
|
| finally: |
| session.close() |
|
|
| |
| if data is not None: |
| print("Downloaded file :", name) |
|
|
| with open(fp, 'wb') as f: |
| f.write(data) |
|
|
| |
| if os.path.getsize(fp) > 89: |
| print(f"File written : {fp}") |
|
|
| else: |
| os.remove(fp) |
| print(f"File corrupted and has been deleted: {fp}") |
|
|
| else: |
| print(f"File [{fp}] corrupted or faced some issue when downloading, skipped.") |
|
|
| |
| time.sleep(5) |
|
|
| else: |
| print(f"File [{fp}] already exists in dataset, skipping re-download.") |
|
|
|
|
| def get_songs_by_category( |
| client: freesound.FreesoundClient, |
| category: str, |
| data_dir: str, |
| max_num_samples=100, |
| page_size=100, |
| min_filesize_in_mb=0, |
| max_filesize_in_mb=10, |
| n_jobs=None, |
| ): |
| """ |
| Download songs of a category with restrictions |
| |
| Args: |
| client: FreesoundAPI client |
| category: category to be downloaded |
| data_dir: directory of downloaded songs |
| max_num_samples: maximum number of samples of this category |
| page_size: samples per page returned |
| min_filesize_in_mb: minimum filesize of the song in MB |
| max_filesize_in_mb: maximum filesize of the song in MB |
| n_jobs: number of jobs for parallel processing |
| |
| Returns: |
| |
| """ |
| |
| query = f'"{category}"' |
| print(f"Query : {query}") |
|
|
| page_size = min(page_size, 150) |
| max_filesize = int(max_filesize_in_mb * (2 ** 20)) |
|
|
| if min_filesize_in_mb == 0: |
| min_filesize_in_mb = 1 |
| else: |
| min_filesize_in_mb = int(min_filesize_in_mb * (2 ** 20)) |
|
|
| if max_num_samples < 0: |
| max_num_samples = int(1e6) |
|
|
| filters = [ |
| 'type:(wav OR flac)', |
| 'license:("Attribution" OR "Creative Commons 0")', |
| f'filesize:[{min_filesize_in_mb} TO {max_filesize}]', |
| ] |
|
|
| fields = "id,name,download,license" |
|
|
| client, pages = get_text_query_with_resource_limit_checks( |
| client, query=query, filters=filters, fields=fields, page_size=page_size |
| ) |
|
|
| if pages is None: |
| print(f"Number of attempts exceeded limit, skipping query {query}") |
| return |
|
|
| num_pages = pages.count |
|
|
| |
| if num_pages == 0: |
| print(f"Found 0 samples of results for query '{query}'") |
| print(f"Trying less restricted query : {category}") |
|
|
| client, pages = get_text_query_with_resource_limit_checks( |
| client, query=category, filters=filters, fields=fields, page_size=page_size |
| ) |
|
|
| if pages is None: |
| print(f"Number of attempts exceeded limit, skipping query {query}") |
| return |
|
|
| num_pages = pages.count |
|
|
| print(f"Found {num_pages} samples of results for query '{query}'") |
|
|
| category = category.replace(' ', '_') |
| basepath = os.path.join(data_dir, category) |
|
|
| if not os.path.exists(basepath): |
| os.makedirs(basepath) |
|
|
| sounds = [] |
| sample_count = 0 |
|
|
| |
| with open(os.path.join(basepath, 'licenses.txt'), 'w') as f: |
| f.write("ID,LICENSE\n") |
| f.flush() |
|
|
| while True: |
| for sound in pages: |
| if sample_count >= max_num_samples: |
| print( |
| f"Collected {sample_count} samples, which is >= max number of samples requested " |
| f"{max_num_samples}. Stopping for this category : {category}" |
| ) |
| break |
|
|
| sounds.append(sound) |
| sample_count += 1 |
|
|
| f.write(f"{sound.id},{sound.license}\n") |
| f.flush() |
|
|
| if sample_count >= max_num_samples: |
| break |
|
|
| try: |
| pages = pages.next_page() |
| except ValueError: |
| break |
|
|
| if n_jobs is None: |
| n_jobs = max(1, len(sounds)) |
|
|
| |
| with Parallel(n_jobs=n_jobs, verbose=10) as parallel: |
| _ = parallel(delayed(download_song)(basepath, sound.id, sound.name, sound.download) for sound in sounds) |
|
|
|
|
| if __name__ == '__main__': |
|
|
| parser = argparse.ArgumentParser(description="Freesound download script") |
|
|
| parser.add_argument( |
| '--authorize', action='store_true', dest='auth', help='Flag to only perform OAuth2 authorization step' |
| ) |
|
|
| parser.add_argument('-c', '--category', default='', type=str, help='Category required to download') |
|
|
| parser.add_argument('-d', '--data_dir', default='', type=str, help='Destination folder to store data') |
|
|
| parser.add_argument('--page_size', default=100, type=int, help='Number of sounds per page') |
|
|
| parser.add_argument('--max_samples', default=100, type=int, help='Maximum number of sound samples') |
|
|
| parser.add_argument('--min_filesize', default=0, type=int, help='Maximum filesize allowed (in MB)') |
|
|
| parser.add_argument('--max_filesize', default=20, type=int, help='Maximum filesize allowed (in MB)') |
|
|
| parser.set_defaults(auth=False) |
|
|
| args = parser.parse_args() |
|
|
| if args.auth: |
| """ Initialize oauth token to be used by all """ |
| oauth, token = initialize_oauth() |
| oauth.close() |
|
|
| print("Authentication suceeded ! Token stored in `_token.pkl`") |
| exit(0) |
|
|
| if not os.path.exists('_token.pkl'): |
| raise FileNotFoundError( |
| "Please authorize the application first using " "`python freesound_download.py --authorize`" |
| ) |
| if args.data_dir == '': |
| raise ValueError("Data dir must be passed as an argument using `--data_dir`") |
|
|
| data_dir = args.data_dir |
|
|
| page_size = args.page_size |
| max_num_samples = args.max_samples |
| min_filesize_in_mb = args.min_filesize |
| max_filesize_in_mb = args.max_filesize |
|
|
| |
| token = unpickle_object('_token') |
| freesound_client = freesound.FreesoundClient() |
| client = prepare_client(freesound_client, token) |
|
|
| category = args.category |
|
|
| if category == '': |
| raise ValueError("Cannot pass empty string as it will select all of FreeSound data !") |
|
|
| print(f"Downloading category : {category}") |
| get_songs_by_category( |
| client, |
| category, |
| data_dir=data_dir, |
| max_num_samples=max_num_samples, |
| page_size=page_size, |
| min_filesize_in_mb=min_filesize_in_mb, |
| max_filesize_in_mb=max_filesize_in_mb, |
| n_jobs=30, |
| ) |
|
|