| """ | |
| A script to download the InfoRE dataset and textgrid files. | |
| """ | |
| import shutil | |
| from pathlib import Path | |
| import pooch | |
| from pooch import Unzip | |
| from tqdm.cli import tqdm | |
| def download_infore_data(): | |
| """download infore wav files""" | |
| files = pooch.retrieve( | |
| url="https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k_denoised.zip", | |
| known_hash="2445527b345fb0b1816ce3c8f09bae419d6bbe251f16d6c74d8dd95ef9fb0737", | |
| processor=Unzip(), | |
| progressbar=True, | |
| ) | |
| data_dir = Path(sorted(files)[0]).parent | |
| return data_dir | |
| def download_textgrid(): | |
| """download textgrid files""" | |
| files = pooch.retrieve( | |
| url="https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_tg.zip", | |
| known_hash="26e4f53025220097ea95dc266657de8d65104b0a17a6ffba778fc016c8dd36d7", | |
| processor=Unzip(), | |
| progressbar=True, | |
| ) | |
| data_dir = Path(sorted(files)[0]).parent | |
| return data_dir | |
| DATA_ROOT = Path("./train_data") | |
| DATA_ROOT.mkdir(parents=True, exist_ok=True) | |
| wav_dir = download_infore_data() | |
| tg_dir = download_textgrid() | |
| for path in tqdm(tg_dir.glob("*.TextGrid")): | |
| wav_name = path.with_suffix(".wav").name | |
| wav_src = wav_dir / wav_name | |
| shutil.copy(path, DATA_ROOT) | |
| shutil.copy(wav_src, DATA_ROOT) | |