Spaces:
Runtime error
Runtime error
| # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # downloads the training/eval set for VoxConverse. | |
| import argparse | |
| import logging | |
| import os | |
| import zipfile | |
| from pathlib import Path | |
| import wget | |
| from nemo.collections.asr.parts.utils.manifest_utils import create_manifest | |
| dev_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_dev_wav.zip" | |
| test_url = "https://www.robots.ox.ac.uk/~vgg/data/voxconverse/data/voxconverse_test_wav.zip" | |
| rttm_annotations_url = "https://github.com/joonson/voxconverse/archive/refs/heads/master.zip" | |
| def extract_file(filepath: Path, data_dir: Path): | |
| try: | |
| with zipfile.ZipFile(str(filepath), 'r') as zip_ref: | |
| zip_ref.extractall(str(data_dir)) | |
| except Exception: | |
| logging.info("Not extracting. Maybe already there?") | |
| def _generate_manifest(data_root: Path, audio_path: Path, rttm_path: Path, manifest_output_path: Path): | |
| audio_list = str(data_root / 'audio_file.txt') | |
| rttm_list = str(data_root / 'rttm_file.txt') | |
| with open(audio_list, 'w') as f: | |
| f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(audio_path)])) | |
| with open(rttm_list, 'w') as f: | |
| f.write('\n'.join([str(os.path.join(rttm_path, x)) for x in os.listdir(rttm_path)])) | |
| create_manifest( | |
| audio_list, str(manifest_output_path), rttm_path=rttm_list, | |
| ) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="VoxConverse Data download") | |
| parser.add_argument("--data_root", default='./', type=str) | |
| args = parser.parse_args() | |
| data_root = Path(args.data_root) | |
| data_root.mkdir(exist_ok=True, parents=True) | |
| test_path = data_root / os.path.basename(test_url) | |
| dev_path = data_root / os.path.basename(dev_url) | |
| rttm_path = data_root / os.path.basename(rttm_annotations_url) | |
| if not os.path.exists(test_path): | |
| test_path = wget.download(test_url, str(data_root)) | |
| if not os.path.exists(dev_path): | |
| dev_path = wget.download(dev_url, str(data_root)) | |
| if not os.path.exists(rttm_path): | |
| rttm_path = wget.download(rttm_annotations_url, str(data_root)) | |
| extract_file(test_path, data_root / 'test/') | |
| extract_file(dev_path, data_root / 'dev/') | |
| extract_file(rttm_path, data_root) | |
| _generate_manifest( | |
| data_root=data_root, | |
| audio_path=os.path.abspath(data_root / 'test/voxconverse_test_wav/'), | |
| rttm_path=os.path.abspath(data_root / 'voxconverse-master/test/'), | |
| manifest_output_path=data_root / 'test_manifest.json', | |
| ) | |
| _generate_manifest( | |
| data_root=data_root, | |
| audio_path=os.path.abspath(data_root / 'dev/audio/'), | |
| rttm_path=os.path.abspath(data_root / 'voxconverse-master/dev/'), | |
| manifest_output_path=data_root / 'dev_manifest.json', | |
| ) | |
| if __name__ == "__main__": | |
| main() | |