# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ # Task 1: Speech Command Recognition ## Preparing the dataset Use the `process_speech_commands_data.py` script under /scripts/dataset_processing in order to prepare the dataset. ```sh python /scripts/dataset_processing/process_speech_commands_data.py \ --data_root= \ --data_version= \ --class_split= \ --rebalance \ --log ``` ## Train to convergence ```sh python speech_to_label.py \ # (Optional: --config-path= --config-name=) \ model.train_ds.manifest_filepath="" \ model.validation_ds.manifest_filepath=["",""] \ trainer.devices=2 \ trainer.accelerator="gpu" \ strategy="ddp" \ trainer.max_epochs=200 \ exp_manager.create_wandb_logger=True \ exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-v1" \ exp_manager.wandb_logger_kwargs.project="MatchboxNet-v1" \ +trainer.precision=16 \ +trainer.amp_level=O1 # needed if using PyTorch < 1.6 ``` # Task 2: Voice Activity Detection ## Preparing the dataset Use the `process_vad_data.py` script under /scripts/dataset_processing in order to prepare the dataset. ```sh python process_vad_data.py \ --out_dir= \ --speech_data_root= \ --background_data_root= \ --rebalance_method=<'under' or 'over' of 'fixed'> \ --log (Optional --demo (for demonstration in tutorial). If you want to use your own background noise data, make sure to delete --demo) ``` ## Train to convergence ```sh python speech_to_label.py \ --config-path= --config-name= \ model.train_ds.manifest_filepath="" \ model.validation_ds.manifest_filepath=["",""] \ trainer.devices=2 \ trainer.accelerator="gpu" \ strategy="ddp" \ trainer.max_epochs=200 \ exp_manager.create_wandb_logger=True \ exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \ exp_manager.wandb_logger_kwargs.project="MatchboxNet-vad" \ +trainer.precision=16 \ +trainer.amp_level=O1 # needed if using PyTorch < 1.6 ``` # Task 3: Language Identification ## Preparing the dataset Use the `filelist_to_manifest.py` script under /scripts/speaker_tasks in order to prepare the dataset. ``` ## Train to convergence ```sh python speech_to_label.py \ --config-path= --config-name= \ model.train_ds.manifest_filepath="" \ model.validation_ds.manifest_filepath="" \ model.train_ds.augmentor.noise.manifest_path="" \ model.train_ds.augmentor.impulse.manifest_path="" \ model.decoder.num_classes= \ trainer.devices=2 \ trainer.max_epochs=40 \ exp_manager.create_wandb_logger=True \ exp_manager.wandb_logger_kwargs.name="titanet" \ exp_manager.wandb_logger_kwargs.project="langid" \ +exp_manager.checkpoint_callback_params.monitor="val_acc_macro" \ +exp_manager.checkpoint_callback_params.mode="max" \ +trainer.precision=16 \ ``` # Optional: Use tarred dataset to speed up data loading. Apply to both tasks. ## Prepare tarred dataset. Prepare ONE manifest that contains all training data you would like to include. Validation should use non-tarred dataset. Note that it's possible that tarred datasets impacts validation scores because it drop values in order to have same amount of files per tarfile; Scores might be off since some data is missing. Use the `convert_to_tarred_audio_dataset.py` script under /scripts/speech_recognition in order to prepare tarred audio dataset. For details, please see TarredAudioToClassificationLabelDataset in /nemo/collections/asr/data/audio_to_label.py python speech_to_label.py \ --config-path= --config-name= \ model.train_ds.manifest_filepath= \ model.train_ds.is_tarred=True \ model.train_ds.tarred_audio_filepaths= \ +model.train_ds.num_worker= \ model.validation_ds.manifest_filepath=\ trainer.devices=2 \ trainer.accelerator="gpu" \ strategy="ddp" \ \ trainer.max_epochs=200 \ exp_manager.create_wandb_logger=True \ exp_manager.wandb_logger_kwargs.name="MatchboxNet-3x1x64-vad" \ exp_manager.wandb_logger_kwargs.project="MatchboxNet-vad" \ +trainer.precision=16 \ +trainer.amp_level=O1 # needed if using PyTorch < 1.6 # Fine-tune a model For documentation on fine-tuning this model, please visit - https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#fine-tuning-configurations # Pretrained Models For documentation on existing pretrained models, please visit - https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/speech_classification/results.html# """ import pytorch_lightning as pl import torch from omegaconf import OmegaConf from nemo.collections.asr.models import EncDecClassificationModel, EncDecSpeakerLabelModel from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @hydra_runner(config_path="../conf/matchboxnet", config_name="matchboxnet_3x1x64_v1") def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if 'titanet' in cfg.name.lower(): model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer) else: model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer) # Initialize the weights of the model from another model, if provided via config model.maybe_init_from_pretrained_checkpoint(cfg) trainer.fit(model) torch.distributed.destroy_process_group() if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: if trainer.is_global_zero: trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator, strategy=cfg.trainer.strategy) if model.prepare_test(trainer): trainer.test(model) if __name__ == '__main__': main() # noqa pylint: disable=no-value-for-parameter