| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Usage: |
| |
| export CUDA_VISIBLE_DEVICES="0,1,2" |
| |
| ./transducer_stateless_modified/train.py \ |
| --world-size 3 \ |
| --num-epochs 65 \ |
| --start-epoch 0 \ |
| --exp-dir transducer_stateless_modified/exp \ |
| --max-duration 250 \ |
| --lr-factor 2.0 \ |
| --context-size 2 \ |
| --modified-transducer-prob 0.25 |
| """ |
|
|
|
|
| import argparse |
| import logging |
| import warnings |
| from pathlib import Path |
| from shutil import copyfile |
| from typing import Optional, Tuple |
|
|
| import k2 |
| import torch |
| import torch.multiprocessing as mp |
| import torch.nn as nn |
| from asr_datamodule import AishellAsrDataModule |
| from conformer import Conformer |
| from decoder import Decoder |
| from joiner import Joiner |
| from lhotse.cut import Cut |
| from lhotse.utils import fix_random_seed |
| from model import Transducer |
| from torch import Tensor |
| from torch.nn.parallel import DistributedDataParallel as DDP |
| from torch.nn.utils import clip_grad_norm_ |
| from torch.utils.tensorboard import SummaryWriter |
| from transformer import Noam |
|
|
| from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler |
| from icefall.checkpoint import load_checkpoint |
| from icefall.checkpoint import save_checkpoint as save_checkpoint_impl |
| from icefall.dist import cleanup_dist, setup_dist |
| from icefall.env import get_env_info |
| from icefall.lexicon import Lexicon |
| from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool |
|
|
|
|
| def get_parser(): |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| ) |
|
|
| parser.add_argument( |
| "--world-size", |
| type=int, |
| default=1, |
| help="Number of GPUs for DDP training.", |
| ) |
|
|
| parser.add_argument( |
| "--master-port", |
| type=int, |
| default=12354, |
| help="Master port to use for DDP training.", |
| ) |
|
|
| parser.add_argument( |
| "--tensorboard", |
| type=str2bool, |
| default=True, |
| help="Should various information be logged in tensorboard.", |
| ) |
|
|
| parser.add_argument( |
| "--num-epochs", |
| type=int, |
| default=30, |
| help="Number of epochs to train.", |
| ) |
|
|
| parser.add_argument( |
| "--start-epoch", |
| type=int, |
| default=0, |
| help="""Resume training from from this epoch. |
| If it is positive, it will load checkpoint from |
| transducer_stateless/exp/epoch-{start_epoch-1}.pt |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--exp-dir", |
| type=str, |
| default="transducer_stateless_modified/exp", |
| help="""The experiment dir. |
| It specifies the directory where all training related |
| files, e.g., checkpoints, log, etc, are saved |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--lang-dir", |
| type=str, |
| default="data/lang_char", |
| help="""The lang dir |
| It contains language related input files such as |
| "lexicon.txt" |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--lr-factor", |
| type=float, |
| default=5.0, |
| help="The lr_factor for Noam optimizer", |
| ) |
|
|
| parser.add_argument( |
| "--context-size", |
| type=int, |
| default=2, |
| help="The context size in the decoder. 1 means bigram; 2 means tri-gram", |
| ) |
|
|
| parser.add_argument( |
| "--modified-transducer-prob", |
| type=float, |
| default=0.25, |
| help="""The probability to use modified transducer loss. |
| In modified transduer, it limits the maximum number of symbols |
| per frame to 1. See also the option --max-sym-per-frame in |
| transducer_stateless/decode.py |
| """, |
| ) |
|
|
| return parser |
|
|
|
|
| def get_params() -> AttributeDict: |
| """Return a dict containing training parameters. |
| |
| All training related parameters that are not passed from the commandline |
| are saved in the variable `params`. |
| |
| Commandline options are merged into `params` after they are parsed, so |
| you can also access them via `params`. |
| |
| Explanation of options saved in `params`: |
| |
| - best_train_loss: Best training loss so far. It is used to select |
| the model that has the lowest training loss. It is |
| updated during the training. |
| |
| - best_valid_loss: Best validation loss so far. It is used to select |
| the model that has the lowest validation loss. It is |
| updated during the training. |
| |
| - best_train_epoch: It is the epoch that has the best training loss. |
| |
| - best_valid_epoch: It is the epoch that has the best validation loss. |
| |
| - batch_idx_train: Used to writing statistics to tensorboard. It |
| contains number of batches trained so far across |
| epochs. |
| |
| - log_interval: Print training loss if batch_idx % log_interval` is 0 |
| |
| - reset_interval: Reset statistics if batch_idx % reset_interval is 0 |
| |
| - valid_interval: Run validation if batch_idx % valid_interval is 0 |
| |
| - feature_dim: The model input dim. It has to match the one used |
| in computing features. |
| |
| - subsampling_factor: The subsampling factor for the model. |
| |
| - attention_dim: Hidden dim for multi-head attention model. |
| |
| - num_decoder_layers: Number of decoder layer of transformer decoder. |
| |
| - warm_step: The warm_step for Noam optimizer. |
| """ |
| params = AttributeDict( |
| { |
| "best_train_loss": float("inf"), |
| "best_valid_loss": float("inf"), |
| "best_train_epoch": -1, |
| "best_valid_epoch": -1, |
| "batch_idx_train": 0, |
| "log_interval": 50, |
| "reset_interval": 200, |
| "valid_interval": 800, |
| |
| "feature_dim": 80, |
| "encoder_out_dim": 512, |
| "subsampling_factor": 4, |
| "attention_dim": 512, |
| "nhead": 8, |
| "dim_feedforward": 2048, |
| "num_encoder_layers": 12, |
| "vgg_frontend": False, |
| |
| "warm_step": 80000, |
| "env_info": get_env_info(), |
| } |
| ) |
|
|
| return params |
|
|
|
|
| def get_encoder_model(params: AttributeDict) -> nn.Module: |
| |
| encoder = Conformer( |
| num_features=params.feature_dim, |
| output_dim=params.encoder_out_dim, |
| subsampling_factor=params.subsampling_factor, |
| d_model=params.attention_dim, |
| nhead=params.nhead, |
| dim_feedforward=params.dim_feedforward, |
| num_encoder_layers=params.num_encoder_layers, |
| vgg_frontend=params.vgg_frontend, |
| ) |
| return encoder |
|
|
|
|
| def get_decoder_model(params: AttributeDict) -> nn.Module: |
| decoder = Decoder( |
| vocab_size=params.vocab_size, |
| embedding_dim=params.encoder_out_dim, |
| blank_id=params.blank_id, |
| context_size=params.context_size, |
| ) |
| return decoder |
|
|
|
|
| def get_joiner_model(params: AttributeDict) -> nn.Module: |
| joiner = Joiner( |
| input_dim=params.encoder_out_dim, |
| output_dim=params.vocab_size, |
| ) |
| return joiner |
|
|
|
|
| def get_transducer_model(params: AttributeDict) -> nn.Module: |
| encoder = get_encoder_model(params) |
| decoder = get_decoder_model(params) |
| joiner = get_joiner_model(params) |
|
|
| model = Transducer( |
| encoder=encoder, |
| decoder=decoder, |
| joiner=joiner, |
| ) |
| return model |
|
|
|
|
| def load_checkpoint_if_available( |
| params: AttributeDict, |
| model: nn.Module, |
| optimizer: Optional[torch.optim.Optimizer] = None, |
| scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, |
| ) -> None: |
| """Load checkpoint from file. |
| |
| If params.start_epoch is positive, it will load the checkpoint from |
| `params.start_epoch - 1`. Otherwise, this function does nothing. |
| |
| Apart from loading state dict for `model`, `optimizer` and `scheduler`, |
| it also updates `best_train_epoch`, `best_train_loss`, `best_valid_epoch`, |
| and `best_valid_loss` in `params`. |
| |
| Args: |
| params: |
| The return value of :func:`get_params`. |
| model: |
| The training model. |
| optimizer: |
| The optimizer that we are using. |
| scheduler: |
| The learning rate scheduler we are using. |
| Returns: |
| Return None. |
| """ |
| if params.start_epoch <= 0: |
| return |
|
|
| filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt" |
| saved_params = load_checkpoint( |
| filename, |
| model=model, |
| optimizer=optimizer, |
| scheduler=scheduler, |
| ) |
|
|
| keys = [ |
| "best_train_epoch", |
| "best_valid_epoch", |
| "batch_idx_train", |
| "best_train_loss", |
| "best_valid_loss", |
| ] |
| for k in keys: |
| params[k] = saved_params[k] |
|
|
| return saved_params |
|
|
|
|
| def save_checkpoint( |
| params: AttributeDict, |
| model: nn.Module, |
| optimizer: Optional[torch.optim.Optimizer] = None, |
| scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None, |
| rank: int = 0, |
| ) -> None: |
| """Save model, optimizer, scheduler and training stats to file. |
| |
| Args: |
| params: |
| It is returned by :func:`get_params`. |
| model: |
| The training model. |
| """ |
| if rank != 0: |
| return |
| filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt" |
| save_checkpoint_impl( |
| filename=filename, |
| model=model, |
| params=params, |
| optimizer=optimizer, |
| scheduler=scheduler, |
| rank=rank, |
| ) |
|
|
| if params.best_train_epoch == params.cur_epoch: |
| best_train_filename = params.exp_dir / "best-train-loss.pt" |
| copyfile(src=filename, dst=best_train_filename) |
|
|
| if params.best_valid_epoch == params.cur_epoch: |
| best_valid_filename = params.exp_dir / "best-valid-loss.pt" |
| copyfile(src=filename, dst=best_valid_filename) |
|
|
|
|
| def compute_loss( |
| params: AttributeDict, |
| model: nn.Module, |
| graph_compiler: CharCtcTrainingGraphCompiler, |
| batch: dict, |
| is_training: bool, |
| ) -> Tuple[Tensor, MetricsTracker]: |
| """ |
| Compute CTC loss given the model and its inputs. |
| |
| Args: |
| params: |
| Parameters for training. See :func:`get_params`. |
| model: |
| The model for training. It is an instance of Conformer in our case. |
| batch: |
| A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()` |
| for the content in it. |
| is_training: |
| True for training. False for validation. When it is True, this |
| function enables autograd during computation; when it is False, it |
| disables autograd. |
| """ |
| device = model.device |
| feature = batch["inputs"] |
| |
| assert feature.ndim == 3 |
| feature = feature.to(device) |
|
|
| supervisions = batch["supervisions"] |
| feature_lens = supervisions["num_frames"].to(device) |
|
|
| texts = batch["supervisions"]["text"] |
| y = graph_compiler.texts_to_ids(texts) |
| y = k2.RaggedTensor(y).to(device) |
|
|
| with torch.set_grad_enabled(is_training): |
| loss = model( |
| x=feature, |
| x_lens=feature_lens, |
| y=y, |
| modified_transducer_prob=params.modified_transducer_prob, |
| ) |
|
|
| assert loss.requires_grad == is_training |
|
|
| info = MetricsTracker() |
| with warnings.catch_warnings(): |
| warnings.simplefilter("ignore") |
| info["frames"] = (feature_lens // params.subsampling_factor).sum().item() |
|
|
| |
| info["loss"] = loss.detach().cpu().item() |
|
|
| return loss, info |
|
|
|
|
| def compute_validation_loss( |
| params: AttributeDict, |
| model: nn.Module, |
| graph_compiler: CharCtcTrainingGraphCompiler, |
| valid_dl: torch.utils.data.DataLoader, |
| world_size: int = 1, |
| ) -> MetricsTracker: |
| """Run the validation process.""" |
| model.eval() |
|
|
| tot_loss = MetricsTracker() |
|
|
| for batch_idx, batch in enumerate(valid_dl): |
| loss, loss_info = compute_loss( |
| params=params, |
| model=model, |
| graph_compiler=graph_compiler, |
| batch=batch, |
| is_training=False, |
| ) |
| assert loss.requires_grad is False |
| tot_loss = tot_loss + loss_info |
|
|
| if world_size > 1: |
| tot_loss.reduce(loss.device) |
|
|
| loss_value = tot_loss["loss"] / tot_loss["frames"] |
| if loss_value < params.best_valid_loss: |
| params.best_valid_epoch = params.cur_epoch |
| params.best_valid_loss = loss_value |
|
|
| return tot_loss |
|
|
|
|
| def train_one_epoch( |
| params: AttributeDict, |
| model: nn.Module, |
| optimizer: torch.optim.Optimizer, |
| graph_compiler: CharCtcTrainingGraphCompiler, |
| train_dl: torch.utils.data.DataLoader, |
| valid_dl: torch.utils.data.DataLoader, |
| tb_writer: Optional[SummaryWriter] = None, |
| world_size: int = 1, |
| ) -> None: |
| """Train the model for one epoch. |
| |
| The training loss from the mean of all frames is saved in |
| `params.train_loss`. It runs the validation process every |
| `params.valid_interval` batches. |
| |
| Args: |
| params: |
| It is returned by :func:`get_params`. |
| model: |
| The model for training. |
| optimizer: |
| The optimizer we are using. |
| train_dl: |
| Dataloader for the training dataset. |
| valid_dl: |
| Dataloader for the validation dataset. |
| tb_writer: |
| Writer to write log messages to tensorboard. |
| world_size: |
| Number of nodes in DDP training. If it is 1, DDP is disabled. |
| """ |
| model.train() |
|
|
| tot_loss = MetricsTracker() |
|
|
| for batch_idx, batch in enumerate(train_dl): |
| params.batch_idx_train += 1 |
| batch_size = len(batch["supervisions"]["text"]) |
|
|
| loss, loss_info = compute_loss( |
| params=params, |
| model=model, |
| graph_compiler=graph_compiler, |
| batch=batch, |
| is_training=True, |
| ) |
| |
| tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info |
|
|
| |
| |
|
|
| optimizer.zero_grad() |
| loss.backward() |
| clip_grad_norm_(model.parameters(), 5.0, 2.0) |
| optimizer.step() |
|
|
| if batch_idx % params.log_interval == 0: |
| logging.info( |
| f"Epoch {params.cur_epoch}, " |
| f"batch {batch_idx}, loss[{loss_info}], " |
| f"tot_loss[{tot_loss}], batch size: {batch_size}" |
| ) |
|
|
| if batch_idx % params.log_interval == 0: |
|
|
| if tb_writer is not None: |
| loss_info.write_summary( |
| tb_writer, "train/current_", params.batch_idx_train |
| ) |
| tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train) |
|
|
| if batch_idx > 0 and batch_idx % params.valid_interval == 0: |
| logging.info("Computing validation loss") |
| valid_info = compute_validation_loss( |
| params=params, |
| model=model, |
| graph_compiler=graph_compiler, |
| valid_dl=valid_dl, |
| world_size=world_size, |
| ) |
| model.train() |
| logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}") |
| if tb_writer is not None: |
| valid_info.write_summary( |
| tb_writer, "train/valid_", params.batch_idx_train |
| ) |
|
|
| loss_value = tot_loss["loss"] / tot_loss["frames"] |
| params.train_loss = loss_value |
| if params.train_loss < params.best_train_loss: |
| params.best_train_epoch = params.cur_epoch |
| params.best_train_loss = params.train_loss |
|
|
|
|
| def run(rank, world_size, args): |
| """ |
| Args: |
| rank: |
| It is a value between 0 and `world_size-1`, which is |
| passed automatically by `mp.spawn()` in :func:`main`. |
| The node with rank 0 is responsible for saving checkpoint. |
| world_size: |
| Number of GPUs for DDP training. |
| args: |
| The return value of get_parser().parse_args() |
| """ |
| params = get_params() |
| params.update(vars(args)) |
|
|
| fix_random_seed(42) |
| if world_size > 1: |
| setup_dist(rank, world_size, params.master_port) |
|
|
| setup_logger(f"{params.exp_dir}/log/log-train") |
| logging.info("Training started") |
|
|
| if args.tensorboard and rank == 0: |
| tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard") |
| else: |
| tb_writer = None |
|
|
| device = torch.device("cpu") |
| if torch.cuda.is_available(): |
| device = torch.device("cuda", rank) |
| logging.info(f"Device: {device}") |
|
|
| lexicon = Lexicon(params.lang_dir) |
| graph_compiler = CharCtcTrainingGraphCompiler( |
| lexicon=lexicon, |
| device=device, |
| oov="<unk>", |
| ) |
|
|
| params.blank_id = 0 |
| params.vocab_size = max(lexicon.tokens) + 1 |
|
|
| logging.info(params) |
|
|
| logging.info("About to create model") |
| model = get_transducer_model(params) |
|
|
| num_param = sum([p.numel() for p in model.parameters()]) |
| logging.info(f"Number of model parameters: {num_param}") |
|
|
| checkpoints = load_checkpoint_if_available(params=params, model=model) |
|
|
| model.to(device) |
| if world_size > 1: |
| logging.info("Using DDP") |
| model = DDP(model, device_ids=[rank]) |
| model.device = device |
|
|
| optimizer = Noam( |
| model.parameters(), |
| model_size=params.attention_dim, |
| factor=params.lr_factor, |
| warm_step=params.warm_step, |
| ) |
|
|
| if checkpoints and "optimizer" in checkpoints: |
| logging.info("Loading optimizer state dict") |
| optimizer.load_state_dict(checkpoints["optimizer"]) |
|
|
| aishell = AishellAsrDataModule(args) |
| train_cuts = aishell.train_cuts() |
|
|
| def remove_short_and_long_utt(c: Cut): |
| |
| |
| |
| |
| |
| |
| |
| |
| return 1.0 <= c.duration <= 12.0 |
|
|
| train_cuts = train_cuts.filter(remove_short_and_long_utt) |
|
|
| train_dl = aishell.train_dataloaders(train_cuts) |
| valid_dl = aishell.valid_dataloaders(aishell.valid_cuts()) |
|
|
| scan_pessimistic_batches_for_oom( |
| model=model, |
| train_dl=train_dl, |
| optimizer=optimizer, |
| graph_compiler=graph_compiler, |
| params=params, |
| ) |
|
|
| for epoch in range(params.start_epoch, params.num_epochs): |
| train_dl.sampler.set_epoch(epoch) |
|
|
| cur_lr = optimizer._rate |
| if tb_writer is not None: |
| tb_writer.add_scalar("train/learning_rate", cur_lr, params.batch_idx_train) |
| tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train) |
|
|
| if rank == 0: |
| logging.info("epoch {}, learning rate {}".format(epoch, cur_lr)) |
|
|
| params.cur_epoch = epoch |
|
|
| train_one_epoch( |
| params=params, |
| model=model, |
| optimizer=optimizer, |
| graph_compiler=graph_compiler, |
| train_dl=train_dl, |
| valid_dl=valid_dl, |
| tb_writer=tb_writer, |
| world_size=world_size, |
| ) |
|
|
| save_checkpoint( |
| params=params, |
| model=model, |
| optimizer=optimizer, |
| rank=rank, |
| ) |
|
|
| logging.info("Done!") |
|
|
| if world_size > 1: |
| torch.distributed.barrier() |
| cleanup_dist() |
|
|
|
|
| def scan_pessimistic_batches_for_oom( |
| model: nn.Module, |
| train_dl: torch.utils.data.DataLoader, |
| optimizer: torch.optim.Optimizer, |
| graph_compiler: CharCtcTrainingGraphCompiler, |
| params: AttributeDict, |
| ): |
| from lhotse.dataset import find_pessimistic_batches |
|
|
| logging.info( |
| "Sanity check -- see if any of the batches in epoch 0 would cause OOM." |
| ) |
| batches, crit_values = find_pessimistic_batches(train_dl.sampler) |
| for criterion, cuts in batches.items(): |
| batch = train_dl.dataset[cuts] |
| try: |
| optimizer.zero_grad() |
| loss, _ = compute_loss( |
| params=params, |
| model=model, |
| graph_compiler=graph_compiler, |
| batch=batch, |
| is_training=True, |
| ) |
| loss.backward() |
| clip_grad_norm_(model.parameters(), 5.0, 2.0) |
| optimizer.step() |
| except RuntimeError as e: |
| if "CUDA out of memory" in str(e): |
| logging.error( |
| "Your GPU ran out of memory with the current " |
| "max_duration setting. We recommend decreasing " |
| "max_duration and trying again.\n" |
| f"Failing criterion: {criterion} " |
| f"(={crit_values[criterion]}) ..." |
| ) |
| raise |
|
|
|
|
| def main(): |
| parser = get_parser() |
| AishellAsrDataModule.add_arguments(parser) |
| args = parser.parse_args() |
| args.exp_dir = Path(args.exp_dir) |
| args.lang_dir = Path(args.lang_dir) |
|
|
| world_size = args.world_size |
| assert world_size >= 1 |
| if world_size > 1: |
| mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True) |
| else: |
| run(rank=0, world_size=1, args=args) |
|
|
|
|
| torch.set_num_threads(1) |
| torch.set_num_interop_threads(1) |
|
|
| if __name__ == "__main__": |
| main() |
|
|