| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| Usage: |
| ./rnn_lm/compute_perplexity.py \ |
| --epoch 4 \ |
| --avg 2 \ |
| --lm-data ./data/lm_training_bpe_500/sorted_lm_data-test.pt |
| |
| """ |
|
|
| import argparse |
| import logging |
| import math |
| from pathlib import Path |
|
|
| import torch |
| from dataset import get_dataloader |
| from model import RnnLmModel |
|
|
| from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint |
| from icefall.utils import AttributeDict, setup_logger, str2bool |
|
|
|
|
| def get_parser(): |
| parser = argparse.ArgumentParser( |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter |
| ) |
|
|
| parser.add_argument( |
| "--epoch", |
| type=int, |
| default=49, |
| help="It specifies the checkpoint to use for decoding." |
| "Note: Epoch counts from 0.", |
| ) |
|
|
| parser.add_argument( |
| "--avg", |
| type=int, |
| default=20, |
| help="Number of checkpoints to average. Automatically select " |
| "consecutive checkpoints before the checkpoint specified by " |
| "'--epoch'. ", |
| ) |
|
|
| parser.add_argument( |
| "--iter", |
| type=int, |
| default=0, |
| help="""If positive, --epoch is ignored and it |
| will use the checkpoint exp_dir/checkpoint-iter.pt. |
| You can specify --avg to use more checkpoints for model averaging. |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--exp-dir", |
| type=str, |
| default="rnn_lm/exp", |
| help="The experiment dir", |
| ) |
|
|
| parser.add_argument( |
| "--lm-data", |
| type=str, |
| help="Path to the LM test data for computing perplexity", |
| ) |
|
|
| parser.add_argument( |
| "--vocab-size", |
| type=int, |
| default=500, |
| help="Vocabulary size of the model", |
| ) |
|
|
| parser.add_argument( |
| "--embedding-dim", |
| type=int, |
| default=2048, |
| help="Embedding dim of the model", |
| ) |
|
|
| parser.add_argument( |
| "--hidden-dim", |
| type=int, |
| default=2048, |
| help="Hidden dim of the model", |
| ) |
|
|
| parser.add_argument( |
| "--num-layers", |
| type=int, |
| default=3, |
| help="Number of RNN layers the model", |
| ) |
|
|
| parser.add_argument( |
| "--tie-weights", |
| type=str2bool, |
| default=False, |
| help="""True to share the weights between the input embedding layer and the |
| last output linear layer |
| """, |
| ) |
|
|
| parser.add_argument( |
| "--batch-size", |
| type=int, |
| default=50, |
| help="Number of RNN layers the model", |
| ) |
|
|
| parser.add_argument( |
| "--max-sent-len", |
| type=int, |
| default=100, |
| help="Number of RNN layers the model", |
| ) |
|
|
| parser.add_argument( |
| "--sos-id", |
| type=int, |
| default=1, |
| help="SOS ID", |
| ) |
|
|
| parser.add_argument( |
| "--eos-id", |
| type=int, |
| default=1, |
| help="EOS ID", |
| ) |
|
|
| parser.add_argument( |
| "--blank-id", |
| type=int, |
| default=0, |
| help="Blank ID", |
| ) |
| return parser |
|
|
|
|
| @torch.no_grad() |
| def main(): |
| parser = get_parser() |
| args = parser.parse_args() |
| args.exp_dir = Path(args.exp_dir) |
| args.lm_data = Path(args.lm_data) |
|
|
| params = AttributeDict(vars(args)) |
|
|
| if params.iter > 0: |
| setup_logger( |
| f"{params.exp_dir}/log-ppl/log-ppl-iter-{params.iter}-avg-{params.avg}" |
| ) |
| else: |
| setup_logger( |
| f"{params.exp_dir}/log-ppl/log-ppl-epoch-{params.epoch}-avg-{params.avg}" |
| ) |
| logging.info("Computing perplexity started") |
| logging.info(params) |
|
|
| device = torch.device("cpu") |
| if torch.cuda.is_available(): |
| device = torch.device("cuda", 0) |
|
|
| logging.info(f"Device: {device}") |
|
|
| logging.info("About to create model") |
| model = RnnLmModel( |
| vocab_size=params.vocab_size, |
| embedding_dim=params.embedding_dim, |
| hidden_dim=params.hidden_dim, |
| num_layers=params.num_layers, |
| tie_weights=params.tie_weights, |
| ) |
|
|
| if params.iter > 0: |
| filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[ |
| : params.avg |
| ] |
| if len(filenames) == 0: |
| raise ValueError( |
| f"No checkpoints found for --iter {params.iter}, --avg {params.avg}" |
| ) |
| elif len(filenames) < params.avg: |
| raise ValueError( |
| f"Not enough checkpoints ({len(filenames)}) found for" |
| f" --iter {params.iter}, --avg {params.avg}" |
| ) |
| logging.info(f"averaging {filenames}") |
| model.to(device) |
| model.load_state_dict( |
| average_checkpoints(filenames, device=device), strict=False |
| ) |
| elif params.avg == 1: |
| load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model) |
| else: |
| start = params.epoch - params.avg + 1 |
| filenames = [] |
| for i in range(start, params.epoch + 1): |
| if i >= 0: |
| filenames.append(f"{params.exp_dir}/epoch-{i}.pt") |
| logging.info(f"averaging {filenames}") |
| model.to(device) |
| model.load_state_dict( |
| average_checkpoints(filenames, device=device), strict=False |
| ) |
|
|
| model.to(device) |
| model.eval() |
| num_param = sum([p.numel() for p in model.parameters()]) |
| num_param_requires_grad = sum( |
| [p.numel() for p in model.parameters() if p.requires_grad] |
| ) |
|
|
| logging.info(f"Number of model parameters: {num_param}") |
| logging.info( |
| f"Number of model parameters (requires_grad): " |
| f"{num_param_requires_grad} " |
| f"({num_param_requires_grad/num_param_requires_grad*100}%)" |
| ) |
|
|
| logging.info(f"Loading LM test data from {params.lm_data}") |
| test_dl = get_dataloader( |
| filename=params.lm_data, |
| is_distributed=False, |
| params=params, |
| ) |
|
|
| tot_loss = 0.0 |
| num_tokens = 0 |
| num_sentences = 0 |
| for batch_idx, batch in enumerate(test_dl): |
| x, y, sentence_lengths = batch |
| x = x.to(device) |
| y = y.to(device) |
| sentence_lengths = sentence_lengths.to(device) |
|
|
| nll = model(x, y, sentence_lengths) |
| loss = nll.sum().cpu().item() |
|
|
| tot_loss += loss |
| num_tokens += sentence_lengths.sum().cpu().item() |
| num_sentences += x.size(0) |
|
|
| ppl = math.exp(tot_loss / num_tokens) |
| logging.info( |
| f"total nll: {tot_loss}, num tokens: {num_tokens}, " |
| f"num sentences: {num_sentences}, ppl: {ppl:.3f}" |
| ) |
|
|
|
|
| torch.set_num_threads(1) |
| torch.set_num_interop_threads(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|