| |
| |
| |
| |
|
|
| from fairseq import utils |
| from fairseq.models import ( |
| FairseqLanguageModel, |
| register_model, |
| register_model_architecture, |
| ) |
| from fairseq.models.fconv import FConvDecoder |
| from fairseq.utils import safe_hasattr |
|
|
|
|
| @register_model("fconv_lm") |
| class FConvLanguageModel(FairseqLanguageModel): |
| def __init__(self, decoder): |
| super().__init__(decoder) |
|
|
| @staticmethod |
| def add_args(parser): |
| """Add model-specific arguments to the parser.""" |
| parser.add_argument( |
| "--dropout", type=float, metavar="D", help="dropout probability" |
| ) |
| parser.add_argument( |
| "--decoder-embed-dim", |
| type=int, |
| metavar="N", |
| help="decoder embedding dimension", |
| ) |
| parser.add_argument( |
| "--decoder-layers", |
| type=str, |
| metavar="EXPR", |
| help="decoder layers [(dim, kernel_size), ...]", |
| ) |
| parser.add_argument( |
| "--decoder-out-embed-dim", |
| type=int, |
| metavar="N", |
| help="decoder output embedding dimension", |
| ) |
| parser.add_argument( |
| "--adaptive-softmax-cutoff", |
| metavar="EXPR", |
| help="comma separated list of adaptive softmax cutoff points. " |
| "Must be used with adaptive_loss criterion", |
| ) |
| parser.add_argument( |
| "--adaptive-softmax-dropout", |
| type=float, |
| metavar="D", |
| help="sets adaptive softmax dropout for the tail projections", |
| ) |
| parser.add_argument( |
| "--decoder-attention", |
| type=str, |
| metavar="EXPR", |
| help="decoder attention [True, ...]", |
| ) |
|
|
| @classmethod |
| def build_model(cls, args, task): |
| """Build a new model instance.""" |
| |
| base_lm_architecture(args) |
|
|
| if safe_hasattr(args, "max_target_positions") and not safe_hasattr( |
| args, "tokens_per_sample" |
| ): |
| args.tokens_per_sample = args.max_target_positions |
|
|
| decoder = FConvDecoder( |
| dictionary=task.target_dictionary, |
| embed_dim=args.decoder_embed_dim, |
| convolutions=eval(args.decoder_layers), |
| out_embed_dim=args.decoder_embed_dim, |
| attention=eval(args.decoder_attention), |
| dropout=args.dropout, |
| max_positions=args.tokens_per_sample, |
| share_embed=False, |
| positional_embeddings=False, |
| adaptive_softmax_cutoff=( |
| utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) |
| if args.criterion == "adaptive_loss" |
| else None |
| ), |
| adaptive_softmax_dropout=args.adaptive_softmax_dropout, |
| ) |
| return FConvLanguageModel(decoder) |
|
|
|
|
| @register_model_architecture("fconv_lm", "fconv_lm") |
| def base_lm_architecture(args): |
| args.dropout = getattr(args, "dropout", 0.1) |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128) |
| args.decoder_layers = getattr(args, "decoder_layers", "[(1268, 4)] * 13") |
| args.decoder_attention = getattr(args, "decoder_attention", "False") |
| args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) |
| args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) |
|
|
|
|
| @register_model_architecture("fconv_lm", "fconv_lm_dauphin_wikitext103") |
| def fconv_lm_dauphin_wikitext103(args): |
| layers = "[(850, 6)] * 3" |
| layers += " + [(850, 1)] * 1" |
| layers += " + [(850, 5)] * 4" |
| layers += " + [(850, 1)] * 1" |
| layers += " + [(850, 4)] * 3" |
| layers += " + [(1024, 4)] * 1" |
| layers += " + [(2048, 4)] * 1" |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 280) |
| args.decoder_layers = getattr(args, "decoder_layers", layers) |
| args.decoder_attention = getattr(args, "decoder_attention", "False") |
| args.adaptive_softmax_cutoff = getattr( |
| args, "adaptive_softmax_cutoff", "10000,20000,200000" |
| ) |
| base_lm_architecture(args) |
|
|
|
|
| @register_model_architecture("fconv_lm", "fconv_lm_dauphin_gbw") |
| def fconv_lm_dauphin_gbw(args): |
| layers = "[(512, 5)]" |
| layers += " + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3" |
| layers += " + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3" |
| layers += " + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6" |
| layers += " + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]" |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128) |
| args.decoder_layers = getattr(args, "decoder_layers", layers) |
| args.decoder_attention = getattr(args, "decoder_attention", "False") |
| args.adaptive_softmax_cutoff = getattr( |
| args, "adaptive_softmax_cutoff", "10000,50000,200000" |
| ) |
| base_lm_architecture(args) |
|
|