| |
| |
| |
| |
|
|
| from fairseq import utils |
| from fairseq.models import ( |
| FairseqLanguageModel, |
| register_model, |
| register_model_architecture, |
| ) |
| from fairseq.models.lightconv import Embedding, LightConvDecoder |
| from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder |
|
|
|
|
| @register_model("lightconv_lm") |
| class LightConvLanguageModel(FairseqLanguageModel): |
| def __init__(self, decoder): |
| super().__init__(decoder) |
|
|
| @staticmethod |
| def add_args(parser): |
| """Add model-specific arguments to the parser.""" |
| parser.add_argument( |
| "--dropout", |
| default=0.1, |
| type=float, |
| metavar="D", |
| help="dropout probability", |
| ) |
| parser.add_argument( |
| "--attention-dropout", |
| default=0.0, |
| type=float, |
| metavar="D", |
| help="dropout probability for attention weights", |
| ) |
| parser.add_argument( |
| "--relu-dropout", |
| default=0.0, |
| type=float, |
| metavar="D", |
| help="dropout probability after ReLU in FFN", |
| ) |
| parser.add_argument( |
| "--input-dropout", |
| type=float, |
| metavar="D", |
| help="dropout probability of the inputs", |
| ) |
| parser.add_argument( |
| "--decoder-embed-dim", |
| type=int, |
| metavar="N", |
| help="decoder embedding dimension", |
| ) |
| parser.add_argument( |
| "--decoder-output-dim", |
| type=int, |
| metavar="N", |
| help="decoder output dimension", |
| ) |
| parser.add_argument( |
| "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" |
| ) |
| parser.add_argument( |
| "--decoder-ffn-embed-dim", |
| type=int, |
| metavar="N", |
| help="decoder embedding dimension for FFN", |
| ) |
| parser.add_argument( |
| "--decoder-layers", type=int, metavar="N", help="num decoder layers" |
| ) |
| parser.add_argument( |
| "--decoder-attention-heads", |
| type=int, |
| metavar="N", |
| help="num decoder attention heads or LightConv/DynamicConv heads", |
| ) |
| parser.add_argument( |
| "--decoder-normalize-before", |
| default=False, |
| action="store_true", |
| help="apply layernorm before each decoder block", |
| ) |
| parser.add_argument( |
| "--adaptive-softmax-cutoff", |
| metavar="EXPR", |
| help="comma separated list of adaptive softmax cutoff points. " |
| "Must be used with adaptive_loss criterion", |
| ) |
| parser.add_argument( |
| "--adaptive-softmax-dropout", |
| type=float, |
| metavar="D", |
| help="sets adaptive softmax dropout for the tail projections", |
| ) |
| parser.add_argument( |
| "--adaptive-softmax-factor", |
| type=float, |
| metavar="N", |
| help="adaptive input factor", |
| ) |
| parser.add_argument( |
| "--no-token-positional-embeddings", |
| default=False, |
| action="store_true", |
| help="if set, disables positional embeddings (outside self attention)", |
| ) |
| parser.add_argument( |
| "--share-decoder-input-output-embed", |
| default=False, |
| action="store_true", |
| help="share decoder input and output embeddings", |
| ) |
| parser.add_argument( |
| "--character-embeddings", |
| default=False, |
| action="store_true", |
| help="if set, uses character embedding convolutions to produce token embeddings", |
| ) |
| parser.add_argument( |
| "--character-filters", |
| type=str, |
| metavar="LIST", |
| default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", |
| help="size of character embeddings", |
| ) |
| parser.add_argument( |
| "--character-embedding-dim", |
| type=int, |
| metavar="N", |
| default=4, |
| help="size of character embeddings", |
| ) |
| parser.add_argument( |
| "--char-embedder-highway-layers", |
| type=int, |
| metavar="N", |
| default=2, |
| help="number of highway layers for character token embeddder", |
| ) |
| parser.add_argument( |
| "--adaptive-input", |
| default=False, |
| action="store_true", |
| help="if set, uses adaptive input", |
| ) |
| parser.add_argument( |
| "--adaptive-input-factor", |
| type=float, |
| metavar="N", |
| help="adaptive input factor", |
| ) |
| parser.add_argument( |
| "--adaptive-input-cutoff", |
| metavar="EXPR", |
| help="comma separated list of adaptive input cutoff points.", |
| ) |
| parser.add_argument( |
| "--tie-adaptive-weights", |
| action="store_true", |
| help="if set, ties the weights of adaptive softmax and adaptive input", |
| ) |
| parser.add_argument( |
| "--tie-adaptive-proj", |
| action="store_true", |
| help="if set, ties the projection weights of adaptive softmax and adaptive input", |
| ) |
| parser.add_argument( |
| "--decoder-learned-pos", |
| action="store_true", |
| help="use learned positional embeddings in the decoder", |
| ) |
|
|
| """LightConv and DynamicConv arguments""" |
| parser.add_argument( |
| "--decoder-kernel-size-list", |
| type=lambda x: utils.eval_str_list(x, int), |
| help='list of kernel size (default: "[3,7,15,31,31,31]")', |
| ) |
| parser.add_argument( |
| "--decoder-glu", type=utils.eval_bool, help="glu after in proj" |
| ) |
| parser.add_argument( |
| "--decoder-conv-type", |
| default="dynamic", |
| type=str, |
| choices=["dynamic", "lightweight"], |
| help="type of convolution", |
| ) |
| parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) |
| parser.add_argument( |
| "--weight-dropout", |
| type=float, |
| metavar="D", |
| help="dropout probability for conv weights", |
| ) |
|
|
| @classmethod |
| def build_model(cls, args, task): |
| """Build a new model instance.""" |
|
|
| |
| base_lm_architecture(args) |
|
|
| if getattr(args, "max_source_positions", None) is None: |
| args.max_source_positions = args.tokens_per_sample |
| if getattr(args, "max_target_positions", None) is None: |
| args.max_target_positions = args.tokens_per_sample |
|
|
| if args.character_embeddings: |
| embed_tokens = CharacterTokenEmbedder( |
| task.dictionary, |
| eval(args.character_filters), |
| args.character_embedding_dim, |
| args.decoder_embed_dim, |
| args.char_embedder_highway_layers, |
| ) |
| elif args.adaptive_input: |
| embed_tokens = AdaptiveInput( |
| len(task.dictionary), |
| task.dictionary.pad(), |
| args.decoder_input_dim, |
| args.adaptive_input_factor, |
| args.decoder_embed_dim, |
| utils.eval_str_list(args.adaptive_input_cutoff, type=int), |
| ) |
| else: |
| embed_tokens = Embedding( |
| len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() |
| ) |
|
|
| if args.tie_adaptive_weights: |
| assert args.adaptive_input |
| assert args.adaptive_input_factor == args.adaptive_softmax_factor |
| assert ( |
| args.adaptive_softmax_cutoff == args.adaptive_input_cutoff |
| ), "{} != {}".format( |
| args.adaptive_softmax_cutoff, args.adaptive_input_cutoff |
| ) |
| assert args.decoder_input_dim == args.decoder_output_dim |
|
|
| decoder = LightConvDecoder( |
| args, |
| task.output_dictionary, |
| embed_tokens, |
| no_encoder_attn=True, |
| final_norm=False, |
| ) |
| return LightConvLanguageModel(decoder) |
|
|
|
|
| @register_model_architecture("lightconv_lm", "lightconv_lm") |
| def base_lm_architecture(args): |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) |
| args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) |
| args.decoder_layers = getattr(args, "decoder_layers", 6) |
| args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) |
| args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) |
| args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) |
| args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) |
| args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) |
|
|
| args.character_embeddings = getattr(args, "character_embeddings", False) |
|
|
| args.decoder_output_dim = getattr( |
| args, "decoder_output_dim", args.decoder_embed_dim |
| ) |
| args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) |
| args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) |
|
|
| |
| args.decoder_normalize_before = True |
|
|
| args.adaptive_input = getattr(args, "adaptive_input", False) |
| args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) |
| args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) |
|
|
| args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) |
| args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) |
|
|
| args.decoder_kernel_size_list = getattr( |
| args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] |
| ) |
| if len(args.decoder_kernel_size_list) == 1: |
| args.decoder_kernel_size_list = ( |
| args.decoder_kernel_size_list * args.decoder_layers |
| ) |
| assert ( |
| len(args.decoder_kernel_size_list) == args.decoder_layers |
| ), "decoder_kernel_size_list doesn't match decoder_layers" |
| args.decoder_glu = getattr(args, "decoder_glu", True) |
| args.input_dropout = getattr(args, "input_dropout", 0.1) |
| args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) |
|
|
|
|
| @register_model_architecture("lightconv_lm", "lightconv_lm_gbw") |
| def lightconv_lm_gbw(args): |
| args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) |
| args.dropout = getattr(args, "dropout", 0.1) |
| args.attention_dropout = getattr(args, "attention_dropout", 0.1) |
| args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) |
| args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) |
| base_lm_architecture(args) |
|
|