diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..374f2fe814ed1a6b5ea0d95e166698365f7befdf Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/backtranslation_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..124f1fc84142e684b2a058cbcaac2afe7e0e6dc0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/base_wrapper_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c5f611fa46ae6e41bbae704bb42b69d42436740 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/concat_sentences_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/data_utils.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/data_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9b9fe8d51492ad0cc986efa83bdab86c60491bb Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/data_utils.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..18b158bda76069d3159087a09fe67ada36660cd3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/denoising_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/fasta_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/fasta_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da84090aae72f5fd43e9a835097898d9fef0b436 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/fasta_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/iterators.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/iterators.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edee796ea33c6950f39853f3e83e6c14ee915f3b Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/iterators.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/list_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/list_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f963b5a84bf06e2c023284535c76de8ea6d48fcc Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/list_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..663c03c0a089ae404074e2ad2cd4aa0cc69f410c Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/lm_context_window_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87e3e4e13e8a3f77e012682b277d479117765044 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/lru_cache_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2c3346414b2451b8f038f8cf6250c9f8cf68b1c0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/mask_tokens_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/noising.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/noising.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d51c569fe396065035182810b076d2a556728fa1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/noising.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a3ac9de980188b03ced9fe73cacc616a4180ca8 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/num_samples_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8f6fea6b3256382f5469575bb9199af4eea7e1ff Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/numel_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e9938a127b0ee9df4e97d00f6fc7d5668f78ba05 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/pad_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48cbf21a0b56137c40960b28b43852264f337f61 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/plasma_utils.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c33aad3b89569a5133fb60f9bbe6866753b60e29 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/prepend_token_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f24ea32ba4eff90d8153b21933128b3a44bd11c1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/raw_label_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd79410ae06b251b7a7e93eae88e2d7f5cc4ea6e Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/replace_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..536721d5d974842ac17d60192d46576b3cdef154 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/shorten_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0925ee2f86bae02b3ea629a639aa4e0ecb97f6f Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/sort_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccfebd99e1e0a0bc5e0d338e488b6b1c4c7da5c3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/strip_token_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5e0fe8213657cac44ad36b33c2eb72733d59c1c Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/token_block_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b65cf8104fe8d79eb71139103b6a5bb4c7beda2 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc b/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0949d070f2f69403ada2a5951e53b9a6c098c66 Binary files /dev/null and b/fairseq-0.10.2/fairseq/data/__pycache__/transform_eos_lang_pair_dataset.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..265e1ee80ae136e5980d7c8bf3ba8115f947aca0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/composite_encoder.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer_align.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_align.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4579274befac4ddd63b03dd6a7cf217e60c1097c Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_align.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbf62510f2c847723ba9d64c2bfafbbb9fd56a24 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_from_pretrained_xlm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fc9a994fb3bc91fa6600fc1c0af6bd50c13ff803 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/bart/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbca7301246c1f996e74bf26252a8967012545b1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/bart/__pycache__/hub_interface.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/bart/__pycache__/model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/bart/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7431d33bbf7f379d3f9f2e522d0b7dfd9aa0090 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/bart/__pycache__/model.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/bart/model.py b/fairseq-0.10.2/fairseq/models/bart/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0f22352b68187a8edc79db97beba5a8d9ff9ded6 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/bart/model.py @@ -0,0 +1,368 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +BART: Denoising Sequence-to-Sequence Pre-training for +Natural Language Generation, Translation, and Comprehension +""" + +import logging + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.models import register_model, register_model_architecture +from fairseq.models.transformer import TransformerModel +from fairseq.modules.transformer_sentence_encoder import init_bert_params + +from .hub_interface import BARTHubInterface + + +logger = logging.getLogger(__name__) + + +@register_model("bart") +class BARTModel(TransformerModel): + @classmethod + def hub_models(cls): + return { + "bart.base": "http://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz", + "bart.large": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz", + "bart.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz", + "bart.large.cnn": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz", + "bart.large.xsum": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz", + } + + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + + # We follow BERT's random weight initialization + self.apply(init_bert_params) + + self.classification_heads = nn.ModuleDict() + + @staticmethod + def add_args(parser): + super(BARTModel, BARTModel).add_args(parser) + parser.add_argument( + "--pooler-dropout", + type=float, + metavar="D", + help="dropout probability in the masked_lm pooler layers", + ) + parser.add_argument( + "--pooler-activation-fn", + choices=utils.get_available_activation_fns(), + help="activation function to use for pooler layer", + ) + parser.add_argument( + "--spectral-norm-classification-head", + action="store_true", + help="Apply spectral normalization on the classification head", + ) + + @property + def supported_targets(self): + return {"self"} + + def forward( + self, + src_tokens, + src_lengths, + prev_output_tokens, + features_only=False, + classification_head_name=None, + token_embeddings=None, + **kwargs, + ): + if classification_head_name is not None: + features_only = True + + encoder_out = self.encoder( + src_tokens, + src_lengths=src_lengths, + token_embeddings=token_embeddings, + **kwargs, + ) + x, extra = self.decoder( + prev_output_tokens, + encoder_out=encoder_out, + features_only=features_only, + **kwargs, + ) + + if classification_head_name is not None: + sentence_representation = x[ + src_tokens.eq(self.encoder.dictionary.eos()), : + ].view(x.size(0), -1, x.size(-1))[:, -1, :] + x = self.classification_heads[classification_head_name]( + sentence_representation + ) + return x, extra + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="gpt2", + **kwargs, + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return BARTHubInterface(x["args"], x["task"], x["models"][0]) + + def register_classification_head( + self, name, num_classes=None, inner_dim=None, **kwargs + ): + """Register a classification head.""" + logger.info("Registering classification head: {0}".format(name)) + if name in self.classification_heads: + prev_num_classes = self.classification_heads[name].out_proj.out_features + prev_inner_dim = self.classification_heads[name].dense.out_features + if num_classes != prev_num_classes or inner_dim != prev_inner_dim: + logger.warning( + 're-registering head "{}" with num_classes {} (prev: {}) ' + "and inner_dim {} (prev: {})".format( + name, num_classes, prev_num_classes, inner_dim, prev_inner_dim + ) + ) + self.classification_heads[name] = BARTClassificationHead( + input_dim=self.args.encoder_embed_dim, + inner_dim=inner_dim or self.args.encoder_embed_dim, + num_classes=num_classes, + activation_fn=self.args.pooler_activation_fn, + pooler_dropout=self.args.pooler_dropout, + do_spectral_norm=self.args.spectral_norm_classification_head, + ) + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + + prefix = name + "." if name != "" else "" + current_head_names = ( + [] + if not hasattr(self, "classification_heads") + else self.classification_heads.keys() + ) + + # Handle new classification heads present in the state dict. + keys_to_delete = [] + for k in state_dict.keys(): + if not k.startswith(prefix + "classification_heads."): + continue + + head_name = k[len(prefix + "classification_heads.") :].split(".")[0] + num_classes = state_dict[ + prefix + "classification_heads." + head_name + ".out_proj.weight" + ].size(0) + inner_dim = state_dict[ + prefix + "classification_heads." + head_name + ".dense.weight" + ].size(0) + + if getattr(self.args, "load_checkpoint_heads", False): + if head_name not in current_head_names: + self.register_classification_head(head_name, num_classes, inner_dim) + else: + if head_name not in current_head_names: + logger.warning( + "deleting classification head ({}) from checkpoint " + "not present in current model: {}".format(head_name, k) + ) + keys_to_delete.append(k) + elif ( + num_classes + != self.classification_heads[head_name].out_proj.out_features + or inner_dim + != self.classification_heads[head_name].dense.out_features + ): + logger.warning( + "deleting classification head ({}) from checkpoint " + "with different dimensions than current model: {}".format( + head_name, k + ) + ) + keys_to_delete.append(k) + for k in keys_to_delete: + del state_dict[k] + + def truncate_emb(key): + if key in state_dict: + state_dict[key] = state_dict[key][:-1, :] + + # When finetuning on translation task, remove last row of + # embedding matrix that corresponds to mask_idx token. + loaded_dict_size = state_dict["encoder.embed_tokens.weight"].size(0) + if ( + loaded_dict_size == len(self.encoder.dictionary) + 1 + and "" not in self.encoder.dictionary + ): + truncate_emb("encoder.embed_tokens.weight") + truncate_emb("decoder.embed_tokens.weight") + truncate_emb("encoder.output_projection.weight") + truncate_emb("decoder.output_projection.weight") + + # When continued pretraining on new set of languages for mbart, + # add extra lang embeddings at the end of embed_tokens. + # Note: newly added languages are assumed to have been added at the end. + if self.args.task == "multilingual_denoising" and loaded_dict_size < len( + self.encoder.dictionary + ): + logger.info( + "Adding extra language embeddings not found in pretrained model for " + "continued pretraining of MBART on new set of languages." + ) + loaded_mask_token_embedding = state_dict["encoder.embed_tokens.weight"][ + -1, : + ] + + num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size + embed_dim = state_dict["encoder.embed_tokens.weight"].size(1) + + new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim) + nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim ** -0.5) + new_lang_embed_to_add = new_lang_embed_to_add.to( + dtype=state_dict["encoder.embed_tokens.weight"].dtype, + ) + + state_dict["encoder.embed_tokens.weight"] = torch.cat( + [ + state_dict["encoder.embed_tokens.weight"][ + : loaded_dict_size - 1, : + ], + new_lang_embed_to_add, + loaded_mask_token_embedding.unsqueeze(0), + ] + ) + state_dict["decoder.embed_tokens.weight"] = torch.cat( + [ + state_dict["decoder.embed_tokens.weight"][ + : loaded_dict_size - 1, : + ], + new_lang_embed_to_add, + loaded_mask_token_embedding.unsqueeze(0), + ] + ) + + # Copy any newly-added classification heads into the state dict + # with their current weights. + if hasattr(self, "classification_heads"): + cur_state = self.classification_heads.state_dict() + for k, v in cur_state.items(): + if prefix + "classification_heads." + k not in state_dict: + logger.info("Overwriting", prefix + "classification_heads." + k) + state_dict[prefix + "classification_heads." + k] = v + + +class BARTClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim, + inner_dim, + num_classes, + activation_fn, + pooler_dropout, + do_spectral_norm=False, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.activation_fn = utils.get_activation_fn(activation_fn) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + if do_spectral_norm: + self.out_proj = torch.nn.utils.spectral_norm(self.out_proj) + + def forward(self, features, **kwargs): + x = features + x = self.dropout(x) + x = self.dense(x) + x = self.activation_fn(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@register_model_architecture("bart", "bart_large") +def bart_large_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1024) + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 12) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.relu_dropout = getattr(args, "relu_dropout", 0.0) + args.dropout = getattr(args, "dropout", 0.1) + args.max_target_positions = getattr(args, "max_target_positions", 1024) + args.max_source_positions = getattr(args, "max_source_positions", 1024) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", True + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", True) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + args.no_scale_embedding = getattr(args, "no_scale_embedding", True) + args.layernorm_embedding = getattr(args, "layernorm_embedding", True) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) + + +@register_model_architecture("bart", "bart_base") +def bart_base_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 768) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12) + bart_large_architecture(args) + + +@register_model_architecture("bart", "mbart_large") +def mbart_large_architecture(args): + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + bart_large_architecture(args) + + +@register_model_architecture("bart", "mbart_base") +def mbart_base_architecture(args): + args.no_scale_embedding = getattr(args, "no_scale_embedding", False) + bart_base_architecture(args) + + +@register_model_architecture("bart", "mbart_base_wmt20") +def mbart_base_wmt20_architecture(args): + args.layernorm_embedding = getattr(args, "layernorm_embedding", False) + mbart_base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/nat/__init__.py b/fairseq-0.10.2/fairseq/models/nat/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..05fe822487c3bcde8346648d5826f1669c6bc1ca --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +from .fairseq_nat_model import * +from .nonautoregressive_transformer import * +from .nat_crf_transformer import * +from .iterative_nonautoregressive_transformer import * +from .cmlm_transformer import * +from .levenshtein_transformer import * +from .insertion_transformer import * diff --git a/fairseq-0.10.2/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..883c9b7694fa0a74c06967918c6c9fd7416ace2d Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/nat/__pycache__/cmlm_transformer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/nat/cmlm_transformer.py b/fairseq-0.10.2/fairseq/models/nat/cmlm_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..c876e9453c101c00bd8e93e6e6f1fb48dc26f993 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/cmlm_transformer.py @@ -0,0 +1,162 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +This file implements: +Ghazvininejad, Marjan, et al. +"Constant-time machine translation with conditional masked language models." +arXiv preprint arXiv:1904.09324 (2019). +""" + +from fairseq.models import register_model, register_model_architecture +from fairseq.models.nat import NATransformerModel +from fairseq.utils import new_arange + + +def _skeptical_unmasking(output_scores, output_masks, p): + sorted_index = output_scores.sort(-1)[1] + boundary_len = ( + (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p + ).long() + skeptical_mask = new_arange(output_masks) < boundary_len + return skeptical_mask.scatter(1, sorted_index, skeptical_mask) + + +@register_model("cmlm_transformer") +class CMLMNATransformerModel(NATransformerModel): + @staticmethod + def add_args(parser): + NATransformerModel.add_args(parser) + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs + ): + assert not self.decoder.src_embedding_copy, "do not support embedding copy." + + # encoding + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + # length prediction + length_out = self.decoder.forward_length( + normalize=False, encoder_out=encoder_out + ) + length_tgt = self.decoder.forward_length_prediction( + length_out, encoder_out, tgt_tokens + ) + + # decoding + word_ins_out = self.decoder( + normalize=False, + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + ) + word_ins_mask = prev_output_tokens.eq(self.unk) + + return { + "word_ins": { + "out": word_ins_out, + "tgt": tgt_tokens, + "mask": word_ins_mask, + "ls": self.args.label_smoothing, + "nll_loss": True, + }, + "length": { + "out": length_out, + "tgt": length_tgt, + "factor": self.decoder.length_loss_factor, + }, + } + + def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): + + step = decoder_out.step + max_step = decoder_out.max_step + + output_tokens = decoder_out.output_tokens + output_scores = decoder_out.output_scores + history = decoder_out.history + + # execute the decoder + output_masks = output_tokens.eq(self.unk) + _scores, _tokens = self.decoder( + normalize=True, + prev_output_tokens=output_tokens, + encoder_out=encoder_out, + ).max(-1) + output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) + output_scores.masked_scatter_(output_masks, _scores[output_masks]) + + if history is not None: + history.append(output_tokens.clone()) + + # skeptical decoding (depend on the maximum decoding steps.) + if (step + 1) < max_step: + skeptical_mask = _skeptical_unmasking( + output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step + ) + + output_tokens.masked_fill_(skeptical_mask, self.unk) + output_scores.masked_fill_(skeptical_mask, 0.0) + + if history is not None: + history.append(output_tokens.clone()) + + return decoder_out._replace( + output_tokens=output_tokens, + output_scores=output_scores, + attn=None, + history=history, + ) + + +@register_model_architecture("cmlm_transformer", "cmlm_transformer") +def cmlm_base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", True) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.apply_bert_init = getattr(args, "apply_bert_init", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + # --- special arguments --- + args.sg_length_pred = getattr(args, "sg_length_pred", False) + args.pred_length_offset = getattr(args, "pred_length_offset", False) + args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) + args.ngram_predictor = getattr(args, "ngram_predictor", 1) + args.src_embedding_copy = getattr(args, "src_embedding_copy", False) + + +@register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de") +def cmlm_wmt_en_de(args): + cmlm_base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/nat/fairseq_nat_model.py b/fairseq-0.10.2/fairseq/models/nat/fairseq_nat_model.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbc29d0f49697329f50bbea9ee15bda0010f069 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/fairseq_nat_model.py @@ -0,0 +1,159 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +from fairseq.models.transformer import ( + TransformerDecoder, + TransformerEncoder, + TransformerModel, +) +from fairseq.modules.transformer_sentence_encoder import init_bert_params + + +def ensemble_encoder(func): + def wrapper(self, *args, **kwargs): + if self.ensemble_models is None or len(self.ensemble_models) == 1: + return func(self, *args, **kwargs) + encoder_outs = [func(model, *args, **kwargs) for model in self.ensemble_models] + _encoder_out = encoder_outs[0] + + def stack(key): + outs = [getattr(e, key) for e in encoder_outs] + return torch.stack(outs, -1) if outs[0] is not None else None + + return _encoder_out._replace( + encoder_out=stack("encoder_out"), + encoder_embedding=stack("encoder_embedding"), + encoder_states=stack("encoder_states"), + ) + + return wrapper + + +def ensemble_decoder(func): + def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs): + if self.ensemble_models is None or len(self.ensemble_models) == 1: + return func( + self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs + ) + + action_outs = [ + func( + model, + normalize=normalize, + encoder_out=encoder_out._replace( + encoder_out=encoder_out.encoder_out[:, :, :, i] + ), + *args, + **kwargs + ) + for i, model in enumerate(self.ensemble_models) + ] + + if not isinstance(action_outs[0], tuple): # return multiple values + action_outs = [[a] for a in action_outs] + else: + action_outs = [list(a) for a in action_outs] + + ensembled_outs = [] + for i in range(len(action_outs[0])): + if i == 0 and normalize: + ensembled_outs += [ + torch.logsumexp( + torch.stack([a[i] for a in action_outs], -1), dim=-1 + ) + - math.log(len(self.ensemble_models)) + ] + elif action_outs[0][i] is not None: + ensembled_outs += [torch.stack([a[i] for a in action_outs], -1)] + else: + ensembled_outs += [None] + + if len(ensembled_outs) == 1: + return ensembled_outs[0] + return tuple(ensembled_outs) + + return wrapper + + +class FairseqNATModel(TransformerModel): + """ + Abstract class for all nonautoregressive-based models + """ + + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + self.tgt_dict = decoder.dictionary + self.bos = decoder.dictionary.bos() + self.eos = decoder.dictionary.eos() + self.pad = decoder.dictionary.pad() + self.unk = decoder.dictionary.unk() + + self.ensemble_models = None + + @property + def allow_length_beam(self): + return False + + @property + def allow_ensemble(self): + return True + + def enable_ensemble(self, models): + self.encoder.ensemble_models = [m.encoder for m in models] + self.decoder.ensemble_models = [m.decoder for m in models] + + @staticmethod + def add_args(parser): + TransformerModel.add_args(parser) + parser.add_argument( + "--apply-bert-init", + action="store_true", + help="use custom param initialization for BERT", + ) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + decoder = FairseqNATDecoder(args, tgt_dict, embed_tokens) + if getattr(args, "apply_bert_init", False): + decoder.apply(init_bert_params) + return decoder + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + encoder = FairseqNATEncoder(args, src_dict, embed_tokens) + if getattr(args, "apply_bert_init", False): + encoder.apply(init_bert_params) + return encoder + + def forward_encoder(self, encoder_inputs): + return self.encoder(*encoder_inputs) + + def forward_decoder(self, *args, **kwargs): + return NotImplementedError + + def initialize_output_tokens(self, *args, **kwargs): + return NotImplementedError + + def forward(self, *args, **kwargs): + return NotImplementedError + + +class FairseqNATEncoder(TransformerEncoder): + def __init__(self, args, dictionary, embed_tokens): + super().__init__(args, dictionary, embed_tokens) + self.ensemble_models = None + + @ensemble_encoder + def forward(self, *args, **kwargs): + return super().forward(*args, **kwargs) + + +class FairseqNATDecoder(TransformerDecoder): + def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + super().__init__(args, dictionary, embed_tokens, no_encoder_attn) + self.ensemble_models = None diff --git a/fairseq-0.10.2/fairseq/models/nat/insertion_transformer.py b/fairseq-0.10.2/fairseq/models/nat/insertion_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..bc28000f59a3b9e8098f9fe710cc8335d39eea3e --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/insertion_transformer.py @@ -0,0 +1,280 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn.functional as F +from fairseq.models import register_model, register_model_architecture +from fairseq.models.nat import ( + FairseqNATModel, + LevenshteinTransformerDecoder, + LevenshteinTransformerModel, + ensemble_decoder, +) +from fairseq.models.transformer import Linear +from fairseq.modules.transformer_sentence_encoder import init_bert_params +from fairseq.utils import new_arange + + +class NegativeDistanceScore(object): + def __init__(self): + + # pre-compute some values + self.scores = {} + + self.scores[0.5] = self.compute_score_full(50, 0.5) + self.scores[1.0] = self.compute_score_full(50, 1.0) + self.scores[2.0] = self.compute_score_full(50, 2.0) + + def __call__(self, i, L, tau): + if (tau is None) or (tau > 1000): + return 1 / L + + if tau in self.scores: + if L < self.scores[tau].shape[0]: + return self.scores[tau][L - 1, i] + return self.compute_score(L, tau)[i] + + def compute_score(self, L, tau): + s = np.array([-abs(L / 2 - i) / tau for i in range(L)]) + s = np.exp(s - s.max()) + return s / s.sum() + + def compute_score_full(self, L, tau): + s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau + s = np.tril(s, 0) + np.triu(s - float("inf"), 1) + s = np.exp(s - s.max(1, keepdims=True)) + return s / s.sum(1, keepdims=True) + + +neg_scorer = NegativeDistanceScore() + + +def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None): + try: + from fairseq import libnat + except ImportError as e: + import sys + + sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n") + raise e + + B = in_tokens.size(0) + T = in_tokens.size(1) + V = vocab_size + + with torch.cuda.device_of(in_tokens): + in_tokens_list = [ + [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist()) + ] + out_tokens_list = [ + [t for t in s if t != padding_idx] + for i, s in enumerate(out_tokens.tolist()) + ] + + full_labels = libnat.suggested_ed2_path( + in_tokens_list, out_tokens_list, padding_idx + ) + insert_labels = [a[:-1] for a in full_labels] + + # numericalize1 + insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float() + insert_index, insert_labels = zip( + *[ + (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau)) + for i, labels in enumerate(insert_labels) + for j, label in enumerate(labels[1:-1]) + for k, w in enumerate(label) + ] + ) # HACK 1:-1 + insert_index, insert_labels = [ + torch.tensor(list(a), device=in_tokens.device) + for a in [insert_index, insert_labels] + ] + insert_label_tensors.scatter_(0, insert_index.long(), insert_labels) + insert_label_tensors = insert_label_tensors.view(B, T - 1, V) + + return insert_label_tensors + + +def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx): + + padding_masks = in_tokens[:, 1:].eq(padding_idx) + word_ins_scores.masked_fill_(padding_masks, 0.0) + word_ins_pred.masked_fill_(padding_masks, padding_idx) + + in_coords = new_arange(in_tokens).type_as(in_scores) + + # shift all padding predictions to infinite + out_coords = (in_coords[:, 1:] - 0.5).masked_fill( + word_ins_pred.eq(padding_idx), float("inf") + ) + out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1] + out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords) + out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords) + return out_tokens, out_scores + + +@register_model("insertion_transformer") +class InsertionTransformerModel(LevenshteinTransformerModel): + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + + @staticmethod + def add_args(parser): + FairseqNATModel.add_args(parser) + parser.add_argument("--label-tau", default=None, type=float) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens) + if getattr(args, "apply_bert_init", False): + decoder.apply(init_bert_params) + return decoder + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs + ): + + assert tgt_tokens is not None, "forward function only supports training." + + # encoding + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + + # generate training labels for insertion + word_ins_out = self.decoder.forward_word_ins( + normalize=False, + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + ) + + word_ins_tgt = _get_ins_targets( + prev_output_tokens, + tgt_tokens, + self.pad, + self.unk, + len(self.tgt_dict), + tau=self.decoder.label_tau, + ).type_as(word_ins_out) + word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad) + + return { + "word_ins": { + "out": word_ins_out, + "tgt": word_ins_tgt, + "mask": word_ins_masks, + "ls": self.args.label_smoothing, + "nll_loss": True, + } + } + + def forward_decoder( + self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs + ): + + output_tokens = decoder_out.output_tokens + output_scores = decoder_out.output_scores + history = decoder_out.history + + # TODO: decoding for InsertionTransformer + word_ins_score = self.decoder.forward_word_ins( + normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out + ) + + if eos_penalty > 0.0: + word_ins_score[:, :, self.pad] -= eos_penalty + word_ins_score, word_ins_pred = word_ins_score.max(-1) + output_tokens, output_scores = _apply_ins_words( + output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad + ) + + # delete some unnecessary paddings + cut_off = output_tokens.ne(self.pad).sum(1).max() + output_tokens = output_tokens[:, :cut_off] + output_scores = output_scores[:, :cut_off] + + if history is not None: + history.append(output_tokens.clone()) + + return decoder_out._replace( + output_tokens=output_tokens, + output_scores=output_scores, + attn=None, + history=history, + ) + + +class InsertionTransformerDecoder(LevenshteinTransformerDecoder): + def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + # use the TransformerDecoder's __init__ + super(LevenshteinTransformerDecoder, self).__init__( + args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn + ) + + self.dictionary = dictionary + self.bos = dictionary.bos() + self.unk = dictionary.unk() + self.eos = dictionary.eos() + self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim) + + self.label_tau = getattr(args, "label_tau", None) + + @ensemble_decoder + def forward_word_ins(self, normalize, encoder_out, prev_output_tokens): + features = self.extract_features(prev_output_tokens, encoder_out=encoder_out)[0] + features = self.pool_out( + torch.cat([features[:, :-1, :], features[:, 1:, :]], 2) + ) + decoder_out = self.output_layer(features) + return F.log_softmax(decoder_out, -1) if normalize else decoder_out + + def forward_mask_ins(self, *args, **kwargs): + raise NotImplementedError + + def forward_word_del(self, *args, **kwargs): + raise NotImplementedError + + +@register_model_architecture("insertion_transformer", "insertion_transformer") +def insertion_base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.apply_bert_init = getattr(args, "apply_bert_init", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + # special for insertion transformer + args.label_tau = getattr(args, "label_tau", None) diff --git a/fairseq-0.10.2/fairseq/models/nat/iterative_nonautoregressive_transformer.py b/fairseq-0.10.2/fairseq/models/nat/iterative_nonautoregressive_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..bc39509980a80eb8c21e0bfdb304649ad3acc4d0 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/iterative_nonautoregressive_transformer.py @@ -0,0 +1,228 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.models import register_model, register_model_architecture +from fairseq.models.nat import NATransformerModel + + +def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1): + # s: input batch + # V: vocabulary size + rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device) + choices = torch.rand(size=s.size(), device=s.device) + choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1) + + replace = choices < beta / 3 + repeat = (choices >= beta / 3) & (choices < beta * 2 / 3) + swap = (choices >= beta * 2 / 3) & (choices < beta) + safe = choices >= beta + + for i in range(s.size(1) - 1): + rand_word = rand_words[:, i] + next_word = s[:, i + 1] + self_word = s[:, i] + + replace_i = replace[:, i] + swap_i = swap[:, i] & (next_word != 3) + repeat_i = repeat[:, i] & (next_word != 3) + safe_i = safe[:, i] | ((next_word == 3) & (~replace_i)) + + s[:, i] = ( + self_word * (safe_i | repeat_i).long() + + next_word * swap_i.long() + + rand_word * replace_i.long() + ) + s[:, i + 1] = ( + next_word * (safe_i | replace_i).long() + + self_word * (swap_i | repeat_i).long() + ) + return s + + +def gumbel_noise(input, TINY=1e-8): + return ( + input.new_zeros(*input.size()) + .uniform_() + .add_(TINY) + .log_() + .neg_() + .add_(TINY) + .log_() + .neg_() + ) + + +@register_model("iterative_nonautoregressive_transformer") +class IterNATransformerModel(NATransformerModel): + @staticmethod + def add_args(parser): + NATransformerModel.add_args(parser) + parser.add_argument( + "--train-step", + type=int, + help="number of refinement iterations during training", + ) + parser.add_argument( + "--dae-ratio", + type=float, + help="the probability of switching to the denoising auto-encoder loss", + ) + parser.add_argument( + "--stochastic-approx", + action="store_true", + help="sampling from the decoder as the inputs for next iteration", + ) + + @classmethod + def build_model(cls, args, task): + model = super().build_model(args, task) + model.train_step = getattr(args, "train_step", 4) + model.dae_ratio = getattr(args, "dae_ratio", 0.5) + model.stochastic_approx = getattr(args, "stochastic_approx", False) + return model + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs + ): + + B, T = prev_output_tokens.size() + + # encoding + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + + # length prediction + length_out = self.decoder.forward_length( + normalize=False, encoder_out=encoder_out + ) + length_tgt = self.decoder.forward_length_prediction( + length_out, encoder_out, tgt_tokens + ) + + # decoding + word_ins_outs, word_ins_tgts, word_ins_masks = [], [], [] + for t in range(self.train_step): + word_ins_out = self.decoder( + normalize=False, + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + step=t, + ) + word_ins_tgt = tgt_tokens + word_ins_mask = word_ins_tgt.ne(self.pad) + + word_ins_outs.append(word_ins_out) + word_ins_tgts.append(word_ins_tgt) + word_ins_masks.append(word_ins_mask) + + if t < (self.train_step - 1): + # prediction for next iteration + if self.stochastic_approx: + word_ins_prediction = ( + word_ins_out + gumbel_noise(word_ins_out) + ).max(-1)[1] + else: + word_ins_prediction = word_ins_out.max(-1)[1] + + prev_output_tokens = prev_output_tokens.masked_scatter( + word_ins_mask, word_ins_prediction[word_ins_mask] + ) + + if self.dae_ratio > 0: + # we do not perform denoising for the first iteration + corrputed = ( + torch.rand(size=(B,), device=prev_output_tokens.device) + < self.dae_ratio + ) + corrputed_tokens = _sequential_poisoning( + tgt_tokens[corrputed], + len(self.tgt_dict), + 0.33, + self.bos, + self.eos, + self.pad, + ) + prev_output_tokens[corrputed] = corrputed_tokens + + # concat everything + word_ins_out = torch.cat(word_ins_outs, 0) + word_ins_tgt = torch.cat(word_ins_tgts, 0) + word_ins_mask = torch.cat(word_ins_masks, 0) + + return { + "word_ins": { + "out": word_ins_out, + "tgt": word_ins_tgt, + "mask": word_ins_mask, + "ls": self.args.label_smoothing, + "nll_loss": True, + }, + "length": { + "out": length_out, + "tgt": length_tgt, + "factor": self.decoder.length_loss_factor, + }, + } + + +@register_model_architecture( + "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer" +) +def inat_base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.apply_bert_init = getattr(args, "apply_bert_init", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + # --- special arguments --- + args.sg_length_pred = getattr(args, "sg_length_pred", False) + args.pred_length_offset = getattr(args, "pred_length_offset", False) + args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) + args.ngram_predictor = getattr(args, "ngram_predictor", 1) + args.src_embedding_copy = getattr(args, "src_embedding_copy", False) + + args.train_step = getattr(args, "train_step", 4) + args.dae_ratio = getattr(args, "dae_ratio", 0.5) + args.stochastic_approx = getattr(args, "stochastic_approx", False) + + +@register_model_architecture( + "iterative_nonautoregressive_transformer", + "iterative_nonautoregressive_transformer_wmt_en_de", +) +def iter_nat_wmt_en_de(args): + inat_base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/nat/levenshtein_utils.py b/fairseq-0.10.2/fairseq/models/nat/levenshtein_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..375a98c2e11354de085f0a7926f407bd1a6a2ad4 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/levenshtein_utils.py @@ -0,0 +1,293 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.utils import new_arange + + +# -------------- Helper Functions --------------------------------------------------- # + + +def load_libnat(): + try: + from fairseq import libnat_cuda + + return libnat_cuda, True + + except ImportError as e: + print(str(e) + "... fall back to CPU version") + + try: + from fairseq import libnat + + return libnat, False + + except ImportError as e: + import sys + + sys.stderr.write( + "ERROR: missing libnat_cuda. run `python setup.py build_ext --inplace`\n" + ) + raise e + + +def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx): + libnat, use_cuda = load_libnat() + + def _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx): + in_masks = in_tokens.ne(padding_idx) + out_masks = out_tokens.ne(padding_idx) + mask_ins_targets, masked_tgt_masks = libnat.generate_insertion_labels( + out_tokens.int(), + libnat.levenshtein_distance( + in_tokens.int(), + out_tokens.int(), + in_masks.sum(1).int(), + out_masks.sum(1).int(), + ), + ) + masked_tgt_masks = masked_tgt_masks.bool() & out_masks + mask_ins_targets = mask_ins_targets.type_as(in_tokens)[ + :, 1 : in_masks.size(1) + ].masked_fill_(~in_masks[:, 1:], 0) + masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx) + return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets + + def _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx): + in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1) + + in_tokens_list = [ + [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist()) + ] + out_tokens_list = [ + [t for t in s if t != padding_idx] + for i, s in enumerate(out_tokens.tolist()) + ] + + full_labels = libnat.suggested_ed2_path( + in_tokens_list, out_tokens_list, padding_idx + ) + mask_inputs = [ + [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels + ] + + # generate labels + masked_tgt_masks = [] + for mask_input in mask_inputs: + mask_label = [] + for beam_size in mask_input[1:-1]: # HACK 1:-1 + mask_label += [0] + [1 for _ in range(beam_size)] + masked_tgt_masks.append( + mask_label + [0 for _ in range(out_seq_len - len(mask_label))] + ) + mask_ins_targets = [ + mask_input[1:-1] + + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))] + for mask_input in mask_inputs + ] + + # transform to tensor + masked_tgt_masks = torch.tensor( + masked_tgt_masks, device=out_tokens.device + ).bool() + mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device) + masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx) + return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets + + if use_cuda: + return _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx) + return _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx) + + +def _get_del_targets(in_tokens, out_tokens, padding_idx): + libnat, use_cuda = load_libnat() + + def _get_del_targets_cuda(in_tokens, out_tokens, padding_idx): + in_masks = in_tokens.ne(padding_idx) + out_masks = out_tokens.ne(padding_idx) + + word_del_targets = libnat.generate_deletion_labels( + in_tokens.int(), + libnat.levenshtein_distance( + in_tokens.int(), + out_tokens.int(), + in_masks.sum(1).int(), + out_masks.sum(1).int(), + ), + ) + word_del_targets = word_del_targets.type_as(in_tokens).masked_fill_( + ~in_masks, 0 + ) + return word_del_targets + + def _get_del_targets_cpu(in_tokens, out_tokens, padding_idx): + out_seq_len = out_tokens.size(1) + with torch.cuda.device_of(in_tokens): + in_tokens_list = [ + [t for t in s if t != padding_idx] + for i, s in enumerate(in_tokens.tolist()) + ] + out_tokens_list = [ + [t for t in s if t != padding_idx] + for i, s in enumerate(out_tokens.tolist()) + ] + + full_labels = libnat.suggested_ed2_path( + in_tokens_list, out_tokens_list, padding_idx + ) + word_del_targets = [b[-1] for b in full_labels] + word_del_targets = [ + labels + [0 for _ in range(out_seq_len - len(labels))] + for labels in word_del_targets + ] + + # transform to tensor + word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device) + return word_del_targets + + if use_cuda: + return _get_del_targets_cuda(in_tokens, out_tokens, padding_idx) + return _get_del_targets_cpu(in_tokens, out_tokens, padding_idx) + + +def _apply_ins_masks( + in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx +): + + in_masks = in_tokens.ne(padding_idx) + in_lengths = in_masks.sum(1) + + # HACK: hacky way to shift all the paddings to eos first. + in_tokens.masked_fill_(~in_masks, eos_idx) + mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0) + + out_lengths = in_lengths + mask_ins_pred.sum(1) + out_max_len = out_lengths.max() + out_masks = new_arange(out_lengths, out_max_len)[None, :] < out_lengths[:, None] + + reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1) + out_tokens = ( + in_tokens.new_zeros(in_tokens.size(0), out_max_len) + .fill_(padding_idx) + .masked_fill_(out_masks, unk_idx) + ) + out_tokens[:, 0] = in_tokens[:, 0] + out_tokens.scatter_(1, reordering, in_tokens[:, 1:]) + + out_scores = None + if in_scores is not None: + in_scores.masked_fill_(~in_masks, 0) + out_scores = in_scores.new_zeros(*out_tokens.size()) + out_scores[:, 0] = in_scores[:, 0] + out_scores.scatter_(1, reordering, in_scores[:, 1:]) + + return out_tokens, out_scores + + +def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx): + word_ins_masks = in_tokens.eq(unk_idx) + out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks]) + + if in_scores is not None: + out_scores = in_scores.masked_scatter( + word_ins_masks, word_ins_scores[word_ins_masks] + ) + else: + out_scores = None + + return out_tokens, out_scores + + +def _apply_del_words( + in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx +): + # apply deletion to a tensor + in_masks = in_tokens.ne(padding_idx) + bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx) + + max_len = in_tokens.size(1) + word_del_pred.masked_fill_(~in_masks, 1) + word_del_pred.masked_fill_(bos_eos_masks, 0) + + reordering = new_arange(in_tokens).masked_fill_(word_del_pred, max_len).sort(1)[1] + + out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering) + + out_scores = None + if in_scores is not None: + out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering) + + out_attn = None + if in_attn is not None: + _mask = word_del_pred[:, :, None].expand_as(in_attn) + _reordering = reordering[:, :, None].expand_as(in_attn) + out_attn = in_attn.masked_fill(_mask, 0.0).gather(1, _reordering) + + return out_tokens, out_scores, out_attn + + +def _skip(x, mask): + """ + Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors. + """ + if isinstance(x, int): + return x + + if x is None: + return None + + if isinstance(x, torch.Tensor): + if x.size(0) == mask.size(0): + return x[mask] + elif x.size(1) == mask.size(0): + return x[:, mask] + + if isinstance(x, list): + return [_skip(x_i, mask) for x_i in x] + + if isinstance(x, dict): + return {k: _skip(v, mask) for k, v in x.items()} + + raise NotImplementedError + + +def _skip_encoder_out(encoder, encoder_out, mask): + if not mask.any(): + return encoder_out + else: + return encoder.reorder_encoder_out( + encoder_out, mask.nonzero(as_tuple=False).squeeze() + ) + + +def _fill(x, mask, y, padding_idx): + """ + Filling tensor x with y at masked positions (dim=0). + """ + if x is None: + return y + assert x.dim() == y.dim() and mask.size(0) == x.size(0) + assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2)) + n_selected = mask.sum() + assert n_selected == y.size(0) + + if n_selected == x.size(0): + return y + + if x.size(1) < y.size(1): + dims = [x.size(0), y.size(1) - x.size(1)] + if x.dim() == 3: + dims.append(x.size(2)) + x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1) + x[mask] = y + elif x.size(1) > y.size(1): + x[mask] = padding_idx + if x.dim() == 2: + x[mask, : y.size(1)] = y + else: + x[mask, : y.size(1), :] = y + else: + x[mask] = y + return x diff --git a/fairseq-0.10.2/fairseq/models/nat/nat_crf_transformer.py b/fairseq-0.10.2/fairseq/models/nat/nat_crf_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b3cd931ceb077eb30db73df1d5d6cd714a86c2 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/nat_crf_transformer.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from fairseq.models import register_model, register_model_architecture +from fairseq.models.nat import NATransformerModel, base_architecture +from fairseq.modules import DynamicCRF + + +@register_model("nacrf_transformer") +class NACRFTransformerModel(NATransformerModel): + def __init__(self, args, encoder, decoder): + super().__init__(args, encoder, decoder) + self.crf_layer = DynamicCRF( + num_embedding=len(self.tgt_dict), + low_rank=args.crf_lowrank_approx, + beam_size=args.crf_beam_approx, + ) + + @property + def allow_ensemble(self): + return False + + @staticmethod + def add_args(parser): + NATransformerModel.add_args(parser) + parser.add_argument( + "--crf-lowrank-approx", + type=int, + help="the dimension of low-rank approximation of transition", + ) + parser.add_argument( + "--crf-beam-approx", + type=int, + help="the beam size for apporixmating the normalizing factor", + ) + parser.add_argument( + "--word-ins-loss-factor", + type=float, + help="weights on NAT loss used to co-training with CRF loss.", + ) + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs + ): + # encoding + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + + # length prediction + length_out = self.decoder.forward_length( + normalize=False, encoder_out=encoder_out + ) + length_tgt = self.decoder.forward_length_prediction( + length_out, encoder_out, tgt_tokens + ) + + # decoding + word_ins_out = self.decoder( + normalize=False, + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + ) + word_ins_tgt, word_ins_mask = tgt_tokens, tgt_tokens.ne(self.pad) + + # compute the log-likelihood of CRF + crf_nll = -self.crf_layer(word_ins_out, word_ins_tgt, word_ins_mask) + crf_nll = (crf_nll / word_ins_mask.type_as(crf_nll).sum(-1)).mean() + + return { + "word_ins": { + "out": word_ins_out, + "tgt": word_ins_tgt, + "mask": word_ins_mask, + "ls": self.args.label_smoothing, + "nll_loss": True, + "factor": self.args.word_ins_loss_factor, + }, + "word_crf": {"loss": crf_nll}, + "length": { + "out": length_out, + "tgt": length_tgt, + "factor": self.decoder.length_loss_factor, + }, + } + + def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): + output_tokens = decoder_out.output_tokens + output_scores = decoder_out.output_scores + history = decoder_out.history + + # execute the decoder and get emission scores + output_masks = output_tokens.ne(self.pad) + word_ins_out = self.decoder( + normalize=False, prev_output_tokens=output_tokens, encoder_out=encoder_out + ) + + # run viterbi decoding through CRF + _scores, _tokens = self.crf_layer.forward_decoder(word_ins_out, output_masks) + output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) + output_scores.masked_scatter_(output_masks, _scores[output_masks]) + if history is not None: + history.append(output_tokens.clone()) + + return decoder_out._replace( + output_tokens=output_tokens, + output_scores=output_scores, + attn=None, + history=history, + ) + + +@register_model_architecture("nacrf_transformer", "nacrf_transformer") +def nacrf_base_architecture(args): + args.crf_lowrank_approx = getattr(args, "crf_lowrank_approx", 32) + args.crf_beam_approx = getattr(args, "crf_beam_approx", 64) + args.word_ins_loss_factor = getattr(args, "word_ins_loss_factor", 0.5) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_ensembles.py b/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_ensembles.py new file mode 100644 index 0000000000000000000000000000000000000000..46bb8aac4370815616704de928322880c929b59e --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_ensembles.py @@ -0,0 +1,254 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq.models.nat import ( + _apply_del_words, + _apply_ins_masks, + _apply_ins_words, + _fill, + _skip, + _skip_encoder_out, +) + + +class _EnsembleModelEncoder(object): + def __init__(self, models): + self.models = models + + def reorder_encoder_out(self, encoder_outs, new_order): + encoder_outs = [ + model.encoder.reorder_encoder_out(encoder_out, new_order) + for model, encoder_out in zip(self.models, encoder_outs) + ] + return encoder_outs + + +class BasicEnsembleModel(torch.nn.Module): + """A wrapper around an ensemble of models.""" + + def __init__(self, models): + super().__init__() + self.models = torch.nn.ModuleList(models) + self.bos = self.models[0].decoder.dictionary.bos() + self.eos = self.models[0].decoder.dictionary.eos() + self.pad = self.models[0].decoder.dictionary.pad() + self.unk = self.models[0].decoder.dictionary.unk() + self.encoder = _EnsembleModelEncoder(self.models) + + def has_encoder(self): + return hasattr(self.models[0], "encoder") + + def max_decoder_positions(self): + return min(m.max_decoder_positions() for m in self.models) + + @torch.no_grad() + def forward_encoder(self, encoder_input): + if not self.has_encoder(): + return None + return [model.forward_encoder(encoder_input) for model in self.models] + + @torch.no_grad() + def forward_decoder(self, *inputs): + raise NotImplementedError + + def initialize_output_tokens(self, *inputs): + raise NotImplementedError + + +class EnsembleLevT(BasicEnsembleModel): + """A wrapper around an ensemble of models.""" + + def __init__(self, models): + super().__init__(models) + + @torch.no_grad() + def forward_decoder( + self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs + ): + # LevT ensembling + # A pipeline of three steps: deletion, placeholder, and word insertion. + # We need to average scores in each step in a pipeline way because of dependence. + # deletion + output_tokens = decoder_out.output_tokens + output_scores = decoder_out.output_scores + attn = decoder_out.attn + + bsz = output_tokens.size(0) + if max_ratio is None: + max_lens = output_tokens.new().fill_(255) + else: + if encoder_outs[0].encoder_padding_mask is None: + src_lens = ( + encoder_outs[0] + .encoder_out.new(bsz) + .fill_(encoder_outs[0].encoder_out.size(1)) + ) + else: + src_lens = (~encoder_outs[0].encoder_padding_mask).sum(1) + max_lens = (src_lens * max_ratio).clamp(min=10).long() + + # delete words + # do not delete tokens if it is + can_del_word = output_tokens.ne(self.pad).sum(1) > 2 + if can_del_word.sum() != 0: # we cannot delete, skip + output_tokens, output_scores, attn = self.forward_word_del( + encoder_outs, + output_tokens, + output_scores, + attn, + can_del_word, + ) + + # insert placeholders + can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens + if can_ins_mask.sum() != 0: + output_tokens, output_scores = self.forward_mask_ins( + encoder_outs, + output_tokens, + output_scores, + can_ins_mask, + eos_penalty, + max_lens, + ) + + # insert words + can_ins_word = output_tokens.eq(self.unk).sum(1) > 0 + if can_ins_word.sum() != 0: + output_tokens, output_scores, attn = self.forward_word_ins( + encoder_outs, + output_tokens, + output_scores, + attn, + can_ins_word, + ) + + # delete some unnecessary paddings + cut_off = output_tokens.ne(self.pad).sum(1).max() + output_tokens = output_tokens[:, :cut_off] + output_scores = output_scores[:, :cut_off] + attn = None if attn is None else attn[:, :cut_off, :] + return decoder_out._replace( + output_tokens=output_tokens, + output_scores=output_scores, + attn=attn, + history=None, + ) + + def forward_word_del( + self, encoder_outs, output_tokens, output_scores, attn, can_del_word + ): + word_del_score_avg = [] + word_del_attn_avg = [] + for model, encoder_out in zip(self.models, encoder_outs): + word_del_out, word_del_attn = model.decoder.forward_word_del( + _skip(output_tokens, can_del_word), + _skip_encoder_out(model.encoder, encoder_out, can_del_word), + ) + word_del_score = F.log_softmax(word_del_out, 2) + word_del_score_avg.append(word_del_score) + word_del_attn_avg.append(word_del_attn) + word_del_score_avg = torch.logsumexp( + torch.stack(word_del_score_avg, dim=0), dim=0 + ) - math.log(len(self.models)) + word_del_pred = word_del_score_avg.max(-1)[1].bool() + if word_del_attn_avg[0] is not None: + word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0) / len(self.models) + else: + word_del_attn_avg = None + + _tokens, _scores, _attn = _apply_del_words( + output_tokens[can_del_word], + output_scores[can_del_word], + word_del_attn_avg, + word_del_pred, + self.pad, + self.bos, + self.eos, + ) + output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad) + output_scores = _fill(output_scores, can_del_word, _scores, 0) + attn = _fill(attn, can_del_word, _attn, 0.0) + return output_tokens, output_scores, attn + + def forward_mask_ins( + self, + encoder_outs, + output_tokens, + output_scores, + can_ins_mask, + eos_penalty, + max_lens, + ): + mask_ins_score_avg = [] + for model, encoder_out in zip(self.models, encoder_outs): + mask_ins_out, _ = model.decoder.forward_mask_ins( + _skip(output_tokens, can_ins_mask), + _skip_encoder_out(model.encoder, encoder_out, can_ins_mask), + ) + mask_ins_score = F.log_softmax(mask_ins_out, 2) + if eos_penalty > 0.0: + mask_ins_score[:, :, 0] -= eos_penalty + mask_ins_score_avg.append(mask_ins_score) + mask_ins_score_avg = torch.logsumexp( + torch.stack(mask_ins_score_avg, dim=0), dim=0 + ) - math.log(len(self.models)) + mask_ins_pred = mask_ins_score_avg.max(-1)[1] + mask_ins_pred = torch.min( + mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred) + ) + _tokens, _scores = _apply_ins_masks( + output_tokens[can_ins_mask], + output_scores[can_ins_mask], + mask_ins_pred, + self.pad, + self.unk, + self.eos, + ) + output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad) + output_scores = _fill(output_scores, can_ins_mask, _scores, 0) + return output_tokens, output_scores + + def forward_word_ins( + self, encoder_outs, output_tokens, output_scores, attn, can_ins_word + ): + word_ins_score_avg = [] + word_ins_attn_avg = [] + for model, encoder_out in zip(self.models, encoder_outs): + word_ins_out, word_ins_attn = model.decoder.forward_word_ins( + _skip(output_tokens, can_ins_word), + _skip_encoder_out(model.encoder, encoder_out, can_ins_word), + ) + word_ins_score = F.log_softmax(word_ins_out, 2) + word_ins_score_avg.append(word_ins_score) + word_ins_attn_avg.append(word_ins_attn) + word_ins_score_avg = torch.logsumexp( + torch.stack(word_ins_score_avg, dim=0), dim=0 + ) - math.log(len(self.models)) + if word_ins_attn_avg[0] is not None: + word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0) / len(self.models) + else: + word_ins_attn_avg = None + word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1) + + _tokens, _scores = _apply_ins_words( + output_tokens[can_ins_word], + output_scores[can_ins_word], + word_ins_pred, + word_ins_score_max, + self.unk, + ) + + output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad) + output_scores = _fill(output_scores, can_ins_word, _scores, 0) + attn = _fill(attn, can_ins_word, word_ins_attn, 0.0) + return output_tokens, output_scores, attn + + def initialize_output_tokens(self, encoder_outs, src_tokens): + # LevT doesn't do length prediction. + return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens) diff --git a/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_transformer.py b/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..735297fc290786a73617352d0c47ed72edef8e84 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/nat/nonautoregressive_transformer.py @@ -0,0 +1,440 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.iterative_refinement_generator import DecoderOut +from fairseq.models import register_model, register_model_architecture +from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder +from fairseq.models.transformer import Embedding +from fairseq.modules.transformer_sentence_encoder import init_bert_params + + +def _mean_pooling(enc_feats, src_masks): + # enc_feats: T x B x C + # src_masks: B x T or None + if src_masks is None: + enc_feats = enc_feats.mean(0) + else: + src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats) + enc_feats = ( + (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None] + ).sum(0) + return enc_feats + + +def _argmax(x, dim): + return (x == x.max(dim, keepdim=True)[0]).type_as(x) + + +def _uniform_assignment(src_lens, trg_lens): + max_trg_len = trg_lens.max() + steps = (src_lens.float() - 1) / (trg_lens.float() - 1) # step-size + # max_trg_len + index_t = utils.new_arange(trg_lens, max_trg_len).float() + index_t = steps[:, None] * index_t[None, :] # batch_size X max_trg_len + index_t = torch.round(index_t).long().detach() + return index_t + + +@register_model("nonautoregressive_transformer") +class NATransformerModel(FairseqNATModel): + @property + def allow_length_beam(self): + return True + + @staticmethod + def add_args(parser): + FairseqNATModel.add_args(parser) + + # length prediction + parser.add_argument( + "--src-embedding-copy", + action="store_true", + help="copy encoder word embeddings as the initial input of the decoder", + ) + parser.add_argument( + "--pred-length-offset", + action="store_true", + help="predicting the length difference between the target and source sentences", + ) + parser.add_argument( + "--sg-length-pred", + action="store_true", + help="stop the gradients back-propagated from the length predictor", + ) + parser.add_argument( + "--length-loss-factor", + type=float, + help="weights on the length prediction loss", + ) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + decoder = NATransformerDecoder(args, tgt_dict, embed_tokens) + if getattr(args, "apply_bert_init", False): + decoder.apply(init_bert_params) + return decoder + + def forward( + self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs + ): + # encoding + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + + # length prediction + length_out = self.decoder.forward_length( + normalize=False, encoder_out=encoder_out + ) + length_tgt = self.decoder.forward_length_prediction( + length_out, encoder_out, tgt_tokens + ) + + # decoding + word_ins_out = self.decoder( + normalize=False, + prev_output_tokens=prev_output_tokens, + encoder_out=encoder_out, + ) + + return { + "word_ins": { + "out": word_ins_out, + "tgt": tgt_tokens, + "mask": tgt_tokens.ne(self.pad), + "ls": self.args.label_smoothing, + "nll_loss": True, + }, + "length": { + "out": length_out, + "tgt": length_tgt, + "factor": self.decoder.length_loss_factor, + }, + } + + def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): + step = decoder_out.step + output_tokens = decoder_out.output_tokens + output_scores = decoder_out.output_scores + history = decoder_out.history + + # execute the decoder + output_masks = output_tokens.ne(self.pad) + _scores, _tokens = self.decoder( + normalize=True, + prev_output_tokens=output_tokens, + encoder_out=encoder_out, + step=step, + ).max(-1) + + output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) + output_scores.masked_scatter_(output_masks, _scores[output_masks]) + if history is not None: + history.append(output_tokens.clone()) + + return decoder_out._replace( + output_tokens=output_tokens, + output_scores=output_scores, + attn=None, + history=history, + ) + + def initialize_output_tokens(self, encoder_out, src_tokens): + # length prediction + length_tgt = self.decoder.forward_length_prediction( + self.decoder.forward_length(normalize=True, encoder_out=encoder_out), + encoder_out=encoder_out, + ) + + max_length = length_tgt.clamp_(min=2).max() + idx_length = utils.new_arange(src_tokens, max_length) + + initial_output_tokens = src_tokens.new_zeros( + src_tokens.size(0), max_length + ).fill_(self.pad) + initial_output_tokens.masked_fill_( + idx_length[None, :] < length_tgt[:, None], self.unk + ) + initial_output_tokens[:, 0] = self.bos + initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos) + + initial_output_scores = initial_output_tokens.new_zeros( + *initial_output_tokens.size() + ).type_as(encoder_out.encoder_out) + + return DecoderOut( + output_tokens=initial_output_tokens, + output_scores=initial_output_scores, + attn=None, + step=0, + max_step=0, + history=None, + ) + + def regenerate_length_beam(self, decoder_out, beam_size): + output_tokens = decoder_out.output_tokens + length_tgt = output_tokens.ne(self.pad).sum(1) + length_tgt = ( + length_tgt[:, None] + + utils.new_arange(length_tgt, 1, beam_size) + - beam_size // 2 + ) + length_tgt = length_tgt.view(-1).clamp_(min=2) + max_length = length_tgt.max() + idx_length = utils.new_arange(length_tgt, max_length) + + initial_output_tokens = output_tokens.new_zeros( + length_tgt.size(0), max_length + ).fill_(self.pad) + initial_output_tokens.masked_fill_( + idx_length[None, :] < length_tgt[:, None], self.unk + ) + initial_output_tokens[:, 0] = self.bos + initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos) + + initial_output_scores = initial_output_tokens.new_zeros( + *initial_output_tokens.size() + ).type_as(decoder_out.output_scores) + + return decoder_out._replace( + output_tokens=initial_output_tokens, output_scores=initial_output_scores + ) + + +class NATransformerDecoder(FairseqNATDecoder): + def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + super().__init__( + args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn + ) + self.dictionary = dictionary + self.bos = dictionary.bos() + self.unk = dictionary.unk() + self.eos = dictionary.eos() + + self.encoder_embed_dim = args.encoder_embed_dim + self.sg_length_pred = getattr(args, "sg_length_pred", False) + self.pred_length_offset = getattr(args, "pred_length_offset", False) + self.length_loss_factor = getattr(args, "length_loss_factor", 0.1) + self.src_embedding_copy = getattr(args, "src_embedding_copy", False) + self.embed_length = Embedding(256, self.encoder_embed_dim, None) + + @ensemble_decoder + def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused): + features, _ = self.extract_features( + prev_output_tokens, + encoder_out=encoder_out, + embedding_copy=(step == 0) & self.src_embedding_copy, + ) + decoder_out = self.output_layer(features) + return F.log_softmax(decoder_out, -1) if normalize else decoder_out + + @ensemble_decoder + def forward_length(self, normalize, encoder_out): + enc_feats = encoder_out.encoder_out # T x B x C + src_masks = encoder_out.encoder_padding_mask # B x T or None + enc_feats = _mean_pooling(enc_feats, src_masks) + if self.sg_length_pred: + enc_feats = enc_feats.detach() + length_out = F.linear(enc_feats, self.embed_length.weight) + return F.log_softmax(length_out, -1) if normalize else length_out + + def extract_features( + self, + prev_output_tokens, + encoder_out=None, + early_exit=None, + embedding_copy=False, + **unused + ): + """ + Similar to *forward* but only return features. + + Inputs: + prev_output_tokens: Tensor(B, T) + encoder_out: a dictionary of hidden states and masks + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + the LevenshteinTransformer decoder has full-attention to all generated tokens + """ + # embedding + if embedding_copy: + src_embd = encoder_out.encoder_embedding + src_mask = encoder_out.encoder_padding_mask + src_mask = ( + ~src_mask + if src_mask is not None + else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool() + ) + + x, decoder_padding_mask = self.forward_embedding( + prev_output_tokens, + self.forward_copying_source( + src_embd, src_mask, prev_output_tokens.ne(self.padding_idx) + ), + ) + + else: + + x, decoder_padding_mask = self.forward_embedding(prev_output_tokens) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + attn = None + inner_states = [x] + + # decoder layers + for i, layer in enumerate(self.layers): + + # early exit from the decoder. + if (early_exit is not None) and (i >= early_exit): + break + + x, attn, _ = layer( + x, + encoder_out.encoder_out if encoder_out is not None else None, + encoder_out.encoder_padding_mask if encoder_out is not None else None, + self_attn_mask=None, + self_attn_padding_mask=decoder_padding_mask, + ) + inner_states.append(x) + + if self.layer_norm: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + + return x, {"attn": attn, "inner_states": inner_states} + + def forward_embedding(self, prev_output_tokens, states=None): + # embed positions + positions = ( + self.embed_positions(prev_output_tokens) + if self.embed_positions is not None + else None + ) + + # embed tokens and positions + if states is None: + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + if self.project_in_dim is not None: + x = self.project_in_dim(x) + else: + x = states + + if positions is not None: + x += positions + x = self.dropout_module(x) + decoder_padding_mask = prev_output_tokens.eq(self.padding_idx) + return x, decoder_padding_mask + + def forward_copying_source(self, src_embeds, src_masks, tgt_masks): + length_sources = src_masks.sum(1) + length_targets = tgt_masks.sum(1) + mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill( + ~tgt_masks, 0 + ) + copied_embedding = torch.gather( + src_embeds, + 1, + mapped_inputs.unsqueeze(-1).expand( + *mapped_inputs.size(), src_embeds.size(-1) + ), + ) + return copied_embedding + + def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None): + enc_feats = encoder_out.encoder_out # T x B x C + src_masks = encoder_out.encoder_padding_mask # B x T or None + if self.pred_length_offset: + if src_masks is None: + src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_( + enc_feats.size(0) + ) + else: + src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0) + src_lengs = src_lengs.long() + + if tgt_tokens is not None: + # obtain the length target + tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long() + if self.pred_length_offset: + length_tgt = tgt_lengs - src_lengs + 128 + else: + length_tgt = tgt_lengs + length_tgt = length_tgt.clamp(min=0, max=255) + + else: + # predict the length target (greedy for now) + # TODO: implementing length-beam + pred_lengs = length_out.max(-1)[1] + if self.pred_length_offset: + length_tgt = pred_lengs - 128 + src_lengs + else: + length_tgt = pred_lengs + + return length_tgt + + +@register_model_architecture( + "nonautoregressive_transformer", "nonautoregressive_transformer" +) +def base_architecture(args): + args.encoder_embed_path = getattr(args, "encoder_embed_path", None) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.decoder_embed_path = getattr(args, "decoder_embed_path", None) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) + args.decoder_ffn_embed_dim = getattr( + args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim + ) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.attention_dropout = getattr(args, "attention_dropout", 0.0) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.activation_fn = getattr(args, "activation_fn", "relu") + args.dropout = getattr(args, "dropout", 0.1) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + args.share_all_embeddings = getattr(args, "share_all_embeddings", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.adaptive_input = getattr(args, "adaptive_input", False) + args.apply_bert_init = getattr(args, "apply_bert_init", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + + # --- special arguments --- + args.sg_length_pred = getattr(args, "sg_length_pred", False) + args.pred_length_offset = getattr(args, "pred_length_offset", False) + args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) + args.src_embedding_copy = getattr(args, "src_embedding_copy", False) + + +@register_model_architecture( + "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de" +) +def nonautoregressive_transformer_wmt_en_de(args): + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/roberta/__init__.py b/fairseq-0.10.2/fairseq/models/roberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..56579e591566e014d99ed5a283ee7135257f054c --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .hub_interface import * # noqa +from .model import * # noqa +from .model_camembert import * # noqa +from .model_xlmr import * # noqa diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81729cad6588cf74c668719e3a5d4100eee080f5 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/roberta/alignment_utils.py b/fairseq-0.10.2/fairseq/models/roberta/alignment_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ccc7f74cb94d5b8baa2d4e9dfd44f653d47ee43e --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/alignment_utils.py @@ -0,0 +1,118 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import Counter +from typing import List + +import torch + + +def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): + """ + Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). + + Args: + roberta (RobertaHubInterface): RoBERTa instance + bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)` + other_tokens (List[str]): other tokens of shape `(T_words)` + + Returns: + List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*. + """ + assert bpe_tokens.dim() == 1 + assert bpe_tokens[0] == 0 + + def clean(text): + return text.strip() + + # remove whitespaces to simplify alignment + bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens] + bpe_tokens = [ + clean(roberta.bpe.decode(x) if x not in {"", ""} else x) for x in bpe_tokens + ] + other_tokens = [clean(str(o)) for o in other_tokens] + + # strip leading + bpe_tokens = bpe_tokens[1:] + assert "".join(bpe_tokens) == "".join(other_tokens) + + # create alignment from every word to a list of BPE tokens + alignment = [] + bpe_toks = filter(lambda item: item[1] != "", enumerate(bpe_tokens, start=1)) + j, bpe_tok = next(bpe_toks) + for other_tok in other_tokens: + bpe_indices = [] + while True: + if other_tok.startswith(bpe_tok): + bpe_indices.append(j) + other_tok = other_tok[len(bpe_tok) :] + try: + j, bpe_tok = next(bpe_toks) + except StopIteration: + j, bpe_tok = None, None + elif bpe_tok.startswith(other_tok): + # other_tok spans multiple BPE tokens + bpe_indices.append(j) + bpe_tok = bpe_tok[len(other_tok) :] + other_tok = "" + else: + raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok)) + if other_tok == "": + break + assert len(bpe_indices) > 0 + alignment.append(bpe_indices) + assert len(alignment) == len(other_tokens) + + return alignment + + +def align_features_to_words(roberta, features, alignment): + """ + Align given features to words. + + Args: + roberta (RobertaHubInterface): RoBERTa instance + features (torch.Tensor): features to align of shape `(T_bpe x C)` + alignment: alignment between BPE tokens and words returned by + func:`align_bpe_to_words`. + """ + assert features.dim() == 2 + + bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices) + assert bpe_counts[0] == 0 # shouldn't be aligned + denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))]) + weighted_features = features / denom.unsqueeze(-1) + + output = [weighted_features[0]] + largest_j = -1 + for bpe_indices in alignment: + output.append(weighted_features[bpe_indices].sum(dim=0)) + largest_j = max(largest_j, *bpe_indices) + for j in range(largest_j + 1, len(features)): + output.append(weighted_features[j]) + output = torch.stack(output) + assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4) + return output + + +def spacy_nlp(): + if getattr(spacy_nlp, "_nlp", None) is None: + try: + from spacy.lang.en import English + + spacy_nlp._nlp = English() + except ImportError: + raise ImportError("Please install spacy with: pip install spacy") + return spacy_nlp._nlp + + +def spacy_tokenizer(): + if getattr(spacy_tokenizer, "_tokenizer", None) is None: + try: + nlp = spacy_nlp() + spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp) + except ImportError: + raise ImportError("Please install spacy with: pip install spacy") + return spacy_tokenizer._tokenizer diff --git a/fairseq-0.10.2/fairseq/models/roberta/hub_interface.py b/fairseq-0.10.2/fairseq/models/roberta/hub_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..526823bd1ffd27269493c8807cb248d49997bc51 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/hub_interface.py @@ -0,0 +1,235 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.data import encoders + + +class RobertaHubInterface(nn.Module): + """A simple PyTorch Hub interface to RoBERTa. + + Usage: https://github.com/pytorch/fairseq/tree/master/examples/roberta + """ + + def __init__(self, args, task, model): + super().__init__() + self.args = args + self.task = task + self.model = model + + self.bpe = encoders.build_bpe(args) + + # this is useful for determining the device + self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) + + @property + def device(self): + return self._float_tensor.device + + def encode( + self, sentence: str, *addl_sentences, no_separator=False + ) -> torch.LongTensor: + """ + BPE-encode a sentence (or multiple sentences). + + Every sequence begins with a beginning-of-sentence (``) symbol. + Every sentence ends with an end-of-sentence (``) and we use an + extra end-of-sentence (``) as a separator. + + Example (single sentence): ` a b c ` + Example (sentence pair): ` d e f 1 2 3 ` + + The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE + requires leading spaces. For example:: + + >>> roberta.encode('Hello world').tolist() + [0, 31414, 232, 2] + >>> roberta.encode(' world').tolist() + [0, 232, 2] + >>> roberta.encode('world').tolist() + [0, 8331, 2] + """ + bpe_sentence = " " + self.bpe.encode(sentence) + " " + for s in addl_sentences: + bpe_sentence += " " if not no_separator else "" + bpe_sentence += " " + self.bpe.encode(s) + " " + tokens = self.task.source_dictionary.encode_line( + bpe_sentence, append_eos=False, add_if_not_exist=False + ) + return tokens.long() + + def decode(self, tokens: torch.LongTensor): + assert tokens.dim() == 1 + tokens = tokens.numpy() + if tokens[0] == self.task.source_dictionary.bos(): + tokens = tokens[1:] # remove + eos_mask = tokens == self.task.source_dictionary.eos() + doc_mask = eos_mask[1:] & eos_mask[:-1] + sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) + sentences = [ + self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences + ] + if len(sentences) == 1: + return sentences[0] + return sentences + + def extract_features( + self, tokens: torch.LongTensor, return_all_hiddens: bool = False + ) -> torch.Tensor: + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + if tokens.size(-1) > self.model.max_positions(): + raise ValueError( + "tokens exceeds maximum length: {} > {}".format( + tokens.size(-1), self.model.max_positions() + ) + ) + features, extra = self.model( + tokens.to(device=self.device), + features_only=True, + return_all_hiddens=return_all_hiddens, + ) + if return_all_hiddens: + # convert from T x B x C -> B x T x C + inner_states = extra["inner_states"] + return [inner_state.transpose(0, 1) for inner_state in inner_states] + else: + return features # just the last layer's features + + def register_classification_head( + self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs + ): + self.model.register_classification_head( + name, num_classes=num_classes, embedding_size=embedding_size, **kwargs + ) + + def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False): + features = self.extract_features(tokens.to(device=self.device)) + logits = self.model.classification_heads[head](features) + if return_logits: + return logits + return F.log_softmax(logits, dim=-1) + + def extract_features_aligned_to_words( + self, sentence: str, return_all_hiddens: bool = False + ) -> torch.Tensor: + """Extract RoBERTa features, aligned to spaCy's word-level tokenizer.""" + from fairseq.models.roberta import alignment_utils + from spacy.tokens import Doc + + nlp = alignment_utils.spacy_nlp() + tokenizer = alignment_utils.spacy_tokenizer() + + # tokenize both with GPT-2 BPE and spaCy + bpe_toks = self.encode(sentence) + spacy_toks = tokenizer(sentence) + spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] + alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws) + + # extract features and align them + features = self.extract_features( + bpe_toks, return_all_hiddens=return_all_hiddens + ) + features = features.squeeze(0) + aligned_feats = alignment_utils.align_features_to_words( + self, features, alignment + ) + + # wrap in spaCy Doc + doc = Doc( + nlp.vocab, + words=[""] + [x.text for x in spacy_toks] + [""], + spaces=[True] + + [x.endswith(" ") for x in spacy_toks_ws[:-1]] + + [True, False], + ) + assert len(doc) == aligned_feats.size(0) + doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i] + return doc + + def fill_mask(self, masked_input: str, topk: int = 5): + masked_token = "" + assert ( + masked_token in masked_input and masked_input.count(masked_token) == 1 + ), "Please add one {0} token for the input, eg: 'He is a {0} guy'".format( + masked_token + ) + + text_spans = masked_input.split(masked_token) + text_spans_bpe = ( + (" {0} ".format(masked_token)) + .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans]) + .strip() + ) + tokens = self.task.source_dictionary.encode_line( + " " + text_spans_bpe + " ", + append_eos=False, + add_if_not_exist=False, + ) + + masked_index = (tokens == self.task.mask_idx).nonzero() + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + + with utils.model_eval(self.model): + features, extra = self.model( + tokens.long().to(device=self.device), + features_only=False, + return_all_hiddens=False, + ) + logits = features[0, masked_index, :].squeeze() + prob = logits.softmax(dim=0) + values, index = prob.topk(k=topk, dim=0) + topk_predicted_token_bpe = self.task.source_dictionary.string(index) + + topk_filled_outputs = [] + for index, predicted_token_bpe in enumerate( + topk_predicted_token_bpe.split(" ") + ): + predicted_token = self.bpe.decode(predicted_token_bpe) + # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306 + if predicted_token_bpe.startswith("\u2581"): + predicted_token = " " + predicted_token + if " {0}".format(masked_token) in masked_input: + topk_filled_outputs.append( + ( + masked_input.replace( + " {0}".format(masked_token), predicted_token + ), + values[index].item(), + predicted_token, + ) + ) + else: + topk_filled_outputs.append( + ( + masked_input.replace(masked_token, predicted_token), + values[index].item(), + predicted_token, + ) + ) + return topk_filled_outputs + + def disambiguate_pronoun(self, sentence: str) -> bool: + """ + Usage:: + + >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.') + True + + >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.') + 'The trophy' + """ + assert hasattr( + self.task, "disambiguate_pronoun" + ), "roberta.disambiguate_pronoun() requires a model trained with the WSC task." + with utils.model_eval(self.model): + return self.task.disambiguate_pronoun( + self.model, sentence, use_cuda=self.device.type == "cuda" + ) diff --git a/fairseq-0.10.2/fairseq/models/roberta/model.py b/fairseq-0.10.2/fairseq/models/roberta/model.py new file mode 100644 index 0000000000000000000000000000000000000000..d56496f803d2cd66e102b069358d73166a7e482d --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/model.py @@ -0,0 +1,524 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +RoBERTa: A Robustly Optimized BERT Pretraining Approach. +""" + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderModel, + register_model, + register_model_architecture, +) +from fairseq.modules import LayerNorm, TransformerSentenceEncoder +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ +from fairseq.modules.transformer_sentence_encoder import init_bert_params + +from .hub_interface import RobertaHubInterface + + +logger = logging.getLogger(__name__) + + +@register_model("roberta") +class RobertaModel(FairseqEncoderModel): + @classmethod + def hub_models(cls): + return { + "roberta.base": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz", + "roberta.large": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz", + "roberta.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz", + "roberta.large.wsc": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz", + } + + def __init__(self, args, encoder): + super().__init__(encoder) + self.args = args + + # We follow BERT's random weight initialization + self.apply(init_bert_params) + + self.classification_heads = nn.ModuleDict() + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument( + "--encoder-layers", type=int, metavar="L", help="num encoder layers" + ) + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="H", + help="encoder embedding dimension", + ) + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="F", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="A", + help="num encoder attention heads", + ) + parser.add_argument( + "--activation-fn", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--pooler-activation-fn", + choices=utils.get_available_activation_fns(), + help="activation function to use for pooler layer", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--activation-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN", + ) + parser.add_argument( + "--pooler-dropout", + type=float, + metavar="D", + help="dropout probability in the masked_lm pooler layers", + ) + parser.add_argument( + "--max-positions", type=int, help="number of positional embeddings to learn" + ) + parser.add_argument( + "--load-checkpoint-heads", + action="store_true", + help="(re-)register and load heads when loading checkpoints", + ) + # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) + parser.add_argument( + "--encoder-layerdrop", + type=float, + metavar="D", + default=0, + help="LayerDrop probability for encoder", + ) + parser.add_argument( + "--encoder-layers-to-keep", + default=None, + help="which layers to *keep* when pruning as a comma-separated list", + ) + # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) + parser.add_argument( + "--quant-noise-pq", + type=float, + metavar="D", + default=0, + help="iterative PQ quantization noise at training time", + ) + parser.add_argument( + "--quant-noise-pq-block-size", + type=int, + metavar="D", + default=8, + help="block size of quantization noise at training time", + ) + parser.add_argument( + "--quant-noise-scalar", + type=float, + metavar="D", + default=0, + help="scalar quantization noise and scalar quantization at training time", + ) + parser.add_argument( + "--untie-weights-roberta", + action="store_true", + help="Untie weights between embeddings and classifiers in RoBERTa", + ) + parser.add_argument( + "--spectral-norm-classification-head", + action="store_true", + default=False, + help="Apply spectral normalization on the classification head", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present + base_architecture(args) + + if not hasattr(args, "max_positions"): + args.max_positions = args.tokens_per_sample + + encoder = RobertaEncoder(args, task.source_dictionary) + return cls(args, encoder) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + classification_head_name=None, + **kwargs + ): + if classification_head_name is not None: + features_only = True + + x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs) + + if classification_head_name is not None: + x = self.classification_heads[classification_head_name](x) + return x, extra + + def get_normalized_probs(self, net_output, log_probs, sample=None): + """Get normalized probabilities (or log probs) from a net's output.""" + logits = net_output[0].float() + if log_probs: + return F.log_softmax(logits, dim=-1) + else: + return F.softmax(logits, dim=-1) + + def register_classification_head( + self, name, num_classes=None, inner_dim=None, **kwargs + ): + """Register a classification head.""" + if name in self.classification_heads: + prev_num_classes = self.classification_heads[name].out_proj.out_features + prev_inner_dim = self.classification_heads[name].dense.out_features + if num_classes != prev_num_classes or inner_dim != prev_inner_dim: + logger.warning( + 're-registering head "{}" with num_classes {} (prev: {}) ' + "and inner_dim {} (prev: {})".format( + name, num_classes, prev_num_classes, inner_dim, prev_inner_dim + ) + ) + self.classification_heads[name] = RobertaClassificationHead( + input_dim=self.args.encoder_embed_dim, + inner_dim=inner_dim or self.args.encoder_embed_dim, + num_classes=num_classes, + activation_fn=self.args.pooler_activation_fn, + pooler_dropout=self.args.pooler_dropout, + q_noise=self.args.quant_noise_pq, + qn_block_size=self.args.quant_noise_pq_block_size, + do_spectral_norm=self.args.spectral_norm_classification_head, + ) + + @property + def supported_targets(self): + return {"self"} + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="gpt2", + **kwargs + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + cls.upgrade_args(x["args"]) + + logger.info(x["args"]) + return RobertaHubInterface(x["args"], x["task"], x["models"][0]) + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + + # rename decoder -> encoder before upgrading children modules + for k in list(state_dict.keys()): + if k.startswith(prefix + "decoder"): + new_k = prefix + "encoder" + k[len(prefix + "decoder") :] + state_dict[new_k] = state_dict[k] + del state_dict[k] + + # upgrade children modules + super().upgrade_state_dict_named(state_dict, name) + + # Handle new classification heads present in the state dict. + current_head_names = ( + [] + if not hasattr(self, "classification_heads") + else self.classification_heads.keys() + ) + keys_to_delete = [] + for k in state_dict.keys(): + if not k.startswith(prefix + "classification_heads."): + continue + + head_name = k[len(prefix + "classification_heads.") :].split(".")[0] + num_classes = state_dict[ + prefix + "classification_heads." + head_name + ".out_proj.weight" + ].size(0) + inner_dim = state_dict[ + prefix + "classification_heads." + head_name + ".dense.weight" + ].size(0) + + if getattr(self.args, "load_checkpoint_heads", False): + if head_name not in current_head_names: + self.register_classification_head(head_name, num_classes, inner_dim) + else: + if head_name not in current_head_names: + logger.warning( + "deleting classification head ({}) from checkpoint " + "not present in current model: {}".format(head_name, k) + ) + keys_to_delete.append(k) + elif ( + num_classes + != self.classification_heads[head_name].out_proj.out_features + or inner_dim + != self.classification_heads[head_name].dense.out_features + ): + logger.warning( + "deleting classification head ({}) from checkpoint " + "with different dimensions than current model: {}".format( + head_name, k + ) + ) + keys_to_delete.append(k) + for k in keys_to_delete: + del state_dict[k] + + # Copy any newly-added classification heads into the state dict + # with their current weights. + if hasattr(self, "classification_heads"): + cur_state = self.classification_heads.state_dict() + for k, v in cur_state.items(): + if prefix + "classification_heads." + k not in state_dict: + logger.info("Overwriting " + prefix + "classification_heads." + k) + state_dict[prefix + "classification_heads." + k] = v + + +class RobertaLMHead(nn.Module): + """Head for masked language modeling.""" + + def __init__(self, embed_dim, output_dim, activation_fn, weight=None): + super().__init__() + self.dense = nn.Linear(embed_dim, embed_dim) + self.activation_fn = utils.get_activation_fn(activation_fn) + self.layer_norm = LayerNorm(embed_dim) + + if weight is None: + weight = nn.Linear(embed_dim, output_dim, bias=False).weight + self.weight = weight + self.bias = nn.Parameter(torch.zeros(output_dim)) + + def forward(self, features, masked_tokens=None, **kwargs): + # Only project the masked tokens while training, + # saves both memory and computation + if masked_tokens is not None: + features = features[masked_tokens, :] + + x = self.dense(features) + x = self.activation_fn(x) + x = self.layer_norm(x) + # project back to size of vocabulary with bias + x = F.linear(x, self.weight) + self.bias + return x + + +class RobertaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim, + inner_dim, + num_classes, + activation_fn, + pooler_dropout, + q_noise=0, + qn_block_size=8, + do_spectral_norm=False, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.activation_fn = utils.get_activation_fn(activation_fn) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = apply_quant_noise_( + nn.Linear(inner_dim, num_classes), q_noise, qn_block_size + ) + if do_spectral_norm: + if q_noise != 0: + raise NotImplementedError( + "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported" + ) + self.out_proj = torch.nn.utils.spectral_norm(self.out_proj) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = self.activation_fn(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +class RobertaEncoder(FairseqEncoder): + """RoBERTa encoder.""" + + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + if args.encoder_layers_to_keep: + args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) + + self.sentence_encoder = TransformerSentenceEncoder( + padding_idx=dictionary.pad(), + vocab_size=len(dictionary), + num_encoder_layers=args.encoder_layers, + embedding_dim=args.encoder_embed_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + layerdrop=args.encoder_layerdrop, + max_seq_len=args.max_positions, + num_segments=0, + encoder_normalize_before=True, + apply_bert_init=True, + activation_fn=args.activation_fn, + q_noise=args.quant_noise_pq, + qn_block_size=args.quant_noise_pq_block_size, + ) + args.untie_weights_roberta = getattr(args, "untie_weights_roberta", False) + + self.lm_head = RobertaLMHead( + embed_dim=args.encoder_embed_dim, + output_dim=len(dictionary), + activation_fn=args.activation_fn, + weight=( + self.sentence_encoder.embed_tokens.weight + if not args.untie_weights_roberta + else None + ), + ) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + masked_tokens=None, + **unused + ): + """ + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + features_only (bool, optional): skip LM head and just return + features. If True, the output will be of shape + `(batch, src_len, embed_dim)`. + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + + Returns: + tuple: + - the LM output of shape `(batch, src_len, vocab)` + - a dictionary of additional data, where 'inner_states' + is a list of hidden states. Note that the hidden + states have shape `(src_len, batch, vocab)`. + """ + x, extra = self.extract_features( + src_tokens, return_all_hiddens=return_all_hiddens + ) + if not features_only: + x = self.output_layer(x, masked_tokens=masked_tokens) + return x, extra + + def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs): + inner_states, _ = self.sentence_encoder( + src_tokens, + last_state_only=not return_all_hiddens, + token_embeddings=kwargs.get("token_embeddings", None), + ) + features = inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C + return features, {"inner_states": inner_states if return_all_hiddens else None} + + def output_layer(self, features, masked_tokens=None, **unused): + return self.lm_head(features, masked_tokens) + + def max_positions(self): + """Maximum output length supported by the encoder.""" + return self.args.max_positions + + +@register_model_architecture("roberta", "roberta") +def base_architecture(args): + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) + args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) + args.spectral_norm_classification_head = getattr( + args, "spectral_nrom_classification_head", False + ) + + +@register_model_architecture("roberta", "roberta_base") +def roberta_base_architecture(args): + base_architecture(args) + + +@register_model_architecture("roberta", "roberta_large") +def roberta_large_architecture(args): + args.encoder_layers = getattr(args, "encoder_layers", 24) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + base_architecture(args) + + +@register_model_architecture("roberta", "xlm") +def xlm_architecture(args): + args.encoder_layers = getattr(args, "encoder_layers", 16) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1280) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1280 * 4) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/roberta/model_camembert.py b/fairseq-0.10.2/fairseq/models/roberta/model_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..46447546fafb4a0a887b481022cac07631047c80 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/model_camembert.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +CamemBERT: a Tasty French Language Model +""" + +from fairseq.models import register_model + +from .hub_interface import RobertaHubInterface +from .model import RobertaModel + + +@register_model("camembert") +class CamembertModel(RobertaModel): + @classmethod + def hub_models(cls): + return { + "camembert": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", + "camembert.v0": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", + "camembert-base": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", + "camembert-large": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz", + "camembert-base-ccnet": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz", + "camembert-base-ccnet-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz", + "camembert-base-wikipedia-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz", + "camembert-base-oscar-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz", + } + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="sentencepiece", + **kwargs + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return RobertaHubInterface(x["args"], x["task"], x["models"][0]) diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/__init__.py b/fairseq-0.10.2/fairseq/models/wav2vec/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..06cec18183ca14cd534d14558e8b44e25f3e69d5 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/wav2vec/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .wav2vec import * # noqa +from .wav2vec2 import * # noqa +from .wav2vec2_asr import * # noqa diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..caaf63b00a9728ef071406f1e166c2fa9abb1e9d Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b16e14f0ac21f27678190dcc668dd38d750e1e43 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29ebbd719fa87f5d126e19ed5795f8876c207028 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7942e85794e39d0a28b6622daf60317176da0066 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/wav2vec/__pycache__/wav2vec2_asr.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec.py b/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec.py new file mode 100644 index 0000000000000000000000000000000000000000..772995b526fe87d4f53badca09aa5aa3a0662412 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec.py @@ -0,0 +1,735 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import math +import sys + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq.models import BaseFairseqModel, register_model, register_model_architecture +from fairseq.modules import ( + Fp32GroupNorm, + Fp32LayerNorm, + GumbelVectorQuantizer, + KmeansVectorQuantizer, + TransposeLast, +) +from fairseq.utils import buffered_arange + + +logger = logging.getLogger(__name__) + + +@register_model("wav2vec") +class Wav2VecModel(BaseFairseqModel): + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument( + "--prediction-steps", + type=int, + metavar="N", + help="number of steps ahead to predict", + ) + parser.add_argument( + "--sample-distance", + type=int, + metavar="N", + help="sample distance from target. does not work properly with cross-sampling", + ) + parser.add_argument( + "--cross-sample-negatives", + type=int, + metavar="N", + help="num of cross sampled negatives", + ) + parser.add_argument( + "--num-negatives", type=int, metavar="N", help="number of negative examples" + ) + parser.add_argument( + "--conv-feature-layers", + type=str, + metavar="EXPR", + help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]", + ) + parser.add_argument( + "--conv-aggregator-layers", + type=str, + metavar="EXPR", + help="convolutional feature extraction layers [(dim, kernel_size, stride), ...]", + ) + parser.add_argument( + "--dropout", + type=float, + metavar="D", + help="dropout to apply within the model", + ) + parser.add_argument( + "--dropout-features", + type=float, + metavar="D", + help="dropout to apply to the features", + ) + parser.add_argument( + "--dropout-agg", + type=float, + metavar="D", + help="dropout to apply after aggregation step", + ) + parser.add_argument( + "--encoder", type=str, choices=["cnn"], help="type of encoder to use" + ) + parser.add_argument( + "--aggregator", + type=str, + choices=["cnn", "gru"], + help="type of aggregator to use", + ) + parser.add_argument( + "--gru-dim", type=int, metavar="N", help="GRU dimensionality" + ) + + parser.add_argument( + "--no-conv-bias", + action="store_true", + help="if set, does not learn bias for conv layers", + ) + parser.add_argument( + "--agg-zero-pad", + action="store_true", + help="if set, zero pads in aggregator instead of repl pad", + ) + + parser.add_argument( + "--skip-connections-feat", + action="store_true", + help="if set, adds skip connections to the feature extractor", + ) + parser.add_argument( + "--skip-connections-agg", + action="store_true", + help="if set, adds skip connections to the aggregator", + ) + parser.add_argument( + "--residual-scale", + type=float, + metavar="D", + help="scales residual by sqrt(value)", + ) + + parser.add_argument( + "--log-compression", + action="store_true", + help="if set, adds a log compression to feature extractor", + ) + + parser.add_argument( + "--balanced-classes", + action="store_true", + help="if set, loss is scaled to balance for number of negatives", + ) + + parser.add_argument( + "--project-features", + choices=["none", "same", "new"], + help="if not none, features are projected using the (same or new) aggregator", + ) + + parser.add_argument( + "--non-affine-group-norm", + action="store_true", + help="if set, group norm is not affine", + ) + + parser.add_argument( + "--offset", + help="if set, introduces an offset from target to predictions. " + 'if set to "auto", it is computed automatically from the receptive field', + ) + + parser.add_argument( + "--activation", + type=str, + choices=["relu", "gelu"], + help="which activation function to use", + ) + + parser.add_argument( + "--vq-type", + type=str, + choices=["none", "gumbel", "kmeans"], + help="which type of quantizer to use", + ) + parser.add_argument( + "--vq-vars", + type=int, + metavar="N", + help="if set, project to this many vector quantized variables per group", + ) + parser.add_argument( + "--vq-groups", + type=int, + metavar="N", + help="number of groups of latent variables", + ) + parser.add_argument( + "--vq-dim", + type=int, + metavar="N", + help="uses this dimensionality for quantized vectors", + ) + parser.add_argument( + "--vq-depth", + type=int, + metavar="N", + help="number of layers for vq weight projection", + ) + parser.add_argument( + "--combine-groups", + action="store_true", + help="if set, variables are shared among groups", + ) + parser.add_argument( + "--vq-temp", + type=str, + metavar="TEMP", + help="temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)", + ) + parser.add_argument( + "--vq-gamma", + type=float, + metavar="D", + help="gamma parameter for kmeans style vector quantization", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_wav2vec_architecture(args) + + model = Wav2VecModel(args) + logger.info(model) + return model + + def __init__(self, args): + super().__init__() + + self.prediction_steps = args.prediction_steps + offset = args.offset + + if args.activation == "relu": + activation = nn.ReLU() + elif args.activation == "gelu": + activation = nn.GELU() + else: + raise Exception("unknown activation " + args.activation) + + if args.encoder == "cnn": + feature_enc_layers = eval(args.conv_feature_layers) + self.feature_extractor = ConvFeatureExtractionModel( + conv_layers=feature_enc_layers, + dropout=0.0, + log_compression=args.log_compression, + skip_connections=args.skip_connections_feat, + residual_scale=args.residual_scale, + non_affine_group_norm=args.non_affine_group_norm, + activation=activation, + ) + embed = feature_enc_layers[-1][0] + else: + raise Exception("unknown encoder type " + args.encoder) + + self.vector_quantizer = None + if args.vq_type == "gumbel": + self.vector_quantizer = GumbelVectorQuantizer( + dim=embed, + num_vars=args.vq_vars, + temp=eval(args.vq_temp), + groups=args.vq_groups, + combine_groups=args.combine_groups, + vq_dim=args.vq_dim if args.vq_dim > 0 else embed, + time_first=False, + activation=activation, + weight_proj_depth=args.vq_depth, + weight_proj_factor=2, + ) + elif args.vq_type == "kmeans": + self.vector_quantizer = KmeansVectorQuantizer( + dim=embed, + num_vars=args.vq_vars, + groups=args.vq_groups, + combine_groups=args.combine_groups, + vq_dim=args.vq_dim if args.vq_dim > 0 else embed, + time_first=False, + gamma=args.vq_gamma, + ) + else: + assert ( + args.vq_type == "none" or args.vq_type is None + ), "Unknown quantizer type" + + if args.offset == "auto": + assert args.encoder == "cnn" + jin = 0 + rin = 0 + for _, k, stride in feature_enc_layers: + if rin == 0: + rin = k + rin = rin + (k - 1) * jin + if jin == 0: + jin = stride + else: + jin *= stride + offset = math.ceil(rin / jin) + + offset = int(offset) + + def make_aggregator(): + if args.aggregator == "cnn": + agg_layers = eval(args.conv_aggregator_layers) + agg_dim = agg_layers[-1][0] + feature_aggregator = ConvAggegator( + conv_layers=agg_layers, + embed=embed, + dropout=args.dropout, + skip_connections=args.skip_connections_agg, + residual_scale=args.residual_scale, + non_affine_group_norm=args.non_affine_group_norm, + conv_bias=not args.no_conv_bias, + zero_pad=args.agg_zero_pad, + activation=activation, + ) + elif args.aggregator == "gru": + agg_dim = args.gru_dim + feature_aggregator = nn.Sequential( + TransposeLast(), + nn.GRU( + input_size=embed, + hidden_size=agg_dim, + num_layers=1, + dropout=args.dropout, + ), + TransposeLast(deconstruct_idx=0), + ) + else: + raise Exception("unknown aggregator type " + args.aggregator) + + return feature_aggregator, agg_dim + + self.feature_aggregator, agg_dim = make_aggregator() + + self.wav2vec_predictions = Wav2VecPredictionsModel( + in_dim=agg_dim, + out_dim=embed, + prediction_steps=args.prediction_steps, + n_negatives=args.num_negatives, + cross_sample_negatives=args.cross_sample_negatives, + sample_distance=args.sample_distance, + dropout=args.dropout, + offset=offset, + balanced_classes=args.balanced_classes, + infonce=args.infonce, + ) + + self.dropout_feats = nn.Dropout(p=args.dropout_features) + self.dropout_agg = nn.Dropout(p=args.dropout_agg) + + if args.project_features == "none": + self.project_features = None + elif args.project_features == "same": + self.project_features = self.feature_aggregator + elif args.project_features == "new": + self.project_features, _ = make_aggregator() + + def forward(self, source): + result = {} + + features = self.feature_extractor(source) + if self.vector_quantizer: + q_res = self.vector_quantizer(features) + features = q_res["x"] + for k in q_res.keys(): + if k != "x": + result[k] = q_res[k] + + x = self.dropout_feats(features) + x = self.feature_aggregator(x) + x = self.dropout_agg(x) + + if self.project_features is not None: + features = self.project_features(features) + x, targets = self.wav2vec_predictions(x, features) + result["cpc_logits"] = x + result["cpc_targets"] = targets + + return result + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + + def max_positions(self): + """Maximum length supported by the model.""" + return sys.maxsize + + def get_logits(self, net_output): + logits = net_output["cpc_logits"] + return logits + + def get_targets(self, sample, net_output): + t = net_output["cpc_targets"] + if isinstance(t, tuple): + t = t[0] + return t.contiguous() + + def get_target_weights(self, targets, net_output): + targets = net_output["cpc_targets"] + if isinstance(targets, tuple) and targets[-1] is not None: + return targets[-1] + return None + + def get_extra_losses(self, net_output): + loss = None + if "prob_perplexity" in net_output: + loss = net_output["num_vars"] - net_output["prob_perplexity"] + elif "kmeans_loss" in net_output: + loss = net_output["kmeans_loss"] + + return loss + + +def norm_block(is_layer_norm, dim, affine=True): + if is_layer_norm: + mod = nn.Sequential( + TransposeLast(), + Fp32LayerNorm(dim, elementwise_affine=affine), + TransposeLast(), + ) + else: + mod = Fp32GroupNorm(1, dim, affine=affine) + + return mod + + +class ConvFeatureExtractionModel(nn.Module): + def __init__( + self, + conv_layers, + dropout, + log_compression, + skip_connections, + residual_scale, + non_affine_group_norm, + activation, + ): + super().__init__() + + def block(n_in, n_out, k, stride): + return nn.Sequential( + nn.Conv1d(n_in, n_out, k, stride=stride, bias=False), + nn.Dropout(p=dropout), + norm_block( + is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm + ), + activation, + ) + + in_d = 1 + self.conv_layers = nn.ModuleList() + for dim, k, stride in conv_layers: + self.conv_layers.append(block(in_d, dim, k, stride)) + in_d = dim + + self.log_compression = log_compression + self.skip_connections = skip_connections + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x): + # BxT -> BxCxT + x = x.unsqueeze(1) + + for conv in self.conv_layers: + residual = x + x = conv(x) + if self.skip_connections and x.size(1) == residual.size(1): + tsz = x.size(2) + r_tsz = residual.size(2) + residual = residual[..., :: r_tsz // tsz][..., :tsz] + x = (x + residual) * self.residual_scale + + if self.log_compression: + x = x.abs() + x = x + 1 + x = x.log() + + return x + + +class ZeroPad1d(nn.Module): + def __init__(self, pad_left, pad_right): + super().__init__() + self.pad_left = pad_left + self.pad_right = pad_right + + def forward(self, x): + return F.pad(x, (self.pad_left, self.pad_right)) + + +class ConvAggegator(nn.Module): + def __init__( + self, + conv_layers, + embed, + dropout, + skip_connections, + residual_scale, + non_affine_group_norm, + conv_bias, + zero_pad, + activation, + ): + super().__init__() + + def block(n_in, n_out, k, stride): + # padding dims only really make sense for stride = 1 + ka = k // 2 + kb = ka - 1 if k % 2 == 0 else ka + + pad = ( + ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0)) + ) + + return nn.Sequential( + pad, + nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias), + nn.Dropout(p=dropout), + norm_block(False, n_out, affine=not non_affine_group_norm), + activation, + ) + + in_d = embed + self.conv_layers = nn.ModuleList() + self.residual_proj = nn.ModuleList() + for dim, k, stride in conv_layers: + if in_d != dim and skip_connections: + self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False)) + else: + self.residual_proj.append(None) + + self.conv_layers.append(block(in_d, dim, k, stride)) + in_d = dim + self.conv_layers = nn.Sequential(*self.conv_layers) + self.skip_connections = skip_connections + self.residual_scale = math.sqrt(residual_scale) + + def forward(self, x): + for rproj, conv in zip(self.residual_proj, self.conv_layers): + residual = x + x = conv(x) + if self.skip_connections: + if rproj is not None: + residual = rproj(residual) + x = (x + residual) * self.residual_scale + return x + + +class Wav2VecPredictionsModel(nn.Module): + def __init__( + self, + in_dim, + out_dim, + prediction_steps, + n_negatives, + cross_sample_negatives, + sample_distance, + dropout, + offset, + balanced_classes, + infonce, + ): + super().__init__() + + self.n_negatives = n_negatives + self.cross_sample_negatives = cross_sample_negatives + self.sample_distance = sample_distance + self.project_to_steps = nn.ConvTranspose2d( + in_dim, out_dim, (1, prediction_steps) + ) + self.dropout = nn.Dropout(p=dropout) + self.offset = offset + self.balanced_classes = balanced_classes + self.infonce = infonce + + def sample_negatives(self, y): + bsz, fsz, tsz = y.shape + + y = y.transpose(0, 1) # BCT -> CBT + y = y.contiguous().view(fsz, -1) # CBT => C(BxT) + + cross_high = tsz * bsz + high = tsz if self.sample_distance is None else min(tsz, self.sample_distance) + assert high > 1 + + neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz)) + + with torch.no_grad(): + if self.n_negatives > 0: + tszs = ( + buffered_arange(tsz) + .unsqueeze(-1) + .expand(-1, self.n_negatives) + .flatten() + ) + + neg_idxs = torch.randint( + low=0, high=high - 1, size=(bsz, self.n_negatives * tsz) + ) + neg_idxs[neg_idxs >= tszs] += 1 + + if self.cross_sample_negatives > 0: + tszs = ( + buffered_arange(tsz) + .unsqueeze(-1) + .expand(-1, self.cross_sample_negatives) + .flatten() + ) + + cross_neg_idxs = torch.randint( + low=0, + high=cross_high - 1, + size=(bsz, self.cross_sample_negatives * tsz), + ) + cross_neg_idxs[cross_neg_idxs >= tszs] += 1 + + if self.n_negatives > 0: + for i in range(1, bsz): + neg_idxs[i] += i * high + else: + neg_idxs = cross_neg_idxs + + if self.cross_sample_negatives > 0 and self.n_negatives > 0: + neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1) + + negs = y[..., neg_idxs.view(-1)] + negs = negs.view( + fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz + ).permute( + 2, 1, 0, 3 + ) # to NxBxCxT + + return negs + + def forward(self, x, y): + + x = x.unsqueeze(-1) + x = self.project_to_steps(x) # BxCxTxS + x = self.dropout(x) + + negatives = self.sample_negatives(y) + y = y.unsqueeze(0) + targets = torch.cat([y, negatives], dim=0) # Copies x B x C x T + + copies = targets.size(0) + bsz, dim, tsz, steps = x.shape + steps = min(steps, tsz - self.offset) + + predictions = x.new( + bsz * copies * (tsz - self.offset + 1) * steps + - ((steps + 1) * steps // 2) * copies * bsz + ) + if self.infonce: + labels = predictions.new_full( + (predictions.shape[0] // copies,), 0, dtype=torch.long + ) + else: + labels = torch.zeros_like(predictions) + weights = ( + torch.full_like(labels, 1 / self.n_negatives) + if self.balanced_classes and not self.infonce + else None + ) + + start = end = 0 + for i in range(steps): + offset = i + self.offset + end = start + (tsz - offset) * bsz * copies + if self.infonce: + predictions[start:end] = torch.einsum( + "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:] + ).flatten() + else: + pos_num = (end - start) // copies + predictions[start:end] = torch.einsum( + "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:] + ).flatten() + labels[start : start + pos_num] = 1.0 + if weights is not None: + weights[start : start + pos_num] = 1.0 + start = end + assert end == predictions.numel(), "{} != {}".format(end, predictions.numel()) + + if self.infonce: + predictions = predictions.view(-1, copies) + else: + if weights is not None: + labels = (labels, weights) + + return predictions, labels + + +@register_model_architecture("wav2vec", "wav2vec") +def base_wav2vec_architecture(args): + conv_feature_layers = "[(512, 10, 5)]" + conv_feature_layers += " + [(512, 8, 4)]" + conv_feature_layers += " + [(512, 4, 2)] * 3" + args.conv_feature_layers = getattr(args, "conv_feature_layers", conv_feature_layers) + + args.conv_aggregator_layers = getattr( + args, "conv_aggregator_layers", "[(512, 3, 1)] * 9" + ) + + args.prediction_steps = getattr(args, "prediction_steps", 12) + args.num_negatives = getattr(args, "num_negatives", 1) + args.sample_distance = getattr(args, "sample_distance", None) + args.cross_sample_negatives = getattr(args, "cross_sample_negatives", 0) + + args.dropout = getattr(args, "dropout", 0.0) + args.dropout_features = getattr(args, "dropout_features", 0.0) + args.dropout_agg = getattr(args, "dropout_agg", 0.0) + args.encoder = getattr(args, "encoder", "cnn") + args.aggregator = getattr(args, "aggregator", "cnn") + + args.skip_connections_feat = getattr(args, "skip_connections_feat", False) + args.skip_connections_agg = getattr(args, "skip_connections_agg", False) + args.residual_scale = getattr(args, "residual_scale", 0.5) + + args.gru_dim = getattr(args, "gru_dim", 512) + + args.no_conv_bias = getattr(args, "no_conv_bias", False) + args.agg_zero_pad = getattr(args, "agg_zero_pad", False) + + args.log_compression = getattr(args, "log_compression", False) + + args.balanced_classes = getattr(args, "balanced_classes", False) + args.infonce = getattr(args, "infonce", False) + args.project_features = getattr(args, "project_features", "none") + + args.non_affine_group_norm = getattr(args, "non_affine_group_norm", False) + + args.offset = getattr(args, "offset", "auto") + + args.activation = getattr(args, "activation", "relu") + + args.vq_type = getattr(args, "vq_type", "none") + args.vq_vars = getattr(args, "vq_vars", 320) + args.vq_groups = getattr(args, "vq_groups", 2) + args.vq_dim = getattr(args, "vq_dim", 0) + args.vq_depth = getattr(args, "vq_depth", 1) + args.combine_groups = getattr(args, "combine_groups", False) + args.vq_temp = getattr(args, "vq_temp", "(2.0, 0.5, 0.999995)") + args.vq_gamma = getattr(args, "vq_gamma", 0.25) diff --git a/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec2_asr.py b/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec2_asr.py new file mode 100644 index 0000000000000000000000000000000000000000..52ca9a8007b3e6236c7ac23bfa573990e549d15d --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/wav2vec/wav2vec2_asr.py @@ -0,0 +1,675 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import copy +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import checkpoint_utils, tasks, utils +from fairseq.models import ( + BaseFairseqModel, + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, + register_model, + register_model_architecture, +) +from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer + + +def add_common_args(parser): + parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model") + parser.add_argument( + "--no-pretrained-weights", + action="store_true", + help="if true, does not load pretrained weights", + ) + parser.add_argument( + "--dropout-input", + type=float, + metavar="D", + help="dropout to apply to the input (after feat extr)", + ) + parser.add_argument( + "--final-dropout", + type=float, + metavar="D", + help="dropout after transformer and before final projection", + ) + parser.add_argument( + "--apply-mask", action="store_true", help="apply masking during fine-tuning" + ) + parser.add_argument( + "--dropout", + type=float, + metavar="D", + help="dropout probability inside wav2vec 2.0 model", + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights inside wav2vec 2.0 model", + ) + parser.add_argument( + "--activation-dropout", + "--relu-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN inside wav2vec 2.0 model", + ) + + parser.add_argument( + "--mask-length", type=int, help="repeat the mask indices multiple times" + ) + + parser.add_argument( + "--mask-prob", type=float, help="probability of replacing a token with mask" + ) + + parser.add_argument( + "--mask-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + + parser.add_argument( + "--mask-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + + parser.add_argument( + "--no-mask-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + + parser.add_argument( + "--mask-channel-length", type=int, help="repeat the mask indices multiple times" + ) + + parser.add_argument( + "--mask-channel-prob", + type=float, + help="probability of replacing a token with mask", + ) + + parser.add_argument( + "--mask-channel-selection", + type=str, + choices=["static", "uniform", "normal", "poisson"], + help="how to choose masks", + ) + + parser.add_argument( + "--mask-channel-other", + type=float, + help="stdev of the mask length in case of 'normal' selection strategy", + ) + + parser.add_argument( + "--no-mask-channel-overlap", + action="store_true", + help="whether to allow masks to overlap", + ) + + parser.add_argument( + "--freeze-finetune-updates", + default=0, + type=int, + help="dont finetune wav2vec for this many updates", + ) + + parser.add_argument( + "--feature-grad-mult", + default=None, + type=float, + help="reset feature grad mult in wav2vec 2.0 to this", + ) + + parser.add_argument( + "--layerdrop", + default=0.0, + type=float, + help="probability of dropping a layer in wav2vec 2.0", + ) + + +@register_model("wav2vec_ctc") +class Wav2VecCtc(BaseFairseqModel): + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + add_common_args(parser) + + def __init__(self, w2v_encoder, args): + super().__init__() + self.w2v_encoder = w2v_encoder + self.args = args + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + base_architecture(args) + w2v_encoder = Wav2VecEncoder(args, task.target_dictionary) + return cls(w2v_encoder, args) + + def get_normalized_probs(self, net_output, log_probs): + """Get normalized probabilities (or log probs) from a net's output.""" + + logits = net_output["encoder_out"] + if log_probs: + return utils.log_softmax(logits.float(), dim=-1) + else: + return utils.softmax(logits.float(), dim=-1) + + def forward(self, **kwargs): + x = self.w2v_encoder(**kwargs) + return x + + # def max_positions(self): + # return None + + +@register_model("wav2vec_seq2seq") +class TransformerModel(FairseqEncoderDecoderModel): + def __init__(self, args, encoder, decoder): + super().__init__(encoder, decoder) + + @staticmethod + def add_args(parser): + add_common_args(parser) + + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-layerdrop", + type=float, + metavar="D", + help="decoder layerdrop chance", + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads", + ) + parser.add_argument( + "--decoder-learned-pos", + action="store_true", + help="use learned positional embeddings in the decoder", + ) + parser.add_argument( + "--decoder-normalize-before", + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--no-token-positional-embeddings", + default=False, + action="store_true", + help="if set, disables positional embeddings (outside self attention)", + ) + + parser.add_argument( + "--decoder-dropout", + type=float, + metavar="D", + help="dropout probability in the decoder", + ) + parser.add_argument( + "--decoder-attention-dropout", + type=float, + metavar="D", + help="dropout probability for attention weights inside the decoder", + ) + parser.add_argument( + "--decoder-activation-dropout", + type=float, + metavar="D", + help="dropout probability after activation in FFN inside the decoder", + ) + + # fmt: on + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + if not hasattr(args, "max_source_positions"): + args.max_source_positions = 2048 + if not hasattr(args, "max_target_positions"): + args.max_target_positions = 2048 + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + def build_embedding(dictionary, embed_dim): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + return emb + + decoder_embed_tokens = build_embedding(tgt_dict, args.decoder_embed_dim) + + encoder = cls.build_encoder(args) + decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) + return TransformerModel(args, encoder, decoder) + + @classmethod + def build_encoder(cls, args): + return Wav2VecEncoder(args) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return TransformerDecoder(args, tgt_dict, embed_tokens) + + def forward(self, **kwargs): + encoder_out = self.encoder(tbc=False, **kwargs) + decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) + return decoder_out + + def upgrade_state_dict_named(self, state_dict, name): + super().upgrade_state_dict_named(state_dict, name) + return state_dict + + +class Wav2VecEncoder(FairseqEncoder): + def __init__(self, args, tgt_dict=None): + self.apply_mask = args.apply_mask + + arg_overrides = { + "dropout": args.dropout, + "activation_dropout": args.activation_dropout, + "dropout_input": args.dropout_input, + "attention_dropout": args.attention_dropout, + "mask_length": args.mask_length, + "mask_prob": args.mask_prob, + "mask_selection": args.mask_selection, + "mask_other": args.mask_other, + "no_mask_overlap": args.no_mask_overlap, + "mask_channel_length": args.mask_channel_length, + "mask_channel_prob": args.mask_channel_prob, + "mask_channel_selection": args.mask_channel_selection, + "mask_channel_other": args.mask_channel_other, + "no_mask_channel_overlap": args.no_mask_channel_overlap, + "encoder_layerdrop": args.layerdrop, + "feature_grad_mult": args.feature_grad_mult, + } + + if getattr(args, "w2v_args", None) is None: + state = checkpoint_utils.load_checkpoint_to_cpu( + args.w2v_path, arg_overrides + ) + w2v_args = state["args"] + else: + state = None + w2v_args = args.w2v_args + + assert ( + args.normalize == w2v_args.normalize + ), "Fine-tuning works best when data normalization is the same" + + w2v_args.data = args.data + task = tasks.setup_task(w2v_args) + model = task.build_model(w2v_args) + + if state is not None and not args.no_pretrained_weights: + model.load_state_dict(state["model"], strict=True) + + model.remove_pretraining_modules() + + super().__init__(task.source_dictionary) + + d = w2v_args.encoder_embed_dim + + self.w2v_model = model + + self.final_dropout = nn.Dropout(args.final_dropout) + self.freeze_finetune_updates = args.freeze_finetune_updates + self.num_updates = 0 + + if tgt_dict is not None: + self.proj = Linear(d, len(tgt_dict)) + elif getattr(args, "decoder_embed_dim", d) != d: + self.proj = Linear(d, args.decoder_embed_dim) + else: + self.proj = None + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + super().set_num_updates(num_updates) + self.num_updates = num_updates + + def forward(self, source, padding_mask, tbc=True, **kwargs): + + w2v_args = { + "source": source, + "padding_mask": padding_mask, + "mask": self.apply_mask and self.training, + } + + ft = self.freeze_finetune_updates <= self.num_updates + + with torch.no_grad() if not ft else contextlib.ExitStack(): + x, padding_mask = self.w2v_model.extract_features(**w2v_args) + + if tbc: + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + x = self.final_dropout(x) + + if self.proj: + x = self.proj(x) + + return { + "encoder_out": x, # T x B x C + "encoder_padding_mask": padding_mask, # B x T + "padding_mask": padding_mask, + } + + def reorder_encoder_out(self, encoder_out, new_order): + if encoder_out["encoder_out"] is not None: + encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( + 1, new_order + ) + if encoder_out["encoder_padding_mask"] is not None: + encoder_out["encoder_padding_mask"] = encoder_out[ + "encoder_padding_mask" + ].index_select(0, new_order) + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return None + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +class TransformerDecoder(FairseqIncrementalDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): + super().__init__(dictionary) + + self.dropout = args.decoder_dropout + self.share_input_output_embed = args.share_decoder_input_output_embed + + input_embed_dim = embed_tokens.embedding_dim + embed_dim = args.decoder_embed_dim + self.output_embed_dim = args.decoder_embed_dim + args.encoder_embed_dim = embed_dim + + self.layerdrop = args.decoder_layerdrop + + padding_idx = embed_tokens.padding_idx + self.max_target_positions = args.max_target_positions + + self.embed_tokens = embed_tokens + self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + + self.embed_positions = ( + PositionalEmbedding( + args.max_target_positions, + embed_dim, + padding_idx, + learned=args.decoder_learned_pos, + ) + if not args.no_token_positional_embeddings + else None + ) + + args = copy.deepcopy(args) + args.dropout = args.decoder_dropout + args.attention_dropout = args.decoder_attention_dropout + args.activation_dropout = args.decoder_activation_dropout + + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + TransformerDecoderLayer(args, no_encoder_attn) + for _ in range(args.decoder_layers) + ] + ) + + if not self.share_input_output_embed: + self.embed_out = nn.Parameter( + torch.Tensor(len(dictionary), self.output_embed_dim) + ) + nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) + + if args.decoder_normalize_before and not getattr( + args, "no_decoder_final_norm", False + ): + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (Tensor, optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + prev_output_tokens = prev_output_tokens.long() + x, extra = self.extract_features( + prev_output_tokens, encoder_out, incremental_state + ) + x = self.output_layer(x) + return x, extra + + def extract_features( + self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused + ): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + + # embed positions + positions = ( + self.embed_positions( + prev_output_tokens, incremental_state=incremental_state + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + attn = None + + inner_states = [x] + + # decoder layers + for layer in self.layers: + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, attn, _ = layer( + x, + encoder_out["encoder_out"] if encoder_out is not None else None, + encoder_out["encoder_padding_mask"] + if encoder_out is not None + else None, + incremental_state, + self_attn_mask=self.buffered_future_mask(x) + if incremental_state is None + else None, + ) + inner_states.append(x) + + if self.layer_norm: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, {"attn": attn, "inner_states": inner_states} + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + # project back to size of vocabulary + if self.share_input_output_embed: + return F.linear(features, self.embed_tokens.weight) + else: + return F.linear(features, self.embed_out) + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embed_positions is None: + return self.max_target_positions + return min(self.max_target_positions, self.embed_positions.max_positions) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + if ( + not hasattr(self, "_future_mask") + or self._future_mask is None + or self._future_mask.device != tensor.device + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 + ) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + return state_dict + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m + + +@register_model_architecture("wav2vec_ctc", "wav2vec_ctc") +def base_architecture(args): + args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False) + args.dropout_input = getattr(args, "dropout_input", 0) + args.final_dropout = getattr(args, "final_dropout", 0) + args.apply_mask = getattr(args, "apply_mask", False) + args.dropout = getattr(args, "dropout", 0) + args.attention_dropout = getattr(args, "attention_dropout", 0) + args.activation_dropout = getattr(args, "activation_dropout", 0) + + args.mask_length = getattr(args, "mask_length", 10) + args.mask_prob = getattr(args, "mask_prob", 0.5) + args.mask_selection = getattr(args, "mask_selection", "static") + args.mask_other = getattr(args, "mask_other", 0) + args.no_mask_overlap = getattr(args, "no_mask_overlap", False) + args.mask_channel_length = getattr(args, "mask_channel_length", 10) + args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5) + args.mask_channel_selection = getattr(args, "mask_channel_selection", "static") + args.mask_channel_other = getattr(args, "mask_channel_other", 0) + args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False) + + args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0) + args.feature_grad_mult = getattr(args, "feature_grad_mult", 0) + args.layerdrop = getattr(args, "layerdrop", 0.0) + + +@register_model_architecture("wav2vec_seq2seq", "wav2vec_seq2seq") +def seq2seq_architecture(args): + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_layers = getattr(args, "decoder_layers", 10) + args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.decoder_dropout = getattr(args, "decoder_dropout", 0) + args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0) + args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0) + args.share_decoder_input_output_embed = getattr( + args, "share_decoder_input_output_embed", False + ) + + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/same_pad.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/same_pad.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..266c03705df4ecca87e1e8d5e08f10fa2f1993e0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/same_pad.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e23a577a3ed0501ff1939fb346e944c9eb0ef47f Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/adaptive_input.py b/fairseq-0.10.2/fairseq/modules/adaptive_input.py new file mode 100644 index 0000000000000000000000000000000000000000..446534a9f8b87337a4dd752944ea386ff7cf7965 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/adaptive_input.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import List + +import torch +from fairseq.modules.quant_noise import quant_noise +from torch import nn + + +class AdaptiveInput(nn.Module): + def __init__( + self, + vocab_size: int, + padding_idx: int, + initial_dim: int, + factor: float, + output_dim: int, + cutoff: List[int], + q_noise: float = 0, + qn_block_size: int = 8, + ): + super().__init__() + + if vocab_size > cutoff[-1]: + cutoff = cutoff + [vocab_size] + else: + assert ( + vocab_size == cutoff[-1] + ), "cannot specify cutoff larger than vocab size" + + self.cutoff = cutoff + self.embedding_dim = output_dim + self.padding_idx = padding_idx + + self.embeddings = nn.ModuleList() + for i in range(len(self.cutoff)): + prev = self.cutoff[i - 1] if i > 0 else 0 + size = self.cutoff[i] - prev + dim = int(initial_dim // (factor ** i)) + seq = nn.Sequential( + nn.Embedding(size, dim, self.padding_idx), + quant_noise( + nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size + ), + ) + + self.embeddings.append(seq) + self.padding_idx = None + self.padding_idx = padding_idx + + def init_weights(m): + if isinstance(m, nn.Embedding): + nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5) + nn.init.constant_(m.weight[padding_idx], 0) + elif hasattr(m, "weight"): + nn.init.xavier_uniform_(m.weight) + + self.apply(init_weights) + + self.register_buffer("_float_tensor", torch.FloatTensor(1)) + + def weights_for_band(self, band: int): + return self.embeddings[band][0].weight, self.embeddings[band][1].weight + + def forward(self, input: torch.Tensor): + result = self._float_tensor.new(input.shape + (self.embedding_dim,)) + for i in range(len(self.cutoff)): + mask = input.lt(self.cutoff[i]) + if i > 0: + mask.mul_(input.ge(self.cutoff[i - 1])) + chunk_input = input[mask] - self.cutoff[i - 1] + else: + chunk_input = input[mask] + if mask.any(): + result[mask] = self.embeddings[i](chunk_input) + return result diff --git a/fairseq-0.10.2/fairseq/modules/conv_tbc.py b/fairseq-0.10.2/fairseq/modules/conv_tbc.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc46c4b9baf93c54234df0c61e8e7fd6390ee63 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/conv_tbc.py @@ -0,0 +1,42 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch.nn.modules.utils import _single + + +class ConvTBC(torch.nn.Module): + """1D convolution over an input of shape (time x batch x channel) + + The implementation uses gemm to perform the convolution. This implementation + is faster than cuDNN for small kernel sizes. + """ + + def __init__(self, in_channels, out_channels, kernel_size, padding=0): + super(ConvTBC, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _single(kernel_size) + self.padding = _single(padding) + + self.weight = torch.nn.Parameter( + torch.Tensor(self.kernel_size[0], in_channels, out_channels) + ) + self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) + + def forward(self, input): + return torch.conv_tbc( + input.contiguous(), self.weight, self.bias, self.padding[0] + ) + + def __repr__(self): + s = ( + "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}" + ", padding={padding}" + ) + if self.bias is None: + s += ", bias=False" + s += ")" + return s.format(name=self.__class__.__name__, **self.__dict__) diff --git a/fairseq-0.10.2/fairseq/modules/cuda_utils.cu b/fairseq-0.10.2/fairseq/modules/cuda_utils.cu new file mode 100644 index 0000000000000000000000000000000000000000..516f1d92440e9e2c092f122e45d81b45cb135602 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/cuda_utils.cu @@ -0,0 +1,203 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + + +template +constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { + return (a + b - 1) / b; +} + + +template +__inline__ __device__ +void zeroSharedMem(scalar_t* data) { + /* + Given an array of length FS + SB, zero out the first padding_l and last + (FS - padding_l) values in the array + */ + + int tid = threadIdx.x; + + if (FS < SB) { + + // zero all if we have enough threads in a block to do all of them + if (tid < padding_l || tid > SB - FS + padding_l - 1) { + data[tid] = scalar_t(0.0); + } + } else { + + // otherwise zero out one block at a time + const int numIterations = divUp(FS, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if (tid + offset < padding_l) { + data[tid + offset] = scalar_t(0.0); + } else if (tid + offset < FS) { + data[SB + tid + offset] = scalar_t(0.0); + } + } + } +} + +template +__inline__ __device__ +scalar_t warpReduce(scalar_t data) { + /* + Reduce an array within each warp. After processing all values in warp will + caontain the sum of all original values in that warp. + + data - pointer to data to reduce + */ + data += __shfl_xor_sync(SHFL_MASK, data, 16); + data += __shfl_xor_sync(SHFL_MASK, data, 8); + data += __shfl_xor_sync(SHFL_MASK, data, 4); + data += __shfl_xor_sync(SHFL_MASK, data, 2); + data += __shfl_xor_sync(SHFL_MASK, data, 1); + return data; +} + +template +__inline__ __device__ +scalar_t blockReduce(scalar_t data) { + /* + Reduce an entire array on the block level. After processing, the + first value in the array will contain the reduced sum. + + data - pointer to data to reduce + */ + + static __shared__ scalar_t warpSum[32]; + const int tid = threadIdx.x; + int wid = tid / 32; + int lane = tid % 32; + + __syncthreads(); + + // reduce each warp then write to shared memory + scalar_t sum = warpReduce(data); + if (lane == 0) { + warpSum[wid] = sum; + } + + __syncthreads(); + + scalar_t v; + // perform final sum of partial warp sums + if (tid < blockDim.x / 32) { + v = warpSum[lane]; + } else { + v = scalar_t(0.0); + } + + if (wid == 0) { + v = warpReduce(v); + } + __syncthreads(); + + return v; +} + +void checkCudaStatus(cudaError_t status, int lineNumber = -1) { + + if (status != cudaSuccess) { + std::cout << cudaGetErrorString(status) + << " at line " << lineNumber << std::endl; + std::cout << "Exiting" << std::endl; + exit(1); + } +} + +template +__device__ +void load_input_to_shared(const scalar_t* input, // global memory + int inputOffset, int sequenceLength, + int iteration, int numIterations, + bool no_prev, scalar_t* output /* shared memory */) { + /* + Load a block size of input into shared memory with + right and left overhang of total size FS. If previously + loaded memory, overlap will be shifted over to reduce + global memory access + + input - pointer to start of channel sequence + inputOffset - how far in the sequence to start loading + sequenceLength - total length of sequence + iteration - which block of sequence we are loading + numIterations - total number of blocks to load + no_prev - whether to load the whole block if the previous block + wasn't loaded + output - shared memory to write input to + */ + + const int tid = threadIdx.x; + + // Load the left "overhang" of input + if (iteration > 0) { + if (padding_l < SB) { + + // load all at once + if (tid < padding_l) { + output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; + } + } else { + + // load in chunks of size SB + int numIterations = divUp(padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < padding_l) { + output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB]; + } + } + } + } + + // Load the right "overhang" of input + if (iteration < (numIterations - 1)) { + const int elementsLeft = sequenceLength - (iteration+1) * SB; + + if ((FS - padding_l) < SB) { + + // load all at once + if (tid < (FS - padding_l)) { + output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0); + } + } else { + + // load in chunks of size SB + int numIterations = divUp(FS - padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < (FS - padding_l)) { + output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0); + } + } + } + } + + // We should also clear out the right "overhang" + if (iteration == (numIterations - 1)) { + if ((FS - padding_l) < SB) { + + // clear out all at once + if (tid < (FS - padding_l)) { + output[padding_l + SB + tid] = scalar_t(0.0); + } + } else { + + // clear in chunks of size SB + int numIterations = divUp(FS - padding_l, SB); + for (int i = 0; i < numIterations; i++) { + int offset = i * SB; + if ((tid + offset) < (FS - padding_l)) { + output[padding_l + SB + tid + offset] = scalar_t(0.0); + } + } + } + } + output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0); +} diff --git a/fairseq-0.10.2/fairseq/modules/dynamic_convolution.py b/fairseq-0.10.2/fairseq/modules/dynamic_convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..5999a0453973166e65ae22fe49c0c4143a253bcc --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamic_convolution.py @@ -0,0 +1,304 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.modules.fairseq_dropout import FairseqDropout + +from .unfold import unfold1d + + +def DynamicConv( + input_size, + kernel_size=1, + padding_l=None, + num_heads=1, + weight_dropout=0.0, + weight_softmax=False, + renorm_padding=False, + bias=False, + conv_bias=False, + query_size=None, + in_proj=False, +): + if torch.cuda.is_available(): + try: + from fairseq.modules.dynamicconv_layer import DynamicconvLayer + + return DynamicconvLayer( + input_size, + kernel_size=kernel_size, + padding_l=padding_l, + num_heads=num_heads, + weight_dropout=weight_dropout, + weight_softmax=weight_softmax, + bias=bias, + ) + except ImportError as e: + print(e) + return DynamicConv1dTBC( + input_size, + kernel_size=kernel_size, + padding_l=padding_l, + num_heads=num_heads, + weight_dropout=weight_dropout, + weight_softmax=weight_softmax, + bias=bias, + ) + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m + + +@with_incremental_state +class DynamicConv1dTBC(nn.Module): + """Dynamic lightweight convolution taking T x B x C inputs + Args: + input_size: # of channels of the input + kernel_size: convolution channels + padding_l: padding to the left when using "same" padding + num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) + weight_dropout: the drop rate of the DropConnect to drop the weight + weight_softmax: normalize the weight with softmax before the convolution + renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1) + bias: use bias + conv_bias: bias of the convolution + query_size: specified when feeding a different input as the query + in_proj: project the input and generate the filter together + + Shape: + Input: TxBxC, i.e. (timesteps, batch_size, input_size) + Output: TxBxC, i.e. (timesteps, batch_size, input_size) + + Attributes: + weight: the learnable weights of the module of shape + `(num_heads, 1, kernel_size)` + bias: the learnable bias of the module of shape `(input_size)` + """ + + def __init__( + self, + input_size, + kernel_size=1, + padding_l=None, + num_heads=1, + weight_dropout=0.0, + weight_softmax=False, + renorm_padding=False, + bias=False, + conv_bias=False, + query_size=None, + in_proj=False, + ): + super().__init__() + self.input_size = input_size + self.query_size = input_size if query_size is None else query_size + self.kernel_size = kernel_size + self.padding_l = padding_l + self.num_heads = num_heads + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + self.weight_softmax = weight_softmax + self.renorm_padding = renorm_padding + + if in_proj: + self.weight_linear = Linear( + self.input_size, self.input_size + num_heads * kernel_size * 1 + ) + else: + self.weight_linear = Linear( + self.query_size, num_heads * kernel_size * 1, bias=bias + ) + if conv_bias: + self.conv_bias = nn.Parameter(torch.Tensor(input_size)) + else: + self.conv_bias = None + self.reset_parameters() + + @property + def in_proj(self): + return ( + self.weight_linear.out_features + == self.input_size + self.num_heads * self.kernel_size + ) + + def reset_parameters(self): + self.weight_linear.reset_parameters() + if self.conv_bias is not None: + nn.init.constant_(self.conv_bias, 0.0) + + def forward(self, x, incremental_state=None, query=None, unfold=None): + """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C + args: + x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) + incremental_state: A dict to keep the state + unfold: unfold the input or not. If not, we use the matrix trick instead + query: use the specified query to predict the conv filters + """ + unfold = ( + x.size(0) > 512 if unfold is None else unfold + ) # use unfold mode as default for long sequence to save memory + unfold = unfold or (incremental_state is not None) + assert query is None or not self.in_proj + + if query is None: + query = x + if unfold: + output = self._forward_unfolded(x, incremental_state, query) + else: + output = self._forward_expanded(x, incremental_state, query) + + if self.conv_bias is not None: + output = output + self.conv_bias.view(1, 1, -1) + return output + + def _forward_unfolded(self, x, incremental_state, query): + """The conventional implementation of convolutions. + Unfolding the input by having a window shifting to the right.""" + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + + if self.in_proj: + proj = self.weight_linear(x) + x = proj.narrow(2, 0, self.input_size).contiguous() + weight = ( + proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1) + ) + else: + weight = self.weight_linear(query).view(T * B * H, -1) + + # renorm_padding is only implemented in _forward_expanded + assert not self.renorm_padding or incremental_state is not None + + if incremental_state is not None: + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is None: + input_buffer = x.new() + x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) + if self.kernel_size > 1: + self._set_input_buffer( + incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] + ) + x_unfold = x_unfold.view(T * B * H, R, -1) + else: + padding_l = self.padding_l + if K > T and padding_l == K - 1: + weight = weight.narrow(1, K - T, T) + K, padding_l = T, T - 1 + # unfold the input: T x B x C --> T' x B x C x K + x_unfold = unfold1d(x, K, padding_l, 0) + x_unfold = x_unfold.view(T * B * H, R, K) + + if self.weight_softmax and not self.renorm_padding: + weight = F.softmax(weight, dim=1) + weight = weight.narrow(1, 0, K) + + if incremental_state is not None: + weight = weight[:, -x_unfold.size(2) :] + K = weight.size(1) + + if self.weight_softmax and self.renorm_padding: + weight = F.softmax(weight, dim=1) + + weight = self.weight_dropout_module(weight, inplace=False) + + output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 + output = output.view(T, B, C) + return output + + def _forward_expanded(self, x, incremental_stat, query): + """Turn the convolution filters into band matrices and do matrix multiplication. + This is faster when the sequence is short, but less memory efficient. + This is not used in the decoder during inference. + """ + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + if self.in_proj: + proj = self.weight_linear(x) + x = proj.narrow(2, 0, self.input_size).contiguous() + weight = ( + proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1) + ) + else: + weight = self.weight_linear(query).view(T * B * H, -1) + + if not self.renorm_padding: + if self.weight_softmax: + weight = F.softmax(weight, dim=1) + weight = self.weight_dropout_module(weight, inplace=False) + weight = weight.narrow(1, 0, K).contiguous() + weight = weight.view(T, B * H, K).transpose(0, 1) + + x = x.view(T, B * H, R).transpose(0, 1) + if self.weight_softmax and self.renorm_padding: + # turn the convolution filters into band matrices + weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf")) + weight_expanded.as_strided( + (B * H, T, K), (T * (T + K - 1), T + K, 1) + ).copy_(weight) + weight_expanded = weight_expanded.narrow(2, self.padding_l, T) + # normalize the weight over valid positions like self-attention + weight_expanded = F.softmax(weight_expanded, dim=2) + weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) + else: + P = self.padding_l + # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length + if K > T and P == K - 1: + weight = weight.narrow(2, K - T, T) + K, P = T, T - 1 + # turn the convolution filters into band matrices + weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) + weight_expanded.as_strided( + (B * H, T, K), (T * (T + K - 1), T + K, 1) + ).copy_(weight) + weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T + output = torch.bmm(weight_expanded, x) + output = output.transpose(0, 1).contiguous().view(T, B, C) + return output + + def reorder_incremental_state(self, incremental_state, new_order): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(1, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return utils.get_incremental_state(self, incremental_state, "input_buffer") + + def _set_input_buffer(self, incremental_state, new_buffer): + return utils.set_incremental_state( + self, incremental_state, "input_buffer", new_buffer + ) + + def extra_repr(self): + s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format( + self.input_size, + self.kernel_size, + self.padding_l, + self.num_heads, + self.weight_softmax, + self.conv_bias is not None, + self.renorm_padding, + self.in_proj, + ) + + if self.query_size != self.input_size: + s += ", query_size={}".format(self.query_size) + if self.weight_dropout_module.p > 0.0: + s += ", weight_dropout={}".format(self.weight_dropout_module.p) + return s diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/__init__.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22dc6f403d2a0ecdb1b9e7e69ed96bd560e93b2c --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .dynamicconv_layer import DynamicconvLayer # noqa diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ebd4df0e9608d769f31eadc6e0b487505f11b279 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp @@ -0,0 +1,56 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +std::vector dynamicconv_cuda_forward( + at::Tensor input, + at::Tensor filters, + int padding_l); + +std::vector dynamicconv_cuda_backward( + at::Tensor gradOutput, + int padding_l, + at::Tensor input, + at::Tensor filters); + + +#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +std::vector dynamicconv_forward( + at::Tensor input, + at::Tensor filters, + int padding_l) { + + CHECK_INPUT(input); + CHECK_INPUT(filters); + + return dynamicconv_cuda_forward(input, filters, + padding_l); +} + +std::vector dynamicconv_backward( + at::Tensor gradOutput, + int padding_l, + at::Tensor input, + at::Tensor filters) { + + CHECK_INPUT(gradOutput); + CHECK_INPUT(input); + CHECK_INPUT(filters); + + return dynamicconv_cuda_backward(gradOutput, padding_l, + input, filters); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)"); + m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)"); +} diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2196259433aefc88f96cd5bbcae57740a9a8c2dc --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh @@ -0,0 +1,51 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define SHFL_MASK 0xffffffff + +template +__global__ +void dynamicconv_forward_kernel(const scalar_t* input, + const scalar_t* weight, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + scalar_t* output); + +template +__global__ +void dynamicconv_backward_kernel( + const scalar_t* gradOutput, // B * C * T + const scalar_t* input, // B * C * T + const scalar_t* weight, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + scalar_t* gradWeight, + scalar_t* gradInput); // B * H * k * T diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/setup.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..6a21f7e2ee0840a3b251522275a0b32a856951d7 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/setup.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + + +setup( + name="dynamicconv_layer", + ext_modules=[ + CUDAExtension( + name="dynamicconv_cuda", + sources=[ + "dynamicconv_cuda.cpp", + "dynamicconv_cuda_kernel.cu", + ], + ), + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/fairseq-0.10.2/fairseq/modules/gelu.py b/fairseq-0.10.2/fairseq/modules/gelu.py new file mode 100644 index 0000000000000000000000000000000000000000..a2f1ecff4a3ae3de3eb7d327b9163c46b18a15ed --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/gelu.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with +the corresponding GitHub repo: https://github.com/hendrycks/GELUs +""" + +import math + +import torch +import torch.nn as nn + + +def gelu_accurate(x): + if not hasattr(gelu_accurate, "_a"): + gelu_accurate._a = math.sqrt(2 / math.pi) + return ( + 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) + ) + + +def gelu(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x.float()).type_as(x) diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/cuda_function_gen.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/cuda_function_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..a25433dd8edae2f0b52d7d0eeeb829cabc6b4b89 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/cuda_function_gen.py @@ -0,0 +1,289 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +def gen_forward(): + + kernels = [3, 5, 7, 15, 31, 63, 127, 255] + seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] + + head = """ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "lightconv_cuda.cuh" + +std::vector lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) { + + at::DeviceGuard g(input.device()); + const auto minibatch = input.size(0); + const auto numFeatures = input.size(1); + const auto sequenceLength = input.size(2); + + const auto numHeads = filters.size(0); + const auto filterSize = filters.size(1); + + const auto numFiltersInBlock = numFeatures / numHeads; + + const dim3 blocks(minibatch, numFeatures); + + auto output = at::zeros_like(input); + auto stream = at::cuda::getCurrentCUDAStream(); +""" + + sequence_if = """ + if (sequenceLength <= {seq}) {{ + switch(filterSize) {{ +""" + + case_k = """ + case {k}: +""" + + main_block = """ + if (padding_l == {pad}) {{ + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{ + lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t> + <<>>( + input.data(), + filters.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + output.data()); + }})); + }} else +""" + + bad_padding = """ + { + std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl; + } + break; +""" + + bad_filter = """ + default: + std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl; + } +""" + + con_else = """ + } else +""" + + final_else = """ + { + switch(filterSize) { +""" + + final_return = """ + } + + return {output}; +} +""" + + with open("lightconv_cuda_forward.cu", "w") as forward: + forward.write(head) + for seq in seqs: + forward.write(sequence_if.format(seq=seq)) + for k in kernels: + forward.write(case_k.format(k=k)) + for pad in [k // 2, k - 1]: + forward.write(main_block.format(k=k, b_size=seq, pad=pad)) + forward.write(bad_padding) + forward.write(bad_filter) + forward.write(con_else) + + forward.write(final_else) + for k in kernels: + forward.write(case_k.format(k=k)) + for pad in [k // 2, k - 1]: + forward.write(main_block.format(k=k, b_size=seq, pad=pad)) + forward.write(bad_padding) + forward.write(bad_filter) + forward.write(final_return) + + +def gen_backward(): + + head = """ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "lightconv_cuda.cuh" + +std::vector lightconv_cuda_backward( + at::Tensor gradOutput, + int padding_l, + at::Tensor input, + at::Tensor filters) { + + // gradWrtInput + const int minibatch = input.size(0); + const int numFeatures = input.size(1); + const int sequenceLength = input.size(2); + + const int numHeads = filters.size(0); + const int filterSize = filters.size(1); + + const dim3 gradBlocks(minibatch, numFeatures); + const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads); + const dim3 weightGradSecondpassBlocks(numHeads, filterSize); + + const int numFiltersInBlock = numFeatures / numHeads; + + auto gradInput = at::zeros_like(input); + auto gradFilters = at::zeros_like(filters); + + at::DeviceGuard g(input.device()); + auto stream = at::cuda::getCurrentCUDAStream(); + + switch(filterSize) { +""" + + sequence_if = """ + if (sequenceLength <= {seq}) {{ +""" + + case_k = """ + case {k}: +""" + + main_block = """ + if (padding_l == {p}) {{ + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{ + lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t> + <<>>( + gradOutput.data(), + filters.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + gradInput.data()); + +""" + + weight_grad_short = """ + at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat)); + lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t> + <<>>( + input.data(), + gradOutput.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + numHeads, + tempSumGradFilters.data() + ); + + lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t> + <<>>( + tempSumGradFilters.data(), + minibatch, + numFiltersInBlock, + gradFilters.data() + ); + }})); + }} else +""" + + weight_grad = """ + at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat)); + lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t> + <<>>( + input.data(), + gradOutput.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + tempSumGradFilters.data() + ); + + lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t> + <<>>( + tempSumGradFilters.data(), + minibatch, + numFiltersInBlock, + gradFilters.data() + ); + }})); + }} else +""" + + bad_padding = """ + { + std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl; + } +""" + + breakout = """ + break; +""" + + bad_filter = """ + default: + std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl; +""" + + con_else = """ + } else +""" + + final_else = """ + { + switch(filterSize) { +""" + + last_return = """ + } + return {gradInput, gradFilters}; +} +""" + + kernels = [3, 5, 7, 15, 31, 63, 127, 255] + seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] + thresh = [32, 32, 64, 128, 256, -1, -1, -1] + max_mem = [-1, -1, -1, -1, -1, 192, 96, 64] + + with open("lightconv_cuda_backward.cu", "w") as backward: + backward.write(head) + for (k, t, mem) in zip(kernels, thresh, max_mem): + backward.write(case_k.format(k=k)) + for seq in seqs: + if (t == -1 or seq <= t) and (mem == -1 or seq < mem): + backward.write(sequence_if.format(seq=seq)) + for p in [k // 2, k - 1]: + backward.write(main_block.format(k=k, b_size=seq, p=p)) + backward.write(weight_grad_short.format(k=k, b_size=seq, p=p)) + backward.write(bad_padding) + else: + for p in [k // 2, k - 1]: + backward.write(main_block.format(k=k, b_size=32, p=p)) + backward.write(weight_grad.format(k=k, b_size=32, p=p)) + backward.write(bad_padding) + backward.write(breakout) + break + backward.write(con_else) + backward.write(bad_filter) + backward.write(last_return) + + +if __name__ == "__main__": + gen_forward() + gen_backward() diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cuh b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..3cae57b68fc96872a5047a7a0d081b78456e8fae --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cuh @@ -0,0 +1,83 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define SHFL_MASK 0xffffffff + +template +__global__ +void lightconv_forward_kernel(const scalar_t* input, + const scalar_t* filters, + int minibatch, int sequenceLength, + int numFeatures, int numFiltersInBlock, + scalar_t* output); + +template +__global__ +void lightconv_grad_wrt_input_kernel( + const scalar_t* input, + const scalar_t* filters, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + scalar_t* output); + +template +__global__ +void lightconv_grad_wrt_weights_firstpass_short_kernel( + const scalar_t* input, + const scalar_t* gradInput, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + float* output); + +template +__global__ +void lightconv_grad_wrt_weights_secondpass_short_kernel( + const float* input, + const int minibatch, + const int numFiltersInBlock, + scalar_t* output); + +template +__global__ +void lightconv_grad_wrt_weights_firstpass_kernel( + const scalar_t* input, + const scalar_t* gradInput, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + float* output); + +template +__global__ +void lightconv_grad_wrt_weights_secondpass_kernel( + const float* input, + const int minibatch, + const int numFiltersInBlock, + scalar_t* output); + diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_layer.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e7e597f4749c591b057d776aacec39b44d99c037 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_layer.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import lightconv_cuda +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.modules.fairseq_dropout import FairseqDropout +from torch import nn +from torch.autograd import Function + + +class lightconvFunction(Function): + @staticmethod + def forward(ctx, x, weights, padding_l): + ctx.padding_l = padding_l + outputs = lightconv_cuda.forward(x, weights, padding_l) + variables = [x, weights] + ctx.save_for_backward(*variables) + return outputs[0] + + @staticmethod + def backward(ctx, grad_output): + outputs = lightconv_cuda.backward( + grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors + ) + grad_input, grad_weights = outputs + return grad_input, grad_weights, None + + +@with_incremental_state +class LightconvLayer(nn.Module): + def __init__( + self, + input_size, + kernel_size=1, + padding_l=None, + weight_softmax=False, + num_heads=1, + weight_dropout=0.0, + bias=False, + ): + super(LightconvLayer, self).__init__() + self.input_size = input_size + self.kernel_size = kernel_size + self.padding_l = padding_l + self.num_heads = num_heads + self.weight_softmax = weight_softmax + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + + self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(input_size)) + else: + self.bias = None + self.reset_parameters() + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + for k, v in state_dict.items(): + if k.endswith(prefix + "weight"): + if v.dim() == 3 and v.size(1) == 1: + state_dict[k] = v.squeeze(1) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.constant_(self.bias, 0.0) + + def forward(self, x, incremental_state=None): + + # during inference time, incremental BMM is faster + if incremental_state is not None: + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is None: + input_buffer = x.new() + x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) + if self.kernel_size > 1: + self._set_input_buffer( + incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] + ) + x_unfold = x_unfold.view(T * B * H, R, -1) + + weight = self.weight + if self.weight_softmax: + weight = F.softmax(weight.float(), dim=1).type_as(weight) + + weight = weight[:, -x_unfold.size(2) :] + + K = weight.size(1) + + weight = ( + weight.view(1, H, K) + .expand(T * B, H, K) + .contiguous() + .view(T * B * H, K, 1) + ) + + weight = self.weight_dropout_module(weight) + output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 + output = output.view(T, B, C) + return output + + # during training time, use CUDA kernel + else: + x = x.permute(1, 2, 0).contiguous() + weight = self.weight + if self.weight_softmax: + weight = F.softmax(self.weight, -1) + if self.weight_dropout_module.p: + weight = self.weight_dropout_module(weight) + return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1) + + def reorder_incremental_state(self, incremental_state, new_order): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(1, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return utils.get_incremental_state(self, incremental_state, "input_buffer") + + def _set_input_buffer(self, incremental_state, new_buffer): + return utils.set_incremental_state( + self, incremental_state, "input_buffer", new_buffer + ) + + def half(self): + return self._apply(lambda t: t.half() if t.is_floating_point() else t) diff --git a/fairseq-0.10.2/fairseq/modules/lightweight_convolution.py b/fairseq-0.10.2/fairseq/modules/lightweight_convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..ec11a9507951c9e8f3564753841dd9c74a4900e0 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightweight_convolution.py @@ -0,0 +1,310 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.unfold import unfold1d + + +def LightweightConv( + input_size, + kernel_size=1, + padding_l=None, + num_heads=1, + weight_dropout=0.0, + weight_softmax=False, + bias=False, +): + if torch.cuda.is_available(): + try: + from fairseq.modules.lightconv_layer import LightconvLayer + + return LightconvLayer( + input_size, + kernel_size=kernel_size, + padding_l=padding_l, + num_heads=num_heads, + weight_dropout=weight_dropout, + weight_softmax=weight_softmax, + bias=bias, + ) + except ImportError as e: + print(e) + return LightweightConv1dTBC( + input_size, + kernel_size=kernel_size, + padding_l=padding_l, + num_heads=num_heads, + weight_dropout=weight_dropout, + weight_softmax=weight_softmax, + bias=bias, + ) + + +class LightweightConv1d(nn.Module): + """Lightweight Convolution assuming the input is BxCxT + This is just an example that explains LightConv clearer than the TBC version. + We don't use this module in the model. + + Args: + input_size: # of channels of the input and output + kernel_size: convolution channels + padding: padding + num_heads: number of heads used. The weight is of shape + `(num_heads, 1, kernel_size)` + weight_softmax: normalize the weight with softmax before the convolution + + Shape: + Input: BxCxT, i.e. (batch_size, input_size, timesteps) + Output: BxCxT, i.e. (batch_size, input_size, timesteps) + + Attributes: + weight: the learnable weights of the module of shape + `(num_heads, 1, kernel_size)` + bias: the learnable bias of the module of shape `(input_size)` + """ + + def __init__( + self, + input_size, + kernel_size=1, + padding=0, + num_heads=1, + weight_softmax=False, + bias=False, + weight_dropout=0.0, + ): + super().__init__() + self.input_size = input_size + self.kernel_size = kernel_size + self.num_heads = num_heads + self.padding = padding + self.weight_softmax = weight_softmax + self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size)) + + if bias: + self.bias = nn.Parameter(torch.Tensor(input_size)) + else: + self.bias = None + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.constant_(self.bias, 0.0) + + def forward(self, input): + """ + input size: B x C x T + output size: B x C x T + """ + B, C, T = input.size() + H = self.num_heads + + weight = self.weight + if self.weight_softmax: + weight = F.softmax(weight, dim=-1) + + weight = self.weight_dropout_module(weight) + # Merge every C/H entries into the batch dimension (C = self.input_size) + # B x C x T -> (B * C/H) x H x T + # One can also expand the weight to C x 1 x K by a factor of C/H + # and do not reshape the input instead, which is slow though + input = input.view(-1, H, T) + output = F.conv1d(input, weight, padding=self.padding, groups=self.num_heads) + output = output.view(B, C, T) + if self.bias is not None: + output = output + self.bias.view(1, -1, 1) + + return output + + +@with_incremental_state +class LightweightConv1dTBC(nn.Module): + """Lightweight Convolution assuming the input is TxBxC + Args: + input_size: # of channels of the input + kernel_size: convolution channels + padding_l: padding to the left when using "same" padding + num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) + weight_dropout: the drop rate of the DropConnect to drop the weight + weight_softmax: normalize the weight with softmax before the convolution + bias: use bias + + Shape: + Input: TxBxC, i.e. (timesteps, batch_size, input_size) + Output: TxBxC, i.e. (timesteps, batch_size, input_size) + + Attributes: + weight: the learnable weights of the module of shape + `(num_heads, 1, kernel_size)` + bias: the learnable bias of the module of shape `(input_size)` + """ + + def __init__( + self, + input_size, + kernel_size=1, + padding_l=None, + num_heads=1, + weight_dropout=0.0, + weight_softmax=False, + bias=False, + ): + super().__init__() + self.input_size = input_size + self.kernel_size = kernel_size + self.padding_l = padding_l + self.num_heads = num_heads + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + self.weight_softmax = weight_softmax + + self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(input_size)) + else: + self.bias = None + + self.reset_parameters() + self.onnx_trace = False + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.constant_(self.bias, 0.0) + + def forward(self, x, incremental_state=None, unfold=False): + """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C + args: + x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) + incremental_state: A dict to keep the state + unfold: unfold the input or not. If not, we use the matrix trick instead + """ + unfold = unfold or (incremental_state is not None) + + if unfold: + output = self._forward_unfolded(x, incremental_state) + else: + output = self._forward_expanded(x, incremental_state) + + if self.bias is not None: + output = output + self.bias.view(1, 1, -1) + return output + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def _forward_unfolded(self, x, incremental_state): + """The conventional implementation of convolutions. + Unfolding the input by having a window shifting to the right.""" + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + + weight = self.weight.view(H, K) + if incremental_state is not None: + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is None: + input_buffer = x.new() + x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) + if self.kernel_size > 1: + self._set_input_buffer( + incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] + ) + x_unfold = x_unfold.view(T * B * H, R, -1) + else: + # unfold the input: T x B x C --> T' x B x C x K + x_unfold = unfold1d(x, self.kernel_size, self.padding_l, 0) + x_unfold = x_unfold.view(T * B * H, R, K) + + if self.weight_softmax: + weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as( + weight + ) + + if incremental_state is not None: + weight = weight[:, -x_unfold.size(2) :] + K = weight.size(1) + + weight = ( + weight.view(1, H, K).expand(T * B, H, K).contiguous().view(T * B * H, K, 1) + ) + + weight = self.weight_dropout_module(weight) + output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 + output = output.view(T, B, C) + return output + + def _forward_expanded(self, x, incremental_state): + """Turn the convolution filters into band matrices and do matrix multiplication. + This is faster when the sequence is short, but less memory efficient. + This is not used in the decoder during inference. + """ + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + + weight = self.weight.view(H, K) + if self.weight_softmax: + weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as( + weight + ) + weight = weight.view(1, H, K).expand(T * B, H, K).contiguous() + weight = weight.view(T, B * H, K).transpose(0, 1) + + x = x.view(T, B * H, R).transpose(0, 1) + P = self.padding_l + if K > T and P == K - 1: + weight = weight.narrow(2, K - T, T) + K, P = T, T - 1 + # turn the convolution filters into band matrices + weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) + weight_expanded.as_strided((B * H, T, K), (T * (T + K - 1), T + K, 1)).copy_( + weight + ) + weight_expanded = weight_expanded.narrow(2, P, T) + weight_expanded = self.weight_dropout_module(weight_expanded) + + output = torch.bmm(weight_expanded, x) + output = output.transpose(0, 1).contiguous().view(T, B, C) + return output + + def reorder_incremental_state(self, incremental_state, new_order): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(1, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return utils.get_incremental_state(self, incremental_state, "input_buffer") + + def _set_input_buffer(self, incremental_state, new_buffer): + return utils.set_incremental_state( + self, incremental_state, "input_buffer", new_buffer + ) + + def extra_repr(self): + s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, bias={}".format( + self.input_size, + self.kernel_size, + self.padding_l, + self.num_heads, + self.weight_softmax, + self.bias is not None, + ) + if self.weight_dropout_module.p > 0.0: + s += ", weight_dropout={}".format(self.weight_dropout_module.p) + return s diff --git a/fairseq-0.10.2/fairseq/modules/same_pad.py b/fairseq-0.10.2/fairseq/modules/same_pad.py new file mode 100644 index 0000000000000000000000000000000000000000..b46f94d6357888bde46035d8fcd57ceff5d24a88 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/same_pad.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from torch import nn + + +class SamePad(nn.Module): + def __init__(self, kernel_size): + super().__init__() + self.remove = kernel_size % 2 == 0 + + def forward(self, x): + if self.remove: + x = x[:, :, :-1] + return x diff --git a/fairseq-0.10.2/fairseq/modules/scalar_bias.py b/fairseq-0.10.2/fairseq/modules/scalar_bias.py new file mode 100644 index 0000000000000000000000000000000000000000..c96247c75914fabb8a2b7ff731bb82b588f72690 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/scalar_bias.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import torch + + +class ScalarBias(torch.autograd.Function): + """ + Adds a vector of scalars, used in self-attention mechanism to allow + the model to optionally attend to this vector instead of the past + """ + + @staticmethod + def forward(ctx, input, dim, bias_init): + size = list(input.size()) + size[dim] += 1 + output = input.new(*size).fill_(bias_init) + output.narrow(dim, 1, size[dim] - 1).copy_(input) + ctx.dim = dim + return output + + @staticmethod + def backward(ctx, grad): + return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None + + +def scalar_bias(input, dim, bias_init=0): + return ScalarBias.apply(input, dim, bias_init) diff --git a/fairseq-0.10.2/fairseq/modules/sinusoidal_positional_embedding.py b/fairseq-0.10.2/fairseq/modules/sinusoidal_positional_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..857830faf7cb64950021947e2c5babcb906c48d3 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/sinusoidal_positional_embedding.py @@ -0,0 +1,105 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Optional + +import torch +import torch.onnx.operators +from fairseq import utils +from torch import Tensor, nn + + +class SinusoidalPositionalEmbedding(nn.Module): + """This module produces sinusoidal positional embeddings of any length. + + Padding symbols are ignored. + """ + + def __init__(self, embedding_dim, padding_idx, init_size=1024): + super().__init__() + self.embedding_dim = embedding_dim + self.padding_idx = padding_idx + self.weights = SinusoidalPositionalEmbedding.get_embedding( + init_size, embedding_dim, padding_idx + ) + self.onnx_trace = False + self.register_buffer("_float_tensor", torch.FloatTensor(1)) + self.max_positions = int(1e5) + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + @staticmethod + def get_embedding( + num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None + ): + """Build sinusoidal embeddings. + + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) + emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze( + 1 + ) * emb.unsqueeze(0) + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view( + num_embeddings, -1 + ) + if embedding_dim % 2 == 1: + # zero pad + emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) + if padding_idx is not None: + emb[padding_idx, :] = 0 + return emb + + def forward( + self, + input, + incremental_state: Optional[Any] = None, + timestep: Optional[Tensor] = None, + positions: Optional[Any] = None, + ): + """Input is expected to be of size [bsz x seqlen].""" + bspair = torch.onnx.operators.shape_as_tensor(input) + bsz, seq_len = bspair[0], bspair[1] + max_pos = self.padding_idx + 1 + seq_len + if self.weights is None or max_pos > self.weights.size(0): + # recompute/expand embeddings if needed + self.weights = SinusoidalPositionalEmbedding.get_embedding( + max_pos, self.embedding_dim, self.padding_idx + ) + self.weights = self.weights.to(self._float_tensor) + + if incremental_state is not None: + # positions is the same for every token when decoding a single step + pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len + if self.onnx_trace: + return ( + self.weights.index_select(index=self.padding_idx + pos, dim=0) + .unsqueeze(1) + .repeat(bsz, 1, 1) + ) + return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1) + + positions = utils.make_positions( + input, self.padding_idx, onnx_trace=self.onnx_trace + ) + if self.onnx_trace: + flat_embeddings = self.weights.detach().index_select(0, positions.view(-1)) + embedding_shape = torch.cat( + (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long)) + ) + embeddings = torch.onnx.operators.reshape_from_tensor_shape( + flat_embeddings, embedding_shape + ) + return embeddings + return ( + self.weights.index_select(0, positions.view(-1)) + .view(bsz, seq_len, -1) + .detach() + ) diff --git a/fairseq-0.10.2/fairseq/modules/sparse_multihead_attention.py b/fairseq-0.10.2/fairseq/modules/sparse_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..3cbd9d6785886e319aab0601517e27df733b6f97 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/sparse_multihead_attention.py @@ -0,0 +1,140 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch + +from .multihead_attention import MultiheadAttention + + +class SparseMultiheadAttention(MultiheadAttention): + """Sparse Multi-Headed Attention. + + "Generating Long Sequences with Sparse Transformers". Implements + fixed factorized self attention, where l=stride and c=expressivity. + A(1) includes all words in the stride window and A(2) takes a summary of c + words from the end of each stride window. + If is_bidirectional=False, we do not include any words past the current word, + as in the paper. + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + stride=32, + expressivity=8, + is_bidirectional=True, + ): + + super().__init__( + embed_dim, + num_heads, + kdim, + vdim, + dropout, + bias, + add_bias_kv, + add_zero_attn, + self_attention, + encoder_decoder_attention, + ) + + self.is_bidirectional = is_bidirectional + self.stride = stride + self.expressivity = expressivity + assert self.stride > 0 and self.stride >= self.expressivity + + # Used for Ai(2) calculations - beginning of [l-c, l] range + def compute_checkpoint(self, word_index): + if word_index % self.stride == 0 and word_index != 0: + checkpoint_index = word_index - self.expressivity + else: + checkpoint_index = ( + math.floor(word_index / self.stride) * self.stride + + self.stride + - self.expressivity + ) + return checkpoint_index + + # Computes Ai(2) + def compute_subset_summaries(self, absolute_max): + checkpoint_index = self.compute_checkpoint(0) + subset_two = set() + while checkpoint_index <= absolute_max - 1: + summary = set( + range( + checkpoint_index, + min(checkpoint_index + self.expressivity + 1, absolute_max), + ) + ) + subset_two = subset_two.union(summary) + checkpoint_index = self.compute_checkpoint(checkpoint_index + self.stride) + return subset_two + + # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf + def compute_fixed_attention_subset(self, word_index, tgt_len): + # +1s account for range function; [min, max) -> [min, max] + if not self.is_bidirectional: + absolute_max = word_index + 1 + else: + absolute_max = tgt_len + + # Subset 1 - whole window + rounded_index = ( + math.floor((word_index + self.stride) / self.stride) * self.stride + ) + if word_index % self.stride == 0 and word_index != 0: + subset_one = set( + range(word_index - self.stride, min(absolute_max, word_index + 1)) + ) + else: + subset_one = set( + range( + max(0, rounded_index - self.stride), + min(absolute_max, rounded_index + 1), + ) + ) + + # Subset 2 - summary per window + # If bidirectional, subset 2 is the same for every index + subset_two = set() + if not self.is_bidirectional: + subset_two = self.compute_subset_summaries(absolute_max) + + return subset_one.union(subset_two) + + # Compute sparse mask - if bidirectional, can pre-compute and store + def buffered_sparse_mask(self, tensor, tgt_len, src_len): + assert tgt_len > self.stride + sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float("-inf")) + + # If bidirectional, subset 2 is the same for every index + subset_summaries = set() + if self.is_bidirectional: + subset_summaries = self.compute_subset_summaries(tgt_len) + + for i in range(tgt_len): + fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len) + fixed_attention_subset = fixed_attention_subset.union(subset_summaries) + included_word_indices = torch.LongTensor(list(fixed_attention_subset)) + sparse_mask[i].index_fill_(0, included_word_indices, 0) + return sparse_mask.type_as(tensor) + + def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz): + sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len) + sparse_mask = sparse_mask.unsqueeze(0).expand( + bsz * self.num_heads, tgt_len, src_len + ) + attn_weights += sparse_mask diff --git a/fairseq-0.10.2/fairseq/modules/transpose_last.py b/fairseq-0.10.2/fairseq/modules/transpose_last.py new file mode 100644 index 0000000000000000000000000000000000000000..e578b3ec5097bfac5c976b207ea46bec1d9bd4f5 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/transpose_last.py @@ -0,0 +1,20 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +transpose last 2 dimensions of the input +""" + +import torch.nn as nn + + +class TransposeLast(nn.Module): + def __init__(self, deconstruct_idx=None): + super().__init__() + self.deconstruct_idx = deconstruct_idx + + def forward(self, x): + if self.deconstruct_idx is not None: + x = x[self.deconstruct_idx] + return x.transpose(-2, -1)