diff --git a/fairseq-0.10.2/fairseq/criterions/__init__.py b/fairseq-0.10.2/fairseq/criterions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a7eb5f6f3c272c86b15fdf697f72ee9e9382907f --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/__init__.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +import importlib +import os +from argparse import Namespace +from typing import Union + +from fairseq import registry +from fairseq.criterions.fairseq_criterion import ( # noqa + FairseqCriterion, + LegacyFairseqCriterion, +) +from omegaconf import DictConfig + + +( + build_criterion_, + register_criterion, + CRITERION_REGISTRY, + CRITERION_DATACLASS_REGISTRY, +) = registry.setup_registry( + "--criterion", base_class=FairseqCriterion, default="cross_entropy" +) + + +def build_criterion(criterion_cfg: Union[DictConfig, Namespace], task): + return build_criterion_(criterion_cfg, task) + + +# automatically import any Python files in the criterions/ directory +for file in os.listdir(os.path.dirname(__file__)): + if file.endswith(".py") and not file.startswith("_"): + file_name = file[: file.find(".py")] + importlib.import_module("fairseq.criterions." + file_name) diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbef97433cc28e965c86281c5c72ecfc792ef3a1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b64fc17e45963bfad92bf845a05d4ed17f4cf8f3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b669b4c89eae47c072677df71802b84dcc10696 Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee152ccb93436cf15cfd892a8ed898865ec4f117 Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py b/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..74ba37c321e7ba95c1cd97b5d9f0396dd313b4ee --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from dataclasses import dataclass + +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.dataclass import FairseqDataclass +from fairseq.dataclass.constants import DDP_BACKEND_CHOICES +from omegaconf import II + + +@dataclass +class AdaptiveLossConfig(FairseqDataclass): + sentence_avg: bool = II("params.optimization.sentence_avg") + ddp_backend: DDP_BACKEND_CHOICES = II("params.distributed_training.ddp_backend") + + +@register_criterion("adaptive_loss", dataclass=AdaptiveLossConfig) +class AdaptiveLoss(FairseqCriterion): + """This is an implementation of the loss function accompanying the adaptive softmax approximation for + graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs" + (http://arxiv.org/abs/1609.04309).""" + + def __init__(self, task, sentence_avg): + super().__init__(task) + self.sentence_avg = sentence_avg + + @classmethod + def build_criterion(cls, args, task): + if getattr(args, "ddp_backend", None) == "c10d": + raise Exception( + "AdaptiveLoss is not compatible with the c10d " + "version of DistributedDataParallel. Please use " + "`--ddp-backend=no_c10d` instead." + ) + return cls(task, args.sentence_avg) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + + assert ( + hasattr(model.decoder, "adaptive_softmax") + and model.decoder.adaptive_softmax is not None + ) + adaptive_softmax = model.decoder.adaptive_softmax + + net_output = model(**sample["net_input"]) + orig_target = model.get_targets(sample, net_output) + + nsentences = orig_target.size(0) + orig_target = orig_target.view(-1) + + bsz = orig_target.size(0) + + logits, target = adaptive_softmax(net_output[0], orig_target) + assert len(target) == len(logits) + + loss = net_output[0].new(1 if reduce else bsz).zero_() + + for i in range(len(target)): + if target[i] is not None: + assert target[i].min() >= 0 and target[i].max() <= logits[i].size(1) + loss += F.cross_entropy( + logits[i], + target[i], + ignore_index=self.padding_idx, + reduction="sum" if reduce else "none", + ) + + orig = utils.strip_pad(orig_target, self.padding_idx) + ntokens = orig.numel() + sample_size = sample["target"].size(0) if self.sentence_avg else ntokens + logging_output = { + "loss": loss.data, + "ntokens": ntokens, + "nsentences": nsentences, + "sample_size": sample_size, + } + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) + ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) + sample_size = utils.item( + sum(log.get("sample_size", 0) for log in logging_outputs) + ) + + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) + ) + else: + metrics.log_derived( + "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq-0.10.2/fairseq/criterions/composite_loss.py b/fairseq-0.10.2/fairseq/criterions/composite_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..98e835fa6e4c0bcad062df9c519701bf795c98be --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/composite_loss.py @@ -0,0 +1,100 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq import utils +from fairseq.criterions import LegacyFairseqCriterion, register_criterion +from torch import nn + + +@register_criterion("composite_loss") +class CompositeLoss(LegacyFairseqCriterion): + """This is a composite loss that, given a list of model outputs and a list of targets, + computes an average of losses for each output-target pair""" + + def __init__(self, args, task): + super().__init__(args, task) + self.underlying_criterion = args.underlying_criterion + + @staticmethod + def add_args(parser): + """Add criterion-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True, + help='underlying criterion to use for the composite loss') + # fmt: on + + @staticmethod + def build_underlying_criterion(args, task): + saved_criterion = args.criterion + args.criterion = args.underlying_criterion + assert saved_criterion != args.underlying_criterion + underlying_criterion = task.build_criterion(args) + args.criterion = saved_criterion + return underlying_criterion + + @classmethod + def build_criterion(cls, args, task): + underlying_criterion = CompositeLoss.build_underlying_criterion(args, task) + + class FakeModel(nn.Module): + def __init__(self, model, net_out, target): + super().__init__() + self.model = model + self.net_out = net_out + self.target = target + + def forward(self, **unused): + return self.net_out + + def get_normalized_probs(self, net_output, log_probs, sample=None): + return self.model.get_normalized_probs( + net_output, log_probs, sample=sample + ) + + def get_targets(self, *unused): + return self.target + + @property + def decoder(self): + return self.model.decoder + + class _CompositeLoss(LegacyFairseqCriterion): + def __init__(self, args, task, underlying_criterion): + super().__init__(args, task) + self.underlying_criterion = underlying_criterion + + def forward(self, model, sample, reduce=True): + net_outputs = model(**sample["net_input"]) + targets = sample["target"] + + bsz = targets[0].size(0) + loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_() + + sample_size = 0 + logging_output = {} + for o, t in zip(net_outputs[0], targets): + m = FakeModel(model, (o, net_outputs[1]), t) + sample["target"] = t + l, ss, logging_output = self.underlying_criterion(m, sample, reduce) + loss += l + sample_size += ss + + loss.div_(len(targets)) + sample_size /= len(targets) + + logging_output["loss"] = utils.item(loss.data) if reduce else loss.data + return loss, sample_size, logging_output + + @staticmethod + def aggregate_logging_outputs(logging_outputs): + return underlying_criterion.__class__.aggregate_logging_outputs( + logging_outputs + ) + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + underlying_criterion.__class__.reduce_metrics(logging_outputs) + + return _CompositeLoss(args, task, underlying_criterion) diff --git a/fairseq-0.10.2/fairseq/criterions/ctc.py b/fairseq-0.10.2/fairseq/criterions/ctc.py new file mode 100644 index 0000000000000000000000000000000000000000..4f93b3cbfd172f43449d2b80b6f3efd88416eba2 --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/ctc.py @@ -0,0 +1,253 @@ +# All rights reserved. +# +# This source code is licensed under the license found in the LICENSE file in +# the root directory of this source tree. An additional grant of patent rights +# can be found in the PATENTS file in the same directory. + +import math +from argparse import Namespace + +import torch +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion +from fairseq.data.data_utils import post_process +from fairseq.logging.meters import safe_round + + +@register_criterion("ctc") +class CtcCriterion(FairseqCriterion): + def __init__(self, task, wer_args, zero_infinity, sentence_avg, remove_bpe): + super().__init__(task) + self.blank_idx = task.target_dictionary.bos() + self.pad_idx = task.target_dictionary.pad() + self.eos_idx = task.target_dictionary.eos() + self.post_process = remove_bpe if remove_bpe else "letter" + + if wer_args is not None: + from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder + + wer_compute_kenlm, wer_lexicon, lm_w, ws_w = eval(wer_args) + + dec_args = Namespace() + dec_args.nbest = 1 + dec_args.criterion = "ctc" + dec_args.kenlm_model = wer_compute_kenlm + dec_args.lexicon = wer_lexicon + dec_args.beam = 50 + dec_args.beam_size_token = min(50, len(task.target_dictionary)) + dec_args.beam_threshold = min(50, len(task.target_dictionary)) + dec_args.lm_weight = lm_w + dec_args.word_score = ws_w + dec_args.unk_weight = -math.inf + dec_args.sil_weight = 0 + + self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary) + else: + self.w2l_decoder = None + + self.zero_infinity = zero_infinity + self.sentence_avg = sentence_avg + + @staticmethod + def add_args(parser): + """Add criterion-specific arguments to the parser.""" + parser.add_argument( + "--zero-infinity", action="store_true", help="zero inf loss" + ) + try: + parser.add_argument( + "--remove-bpe", + "--post-process", + default="letter", + help="remove BPE tokens before scoring (can be set to sentencepiece, letter, and more)", + ) + except: + pass # this option might have been added from eval args + parser.add_argument( + "--wer-args", + type=str, + default=None, + help="options for wer computation on valid set using 4 gram lm. this should be a tuple of 4 elements: path to 4-gram lm, \ + path to lexicon, lm score, word score", + ) + + def forward(self, model, sample, reduce=True): + net_output = model(**sample["net_input"]) + lprobs = model.get_normalized_probs( + net_output, log_probs=True + ).contiguous() # (T, B, C) from the encoder + + if "src_lengths" in sample["net_input"]: + input_lengths = sample["net_input"]["src_lengths"] + else: + non_padding_mask = ~net_output["padding_mask"] + input_lengths = non_padding_mask.long().sum(-1) + + pad_mask = (sample["target"] != self.pad_idx) & ( + sample["target"] != self.eos_idx + ) + targets_flat = sample["target"].masked_select(pad_mask) + target_lengths = sample["target_lengths"] + + with torch.backends.cudnn.flags(enabled=False): + loss = F.ctc_loss( + lprobs, + targets_flat, + input_lengths, + target_lengths, + blank=self.blank_idx, + reduction="sum", + zero_infinity=self.zero_infinity, + ) + + ntokens = ( + sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item() + ) + + sample_size = sample["target"].size(0) if self.sentence_avg else ntokens + logging_output = { + "loss": utils.item(loss.data), # * sample['ntokens'], + "ntokens": ntokens, + "nsentences": sample["id"].numel(), + "sample_size": sample_size, + } + + if not model.training: + import editdistance + + with torch.no_grad(): + lprobs_t = lprobs.transpose(0, 1).float().contiguous().cpu() + + c_err = 0 + c_len = 0 + w_errs = 0 + w_len = 0 + wv_errs = 0 + for lp, t, inp_l in zip( + lprobs_t, + sample["target_label"] + if "target_label" in sample + else sample["target"], + input_lengths, + ): + lp = lp[:inp_l].unsqueeze(0) + + decoded = None + if self.w2l_decoder is not None: + decoded = self.w2l_decoder.decode(lp) + if len(decoded) < 1: + decoded = None + else: + decoded = decoded[0] + if len(decoded) < 1: + decoded = None + else: + decoded = decoded[0] + + p = (t != self.task.target_dictionary.pad()) & ( + t != self.task.target_dictionary.eos() + ) + targ = t[p] + targ_units = self.task.target_dictionary.string(targ) + targ_units_arr = targ.tolist() + + toks = lp.argmax(dim=-1).unique_consecutive() + pred_units_arr = toks[toks != self.blank_idx].tolist() + + c_err += editdistance.eval(pred_units_arr, targ_units_arr) + c_len += len(targ_units_arr) + + targ_words = post_process(targ_units, self.post_process).split() + + pred_units = self.task.target_dictionary.string(pred_units_arr) + pred_words_raw = post_process(pred_units, self.post_process).split() + + if decoded is not None and "words" in decoded: + pred_words = decoded["words"] + w_errs += editdistance.eval(pred_words, targ_words) + wv_errs += editdistance.eval(pred_words_raw, targ_words) + else: + dist = editdistance.eval(pred_words_raw, targ_words) + w_errs += dist + wv_errs += dist + + w_len += len(targ_words) + + logging_output["wv_errors"] = wv_errs + logging_output["w_errors"] = w_errs + logging_output["w_total"] = w_len + logging_output["c_errors"] = c_err + logging_output["c_total"] = c_len + + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + + loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) + ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) + nsentences = utils.item( + sum(log.get("nsentences", 0) for log in logging_outputs) + ) + sample_size = utils.item( + sum(log.get("sample_size", 0) for log in logging_outputs) + ) + + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + metrics.log_scalar("ntokens", ntokens) + metrics.log_scalar("nsentences", nsentences) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + + c_errors = sum(log.get("c_errors", 0) for log in logging_outputs) + metrics.log_scalar("_c_errors", c_errors) + c_total = sum(log.get("c_total", 0) for log in logging_outputs) + metrics.log_scalar("_c_total", c_total) + w_errors = sum(log.get("w_errors", 0) for log in logging_outputs) + metrics.log_scalar("_w_errors", w_errors) + wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs) + metrics.log_scalar("_wv_errors", wv_errors) + w_total = sum(log.get("w_total", 0) for log in logging_outputs) + metrics.log_scalar("_w_total", w_total) + + if c_total > 0: + metrics.log_derived( + "uer", + lambda meters: safe_round( + meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 + ) + if meters["_c_total"].sum > 0 + else float("nan"), + ) + if w_total > 0: + metrics.log_derived( + "wer", + lambda meters: safe_round( + meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3 + ) + if meters["_w_total"].sum > 0 + else float("nan"), + ) + metrics.log_derived( + "raw_wer", + lambda meters: safe_round( + meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3 + ) + if meters["_w_total"].sum > 0 + else float("nan"), + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py b/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..ef94a863276d6569cb47028069ec199ec5f63055 --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import inspect +from typing import Any, Dict, List + +from fairseq import metrics, utils +from fairseq.dataclass.utils import gen_parser_from_dataclass +from torch.nn.modules.loss import _Loss + + +class FairseqCriterion(_Loss): + def __init__(self, task): + super().__init__() + self.task = task + if hasattr(task, "target_dictionary"): + tgt_dict = task.target_dictionary + self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100 + + @classmethod + def add_args(cls, parser): + """Add criterion-specific arguments to the parser.""" + dc = getattr(cls, "__dataclass", None) + if dc is not None: + gen_parser_from_dataclass(parser, dc()) + + @classmethod + def build_criterion(cls, args, task): + """Construct a criterion from command-line args.""" + # Criterions can override this, but for convenience we also try + # to automatically map argparse.Namespace keys to corresponding + # arguments in the __init__. + init_args = {} + for p in inspect.signature(cls).parameters.values(): + if ( + p.kind == p.POSITIONAL_ONLY + or p.kind == p.VAR_POSITIONAL + or p.kind == p.VAR_KEYWORD + ): + # we haven't implemented inference for these argument types, + # but PRs welcome :) + raise NotImplementedError("{} not supported".format(p.kind)) + + assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY} + + if p.name == "task": + init_args["task"] = task + elif hasattr(args, p.name): + init_args[p.name] = getattr(args, p.name) + elif p.default != p.empty: + pass # we'll use the default value + else: + raise NotImplementedError( + "Unable to infer Criterion arguments, please implement " + "{}.build_criterion".format(cls.__name__) + ) + return cls(**init_args) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + raise NotImplementedError + + @staticmethod + def aggregate_logging_outputs( + logging_outputs: List[Dict[str, Any]], + ) -> Dict[str, Any]: + """Aggregate logging outputs from data parallel training.""" + utils.deprecation_warning( + "The aggregate_logging_outputs API is deprecated. " + "Please use the reduce_metrics API instead." + ) + raise NotImplementedError + + @classmethod + def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: + """Aggregate logging outputs from data parallel training.""" + utils.deprecation_warning( + "Criterions should implement the reduce_metrics API. " + "Falling back to deprecated aggregate_logging_outputs API." + ) + agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs) + for k, v in agg_logging_outputs.items(): + if k in {"nsentences", "ntokens", "sample_size"}: + continue + metrics.log_scalar(k, v) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return False + + +class LegacyFairseqCriterion(FairseqCriterion): + def __init__(self, args, task): + super().__init__(task=task) + self.args = args + + utils.deprecation_warning( + "Criterions should take explicit arguments instead of an " + "argparse.Namespace object, please update your criterion by " + "extending FairseqCriterion instead of LegacyFairseqCriterion." + ) + + @classmethod + def build_criterion(cls, args, task): + """Construct a criterion from command-line args.""" + return cls(args, task) diff --git a/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py b/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..c70608c5a143b7b4fbd8c58dfcf9f873639d379c --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py @@ -0,0 +1,177 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion + + +def compute_cross_entropy_loss(logits, targets, ignore_index=-100): + """ + Function to compute the cross entropy loss. The default value of + ignore_index is the same as the default value for F.cross_entropy in + pytorch. + """ + assert logits.size(0) == targets.size( + -1 + ), "Logits and Targets tensor shapes don't match up" + + loss = F.nll_loss( + F.log_softmax(logits, -1, dtype=torch.float32), + targets, + reduction="sum", + ignore_index=ignore_index, + ) + return loss + + +@register_criterion("legacy_masked_lm_loss") +class LegacyMaskedLmLoss(FairseqCriterion): + """ + Implementation for the loss used in masked language model (MLM) training. + This optionally also computes the next sentence prediction (NSP) loss and + adds it to the overall loss based on the specified args. There are three + cases to consider: + 1) Generic MLM training without NSP loss. In this case sentence_targets + and sentence_logits are both None. + 2) BERT training without NSP loss. In this case sentence_targets is + not None but sentence_logits is None and we should not be computing + a sentence level loss. + 3) BERT training with NSP loss. In this case both sentence_targets and + sentence_logits are not None and we should be computing a sentence + level loss. The weight of the sentence level loss is specified as + an argument. + """ + + def __init__(self, task, masked_lm_only, nsp_loss_weight): + super().__init__(task) + self.masked_lm_only = masked_lm_only + self.nsp_loss_weight = nsp_loss_weight + + @staticmethod + def add_args(parser): + """Args for MaskedLM Loss""" + # Default for masked_lm_only is False so as to not break BERT training + parser.add_argument( + "--masked-lm-only", + default=False, + action="store_true", + help="compute MLM loss only", + ) + parser.add_argument( + "--nsp-loss-weight", + default=1.0, + type=float, + help="weight for next sentence prediction" " loss (default 1)", + ) + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + lm_logits, output_metadata = model(**sample["net_input"]) + + # reshape lm_logits from (N,T,C) to (N*T,C) + lm_logits = lm_logits.view(-1, lm_logits.size(-1)) + lm_targets = sample["lm_target"].view(-1) + lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx) + + # compute the number of tokens for which loss is computed. This is used + # to normalize the loss + ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel() + loss = lm_loss / ntokens + nsentences = sample["nsentences"] + # nsentences = 0 + + # Compute sentence loss if masked_lm_only is False + sentence_loss = None + if not self.masked_lm_only: + sentence_logits = output_metadata["sentence_logits"] + sentence_targets = sample["sentence_target"].view(-1) + # This needs to be recomputed due to some differences between + # TokenBlock and BlockPair dataset. This can be resolved with a + # refactor of BERTModel which we will do in the future. + # TODO: Remove this after refactor of BERTModel + nsentences = sentence_targets.size(0) + + # Check for logits being none which can happen when remove_heads + # is set to true in the BERT model. Ideally we should set + # masked_lm_only to true in this case, but that requires some + # refactor in the BERT model. + if sentence_logits is not None: + sentence_loss = compute_cross_entropy_loss( + sentence_logits, sentence_targets + ) + + loss += self.nsp_loss_weight * (sentence_loss / nsentences) + + # NOTE: as we are summing up per token mlm loss and per sentence nsp loss + # we don't need to use sample_size as denominator for the gradient + # here sample_size is just used for logging + sample_size = 1 + logging_output = { + "loss": utils.item(loss.data) if reduce else loss.data, + "lm_loss": utils.item(lm_loss.data) if reduce else lm_loss.data, + # sentence loss is not always computed + "sentence_loss": ( + (utils.item(sentence_loss.data) if reduce else sentence_loss.data) + if sentence_loss is not None + else 0.0 + ), + "ntokens": ntokens, + "nsentences": nsentences, + "sample_size": sample_size, + } + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + lm_loss_sum = sum(log.get("lm_loss", 0) for log in logging_outputs) + sentence_loss_sum = sum(log.get("sentence_loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + agg_loss = sum(log.get("loss", 0) for log in logging_outputs) + + metrics.log_scalar( + "loss", + agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.0, + sample_size, + round=3, + ) + metrics.log_scalar( + "lm_loss", + lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0, + ntokens, + round=3, + ) + metrics.log_scalar( + "sentence_loss", + sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.0, + nsentences, + round=3, + ) + metrics.log_scalar( + "nll_loss", + lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0, + ntokens, + round=3, + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py b/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py new file mode 100644 index 0000000000000000000000000000000000000000..9519fdc56d7de86b727f74ef5b18db520382e562 --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py @@ -0,0 +1,99 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion + + +@register_criterion("sentence_prediction") +class SentencePredictionCriterion(FairseqCriterion): + def __init__(self, task, classification_head_name, regression_target): + super().__init__(task) + self.classification_head_name = classification_head_name + self.regression_target = regression_target + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--classification-head-name', + default='sentence_classification_head', + help='name of the classification head to use') + # fmt: on + + def forward(self, model, sample, reduce=True): + """Compute the loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + assert ( + hasattr(model, "classification_heads") + and self.classification_head_name in model.classification_heads + ), "model must provide sentence classification head for --criterion=sentence_prediction" + + logits, _ = model( + **sample["net_input"], + features_only=True, + classification_head_name=self.classification_head_name, + ) + targets = model.get_targets(sample, [logits]).view(-1) + sample_size = targets.numel() + + if not self.regression_target: + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + loss = F.nll_loss(lprobs, targets, reduction="sum") + else: + logits = logits.view(-1).float() + targets = targets.float() + loss = F.mse_loss(logits, targets, reduction="sum") + + logging_output = { + "loss": loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample_size, + "sample_size": sample_size, + } + if not self.regression_target: + preds = logits.argmax(dim=1) + logging_output["ncorrect"] = (preds == targets).sum() + + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + + if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: + ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) + metrics.log_scalar( + "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1 + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py b/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c76341d4d87e6d0da21ac89e833ce0bda13a0c --- /dev/null +++ b/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math + +import torch +import torch.nn.functional as F +from fairseq import metrics, utils +from fairseq.criterions import FairseqCriterion, register_criterion + + +@register_criterion("sentence_ranking") +class SentenceRankingCriterion(FairseqCriterion): + def __init__(self, task, ranking_head_name, save_predictions, num_classes): + super().__init__(task) + self.ranking_head_name = ranking_head_name + if save_predictions is not None: + self.prediction_h = open(save_predictions, "w") + else: + self.prediction_h = None + self.num_classes = num_classes + + def __del__(self): + if self.prediction_h is not None: + self.prediction_h.close() + + @staticmethod + def add_args(parser): + # fmt: off + parser.add_argument('--save-predictions', metavar='FILE', + help='file to save predictions to') + parser.add_argument('--ranking-head-name', + default='sentence_classification_head', + help='name of the ranking head to use') + # fmt: on + + def forward(self, model, sample, reduce=True): + """Compute ranking loss for the given sample. + + Returns a tuple with three elements: + 1) the loss + 2) the sample size, which is used as the denominator for the gradient + 3) logging outputs to display while training + """ + assert ( + hasattr(model, "classification_heads") + and self.ranking_head_name in model.classification_heads + ), "model must provide sentence ranking head for --criterion=sentence_ranking" + + scores = [] + for idx in range(self.num_classes): + score, _ = model( + **sample["net_input{idx}".format(idx=idx + 1)], + classification_head_name=self.ranking_head_name, + ) + scores.append(score) + + logits = torch.cat(scores, dim=1) + sample_size = logits.size(0) + + if "target" in sample: + targets = model.get_targets(sample, [logits]).view(-1) + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + loss = F.nll_loss(lprobs, targets, reduction="sum") + else: + targets = None + loss = torch.tensor(0.0, requires_grad=True) + + if self.prediction_h is not None: + preds = logits.argmax(dim=1) + for i, (id, pred) in enumerate(zip(sample["id"].tolist(), preds.tolist())): + if targets is not None: + label = targets[i].item() + print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h) + else: + print("{}\t{}".format(id, pred), file=self.prediction_h) + + logging_output = { + "loss": loss.data, + "ntokens": sample["ntokens"], + "nsentences": sample_size, + "sample_size": sample_size, + } + if targets is not None: + logging_output["ncorrect"] = (logits.argmax(dim=1) == targets).sum() + + return loss, sample_size, logging_output + + @staticmethod + def reduce_metrics(logging_outputs) -> None: + """Aggregate logging outputs from data parallel training.""" + loss_sum = sum(log.get("loss", 0) for log in logging_outputs) + ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) + nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) + sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) + + metrics.log_scalar( + "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 + ) + if sample_size != ntokens: + metrics.log_scalar( + "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 + ) + + if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: + ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) + metrics.log_scalar( + "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1 + ) + + @staticmethod + def logging_outputs_can_be_summed() -> bool: + """ + Whether the logging outputs returned by `forward` can be summed + across workers prior to calling `reduce_metrics`. Setting this + to True will improves distributed training speed. + """ + return True diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65c5013e79484af9c146e46d80d022fdb3d9202c Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py new file mode 100644 index 0000000000000000000000000000000000000000..cbfc6ae4a0bfb8e8c66403a621d5ad6e52996b1a --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py @@ -0,0 +1,721 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.model_parallel.models.pipeline_parallel_transformer.layers import ( + Embedding, + TransformerDecoderEmbedding, + TransformerDecoderLayer, + TransformerDecoderOutputLayer, + TransformerEncoderEmbedding, + TransformerEncoderLayer, + TransformerEncoderLayerNorm, +) +from fairseq.models import ( + BaseFairseqModel, + FairseqDecoder, + FairseqEncoder, + register_model, + register_model_architecture, +) +from fairseq.models.fairseq_encoder import EncoderOut +from fairseq.models.transformer import ( + base_architecture, + transformer_iwslt_de_en, + transformer_wmt_en_de_big, +) +from fairseq.modules import SinusoidalPositionalEmbedding + + +logger = logging.getLogger(__name__) + + +DEFAULT_MAX_SOURCE_POSITIONS = 1024 +DEFAULT_MAX_TARGET_POSITIONS = 1024 + + +@register_model("pipeline_parallel_transformer") +class PipelineParallelTransformerModel(BaseFairseqModel): + def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): + try: + from fairscale.nn import Pipe + except ImportError: + raise ImportError("Please install fairscale with: pip install fairscale") + super().__init__() + assert isinstance(encoder, FairseqEncoder) + assert isinstance(decoder, FairseqDecoder) + encoder_module_list = ( + [encoder.embedding_layer] + + list(encoder.encoder_layers) + + [encoder.final_layer_norm] + ) + self.num_encoder_modules = len(encoder_module_list) + decoder_module_list = ( + [decoder.embedding_layer] + + list(decoder.decoder_layers) + + [decoder.decoder_output_layer] + ) + self.num_decoder_modules = len(decoder_module_list) + module_list = encoder_module_list + decoder_module_list + self.devices = devices + self.model = Pipe( + nn.Sequential(*module_list), + balance=balance, + devices=devices, + chunks=chunks, + checkpoint=checkpoint, + ) + self.encoder_max_positions = self.max_positions_helper( + encoder.embedding_layer, "max_source_positions" + ) + self.decoder_max_positions = self.max_positions_helper( + decoder.embedding_layer, "max_target_positions" + ) + self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None) + # Note: To be populated during inference + self.encoder = None + self.decoder = None + + def forward(self, src_tokens, src_lengths, prev_output_tokens): + if self.training: + input_lst = [src_tokens, src_lengths, prev_output_tokens] + input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst) + return self.model(input) + else: + assert self.encoder is not None and self.decoder is not None, ( + "encoder and decoder need to be initialized by " + + "calling the `prepare_for_inference_()` method" + ) + encoder_output_tuple = self.encoder(input) + return self.decoder(encoder_output_tuple) + + def prepare_for_inference_(self, args): + if self.encoder is not None and self.decoder is not None: + logger.info("Encoder and Decoder already initialized") + return + encoder_module_list = [] + decoder_module_list = [] + module_count = 0 + for partition in self.model.partitions: + for module in partition: + if module_count < self.num_encoder_modules: + encoder_module_list.append(module) + else: + decoder_module_list.append(module) + module_count += 1 + self.model = None + self.encoder = TransformerEncoder(args, None, None, encoder_module_list) + self.decoder = TransformerDecoder( + args, None, None, decoder_module_list=decoder_module_list + ) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--activation-fn', + choices=utils.get_available_activation_fns(), + help='activation function to use') + parser.add_argument('--dropout', type=float, metavar='D', + help='dropout probability') + parser.add_argument('--attention-dropout', type=float, metavar='D', + help='dropout probability for attention weights') + parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', + help='dropout probability after activation in FFN.') + parser.add_argument('--encoder-embed-path', type=str, metavar='STR', + help='path to pre-trained encoder embedding') + parser.add_argument('--encoder-embed-dim', type=int, metavar='N', + help='encoder embedding dimension') + parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', + help='encoder embedding dimension for FFN') + parser.add_argument('--encoder-layers', type=int, metavar='N', + help='num encoder layers') + parser.add_argument('--encoder-attention-heads', type=int, metavar='N', + help='num encoder attention heads') + parser.add_argument('--encoder-normalize-before', action='store_true', + help='apply layernorm before each encoder block') + parser.add_argument('--encoder-learned-pos', action='store_true', + help='use learned positional embeddings in the encoder') + parser.add_argument('--decoder-embed-path', type=str, metavar='STR', + help='path to pre-trained decoder embedding') + parser.add_argument('--decoder-embed-dim', type=int, metavar='N', + help='decoder embedding dimension') + parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', + help='decoder embedding dimension for FFN') + parser.add_argument('--decoder-layers', type=int, metavar='N', + help='num decoder layers') + parser.add_argument('--decoder-attention-heads', type=int, metavar='N', + help='num decoder attention heads') + parser.add_argument('--decoder-learned-pos', action='store_true', + help='use learned positional embeddings in the decoder') + parser.add_argument('--decoder-normalize-before', action='store_true', + help='apply layernorm before each decoder block') + parser.add_argument('--share-decoder-input-output-embed', action='store_true', + help='share decoder input and output embeddings') + parser.add_argument('--share-all-embeddings', action='store_true', + help='share encoder, decoder and output embeddings' + ' (requires shared dictionary and embed dim)') + parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', + help='if set, disables positional embeddings (outside self attention)') + parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', + help='comma separated list of adaptive softmax cutoff points. ' + 'Must be used with adaptive_loss criterion'), + parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', + help='sets adaptive softmax dropout for the tail projections') + parser.add_argument('--num-embedding-chunks', type=int, metavar='N', default=1, + help='Number of embedding layer chunks (enables more even distribution' + 'of optimizer states across data parallel nodes' + 'when using optimizer state sharding and' + 'a big embedding vocabulary)') + # fmt: on + + @classmethod + def build_model_base(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_architecture(args) + + if not hasattr(args, "max_source_positions"): + args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS + if not hasattr(args, "max_target_positions"): + args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS + + src_dict, tgt_dict = task.source_dictionary, task.target_dictionary + + def build_embedding(dictionary, embed_dim, path=None, num_embed_chunks=1): + assert embed_dim % num_embed_chunks == 0, ( + f"Number of embedding chunks = {num_embed_chunks} should be " + + f"divisible by the embedding dimension = {embed_dim}" + ) + assert path is None or num_embed_chunks == 1, ( + "Loading embedding from a path with number of embedding chunks > 1" + + " is not yet supported" + ) + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + # if provided, load from preloaded dictionaries + if path: + emb = Embedding(num_embeddings, embed_dim, padding_idx) + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + else: + embed_chunk_dim = embed_dim // num_embed_chunks + emb = nn.ModuleList() + for i in range(num_embed_chunks): + emb.append(Embedding(num_embeddings, embed_chunk_dim, padding_idx)) + return emb + + num_embed_chunks = args.num_embedding_chunks + if args.share_all_embeddings: + if src_dict != tgt_dict: + raise ValueError("--share-all-embeddings requires a joined dictionary") + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + encoder_embed_tokens = build_embedding( + src_dict, + args.encoder_embed_dim, + args.encoder_embed_path, + num_embed_chunks, + ) + decoder_embed_tokens = encoder_embed_tokens + args.share_decoder_input_output_embed = True + else: + assert args.share_decoder_input_output_embed or num_embed_chunks == 1, ( + "Not sharing decoder I/O embeddings is not yet supported with number of " + + "embedding chunks > 1" + ) + encoder_embed_tokens = build_embedding( + src_dict, + args.encoder_embed_dim, + args.encoder_embed_path, + num_embed_chunks, + ) + decoder_embed_tokens = build_embedding( + tgt_dict, + args.decoder_embed_dim, + args.decoder_embed_path, + num_embed_chunks, + ) + + encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) + decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) + return (encoder, decoder) + + @classmethod + def build_encoder(cls, args, src_dict, embed_tokens): + return TransformerEncoder(args, src_dict, embed_tokens) + + @classmethod + def build_decoder(cls, args, tgt_dict, embed_tokens): + return TransformerDecoder(args, tgt_dict, embed_tokens) + + @classmethod + def build_model(cls, args, task): + encoder, decoder = cls.build_model_base(args, task) + return PipelineParallelTransformerModel( + encoder=encoder, + decoder=decoder, + balance=utils.eval_str_list(args.pipeline_balance, type=int), + devices=utils.eval_str_list(args.pipeline_devices, type=int), + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) + + def output_layer(self, features, **kwargs): + """Project features to the default output size (typically vocabulary size).""" + return self.decoder.output_layer(features, **kwargs) + + def max_positions(self): + """Maximum length supported by the model.""" + return (self.encoder_max_positions, self.decoder_max_positions) + + def max_positions_helper( + self, embedding_layer, max_positions_field="max_source_positions" + ): + """Maximum input length supported by the encoder or decoder.""" + if embedding_layer.embed_positions is None: + return getattr(embedding_layer, max_positions_field) + return min( + getattr(embedding_layer, max_positions_field), + embedding_layer.embed_positions.max_positions, + ) + + def get_normalized_probs(self, net_output, log_probs, sample=None): + """Get normalized probabilities (or log probs) from a net's output.""" + + if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: + if sample is not None: + assert "target" in sample + target = sample["target"] + else: + target = None + out = self.adaptive_softmax.get_log_prob(net_output, target=target) + return out.exp_() if not log_probs else out + + # A Pipe() module returns a tuple of tensors as the output. + # In this case, the tuple has one element - the output tensor of logits + logits = net_output if isinstance(net_output, torch.Tensor) else net_output[0] + if log_probs: + return utils.log_softmax(logits, dim=-1, onnx_trace=False) + else: + return utils.softmax(logits, dim=-1, onnx_trace=False) + + def max_decoder_positions(self): + """Maximum length supported by the decoder.""" + return self.decoder_max_positions + + def load_state_dict(self, state_dict, strict=True, args=None): + """Copies parameters and buffers from *state_dict* into this module and + its descendants. + + Overrides the method in :class:`nn.Module`. Compared with that method + this additionally "upgrades" *state_dicts* from old checkpoints. + """ + self.upgrade_state_dict(state_dict) + is_regular_transformer = not any("model.partitions" in k for k in state_dict) + if is_regular_transformer: + state_dict = self.convert_to_pipeline_parallel_state_dict(state_dict) + return super().load_state_dict(state_dict, strict) + + def convert_to_pipeline_parallel_state_dict(self, state_dict): + new_state_dict = self.state_dict() + encoder_layer_idx = 0 + decoder_layer_idx = 0 + encoder_key_suffixes = [ + "self_attn.k_proj.weight", + "self_attn.k_proj.bias", + "self_attn.v_proj.weight", + "self_attn.v_proj.bias", + "self_attn.q_proj.weight", + "self_attn.q_proj.bias", + "self_attn.out_proj.weight", + "self_attn.out_proj.bias", + "self_attn_layer_norm.weight", + "self_attn_layer_norm.bias", + "fc1.weight", + "fc1.bias", + "fc2.weight", + "fc2.bias", + "final_layer_norm.weight", + "final_layer_norm.bias", + ] + decoder_key_suffixes = [ + "self_attn.k_proj.weight", + "self_attn.k_proj.bias", + "self_attn.v_proj.weight", + "self_attn.v_proj.bias", + "self_attn.q_proj.weight", + "self_attn.q_proj.bias", + "self_attn.out_proj.weight", + "self_attn.out_proj.bias", + "self_attn_layer_norm.weight", + "self_attn_layer_norm.bias", + "encoder_attn.k_proj.weight", + "encoder_attn.k_proj.bias", + "encoder_attn.v_proj.weight", + "encoder_attn.v_proj.bias", + "encoder_attn.q_proj.weight", + "encoder_attn.q_proj.bias", + "encoder_attn.out_proj.weight", + "encoder_attn.out_proj.bias", + "encoder_attn_layer_norm.weight", + "encoder_attn_layer_norm.bias", + "fc1.weight", + "fc1.bias", + "fc2.weight", + "fc2.bias", + "final_layer_norm.weight", + "final_layer_norm.bias", + ] + for pid, partition in enumerate(self.model.partitions): + logger.info(f"Begin Partition {pid}") + for mid, module in enumerate(partition): + # fmt: off + if isinstance(module, TransformerEncoderEmbedding): + new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight'] + new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['encoder.embed_positions._float_tensor'] + if isinstance(module, TransformerEncoderLayer): + for suffix in encoder_key_suffixes: + new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}'] + encoder_layer_idx += 1 + if isinstance(module, TransformerDecoderLayer): + for suffix in decoder_key_suffixes: + new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'decoder.layers.{decoder_layer_idx}.{suffix}'] + decoder_layer_idx += 1 + if isinstance(module, TransformerEncoderLayerNorm): + if 'encoder.layer_norm.weight' in state_dict: + new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.weight'] = state_dict['encoder.layer_norm.weight'] + new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias'] + if isinstance(module, TransformerDecoderEmbedding): + new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight'] + new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['decoder.embed_positions._float_tensor'] + if isinstance(module, TransformerDecoderOutputLayer): + new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight'] + # fmt: on + return new_state_dict + + +class TransformerEncoder(FairseqEncoder): + """ + Transformer encoder consisting of *args.encoder_layers* layers. Each layer + is a :class:`TransformerEncoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): encoding dictionary + embed_tokens (torch.nn.Embedding): input embedding + """ + + def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + try: + from fairscale.nn import Pipe + except ImportError: + raise ImportError("Please install fairscale with: pip install fairscale") + if encoder_module_list is None: + embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) + layers = [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] + if isinstance(embed_tokens, nn.ModuleList): + emb_dim = sum(e.embedding_dim for e in embed_tokens) + else: + emb_dim = embed_tokens.embedding_dim + final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) + encoder_module_list = [embedding_layer] + layers + [final_layer_norm] + self.use_pipeline = getattr(args, "pipeline_encoder_balance", None) is not None + if self.use_pipeline: + encoder_balance = utils.eval_str_list( + args.pipeline_encoder_balance, type=int + ) + encoder_devices = utils.eval_str_list( + args.pipeline_encoder_devices, type=int + ) + assert sum(encoder_balance) == len(encoder_module_list), ( + f"Sum of encoder_balance={encoder_balance} is not equal " + + f"to num_encoder_modules={len(encoder_module_list)}" + ) + self.model = Pipe( + module=nn.Sequential(*encoder_module_list), + balance=encoder_balance, + devices=encoder_devices, + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) + else: + self.embedding_layer = encoder_module_list[0] + self.encoder_layers = nn.Sequential(*encoder_module_list[1:-1]) + self.final_layer_norm = encoder_module_list[-1] + + def forward(self, src_tokens, src_lengths): + """ + Args: + input_tuple( + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (torch.LongTensor): lengths of each source sentence of + shape `(batch)` + ) + + Returns: + output_tuple( + - **encoder_out** (Tensor): the last encoder layer's output of + shape `(src_len, batch, embed_dim)` + - **encoder_padding_mask** (ByteTensor): the positions of + padding elements of shape `(batch, src_len)` + - prev_output_tokens + - **encoder_states** (List[Tensor]): all intermediate + hidden states of shape `(src_len, batch, embed_dim)`. + Only populated if *return_all_hiddens* is True. + ) + """ + dummy_prev_output_tokens = torch.zeros( + 1, dtype=src_tokens.dtype, device=src_tokens.device + ) + input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens) + if self.use_pipeline: + input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) + encoder_out = self.model(input_tuple) + else: + encoder_embed_output_tuple = self.embedding_layer(input_tuple) + encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple) + encoder_out = self.final_layer_norm(encoder_layers_output) + # first element is the encoder output + # second element is the encoder padding mask + # the remaining elements of EncoderOut are not computed by + # the PipelineParallelTransformer + return EncoderOut(encoder_out[0], encoder_out[1], None, None, None, None) + + def reorder_encoder_out(self, encoder_out, new_order): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + if encoder_out.encoder_out is not None: + encoder_out = encoder_out._replace( + encoder_out=encoder_out.encoder_out.index_select(1, new_order) + ) + if encoder_out.encoder_padding_mask is not None: + encoder_out = encoder_out._replace( + encoder_padding_mask=encoder_out.encoder_padding_mask.index_select( + 0, new_order + ) + ) + if encoder_out.encoder_embedding is not None: + encoder_out = encoder_out._replace( + encoder_embedding=encoder_out.encoder_embedding.index_select( + 0, new_order + ) + ) + if encoder_out.encoder_states is not None: + for idx, state in enumerate(encoder_out.encoder_states): + encoder_out.encoder_states[idx] = state.index_select(1, new_order) + return encoder_out + + def max_positions(self): + """Maximum input length supported by the encoder.""" + if self.embedding_layer.embed_positions is None: + return self.embedding_layer.max_source_positions + return min( + self.embedding_layer.max_source_positions, + self.embedding_layer.embed_positions.max_positions, + ) + + +class TransformerDecoder(FairseqDecoder): + """ + Transformer decoder consisting of *args.decoder_layers* layers. Each layer + is a :class:`TransformerDecoderLayer`. + + Args: + args (argparse.Namespace): parsed command-line arguments + dictionary (~fairseq.data.Dictionary): decoding dictionary + embed_tokens (torch.nn.Embedding): output embedding + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, + args, + dictionary, + embed_tokens, + no_encoder_attn=False, + decoder_module_list=None, + ): + super().__init__(dictionary) + self.register_buffer("version", torch.Tensor([3])) + try: + from fairscale.nn import Pipe + except ImportError: + raise ImportError("Please install fairscale with: pip install fairscale") + if decoder_module_list is None: + embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) + layers = [ + TransformerDecoderLayer(args, no_encoder_attn) + for _ in range(args.decoder_layers) + ] + decoder_output_layer = TransformerDecoderOutputLayer( + args, embed_tokens, dictionary + ) + decoder_module_list = [embedding_layer] + layers + [decoder_output_layer] + self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None + if self.use_pipeline: + decoder_balance = utils.eval_str_list( + args.pipeline_decoder_balance, type=int + ) + decoder_devices = utils.eval_str_list( + args.pipeline_decoder_devices, type=int + ) + assert sum(decoder_balance) == len(decoder_module_list), ( + f"Sum of decoder_balance={decoder_balance} is not equal " + + f"to num_decoder_modules={len(decoder_module_list)}" + ) + self.model = Pipe( + module=nn.Sequential(*decoder_module_list), + balance=decoder_balance, + devices=decoder_devices, + chunks=args.pipeline_chunks, + checkpoint=args.pipeline_checkpoint, + ) + else: + self.embedding_layer = decoder_module_list[0] + self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1]) + self.decoder_output_layer = decoder_module_list[-1] + + def forward( + self, + prev_output_tokens, + encoder_out=None, + ): + """ + Args: + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (optional): output from the encoder, used for + encoder-side attention + incremental_state (dict): dictionary used for storing state during + :ref:`Incremental decoding` + features_only (bool, optional): only return features without + applying output layer (default: False). + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + input_tuple = ( + encoder_out.encoder_out, + encoder_out.encoder_padding_mask, + prev_output_tokens, + ) + if self.use_pipeline: + input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) + return (self.model(input_tuple),) + else: + embed_layer_output = self.embedding_layer(input_tuple) + state = self.decoder_layers(embed_layer_output) + return (self.decoder_output_layer(state),) + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + if self.adaptive_softmax is None: + # project back to size of vocabulary + if self.share_input_output_embed: + return F.linear(features, self.embed_tokens.weight) + else: + return F.linear(features, self.embed_out) + else: + return features + + def max_positions(self): + """Maximum output length supported by the decoder.""" + if self.embedding_layer.embed_positions is None: + return self.embedding_layer.max_target_positions + return min( + self.embedding_layer.max_target_positions, + self.embedding_layer.embed_positions.max_positions, + ) + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + if ( + not hasattr(self, "_future_mask") + or self._future_mask is None + or self._future_mask.device != tensor.device + or self._future_mask.size(0) < dim + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 + ) + return self._future_mask[:dim, :dim] + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): + weights_key = "{}.embed_positions.weights".format(name) + if weights_key in state_dict: + del state_dict[weights_key] + state_dict[ + "{}.embed_positions._float_tensor".format(name) + ] = torch.FloatTensor(1) + + for i in range(len(self.layers)): + # update layer norms + layer_norm_map = { + "0": "self_attn_layer_norm", + "1": "encoder_attn_layer_norm", + "2": "final_layer_norm", + } + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) + if k in state_dict: + state_dict[ + "{}.layers.{}.{}.{}".format(name, i, new, m) + ] = state_dict[k] + del state_dict[k] + + version_key = "{}.version".format(name) + if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: + # earlier checkpoints did not normalize after the stack of layers + self.layer_norm = None + self.normalize = False + state_dict[version_key] = torch.Tensor([1]) + + return state_dict + + +@register_model_architecture( + "pipeline_parallel_transformer", "transformer_iwslt_de_en_pipeline_parallel" +) +def transformer_iwslt_de_en_dist(args): + transformer_iwslt_de_en(args) + + +@register_model_architecture( + "pipeline_parallel_transformer", "transformer_wmt_en_de_big_pipeline_parallel" +) +def transformer_wmt_en_de_big_dist(args): + transformer_wmt_en_de_big(args) diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..117827c3e9c176477f33e3a6fd7fe19a922411a2 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .model import * # noqa diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86eb916eddcdc3781aaf02b7320c293c17dd45e1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8ddc583ce9946e70655bd09bf2604029c9d390a2 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..46fa438688304fdc89f8fad8308b2896e8c00cc1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..643549647692c6cc192d23d03168a4e5f511a7d3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e3c50bc4650da9745c90d1d0ca817f1a01c4e10 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..489bb4adc7ac86fde068b9a796a06cfc5d74c2cc Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a80fca170ef22eef58a006b5115af2f4b8edc175 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acadfc2292198993a872422f37ce0c0ff93f399c Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae8753142c67a74f88f7ae0e67f7e1d04eebfce0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfa043d5f5c1b1197feacfa1476f15c84d8603bc Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..69425245346f14bb98f484715caa190c745e14b7 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4677bccd8ce34a3ee6e3c9b7c6c070c8286adc40 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ce08d2d28811eb3eb9364e1c7d25bcf21e486a1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4a287f918f3fd619a82979dbc57fadf5617431b Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e408484ea10c396dc44c691728d08c0b2ff8175a Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8084a8b89b9b5554542a892ba96c68ad8a9668d0 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d42f8d48e03bc4d4383a43cf38b0006c81d9c499 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f58f1f65706e34ecd620bfd007e3a2a971c8f3c2 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6938891548751ce0bd939a5f945ce33f3ff5ad17 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/bart/__init__.py b/fairseq-0.10.2/fairseq/models/bart/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a701923f7e5a2a8aa9b75e5580ddea22907f53ee --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/bart/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .hub_interface import * # noqa +from .model import * # noqa diff --git a/fairseq-0.10.2/fairseq/models/bart/hub_interface.py b/fairseq-0.10.2/fairseq/models/bart/hub_interface.py new file mode 100644 index 0000000000000000000000000000000000000000..cdabe36010bdfde5680f7fd6439b9b2c56c660bd --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/bart/hub_interface.py @@ -0,0 +1,201 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import logging +from typing import List + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.data import encoders + + +logger = logging.getLogger(__name__) + + +class BARTHubInterface(nn.Module): + """A simple PyTorch Hub interface to BART. + + Usage: https://github.com/pytorch/fairseq/tree/master/examples/bart + """ + + def __init__(self, args, task, model): + super().__init__() + self.args = args + self.task = task + self.model = model + + self.bpe = encoders.build_bpe(args) + + self.max_positions = min( + utils.resolve_max_positions( + self.task.max_positions(), + self.model.max_positions(), + ) + ) + + # this is useful for determining the device + self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) + + @property + def device(self): + return self._float_tensor.device + + def encode( + self, sentence: str, *addl_sentences, no_separator=True + ) -> torch.LongTensor: + """ + BPE-encode a sentence (or multiple sentences). + + Every sequence begins with a beginning-of-sentence (``) symbol. + Every sentence ends with an end-of-sentence (``). + + Example (single sentence): ` a b c ` + Example (sentence pair): ` d e f 1 2 3 ` + + The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE + requires leading spaces. For example:: + + >>> bart.encode('Hello world').tolist() + [0, 31414, 232, 2] + >>> bart.encode(' world').tolist() + [0, 232, 2] + >>> bart.encode('world').tolist() + [0, 8331, 2] + """ + tokens = self.bpe.encode(sentence) + if len(tokens.split(" ")) > self.max_positions - 2: + tokens = " ".join(tokens.split(" ")[: self.max_positions - 2]) + bpe_sentence = " " + tokens + " " + for s in addl_sentences: + bpe_sentence += " " if not no_separator else "" + bpe_sentence += " " + self.bpe.encode(s) + " " + tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False) + return tokens.long() + + def decode(self, tokens: torch.LongTensor): + assert tokens.dim() == 1 + tokens = tokens.cpu().numpy() + if tokens[0] == self.task.source_dictionary.bos(): + tokens = tokens[1:] # remove + eos_mask = tokens == self.task.source_dictionary.eos() + doc_mask = eos_mask[1:] & eos_mask[:-1] + sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) + sentences = [ + self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences + ] + if len(sentences) == 1: + return sentences[0] + return sentences + + def _build_sample(self, src_tokens: List[torch.LongTensor]): + # assert torch.is_tensor(src_tokens) + dataset = self.task.build_dataset_for_inference( + src_tokens, + [x.numel() for x in src_tokens], + ) + sample = dataset.collater(dataset) + sample = utils.apply_to_sample(lambda tensor: tensor.to(self.device), sample) + return sample + + def sample( + self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs + ) -> str: + input = [self.encode(sentence) for sentence in sentences] + hypos = self.generate(input, beam, verbose, **kwargs) + return [self.decode(x["tokens"]) for x in hypos] + + def generate( + self, + tokens: List[torch.LongTensor], + beam: int = 5, + verbose: bool = False, + **kwargs + ) -> torch.LongTensor: + sample = self._build_sample(tokens) + + # build generator using current args as well as any kwargs + gen_args = copy.copy(self.args) + gen_args.beam = beam + for k, v in kwargs.items(): + setattr(gen_args, k, v) + generator = self.task.build_generator([self.model], gen_args) + translations = self.task.inference_step( + generator, + [self.model], + sample, + prefix_tokens=sample["net_input"]["src_tokens"] + .new_zeros((len(tokens), 1)) + .fill_(self.task.source_dictionary.bos()), + ) + + if verbose: + src_str_with_unk = self.string(tokens) + logger.info("S\t{}".format(src_str_with_unk)) + + def getarg(name, default): + return getattr(gen_args, name, getattr(self.args, name, default)) + + # Process top predictions + hypos = [x[0] for x in translations] + hypos = [v for _, v in sorted(zip(sample["id"].tolist(), hypos))] + return hypos + + def extract_features( + self, tokens: torch.LongTensor, return_all_hiddens: bool = False + ) -> torch.Tensor: + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + if tokens.size(-1) > min(self.model.max_positions()): + raise ValueError( + "tokens exceeds maximum length: {} > {}".format( + tokens.size(-1), self.model.max_positions() + ) + ) + tokens.to(device=self.device), + prev_output_tokens = tokens.clone() + + prev_output_tokens[:, 0] = tokens.gather( + 1, + (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1) - 1).unsqueeze(-1), + ).squeeze() + + prev_output_tokens[:, 1:] = tokens[:, :-1] + features, extra = self.model( + src_tokens=tokens, + src_lengths=None, + prev_output_tokens=prev_output_tokens, + features_only=True, + return_all_hiddens=return_all_hiddens, + ) + if return_all_hiddens: + # convert from T x B x C -> B x T x C + inner_states = extra["inner_states"] + return [inner_state.transpose(0, 1) for inner_state in inner_states] + else: + return features # just the last layer's features + + def register_classification_head( + self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs + ): + self.model.register_classification_head( + name, num_classes=num_classes, embedding_size=embedding_size, **kwargs + ) + + def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False): + if tokens.dim() == 1: + tokens = tokens.unsqueeze(0) + features = self.extract_features(tokens.to(device=self.device)) + sentence_representation = features[ + tokens.eq(self.task.source_dictionary.eos()), : + ].view(features.size(0), -1, features.size(-1))[:, -1, :] + + logits = self.model.classification_heads[head](sentence_representation) + if return_logits: + return logits + return F.log_softmax(logits, dim=-1) diff --git a/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py b/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py new file mode 100644 index 0000000000000000000000000000000000000000..ece10c6333f486176a8851c4b39b2e6617e37e51 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py @@ -0,0 +1,103 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import inspect + +import torch.nn as nn +from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel + + +_GOSSIP_DISABLED = False +try: + import gossip +except ImportError: + _GOSSIP_DISABLED = True + + +def DistributedFairseqModel(args, model, process_group=None): + """ + Wrap a *model* to support distributed data parallel training. + + This is similar to the built-in DistributedDataParallel, but allows + additional configuration of the DistributedDataParallel class to + use, and also provides easier access to the wrapped model by + forwarding requests for missing attributes to the wrapped model. + + Args: + args (argparse.Namespace): fairseq args + model (BaseFairseqModel): model to wrap + """ + # determine which DDP class to extend + assert isinstance(model, nn.Module) + if args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d": + ddp_class = nn.parallel.DistributedDataParallel + init_kwargs = dict( + module=model, + device_ids=[args.device_id], + output_device=args.device_id, + broadcast_buffers=args.broadcast_buffers, + bucket_cap_mb=args.bucket_cap_mb, + process_group=process_group, + ) + # Maintain backward compatibility + if "check_reduction" in inspect.getargspec(ddp_class)[0]: + init_kwargs["check_reduction"] = True + if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]: + init_kwargs["find_unused_parameters"] = args.find_unused_parameters + elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d": + ddp_class = LegacyDistributedDataParallel + init_kwargs = dict( + module=model, + world_size=args.distributed_world_size, + buffer_size=2 ** 28, + process_group=process_group, + ) + elif args.distributed_wrapper == "SlowMo": + if _GOSSIP_DISABLED: + raise ImportError( + "Cannot find gossip library. Please install from: " + "github.com/facebookresearch/stochastic_gradient_push" + ) + ddp_class = gossip.GossipDataParallel + + # The values of slowmo_momentum below were obtained by tuning on the + # En-De 16 dataset by training the transformer_wmt_en_de_large model + if args.slowmo_momentum is None: + if args.distributed_world_size <= 16: + args.slowmo_momentum = 0.0 + elif args.distributed_world_size <= 32: + args.slowmo_momentum = 0.2 + elif args.distributed_world_size <= 64: + args.slowmo_momentum = 0.5 + else: + args.slowmo_momentum = 0.6 + + init_kwargs = dict( + module=model, + device_ids=[args.device_id], + output_device=args.device_id, + broadcast_buffers=args.broadcast_buffers, + nprocs_per_node=args.nprocs_per_node, + slowmo_momentum=args.slowmo_momentum, + localsgd=(args.slowmo_algorithm == "LocalSGD"), + localsgd_frequency=args.localsgd_frequency, + ) + else: + raise ValueError("Unknown --ddp-backend: " + args.ddp_backend) + + class _DistributedFairseqModel(ddp_class): + """Extend DistributedDataParallel to check for missing + attributes in the wrapped module.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __getattr__(self, name): + wrapped_module = super().__getattr__("module") + if hasattr(wrapped_module, name): + return getattr(wrapped_module, name) + return super().__getattr__(name) + + return _DistributedFairseqModel(**init_kwargs) diff --git a/fairseq-0.10.2/fairseq/models/fairseq_decoder.py b/fairseq-0.10.2/fairseq/models/fairseq_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..fb6c52dc7ffd95c63e0b43512db398cbb8b91582 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/fairseq_decoder.py @@ -0,0 +1,90 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional, Tuple + +import torch.nn as nn +from fairseq import utils +from torch import Tensor + + +class FairseqDecoder(nn.Module): + """Base class for decoders.""" + + def __init__(self, dictionary): + super().__init__() + self.dictionary = dictionary + self.onnx_trace = False + + def forward(self, prev_output_tokens, encoder_out=None, **kwargs): + """ + Args: + prev_output_tokens (LongTensor): shifted output tokens of shape + `(batch, tgt_len)`, for teacher forcing + encoder_out (dict, optional): output from the encoder, used for + encoder-side attention + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + x, extra = self.extract_features( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + x = self.output_layer(x) + return x, extra + + def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs): + """ + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + raise NotImplementedError + + def output_layer(self, features, **kwargs): + """ + Project features to the default output size, e.g., vocabulary size. + + Args: + features (Tensor): features returned by *extract_features*. + """ + raise NotImplementedError + + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + + if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: + if sample is not None: + assert "target" in sample + target = sample["target"] + else: + target = None + out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) + return out.exp_() if not log_probs else out + + logits = net_output[0] + if log_probs: + return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) + else: + return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) + + def max_positions(self): + """Maximum input length supported by the decoder.""" + return 1e6 # an arbitrary large number + + def upgrade_state_dict(self, state_dict): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + return state_dict + + def prepare_for_onnx_export_(self): + self.onnx_trace = True diff --git a/fairseq-0.10.2/fairseq/models/fairseq_encoder.py b/fairseq-0.10.2/fairseq/models/fairseq_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..c8873daa283163881a7dc0190e8b25353abed410 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/fairseq_encoder.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, NamedTuple, Optional + +import torch +import torch.nn as nn +from torch import Tensor + + +EncoderOut = NamedTuple( + "EncoderOut", + [ + ("encoder_out", Tensor), # T x B x C + ("encoder_padding_mask", Optional[Tensor]), # B x T + ("encoder_embedding", Optional[Tensor]), # B x T x C + ("encoder_states", Optional[List[Tensor]]), # List[T x B x C] + ("src_tokens", Optional[Tensor]), # B x T + ("src_lengths", Optional[Tensor]), # B x 1 + ], +) + + +class FairseqEncoder(nn.Module): + """Base class for encoders.""" + + def __init__(self, dictionary): + super().__init__() + self.dictionary = dictionary + + def forward(self, src_tokens, src_lengths=None, **kwargs): + """ + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (LongTensor): lengths of each source sentence of shape + `(batch)` + """ + raise NotImplementedError + + def forward_torchscript(self, net_input: Dict[str, Tensor]): + """A TorchScript-compatible version of forward. + + Encoders which use additional arguments may want to override + this method for TorchScript compatibility. + """ + if torch.jit.is_scripting(): + return self.forward( + src_tokens=net_input["src_tokens"], + src_lengths=net_input["src_lengths"], + ) + else: + return self.forward_non_torchscript(net_input) + + @torch.jit.unused + def forward_non_torchscript(self, net_input: Dict[str, Tensor]): + encoder_input = { + k: v for k, v in net_input.items() if k != "prev_output_tokens" + } + return self.forward(**encoder_input) + + def reorder_encoder_out(self, encoder_out, new_order): + """ + Reorder encoder output according to `new_order`. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + `encoder_out` rearranged according to `new_order` + """ + raise NotImplementedError + + def max_positions(self): + """Maximum input length supported by the encoder.""" + return 1e6 # an arbitrary large number + + def upgrade_state_dict(self, state_dict): + """Upgrade a (possibly old) state dict for new versions of fairseq.""" + return state_dict + + def set_num_updates(self, num_updates): + """State from trainer to pass along to model at every update.""" + + def _apply(m): + if hasattr(m, "set_num_updates") and m != self: + m.set_num_updates(num_updates) + + self.apply(_apply) diff --git a/fairseq-0.10.2/fairseq/models/fairseq_model.py b/fairseq-0.10.2/fairseq/models/fairseq_model.py new file mode 100644 index 0000000000000000000000000000000000000000..092fba43ce16beb479412394b4efcb8e4a07bfbe --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/fairseq_model.py @@ -0,0 +1,556 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Base classes for various fairseq models. +""" + +import logging +from typing import Dict, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +# from fairseq.checkpoint_utils import prune_state_dict +from fairseq.data import Dictionary +from fairseq.dataclass.utils import gen_parser_from_dataclass +from fairseq.models import FairseqDecoder, FairseqEncoder +from torch import Tensor + + +logger = logging.getLogger(__name__) + + +class BaseFairseqModel(nn.Module): + """Base class for fairseq models.""" + + def __init__(self): + super().__init__() + self._is_generation_fast = False + + @classmethod + def add_args(cls, parser): + """Add model-specific arguments to the parser.""" + dc = getattr(cls, "__dataclass", None) + if dc is not None: + # do not set defaults so that settings defaults from various architectures still works + gen_parser_from_dataclass(parser, dc(), delete_default=True) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + raise NotImplementedError("Model must implement the build_model method") + + def get_targets(self, sample, net_output): + """Get targets from either the sample or the net's output.""" + return sample["target"] + + def get_normalized_probs( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Get normalized probabilities (or log probs) from a net's output.""" + return self.get_normalized_probs_scriptable(net_output, log_probs, sample) + + # TorchScript doesn't support super() method so that the scriptable Subclass + # can't access the base class model in Torchscript. + # Current workaround is to add a helper function with different name and + # call the helper function from scriptable Subclass. + def get_normalized_probs_scriptable( + self, + net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], + log_probs: bool, + sample: Optional[Dict[str, Tensor]] = None, + ): + """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel""" + if hasattr(self, "decoder"): + return self.decoder.get_normalized_probs(net_output, log_probs, sample) + elif torch.is_tensor(net_output): + # syntactic sugar for simple models which don't have a decoder + # (e.g., the classification tutorial) + logits = net_output.float() + if log_probs: + return F.log_softmax(logits, dim=-1) + else: + return F.softmax(logits, dim=-1) + raise NotImplementedError + + def extract_features(self, *args, **kwargs): + """Similar to *forward* but only return features.""" + return self(*args, **kwargs) + + def max_positions(self): + """Maximum length supported by the model.""" + return None + + def load_state_dict(self, state_dict, strict=True, args=None): + """Copies parameters and buffers from *state_dict* into this module and + its descendants. + + Overrides the method in :class:`nn.Module`. Compared with that method + this additionally "upgrades" *state_dicts* from old checkpoints. + """ + self.upgrade_state_dict(state_dict) + from fairseq.checkpoint_utils import prune_state_dict + new_state_dict = prune_state_dict(state_dict, args) + return super().load_state_dict(new_state_dict, strict) + + def upgrade_state_dict(self, state_dict): + """Upgrade old state dicts to work with newer code.""" + self.upgrade_state_dict_named(state_dict, "") + + def upgrade_state_dict_named(self, state_dict, name): + """Upgrade old state dicts to work with newer code. + + Args: + state_dict (dict): state dictionary to upgrade, in place + name (str): the state dict key corresponding to the current module + """ + assert state_dict is not None + + def do_upgrade(m, prefix): + if len(prefix) > 0: + prefix += "." + + for n, c in m.named_children(): + name = prefix + n + if hasattr(c, "upgrade_state_dict_named"): + c.upgrade_state_dict_named(state_dict, name) + elif hasattr(c, "upgrade_state_dict"): + c.upgrade_state_dict(state_dict) + do_upgrade(c, name) + + do_upgrade(self, name) + + def set_num_updates(self, num_updates): + """State from trainer to pass along to model at every update.""" + + def _apply(m): + if hasattr(m, "set_num_updates") and m != self: + m.set_num_updates(num_updates) + + self.apply(_apply) + + def prepare_for_inference_(self, args): + """Prepare model for inference.""" + kwargs = {} + kwargs["beamable_mm_beam_size"] = ( + None if getattr(args, "no_beamable_mm", False) else getattr(args, "beam", 5) + ) + kwargs["need_attn"] = getattr(args, "print_alignment", False) + if hasattr(args, "retain_dropout"): + kwargs["retain_dropout"] = args.retain_dropout + kwargs["retain_dropout_modules"] = getattr( + args, "retain_dropout_modules", None + ) + self.make_generation_fast_(**kwargs) + + def make_generation_fast_(self, **kwargs): + """ + Legacy entry point to optimize model for faster generation. + Prefer prepare_for_inference_. + """ + if self._is_generation_fast: + return # only apply once + self._is_generation_fast = True + + # remove weight norm from all modules in the network + def apply_remove_weight_norm(module): + try: + nn.utils.remove_weight_norm(module) + except (AttributeError, ValueError): # this module didn't have weight norm + return + + self.apply(apply_remove_weight_norm) + + def apply_make_generation_fast_(module, prefix): + if len(prefix) > 0: + prefix += "." + + base_func = BaseFairseqModel.make_generation_fast_ + for n, m in module.named_modules(): + if ( + m != self + and hasattr(m, "make_generation_fast_") + # don't call this implementation again, e.g., if + # children modules also inherit from BaseFairseqModel + and m.make_generation_fast_.__func__ is not base_func + ): + name = prefix + n + m.make_generation_fast_(name=name, **kwargs) + + apply_make_generation_fast_(self, "") + + def train(mode=True): + if mode: + raise RuntimeError("cannot train after make_generation_fast") + + # this model should no longer be used for training + self.eval() + self.train = train + + def prepare_for_onnx_export_(self, **kwargs): + """Make model exportable via ONNX trace.""" + seen = set() + + def apply_prepare_for_onnx_export_(module): + if ( + module != self + and hasattr(module, "prepare_for_onnx_export_") + and module not in seen + ): + seen.add(module) + module.prepare_for_onnx_export_(**kwargs) + + self.apply(apply_prepare_for_onnx_export_) + + def prepare_for_tpu_(self, **kwargs): + """Optionally modify model for use on TPUs.""" + seen = set() + + def apply_prepare_for_tpu_(module): + if ( + module != self + and hasattr(module, "prepare_for_tpu_") + and module not in seen + ): + seen.add(module) + module.prepare_for_tpu_(**kwargs) + + self.apply(apply_prepare_for_tpu_) + + @classmethod + def upgrade_args(cls, args): + if hasattr(args, "max_sentences") and not hasattr(args, "batch_size"): + args.batch_size = args.max_sentences + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + **kwargs, + ): + """ + Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model + file. Downloads and caches the pre-trained model file if needed. + + The base implementation returns a + :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to + generate translations or sample from language models. The underlying + :class:`~fairseq.models.FairseqModel` can be accessed via the + *generator.models* attribute. + + Other models may override this to implement custom hub interfaces. + + Args: + model_name_or_path (str): either the name of a pre-trained model to + load or a path/URL to a pre-trained model state dict + checkpoint_file (str, optional): colon-separated list of checkpoint + files in the model archive to ensemble (default: 'model.pt') + data_name_or_path (str, optional): point args.data to the archive + at the given path/URL. Can start with '.' or './' to reuse the + model archive path. + """ + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + **kwargs, + ) + + cls.upgrade_args(x["args"]) + + logger.info(x["args"]) + return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"]) + + @classmethod + def hub_models(cls): + return {} + + +class FairseqEncoderDecoderModel(BaseFairseqModel): + """Base class for encoder-decoder models. + + Args: + encoder (FairseqEncoder): the encoder + decoder (FairseqDecoder): the decoder + """ + + def __init__(self, encoder, decoder): + super().__init__() + + self.encoder = encoder + self.decoder = decoder + assert isinstance(self.encoder, FairseqEncoder) + assert isinstance(self.decoder, FairseqDecoder) + + def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): + """ + Run the forward pass for an encoder-decoder model. + + First feed a batch of source tokens through the encoder. Then, feed the + encoder output and previous decoder outputs (i.e., teacher forcing) to + the decoder to produce the next outputs:: + + encoder_out = self.encoder(src_tokens, src_lengths) + return self.decoder(prev_output_tokens, encoder_out) + + Args: + src_tokens (LongTensor): tokens in the source language of shape + `(batch, src_len)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + prev_output_tokens (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing + + Returns: + tuple: + - the decoder's output of shape `(batch, tgt_len, vocab)` + - a dictionary with any model-specific outputs + """ + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + decoder_out = self.decoder( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + return decoder_out + + def forward_decoder(self, prev_output_tokens, **kwargs): + return self.decoder(prev_output_tokens, **kwargs) + + def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, tgt_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + features = self.decoder.extract_features( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + return features + + def output_layer(self, features, **kwargs): + """Project features to the default output size (typically vocabulary size).""" + return self.decoder.output_layer(features, **kwargs) + + def max_positions(self): + """Maximum length supported by the model.""" + return (self.encoder.max_positions(), self.decoder.max_positions()) + + def max_decoder_positions(self): + """Maximum length supported by the decoder.""" + return self.decoder.max_positions() + + +class FairseqModel(FairseqEncoderDecoderModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + utils.deprecation_warning( + "FairseqModel is deprecated, please use FairseqEncoderDecoderModel " + "or BaseFairseqModel instead", + stacklevel=4, + ) + + +class FairseqMultiModel(BaseFairseqModel): + """Base class for combining multiple encoder-decoder models.""" + + def __init__(self, encoders, decoders): + super().__init__() + assert encoders.keys() == decoders.keys() + self.keys = list(encoders.keys()) + for key in self.keys: + assert isinstance(encoders[key], FairseqEncoder) + assert isinstance(decoders[key], FairseqDecoder) + + self.models = nn.ModuleDict( + { + key: FairseqEncoderDecoderModel(encoders[key], decoders[key]) + for key in self.keys + } + ) + + @staticmethod + def build_shared_embeddings( + dicts: Dict[str, Dictionary], + langs: List[str], + embed_dim: int, + build_embedding: callable, + pretrained_embed_path: Optional[str] = None, + ): + """ + Helper function to build shared embeddings for a set of languages after + checking that all dicts corresponding to those languages are equivalent. + + Args: + dicts: Dict of lang_id to its corresponding Dictionary + langs: languages that we want to share embeddings for + embed_dim: embedding dimension + build_embedding: callable function to actually build the embedding + pretrained_embed_path: Optional path to load pretrained embeddings + """ + shared_dict = dicts[langs[0]] + if any(dicts[lang] != shared_dict for lang in langs): + raise ValueError( + "--share-*-embeddings requires a joined dictionary: " + "--share-encoder-embeddings requires a joined source " + "dictionary, --share-decoder-embeddings requires a joined " + "target dictionary, and --share-all-embeddings requires a " + "joint source + target dictionary." + ) + return build_embedding(shared_dict, embed_dim, pretrained_embed_path) + + def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): + raise NotImplementedError + + def max_positions(self): + """Maximum length supported by the model.""" + return { + key: ( + self.models[key].encoder.max_positions(), + self.models[key].decoder.max_positions(), + ) + for key in self.keys + } + + def max_decoder_positions(self): + """Maximum length supported by the decoder.""" + return min(model.decoder.max_positions() for model in self.models.values()) + + @property + def encoder(self): + return self.models[self.keys[0]].encoder + + @property + def decoder(self): + return self.models[self.keys[0]].decoder + + def forward_decoder(self, prev_output_tokens, **kwargs): + return self.decoder(prev_output_tokens, **kwargs) + + def load_state_dict(self, state_dict, strict=True, args=None): + """Copies parameters and buffers from *state_dict* into this module and + its descendants. + + Overrides the method in :class:`nn.Module`. Compared with that method + this additionally "upgrades" *state_dicts* from old checkpoints. + """ + self.upgrade_state_dict(state_dict) + from fairseq.checkpoint_utils import prune_state_dict + new_state_dict = prune_state_dict(state_dict, args) + return super().load_state_dict(new_state_dict, strict) + + +class FairseqLanguageModel(BaseFairseqModel): + """Base class for decoder-only models. + + Args: + decoder (FairseqDecoder): the decoder + """ + + def __init__(self, decoder): + super().__init__() + self.decoder = decoder + assert isinstance(self.decoder, FairseqDecoder) + + def forward(self, src_tokens, **kwargs): + """ + Run the forward pass for a decoder-only model. + + Feeds a batch of tokens through the decoder to predict the next tokens. + + Args: + src_tokens (LongTensor): tokens on which to condition the decoder, + of shape `(batch, tgt_len)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + + Returns: + tuple: + - the decoder's output of shape `(batch, seq_len, vocab)` + - a dictionary with any model-specific outputs + """ + return self.decoder(src_tokens, **kwargs) + + def forward_decoder(self, prev_output_tokens, **kwargs): + return self.decoder(prev_output_tokens, **kwargs) + + def extract_features(self, src_tokens, **kwargs): + """ + Similar to *forward* but only return features. + + Returns: + tuple: + - the decoder's features of shape `(batch, seq_len, embed_dim)` + - a dictionary with any model-specific outputs + """ + return self.decoder.extract_features(src_tokens, **kwargs) + + def output_layer(self, features, **kwargs): + """Project features to the default output size (typically vocabulary size).""" + return self.decoder.output_layer(features, **kwargs) + + def max_positions(self): + """Maximum length supported by the model.""" + return self.decoder.max_positions() + + def max_decoder_positions(self): + """Maximum length supported by the decoder.""" + return self.decoder.max_positions() + + @property + def supported_targets(self): + return {"future"} + + +class FairseqEncoderModel(BaseFairseqModel): + """Base class for encoder-only models. + + Args: + encoder (FairseqEncoder): the encoder + """ + + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + assert isinstance(self.encoder, FairseqEncoder) + + def forward(self, src_tokens, src_lengths, **kwargs): + """ + Run the forward pass for a encoder-only model. + + Feeds a batch of tokens through the encoder to generate features. + + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + src_lengths (LongTensor): source sentence lengths of shape `(batch)` + + Returns: + the encoder's output, typically of shape `(batch, src_len, features)` + """ + return self.encoder(src_tokens, src_lengths, **kwargs) + + def get_normalized_probs(self, net_output, log_probs, sample=None): + """Get normalized probabilities (or log probs) from a net's output.""" + encoder_out = net_output["encoder_out"] + if torch.is_tensor(encoder_out): + logits = encoder_out.float() + if log_probs: + return F.log_softmax(logits, dim=-1) + else: + return F.softmax(logits, dim=-1) + raise NotImplementedError + + def max_positions(self): + """Maximum length supported by the model.""" + return self.encoder.max_positions() diff --git a/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py b/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py new file mode 100644 index 0000000000000000000000000000000000000000..3a8eb78198f5808557092f814e92f1c9d72933ec --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import sys +from typing import Dict, List, Optional + +import torch +from fairseq.models import ( + FairseqIncrementalDecoder, + FairseqLanguageModel, + register_model, + register_model_architecture, +) + + +logger = logging.getLogger(__name__) + + +DEFAULT_MAX_TARGET_POSITIONS = 1024 + + +@register_model("hf_gpt2") +class HuggingFaceGPT2LanguageModel(FairseqLanguageModel): + def __init__(self, decoder): + super().__init__(decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # fmt: off + parser.add_argument('--embed-dim', type=int, metavar='N', + help='embedding dimension') + parser.add_argument('--num-attention-heads', type=int, metavar='N', + help='num attention heads') + parser.add_argument('--num-layers', type=int, metavar='N', + help='num layers') + parser.add_argument('--dropout', type=float, metavar='D', + help='dropout probability for all fully connected layers ' + 'in the embeddings, encoder, and pooler') + parser.add_argument('--attention-dropout', type=float, metavar='D', + help='dropout probability for attention weights') + # fmt: on + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + default_architecture(args) + return cls(HuggingFaceGPT2Decoder(args, task)) + + +class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder): + def __init__(self, args, task): + try: + from transformers import GPT2Config, GPT2LMHeadModel + except ImportError: + raise ImportError( + "\n\nPlease install huggingface/transformers with:" + "\n\n pip install transformers" + ) + + super().__init__(task.target_dictionary) + + config = GPT2Config( + vocab_size=len(task.target_dictionary), + n_positions=args.max_target_positions + 1, + n_ctx=args.max_target_positions, + n_embd=args.embed_dim, + n_layer=args.num_layers, + n_head=args.num_attention_heads, + resid_pdrop=args.dropout, + embd_pdrop=args.dropout, + attn_pdrop=args.attention_dropout, + layer_norm_epsilon=1e-6, + ) + self.model = GPT2LMHeadModel(config) + + # set zero embedding for padding symbol + self.pad_idx = task.target_dictionary.pad() + self.model.transformer.wte.weight.data[self.pad_idx].zero_() + self.model.transformer.wpe.weight.data[0].zero_() + + def forward( + self, + prev_output_tokens, + src_lengths=None, + incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, + encoder_out=None, + ): + features = self.extract_features(prev_output_tokens, incremental_state) + lm_logits = self.model.lm_head(features) + return (lm_logits,) + + def extract_features( + self, + prev_output_tokens, + incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, + ): + if incremental_state: + past = self.get_incremental_state("past") + else: + past = None + + # don't attend to padding symbols + attention_mask = prev_output_tokens.ne(self.pad_idx).int() + + # set position ids to exclude padding symbols + position_ids = attention_mask * ( + torch.arange(1, 1 + prev_output_tokens.size(1)) + .to(prev_output_tokens) + .repeat(prev_output_tokens.size(0), 1) + ) + + outputs = self.model.transformer( + input_ids=prev_output_tokens, + past=past, + attention_mask=attention_mask, + position_ids=position_ids, + ) + last_hidden_states = outputs[0] + + if incremental_state: + self.set_incremental_state(incremental_state, "past", outputs[1]) + + return last_hidden_states + + def max_positions(self): + return self.model.config.n_positions - 1 + + +@register_model_architecture("hf_gpt2", "hf_gpt2") +def default_architecture(args): + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = getattr( + args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS + ) + args.embed_dim = getattr(args, "embed_dim", 768) + args.num_attention_heads = getattr(args, "num_attention_heads", 12) + args.num_layers = getattr(args, "num_layers", 12) + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + + +@register_model_architecture("hf_gpt2", "hf_gpt2_medium") +def hf_gpt2_medium(args): + args.embed_dim = getattr(args, "embed_dim", 1024) + args.num_attention_heads = getattr(args, "num_attention_heads", 16) + args.num_layers = getattr(args, "num_layers", 24) + default_architecture(args) + + +@register_model_architecture("hf_gpt2", "hf_gpt2_large") +def hf_gpt2_large(args): + args.embed_dim = getattr(args, "embed_dim", 1280) + args.num_attention_heads = getattr(args, "num_attention_heads", 20) + args.num_layers = getattr(args, "num_layers", 36) + default_architecture(args) + + +@register_model_architecture("hf_gpt2", "hf_gpt2_xl") +def hf_gpt2_xl(args): + args.embed_dim = getattr(args, "embed_dim", 1600) + args.num_attention_heads = getattr(args, "num_attention_heads", 25) + args.num_layers = getattr(args, "num_layers", 48) + default_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/lightconv_lm.py b/fairseq-0.10.2/fairseq/models/lightconv_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..1d9efc4e42a5ecc1b83338055f18ade5a83ea666 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/lightconv_lm.py @@ -0,0 +1,306 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq import utils +from fairseq.models import ( + FairseqLanguageModel, + register_model, + register_model_architecture, +) +from fairseq.models.lightconv import Embedding, LightConvDecoder +from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder + + +@register_model("lightconv_lm") +class LightConvLanguageModel(FairseqLanguageModel): + def __init__(self, decoder): + super().__init__(decoder) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + parser.add_argument( + "--dropout", + default=0.1, + type=float, + metavar="D", + help="dropout probability", + ) + parser.add_argument( + "--attention-dropout", + default=0.0, + type=float, + metavar="D", + help="dropout probability for attention weights", + ) + parser.add_argument( + "--relu-dropout", + default=0.0, + type=float, + metavar="D", + help="dropout probability after ReLU in FFN", + ) + parser.add_argument( + "--input-dropout", + type=float, + metavar="D", + help="dropout probability of the inputs", + ) + parser.add_argument( + "--decoder-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension", + ) + parser.add_argument( + "--decoder-output-dim", + type=int, + metavar="N", + help="decoder output dimension", + ) + parser.add_argument( + "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" + ) + parser.add_argument( + "--decoder-ffn-embed-dim", + type=int, + metavar="N", + help="decoder embedding dimension for FFN", + ) + parser.add_argument( + "--decoder-layers", type=int, metavar="N", help="num decoder layers" + ) + parser.add_argument( + "--decoder-attention-heads", + type=int, + metavar="N", + help="num decoder attention heads or LightConv/DynamicConv heads", + ) + parser.add_argument( + "--decoder-normalize-before", + default=False, + action="store_true", + help="apply layernorm before each decoder block", + ) + parser.add_argument( + "--adaptive-softmax-cutoff", + metavar="EXPR", + help="comma separated list of adaptive softmax cutoff points. " + "Must be used with adaptive_loss criterion", + ) + parser.add_argument( + "--adaptive-softmax-dropout", + type=float, + metavar="D", + help="sets adaptive softmax dropout for the tail projections", + ) + parser.add_argument( + "--adaptive-softmax-factor", + type=float, + metavar="N", + help="adaptive input factor", + ) + parser.add_argument( + "--no-token-positional-embeddings", + default=False, + action="store_true", + help="if set, disables positional embeddings (outside self attention)", + ) + parser.add_argument( + "--share-decoder-input-output-embed", + default=False, + action="store_true", + help="share decoder input and output embeddings", + ) + parser.add_argument( + "--character-embeddings", + default=False, + action="store_true", + help="if set, uses character embedding convolutions to produce token embeddings", + ) + parser.add_argument( + "--character-filters", + type=str, + metavar="LIST", + default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", + help="size of character embeddings", + ) + parser.add_argument( + "--character-embedding-dim", + type=int, + metavar="N", + default=4, + help="size of character embeddings", + ) + parser.add_argument( + "--char-embedder-highway-layers", + type=int, + metavar="N", + default=2, + help="number of highway layers for character token embeddder", + ) + parser.add_argument( + "--adaptive-input", + default=False, + action="store_true", + help="if set, uses adaptive input", + ) + parser.add_argument( + "--adaptive-input-factor", + type=float, + metavar="N", + help="adaptive input factor", + ) + parser.add_argument( + "--adaptive-input-cutoff", + metavar="EXPR", + help="comma separated list of adaptive input cutoff points.", + ) + parser.add_argument( + "--tie-adaptive-weights", + action="store_true", + help="if set, ties the weights of adaptive softmax and adaptive input", + ) + parser.add_argument( + "--tie-adaptive-proj", + action="store_true", + help="if set, ties the projection weights of adaptive softmax and adaptive input", + ) + parser.add_argument( + "--decoder-learned-pos", + action="store_true", + help="use learned positional embeddings in the decoder", + ) + + """LightConv and DynamicConv arguments""" + parser.add_argument( + "--decoder-kernel-size-list", + type=lambda x: utils.eval_str_list(x, int), + help='list of kernel size (default: "[3,7,15,31,31,31]")', + ) + parser.add_argument( + "--decoder-glu", type=utils.eval_bool, help="glu after in proj" + ) + parser.add_argument( + "--decoder-conv-type", + default="dynamic", + type=str, + choices=["dynamic", "lightweight"], + help="type of convolution", + ) + parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) + parser.add_argument( + "--weight-dropout", + type=float, + metavar="D", + help="dropout probability for conv weights", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present in older models + base_lm_architecture(args) + + if getattr(args, "max_source_positions", None) is None: + args.max_source_positions = args.tokens_per_sample + if getattr(args, "max_target_positions", None) is None: + args.max_target_positions = args.tokens_per_sample + + if args.character_embeddings: + embed_tokens = CharacterTokenEmbedder( + task.dictionary, + eval(args.character_filters), + args.character_embedding_dim, + args.decoder_embed_dim, + args.char_embedder_highway_layers, + ) + elif args.adaptive_input: + embed_tokens = AdaptiveInput( + len(task.dictionary), + task.dictionary.pad(), + args.decoder_input_dim, + args.adaptive_input_factor, + args.decoder_embed_dim, + utils.eval_str_list(args.adaptive_input_cutoff, type=int), + ) + else: + embed_tokens = Embedding( + len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() + ) + + if args.tie_adaptive_weights: + assert args.adaptive_input + assert args.adaptive_input_factor == args.adaptive_softmax_factor + assert ( + args.adaptive_softmax_cutoff == args.adaptive_input_cutoff + ), "{} != {}".format( + args.adaptive_softmax_cutoff, args.adaptive_input_cutoff + ) + assert args.decoder_input_dim == args.decoder_output_dim + + decoder = LightConvDecoder( + args, + task.output_dictionary, + embed_tokens, + no_encoder_attn=True, + final_norm=False, + ) + return LightConvLanguageModel(decoder) + + +@register_model_architecture("lightconv_lm", "lightconv_lm") +def base_lm_architecture(args): + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) + args.decoder_layers = getattr(args, "decoder_layers", 6) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) + args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) + args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) + args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) + args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) + + args.character_embeddings = getattr(args, "character_embeddings", False) + + args.decoder_output_dim = getattr( + args, "decoder_output_dim", args.decoder_embed_dim + ) + args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) + args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) + + # The model training is not stable without this + args.decoder_normalize_before = True + + args.adaptive_input = getattr(args, "adaptive_input", False) + args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) + args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) + + args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) + args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) + + args.decoder_kernel_size_list = getattr( + args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] + ) + if len(args.decoder_kernel_size_list) == 1: + args.decoder_kernel_size_list = ( + args.decoder_kernel_size_list * args.decoder_layers + ) + assert ( + len(args.decoder_kernel_size_list) == args.decoder_layers + ), "decoder_kernel_size_list doesn't match decoder_layers" + args.decoder_glu = getattr(args, "decoder_glu", True) + args.input_dropout = getattr(args, "input_dropout", 0.1) + args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) + + +@register_model_architecture("lightconv_lm", "lightconv_lm_gbw") +def lightconv_lm_gbw(args): + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) + base_lm_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/masked_lm.py b/fairseq-0.10.2/fairseq/models/masked_lm.py new file mode 100644 index 0000000000000000000000000000000000000000..c786de9125551f7247618b0a1d0867477894c755 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/masked_lm.py @@ -0,0 +1,403 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderModel, + register_model, + register_model_architecture, +) +from fairseq.modules import ( + LayerNorm, + SinusoidalPositionalEmbedding, + TransformerSentenceEncoder, +) +from fairseq.modules.transformer_sentence_encoder import init_bert_params + + +logger = logging.getLogger(__name__) + + +@register_model("masked_lm") +class MaskedLMModel(FairseqEncoderModel): + """ + Class for training a Masked Language Model. It also supports an + additional sentence level prediction if the sent-loss argument is set. + """ + + def __init__(self, args, encoder): + super().__init__(encoder) + self.args = args + + # if specified then apply bert initialization on the model. We need + # to explictly call this to make sure that the output embeddings + # and projection layers are also correctly initialized + if getattr(args, "apply_bert_init", False): + self.apply(init_bert_params) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + # Arguments related to dropout + parser.add_argument( + "--dropout", type=float, metavar="D", help="dropout probability" + ) + parser.add_argument( + "--attention-dropout", + type=float, + metavar="D", + help="dropout probability for" " attention weights", + ) + parser.add_argument( + "--act-dropout", + type=float, + metavar="D", + help="dropout probability after" " activation in FFN", + ) + + # Arguments related to hidden states and self-attention + parser.add_argument( + "--encoder-ffn-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension for FFN", + ) + parser.add_argument( + "--encoder-layers", type=int, metavar="N", help="num encoder layers" + ) + parser.add_argument( + "--encoder-attention-heads", + type=int, + metavar="N", + help="num encoder attention heads", + ) + + # Arguments related to input and output embeddings + parser.add_argument( + "--encoder-embed-dim", + type=int, + metavar="N", + help="encoder embedding dimension", + ) + parser.add_argument( + "--share-encoder-input-output-embed", + action="store_true", + help="share encoder input" " and output embeddings", + ) + parser.add_argument( + "--encoder-learned-pos", + action="store_true", + help="use learned positional embeddings in the encoder", + ) + parser.add_argument( + "--no-token-positional-embeddings", + action="store_true", + help="if set, disables positional embeddings" " (outside self attention)", + ) + parser.add_argument( + "--num-segment", type=int, metavar="N", help="num segment in the input" + ) + parser.add_argument( + "--max-positions", type=int, help="number of positional embeddings to learn" + ) + + # Arguments related to sentence level prediction + parser.add_argument( + "--sentence-class-num", + type=int, + metavar="N", + help="number of classes for sentence task", + ) + parser.add_argument( + "--sent-loss", + action="store_true", + help="if set," " calculate sentence level predictions", + ) + + # Arguments related to parameter initialization + parser.add_argument( + "--apply-bert-init", + action="store_true", + help="use custom param initialization for BERT", + ) + + # misc params + parser.add_argument( + "--activation-fn", + choices=utils.get_available_activation_fns(), + help="activation function to use", + ) + parser.add_argument( + "--pooler-activation-fn", + choices=utils.get_available_activation_fns(), + help="Which activation function to use for pooler layer.", + ) + parser.add_argument( + "--encoder-normalize-before", + action="store_true", + help="apply layernorm before each encoder block", + ) + + def forward(self, src_tokens, segment_labels=None, **kwargs): + return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs) + + def max_positions(self): + return self.encoder.max_positions + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + # make sure all arguments are present in older models + base_architecture(args) + + if not hasattr(args, "max_positions"): + args.max_positions = args.tokens_per_sample + + logger.info(args) + + encoder = MaskedLMEncoder(args, task.dictionary) + return cls(args, encoder) + + +class MaskedLMEncoder(FairseqEncoder): + """ + Encoder for Masked Language Modelling. + """ + + def __init__(self, args, dictionary): + super().__init__(dictionary) + + self.padding_idx = dictionary.pad() + self.vocab_size = dictionary.__len__() + self.max_positions = args.max_positions + + self.sentence_encoder = TransformerSentenceEncoder( + padding_idx=self.padding_idx, + vocab_size=self.vocab_size, + num_encoder_layers=args.encoder_layers, + embedding_dim=args.encoder_embed_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.act_dropout, + max_seq_len=self.max_positions, + num_segments=args.num_segment, + use_position_embeddings=not args.no_token_positional_embeddings, + encoder_normalize_before=args.encoder_normalize_before, + apply_bert_init=args.apply_bert_init, + activation_fn=args.activation_fn, + learned_pos_embedding=args.encoder_learned_pos, + ) + + self.share_input_output_embed = args.share_encoder_input_output_embed + self.embed_out = None + self.sentence_projection_layer = None + self.sentence_out_dim = args.sentence_class_num + self.lm_output_learned_bias = None + + # Remove head is set to true during fine-tuning + self.load_softmax = not getattr(args, "remove_head", False) + + self.masked_lm_pooler = nn.Linear( + args.encoder_embed_dim, args.encoder_embed_dim + ) + self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn) + + self.lm_head_transform_weight = nn.Linear( + args.encoder_embed_dim, args.encoder_embed_dim + ) + self.activation_fn = utils.get_activation_fn(args.activation_fn) + self.layer_norm = LayerNorm(args.encoder_embed_dim) + + self.lm_output_learned_bias = None + if self.load_softmax: + self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size)) + + if not self.share_input_output_embed: + self.embed_out = nn.Linear( + args.encoder_embed_dim, self.vocab_size, bias=False + ) + + if args.sent_loss: + self.sentence_projection_layer = nn.Linear( + args.encoder_embed_dim, self.sentence_out_dim, bias=False + ) + + def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused): + """ + Forward pass for Masked LM encoder. This first computes the token + embedding using the token embedding matrix, position embeddings (if + specified) and segment embeddings (if specified). + + Here we assume that the sentence representation corresponds to the + output of the classification_token (see bert_task or cross_lingual_lm + task for more details). + Args: + - src_tokens: B x T matrix representing sentences + - segment_labels: B x T matrix representing segment label for tokens + Returns: + - a tuple of the following: + - logits for predictions in format B x T x C to be used in + softmax afterwards + - a dictionary of additional data, where 'pooled_output' contains + the representation for classification_token and 'inner_states' + is a list of internal model states used to compute the + predictions (similar in ELMO). 'sentence_logits' + is the prediction logit for NSP task and is only computed if + this is specified in the input arguments. + """ + + inner_states, sentence_rep = self.sentence_encoder( + src_tokens, + segment_labels=segment_labels, + ) + + x = inner_states[-1].transpose(0, 1) + # project masked tokens only + if masked_tokens is not None: + x = x[masked_tokens, :] + x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x))) + + pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep)) + + # project back to size of vocabulary + if self.share_input_output_embed and hasattr( + self.sentence_encoder.embed_tokens, "weight" + ): + x = F.linear(x, self.sentence_encoder.embed_tokens.weight) + elif self.embed_out is not None: + x = self.embed_out(x) + if self.lm_output_learned_bias is not None: + x = x + self.lm_output_learned_bias + sentence_logits = None + if self.sentence_projection_layer: + sentence_logits = self.sentence_projection_layer(pooled_output) + + return x, { + "inner_states": inner_states, + "pooled_output": pooled_output, + "sentence_logits": sentence_logits, + } + + def max_positions(self): + """Maximum output length supported by the encoder.""" + return self.max_positions + + def upgrade_state_dict_named(self, state_dict, name): + if isinstance( + self.sentence_encoder.embed_positions, SinusoidalPositionalEmbedding + ): + state_dict[ + name + ".sentence_encoder.embed_positions._float_tensor" + ] = torch.FloatTensor(1) + if not self.load_softmax: + for k in list(state_dict.keys()): + if ( + "embed_out.weight" in k + or "sentence_projection_layer.weight" in k + or "lm_output_learned_bias" in k + ): + del state_dict[k] + return state_dict + + +@register_model_architecture("masked_lm", "masked_lm") +def base_architecture(args): + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.act_dropout = getattr(args, "act_dropout", 0.0) + + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.share_encoder_input_output_embed = getattr( + args, "share_encoder_input_output_embed", False + ) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.num_segment = getattr(args, "num_segment", 2) + + args.sentence_class_num = getattr(args, "sentence_class_num", 2) + args.sent_loss = getattr(args, "sent_loss", False) + + args.apply_bert_init = getattr(args, "apply_bert_init", False) + + args.activation_fn = getattr(args, "activation_fn", "relu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + + +@register_model_architecture("masked_lm", "bert_base") +def bert_base_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.share_encoder_input_output_embed = getattr( + args, "share_encoder_input_output_embed", True + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) + args.num_segment = getattr(args, "num_segment", 2) + + args.encoder_layers = getattr(args, "encoder_layers", 12) + + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) + + args.sentence_class_num = getattr(args, "sentence_class_num", 2) + args.sent_loss = getattr(args, "sent_loss", True) + + args.apply_bert_init = getattr(args, "apply_bert_init", True) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) + base_architecture(args) + + +@register_model_architecture("masked_lm", "bert_large") +def bert_large_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_layers = getattr(args, "encoder_layers", 24) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + bert_base_architecture(args) + + +@register_model_architecture("masked_lm", "xlm_base") +def xlm_architecture(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.share_encoder_input_output_embed = getattr( + args, "share_encoder_input_output_embed", True + ) + args.no_token_positional_embeddings = getattr( + args, "no_token_positional_embeddings", False + ) + args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) + args.num_segment = getattr(args, "num_segment", 1) + + args.encoder_layers = getattr(args, "encoder_layers", 6) + + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + + args.sent_loss = getattr(args, "sent_loss", False) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + args.apply_bert_init = getattr(args, "apply_bert_init", True) + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/model_utils.py b/fairseq-0.10.2/fairseq/models/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..732d66b1d5f695151c26d29eb7f6b53179c269f1 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/model_utils.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +from torch import Tensor + + +@torch.jit.script +def script_skip_tensor_list(x: List[Tensor], mask): + res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x] + outputs = [] + for i, t in enumerate(res): + if t.numel() != 0: + outputs.append(t) + else: + outputs.append(x[i]) + return outputs + + +@torch.jit.script +def script_skip_tensor(x: Tensor, mask): + # None case + if x.size(0) == 0: + return x + res = x[mask] if x.size(0) == mask.size(0) else x[:, mask] + if res.numel() == 0: + return x + else: + return res + + +@torch.jit.script +def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int): + """ + Expand 2D/3D tensor on dim=1 + """ + if x is None: + return None + + assert x.dim() == 2 or x.dim() == 3 + assert trg_dim >= x.size(1), (trg_dim, x.size()) + if trg_dim == x.size(1): + return x + + dims = [x.size(0), trg_dim - x.size(1)] + if x.dim() == 3: + dims.append(x.size(2)) + x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1) + + return x + + +@torch.jit.script +def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor: + return x if x is not None else y + + +@torch.jit.script +def fill_tensors( + x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int +) -> Optional[Tensor]: + """ + Filling tensor x with y at masked positions (dim=0). + """ + if x is None or x.size()[0] == 0 or y is None: + return x + assert x.dim() == y.dim() and mask.size(0) == x.size(0) + assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2)) + + n_selected = mask.sum() + if n_selected == 0: + return x + assert n_selected == y.size(0) + if n_selected == x.size(0): + return y + + if x.size(1) < y.size(1): + x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx) + x[mask] = y + elif x.size(1) > y.size(1): + x[mask] = torch.tensor(padding_idx).type_as(x) + if x.dim() == 2: + x[mask, : y.size(1)] = y + else: + x[mask, : y.size(1), :] = y + else: + x[mask] = y + return x diff --git a/fairseq-0.10.2/fairseq/models/multilingual_transformer.py b/fairseq-0.10.2/fairseq/models/multilingual_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e3fbbd5710dfb10b16f5495c9131fa42b11544be --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/multilingual_transformer.py @@ -0,0 +1,228 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict + +from fairseq import utils +from fairseq.models import ( + FairseqMultiModel, + register_model, + register_model_architecture, +) +from fairseq.models.transformer import ( + Embedding, + TransformerDecoder, + TransformerEncoder, + TransformerModel, + base_architecture, +) + + +@register_model("multilingual_transformer") +class MultilingualTransformerModel(FairseqMultiModel): + """Train Transformer models for multiple language pairs simultaneously. + + Requires `--task multilingual_translation`. + + We inherit all arguments from TransformerModel and assume that all language + pairs use a single Transformer architecture. In addition, we provide several + options that are specific to the multilingual setting. + + Args: + --share-encoder-embeddings: share encoder embeddings across all source languages + --share-decoder-embeddings: share decoder embeddings across all target languages + --share-encoders: share all encoder params (incl. embeddings) across all source languages + --share-decoders: share all decoder params (incl. embeddings) across all target languages + """ + + def __init__(self, encoders, decoders): + super().__init__(encoders, decoders) + + @staticmethod + def add_args(parser): + """Add model-specific arguments to the parser.""" + TransformerModel.add_args(parser) + parser.add_argument( + "--share-encoder-embeddings", + action="store_true", + help="share encoder embeddings across languages", + ) + parser.add_argument( + "--share-decoder-embeddings", + action="store_true", + help="share decoder embeddings across languages", + ) + parser.add_argument( + "--share-encoders", + action="store_true", + help="share encoders across languages", + ) + parser.add_argument( + "--share-decoders", + action="store_true", + help="share decoders across languages", + ) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + from fairseq.tasks.multilingual_translation import MultilingualTranslationTask + + assert isinstance(task, MultilingualTranslationTask) + + # make sure all arguments are present in older models + base_multilingual_architecture(args) + + if not hasattr(args, "max_source_positions"): + args.max_source_positions = 1024 + if not hasattr(args, "max_target_positions"): + args.max_target_positions = 1024 + + src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs] + tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs] + + if args.share_encoders: + args.share_encoder_embeddings = True + if args.share_decoders: + args.share_decoder_embeddings = True + + def build_embedding(dictionary, embed_dim, path=None): + num_embeddings = len(dictionary) + padding_idx = dictionary.pad() + emb = Embedding(num_embeddings, embed_dim, padding_idx) + # if provided, load from preloaded dictionaries + if path: + embed_dict = utils.parse_embedding(path) + utils.load_embedding(embed_dict, dictionary, emb) + return emb + + # build shared embeddings (if applicable) + shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None + if args.share_all_embeddings: + if args.encoder_embed_dim != args.decoder_embed_dim: + raise ValueError( + "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" + ) + if args.decoder_embed_path and ( + args.decoder_embed_path != args.encoder_embed_path + ): + raise ValueError( + "--share-all-embeddings not compatible with --decoder-embed-path" + ) + shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=task.langs, + embed_dim=args.encoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.encoder_embed_path, + ) + shared_decoder_embed_tokens = shared_encoder_embed_tokens + args.share_decoder_input_output_embed = True + else: + if args.share_encoder_embeddings: + shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=src_langs, + embed_dim=args.encoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.encoder_embed_path, + ) + if args.share_decoder_embeddings: + shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( + dicts=task.dicts, + langs=tgt_langs, + embed_dim=args.decoder_embed_dim, + build_embedding=build_embedding, + pretrained_embed_path=args.decoder_embed_path, + ) + + # encoders/decoders for each language + lang_encoders, lang_decoders = {}, {} + + def get_encoder(lang): + if lang not in lang_encoders: + if shared_encoder_embed_tokens is not None: + encoder_embed_tokens = shared_encoder_embed_tokens + else: + encoder_embed_tokens = build_embedding( + task.dicts[lang], + args.encoder_embed_dim, + args.encoder_embed_path, + ) + lang_encoders[lang] = cls._get_module_class( + True, args, task.dicts[lang], encoder_embed_tokens, src_langs + ) + return lang_encoders[lang] + + def get_decoder(lang): + if lang not in lang_decoders: + if shared_decoder_embed_tokens is not None: + decoder_embed_tokens = shared_decoder_embed_tokens + else: + decoder_embed_tokens = build_embedding( + task.dicts[lang], + args.decoder_embed_dim, + args.decoder_embed_path, + ) + lang_decoders[lang] = cls._get_module_class( + False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs + ) + return lang_decoders[lang] + + # shared encoders/decoders (if applicable) + shared_encoder, shared_decoder = None, None + if args.share_encoders: + shared_encoder = get_encoder(src_langs[0]) + if args.share_decoders: + shared_decoder = get_decoder(tgt_langs[0]) + + encoders, decoders = OrderedDict(), OrderedDict() + for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs): + encoders[lang_pair] = ( + shared_encoder if shared_encoder is not None else get_encoder(src) + ) + decoders[lang_pair] = ( + shared_decoder if shared_decoder is not None else get_decoder(tgt) + ) + + return MultilingualTransformerModel(encoders, decoders) + + @classmethod + def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs): + module_class = TransformerEncoder if is_encoder else TransformerDecoder + return module_class(args, lang_dict, embed_tokens) + + def load_state_dict(self, state_dict, strict=True, args=None): + state_dict_subset = state_dict.copy() + for k, _ in state_dict.items(): + assert k.startswith("models.") + lang_pair = k.split(".")[1] + if lang_pair not in self.models: + del state_dict_subset[k] + super().load_state_dict(state_dict_subset, strict=strict, args=args) + + +@register_model_architecture("multilingual_transformer", "multilingual_transformer") +def base_multilingual_architecture(args): + base_architecture(args) + args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", False) + args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", False) + args.share_encoders = getattr(args, "share_encoders", False) + args.share_decoders = getattr(args, "share_decoders", False) + + +@register_model_architecture( + "multilingual_transformer", "multilingual_transformer_iwslt_de_en" +) +def multilingual_transformer_iwslt_de_en(args): + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) + args.encoder_layers = getattr(args, "encoder_layers", 6) + args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) + args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) + args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) + args.decoder_layers = getattr(args, "decoder_layers", 6) + base_multilingual_architecture(args) diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d74a9bf6cb461e842e7b3293545fd2578e80bd49 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9864f25948e3e4872391c25024dd3ced269985c Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9e40f8a4b5a91035e66c52d889067ea24d7185a3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py b/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py new file mode 100644 index 0000000000000000000000000000000000000000..5886880f73bd1e2176c49e3d491a7d46eb3d9322 --- /dev/null +++ b/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Unsupervised Cross-lingual Representation Learning at Scale +""" + +from fairseq.models import register_model + +from .hub_interface import RobertaHubInterface +from .model import RobertaModel + + +@register_model("xlmr") +class XLMRModel(RobertaModel): + @classmethod + def hub_models(cls): + return { + "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz", + "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz", + } + + @classmethod + def from_pretrained( + cls, + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + bpe="sentencepiece", + **kwargs + ): + from fairseq import hub_utils + + x = hub_utils.from_pretrained( + model_name_or_path, + checkpoint_file, + data_name_or_path, + archive_map=cls.hub_models(), + bpe=bpe, + load_checkpoint_heads=True, + **kwargs, + ) + return RobertaHubInterface(x["args"], x["task"], x["models"][0]) diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7f54438a2eb96d62ce90ea5a3a9d6ed58fbb6098 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1adf927e347f1b940946ce74c275bb3248a684e8 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..96569923675dcd69044b6420c57438690660edd3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c0d2ed4af95a89c618698343413b3a749da1c7c6 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70cdc44765423f38190818d92942bf854be3ce93 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..32c155869dbc4b46ac741fa6f55bb370cd1fb158 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7447ddff6f874a457d3b63ff67726984ba6ad06a Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..033dc4f4a609c2da4ba05f7635043177aeb03ae2 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py b/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..2cdece3f7fca2b830eb72999ce93f58667ed595b --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py @@ -0,0 +1,316 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.scalar_bias import scalar_bias + + +class SingleHeadAttention(nn.Module): + """ + Single-head attention that supports Gating and Downsampling + """ + + def __init__( + self, + out_channels, + embed_dim, + head_dim, + head_index, + dropout=0.0, + bias=True, + project_input=True, + gated=False, + downsample=False, + num_heads=1, + ): + super().__init__() + self.embed_dim = embed_dim + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + self.head_index = head_index + self.head_dim = head_dim + self.project_input = project_input + self.gated = gated + self.downsample = downsample + self.num_heads = num_heads + self.projection = None + + k_layers = [] + v_layers = [] + if self.downsample: + k_layers.append(Downsample(self.head_index)) + v_layers.append(Downsample(self.head_index)) + out_proj_size = self.head_dim + else: + out_proj_size = self.head_dim * self.num_heads + if self.gated: + k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias)) + self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias) + v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias)) + else: + k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias)) + self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias) + v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias)) + + self.in_proj_k = nn.Sequential(*k_layers) + self.in_proj_v = nn.Sequential(*v_layers) + + if self.downsample: + self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias) + else: + self.out_proj = Linear(out_proj_size, out_channels, bias=bias) + + self.scaling = self.head_dim ** -0.5 + + def forward( + self, + query, + key, + value, + mask_future_timesteps=False, + key_padding_mask=None, + use_scalar_bias=False, + ): + """Input shape: Time x Batch x Channel + Self-attention can be implemented by passing in the same arguments for + query, key and value. Future timesteps can be masked with the + `mask_future_timesteps` argument. Padding elements can be excluded from + the key by passing a binary ByteTensor (`key_padding_mask`) with shape: + batch x src_len, where padding elements are indicated by 1s. + """ + src_len, bsz, out_channels = key.size() + tgt_len = query.size(0) + assert list(query.size()) == [tgt_len, bsz, out_channels] + assert key.size() == value.size() + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.downsample: + size = bsz + else: + size = bsz * self.num_heads + + k = key + v = value + q = query + if self.project_input: + q = self.in_proj_q(q) + k = self.in_proj_k(k) + v = self.in_proj_v(v) + src_len = k.size()[0] + q *= self.scaling + + if not self.downsample: + q = q.view(tgt_len, size, self.head_dim) + k = k.view(src_len, size, self.head_dim) + v = v.view(src_len, size, self.head_dim) + + q = q.transpose(0, 1) + k = k.transpose(0, 1) + v = v.transpose(0, 1) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + if mask_future_timesteps: + assert ( + query.size() == key.size() + ), "mask_future_timesteps only applies to self-attention" + attn_weights *= torch.tril( + attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(), + diagonal=-1, + )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0) + attn_weights += torch.triu( + attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(), + diagonal=0, + )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0) + tgt_size = tgt_len + if use_scalar_bias: + attn_weights = scalar_bias(attn_weights, 2) + v = scalar_bias(v, 1) + tgt_size += 1 + + if key_padding_mask is not None: + # don't attend to padding symbols + if key_padding_mask.max() > 0: + if self.downsample: + attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len) + else: + attn_weights = attn_weights.view( + size, self.num_heads, tgt_len, src_len + ) + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2), + -math.inf, + ) + attn_weights = attn_weights.view(size, tgt_len, src_len) + attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = self.dropout_module(attn_weights) + + attn = torch.bmm(attn_weights, v) + if self.downsample: + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim) + else: + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) + + attn = self.out_proj(attn) + + return attn, attn_weights + + +class DownsampledMultiHeadAttention(nn.ModuleList): + """ + Multi-headed attention with Gating and Downsampling + """ + + def __init__( + self, + out_channels, + embed_dim, + num_heads, + dropout=0.0, + bias=True, + project_input=True, + gated=False, + downsample=False, + ): + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.downsample = downsample + self.gated = gated + self.project_input = project_input + assert self.head_dim * num_heads == embed_dim + + if self.downsample: + attention_heads = [] + for index in range(self.num_heads): + attention_heads.append( + SingleHeadAttention( + out_channels, + self.embed_dim, + self.head_dim, + index, + dropout, + bias, + self.project_input, + self.gated, + self.downsample, + self.num_heads, + ) + ) + super().__init__(modules=attention_heads) + self.out_proj = Linear(embed_dim, out_channels, bias=bias) + else: + # either we have a list of attention heads, or just one attention head + # if not being downsampled, we can do the heads with one linear layer instead of separate ones + super().__init__() + self.attention_module = SingleHeadAttention( + out_channels, + self.embed_dim, + self.head_dim, + 1, + dropout, + bias, + self.project_input, + self.gated, + self.downsample, + self.num_heads, + ) + + def forward( + self, + query, + key, + value, + mask_future_timesteps=False, + key_padding_mask=None, + use_scalar_bias=False, + ): + src_len, bsz, embed_dim = key.size() + tgt_len = query.size(0) + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + assert key.size() == value.size() + + tgt_size = tgt_len + if use_scalar_bias: + tgt_size += 1 + + attn = [] + attn_weights = [] + if self.downsample: + for attention_head_number in range(self.num_heads): + # call the forward of each attention head + _attn, _attn_weight = self[attention_head_number]( + query, + key, + value, + mask_future_timesteps, + key_padding_mask, + use_scalar_bias, + ) + attn.append(_attn) + attn_weights.append(_attn_weight) + full_attn = torch.cat(attn, dim=2) + full_attn = self.out_proj(full_attn) + return full_attn, attn_weights[0].clone() + else: + _attn, _attn_weight = self.attention_module( + query, + key, + value, + mask_future_timesteps, + key_padding_mask, + use_scalar_bias, + ) + attn.append(_attn) + attn_weights.append(_attn_weight) + full_attn = torch.cat(attn, dim=2) + full_attn_weights = torch.cat(attn_weights) + full_attn_weights = full_attn_weights.view( + bsz, self.num_heads, tgt_size, src_len + ) + full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads + return full_attn, full_attn_weights + + +class Downsample(nn.Module): + """ + Selects every nth element, where n is the index + """ + + def __init__(self, index): + super().__init__() + self.index = index + + def forward(self, x): + return x[:: self.index + 1] + + +def Linear(in_features, out_features, dropout=0.0, bias=True): + """Weight-normalized Linear layer (input: B x T x C)""" + m = nn.Linear(in_features, out_features, bias=bias) + m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) + m.bias.data.zero_() + return nn.utils.weight_norm(m) + + +def GatedLinear(in_features, out_features, dropout=0.0, bias=True): + """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" + return nn.Sequential( + Linear(in_features, out_features * 4, dropout, bias), + nn.GLU(), + Linear(out_features * 2, out_features * 2, dropout, bias), + nn.GLU(), + Linear(out_features, out_features, dropout, bias), + ) diff --git a/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py b/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..8fcc6b8d2672d2eacc6d01b9688bac44d5e1ce26 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py @@ -0,0 +1,189 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +This file is to re-implemented the low-rank and beam approximation of CRF layer +Proposed by: + +Sun, Zhiqing, et al. +Fast Structured Decoding for Sequence Models +https://arxiv.org/abs/1910.11555 + +The CRF implementation is mainly borrowed from +https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py + +""" + +import numpy as np +import torch +import torch.nn as nn + + +def logsumexp(x, dim=1): + return torch.logsumexp(x.float(), dim=dim).type_as(x) + + +class DynamicCRF(nn.Module): + """Dynamic CRF layer is used to approximate the traditional + Conditional Random Fields (CRF) + $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$ + + where in this function, we assume the emition scores (s) are given, + and the transition score is a |V| x |V| matrix $M$ + + in the following two aspects: + (1) it used a low-rank approximation for the transition matrix: + $M = E_1 E_2^T$ + (2) it used a beam to estimate the normalizing factor Z(x) + """ + + def __init__(self, num_embedding, low_rank=32, beam_size=64): + super().__init__() + + self.E1 = nn.Embedding(num_embedding, low_rank) + self.E2 = nn.Embedding(num_embedding, low_rank) + + self.vocb = num_embedding + self.rank = low_rank + self.beam = beam_size + + def extra_repr(self): + return "vocab_size={}, low_rank={}, beam_size={}".format( + self.vocb, self.rank, self.beam + ) + + def forward(self, emissions, targets, masks, beam=None): + """ + Compute the conditional log-likelihood of a sequence of target tokens given emission scores + + Args: + emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output + ``(batch_size, seq_len, vocab_size)``. We assume batch-first + targets (`~torch.LongTensor`): Sequence of target token indices + ``(batch_size, seq_len) + masks (`~torch.ByteTensor`): Mask tensor with the same size as targets + + Returns: + `~torch.Tensor`: approximated log-likelihood + """ + numerator = self._compute_score(emissions, targets, masks) + denominator = self._compute_normalizer(emissions, targets, masks, beam) + return numerator - denominator + + def forward_decoder(self, emissions, masks=None, beam=None): + """ + Find the most likely output sequence using Viterbi algorithm. + + Args: + emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output + ``(batch_size, seq_len, vocab_size)``. We assume batch-first + masks (`~torch.ByteTensor`): Mask tensor with the same size as targets + + Returns: + `~torch.LongTensor`: decoded sequence from the CRF model + """ + return self._viterbi_decode(emissions, masks, beam) + + def _compute_score(self, emissions, targets, masks=None): + batch_size, seq_len = targets.size() + emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0] # B x T + transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2) + + scores = emission_scores + scores[:, 1:] += transition_scores + + if masks is not None: + scores = scores * masks.type_as(scores) + return scores.sum(-1) + + def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None): + # HACK: we include "target" which is a hueristic for training + # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?) + + beam = beam if beam is not None else self.beam + batch_size, seq_len = emissions.size()[:2] + if targets is not None: + _emissions = emissions.scatter(2, targets[:, :, None], np.float("inf")) + beam_targets = _emissions.topk(beam, 2)[1] + beam_emission_scores = emissions.gather(2, beam_targets) + else: + beam_emission_scores, beam_targets = emissions.topk(beam, 2) + beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D + beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D + beam_transition_matrix = torch.bmm( + beam_transition_score1.view(-1, beam, self.rank), + beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2), + ) + beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam) + + # compute the normalizer in the log-space + score = beam_emission_scores[:, 0] # B x K + for i in range(1, seq_len): + next_score = score[:, :, None] + beam_transition_matrix[:, i - 1] + next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i] + + if masks is not None: + score = torch.where(masks[:, i : i + 1], next_score, score) + else: + score = next_score + + # Sum (log-sum-exp) over all possible tags + return logsumexp(score, dim=1) + + def _viterbi_decode(self, emissions, masks=None, beam=None): + # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?) + + beam = beam if beam is not None else self.beam + batch_size, seq_len = emissions.size()[:2] + beam_emission_scores, beam_targets = emissions.topk(beam, 2) + beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D + beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D + beam_transition_matrix = torch.bmm( + beam_transition_score1.view(-1, beam, self.rank), + beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2), + ) + beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam) + + traj_tokens, traj_scores = [], [] + finalized_tokens, finalized_scores = [], [] + + # compute the normalizer in the log-space + score = beam_emission_scores[:, 0] # B x K + dummy = ( + torch.arange(beam, device=score.device).expand(*score.size()).contiguous() + ) + + for i in range(1, seq_len): + traj_scores.append(score) + _score = score[:, :, None] + beam_transition_matrix[:, i - 1] + _score, _index = _score.max(dim=1) + _score = _score + beam_emission_scores[:, i] + + if masks is not None: + score = torch.where(masks[:, i : i + 1], _score, score) + index = torch.where(masks[:, i : i + 1], _index, dummy) + else: + score, index = _score, _index + traj_tokens.append(index) + + # now running the back-tracing and find the best + best_score, best_index = score.max(dim=1) + finalized_tokens.append(best_index[:, None]) + finalized_scores.append(best_score[:, None]) + + for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)): + previous_index = finalized_tokens[-1] + finalized_tokens.append(idx.gather(1, previous_index)) + finalized_scores.append(scs.gather(1, previous_index)) + + finalized_tokens.reverse() + finalized_tokens = torch.cat(finalized_tokens, 1) + finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0] + + finalized_scores.reverse() + finalized_scores = torch.cat(finalized_scores, 1) + finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1] + + return finalized_scores, finalized_tokens diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..9304f99eb8169a614f39babc830c84cac80e080b --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py @@ -0,0 +1,223 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +def gen_forward(): + + kernels = [3, 5, 7, 15, 31, 63, 127, 255] + blocks = [32, 64, 128, 256] + + head = """ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "dynamicconv_cuda.cuh" + +std::vector dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) { + + at::DeviceGuard g(input.device()); + const auto minibatch = input.size(0); + const auto numFeatures = input.size(1); + const auto sequenceLength = input.size(2); + + const auto numHeads = weight.size(1); + const auto filterSize = weight.size(2); + + const auto numFiltersInBlock = numFeatures / numHeads; + const dim3 blocks(minibatch, numFeatures); + + auto output = at::zeros_like(input); + auto stream = at::cuda::getCurrentCUDAStream(); +""" + + switch = """ + switch(filterSize) { +""" + + case_k = """ + case {k}: +""" + + main_block = """ + if (padding_l == {pad}) {{ + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{ + dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t> + <<>>( + input.data(), + weight.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + numHeads, + output.data()); + }})); + }} else +""" + + bad_padding = """ + { + std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl; + } + break;\n +""" + + end = """ + default: + std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl; + } + + return {output}; +} +""" + + with open("dynamicconv_cuda_forward.cu", "w") as forward: + forward.write(head) + forward.write(switch) + for k in kernels: + b_size = 32 + for b in blocks: + if b > k: + b_size = b + break + forward.write(case_k.format(k=k)) + for pad in [k // 2, k - 1]: + forward.write(main_block.format(k=k, b_size=b_size, pad=pad)) + forward.write(bad_padding) + forward.write(end) + + +def gen_backward(): + + kernels = [3, 5, 7, 15, 31, 63, 127, 255] + thresh = [512, 512, 512, 512, 512, 380, 256, 256] + min_block = [64, 64, 64, 64, 64, 64, 128, 256] + seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] + + head = """ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "dynamicconv_cuda.cuh" + +std::vector dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) { + + at::DeviceGuard g(input.device()); + const auto minibatch = input.size(0); + const auto numFeatures = input.size(1); + const auto sequenceLength = input.size(2); + + const auto numHeads = weight.size(1); + const auto filterSize = weight.size(2); + + const auto numFiltersInBlock = numFeatures / numHeads; + auto numChunks = 1; + + auto gradInput = at::zeros_like(input); + auto gradWeight = at::zeros_like(weight); + auto stream = at::cuda::getCurrentCUDAStream(); + + dim3 blocks(minibatch, numHeads, numChunks); +""" + + sequence_if = """ + if (sequenceLength < {seq}) {{ + switch(filterSize) {{ +""" + + case_k = """ + case {k}: +""" + + chunks_reset = """ + numChunks = int(ceilf(sequenceLength/float({b_size}))); + blocks = dim3(minibatch, numHeads, numChunks); +""" + + main_block = """ + if (padding_l == {p}) {{ + AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{ + dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t> + <<>>( + gradOutput.data(), + input.data(), + weight.data(), + minibatch, + sequenceLength, + numFeatures, + numFiltersInBlock, + numHeads, + gradWeight.data(), + gradInput.data()); + }})); + }} else +""" + + bad_padding = """ + { + std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl; + } + break;\n +""" + + bad_filter = """ + default: + std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl; + } +""" + + con_else = """ + } else +""" + + final_else = """ + { + switch(filterSize) { +""" + + last_return = """ + } + return {gradInput, gradWeight}; +} +""" + + with open("dynamicconv_cuda_backward.cu", "w") as backward: + backward.write(head) + for seq in seqs: + backward.write(sequence_if.format(seq=seq)) + for k, t, m in zip(kernels, thresh, min_block): + backward.write(case_k.format(k=k)) + if seq <= t: + b_size = seq + else: + b_size = m + backward.write(chunks_reset.format(b_size=b_size)) + for p in [k // 2, k - 1]: + backward.write(main_block.format(k=k, b_size=b_size, p=p)) + backward.write(bad_padding) + backward.write(bad_filter) + backward.write(con_else) + backward.write(final_else) + for k, m in zip(kernels, min_block): + backward.write(case_k.format(k=k)) + backward.write(chunks_reset.format(b_size=m)) + for p in [k // 2, k - 1]: + backward.write(main_block.format(k=k, b_size=m, p=p)) + backward.write(bad_padding) + backward.write(bad_filter) + backward.write(last_return) + + +if __name__ == "__main__": + gen_forward() + gen_backward() diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..4a683d2690d5e3058192afb1b3f4c1f3e2c41352 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py @@ -0,0 +1,227 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import dynamicconv_cuda +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.unfold import unfold1d +from torch import nn +from torch.autograd import Function + + +class dynamicconvFunction(Function): + @staticmethod + def forward(ctx, x, weights, padding_l): + ctx.padding_l = padding_l + outputs = dynamicconv_cuda.forward(x, weights, padding_l) + variables = [x, weights] + ctx.save_for_backward(*variables) + return outputs[0] + + @staticmethod + def backward(ctx, grad_output): + outputs = dynamicconv_cuda.backward( + grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors + ) + grad_input, grad_weights = outputs + return grad_input, grad_weights, None + + +@with_incremental_state +class DynamicconvLayer(nn.Module): + def __init__( + self, + input_size, + kernel_size=1, + padding_l=None, + weight_softmax=False, + num_heads=1, + weight_dropout=0.0, + bias=False, + renorm_padding=False, + conv_bias=False, + query_size=None, + ): + + super(DynamicconvLayer, self).__init__() + self.input_size = input_size + self.query_size = input_size if query_size is None else query_size + self.kernel_size = kernel_size + self.padding_l = padding_l + self.num_heads = num_heads + self.weight_softmax = weight_softmax + self.weight_dropout_module = FairseqDropout( + weight_dropout, module_name=self.__class__.__name__ + ) + self.renorm_padding = renorm_padding + self.bias = bias + + self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias) + if conv_bias: + self.conv_bias = nn.Parameter(torch.Tensor(input_size)) + else: + self.conv_bias = None + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight_linear.weight) + if self.conv_bias is not None: + nn.init.constant_(self.conv_bias, 0.0) + nn.init.constant_(self.weight_linaer.bias, 0.0) + + def forward(self, x, incremental_state=None, query=None, unfold=None): + + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + # R = C // H + + # during inference time, incremental BMM is faster + if incremental_state is not None: + unfold = ( + x.size(0) > 512 if unfold is None else unfold + ) # use unfold mode as default for long sequence to save memory + unfold = unfold or (incremental_state is not None) + assert query is None + + if query is None: + query = x + if unfold: + output = self._forward_unfolded(x, incremental_state, query) + else: + output = self._forward_expanded(x, incremental_state, query) + + if self.conv_bias is not None: + output = output + self.conv_bias.view(1, 1, -1) + + return output + + # during training time, use CUDA kernel + else: + weight = self.weight_linear(x).view(T, B, H, K) + if self.weight_softmax: + weight = F.softmax(weight, dim=-1) + if self.weight_dropout_module.p: + weight = self.weight_dropout_module(weight) + + weight = weight.permute(1, 2, 3, 0).contiguous() + self.filters = weight + x = x.permute(1, 2, 0).contiguous() + output = dynamicconvFunction.apply(x, weight, self.padding_l).permute( + 2, 0, 1 + ) + if self.conv_bias is not None: + output = output + self.conv_bias.view(1, 1, -1) + return output + + def reorder_incremental_state(self, incremental_state, new_order): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(1, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return utils.get_incremental_state(self, incremental_state, "input_buffer") + + def _set_input_buffer(self, incremental_state, new_buffer): + return utils.set_incremental_state( + self, incremental_state, "input_buffer", new_buffer + ) + + def _forward_unfolded(self, x, incremental_state, query): + """The conventional implementation of convolutions. + Unfolding the input by having a window shifting to the right.""" + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + + weight = self.weight_linear(query).view(T * B * H, -1) + + # renorm_padding is only implemented in _forward_expanded + assert not self.renorm_padding or incremental_state is not None + + if incremental_state is not None: + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is None: + input_buffer = x.new() + x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) + if self.kernel_size > 1: + self._set_input_buffer( + incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] + ) + x_unfold = x_unfold.view(T * B * H, R, -1) + else: + padding_l = self.padding_l + if K > T and padding_l == K - 1: + weight = weight.narrow(1, K - T, T) + K, padding_l = T, T - 1 + # unfold the input: T x B x C --> T' x B x C x K + x_unfold = unfold1d(x, K, padding_l, 0) + x_unfold = x_unfold.view(T * B * H, R, K) + + if self.weight_softmax and not self.renorm_padding: + weight = F.softmax(weight, dim=1) + weight = weight.narrow(1, 0, K) + + if incremental_state is not None: + weight = weight[:, -x_unfold.size(2) :] + K = weight.size(1) + + if self.weight_softmax and self.renorm_padding: + weight = F.softmax(weight, dim=1) + + weight = self.weight_dropout_module(weight, inplace=False) + + output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 + output = output.view(T, B, C) + return output + + def _forward_expanded(self, x, incremental_stat, query): + """Turn the convolution filters into band matrices and do matrix multiplication. + This is faster when the sequence is short, but less memory efficient. + This is not used in the decoder during inference. + """ + T, B, C = x.size() + K, H = self.kernel_size, self.num_heads + R = C // H + assert R * H == C == self.input_size + weight = self.weight_linear(query).view(T * B * H, -1) + + if not self.renorm_padding: + if self.weight_softmax: + weight = F.softmax(weight, dim=1) + weight = self.weight_dropout_module(weight, inplace=False) + weight = weight.narrow(1, 0, K).contiguous() + weight = weight.view(T, B * H, K).transpose(0, 1) + + x = x.view(T, B * H, R).transpose(0, 1) + if self.weight_softmax and self.renorm_padding: + # turn the convolution filters into band matrices + weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf")) + weight_expanded.as_strided( + (B * H, T, K), (T * (T + K - 1), T + K, 1) + ).copy_(weight) + weight_expanded = weight_expanded.narrow(2, self.padding_l, T) + # normalize the weight over valid positions like self-attention + weight_expanded = F.softmax(weight_expanded, dim=2) + weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) + else: + P = self.padding_l + # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length + if K > T and P == K - 1: + weight = weight.narrow(2, K - T, T) + K, P = T, T - 1 + # turn the convolution filters into band matrices + weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) + weight_expanded.as_strided( + (B * H, T, K), (T * (T + K - 1), T + K, 1) + ).copy_(weight) + weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T + output = torch.bmm(weight_expanded, x) + output = output.transpose(0, 1).contiguous().view(T, B, C) + return output diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8a6af4285da3c40a01383541acf1f455ffc060fb --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp @@ -0,0 +1,35 @@ +#include +#include + +std::vector dynamicconv_cpu_forward( + float* input, + float* filters, + int padding_l); + +std::vector dynamicconv_cpu_backward( + float* gradOutput, + int padding_l, + float* input, + float* filters); + +std::vector dynamicconv_forward( + float* input, + float* filters, + int padding_l) { + + return dynamicconv_cpu_forward(input, filters, padding_l); +} + +std::vector dynamicconv_backward( + float* gradOutput, + int padding_l, + float* input, + float* filters) { + + return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)"); + m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)"); +} diff --git a/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py b/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py new file mode 100644 index 0000000000000000000000000000000000000000..f070a804e6c1e00b6c0db315b944305c2c41d807 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Optional + +import torch.nn as nn +import torch.nn.functional as F + + +logger = logging.getLogger(__name__) + + +class FairseqDropout(nn.Module): + def __init__(self, p, module_name=None): + super().__init__() + self.p = p + self.module_name = module_name + self.apply_during_inference = False + + def forward(self, x, inplace: bool = False): + if self.training or self.apply_during_inference: + return F.dropout(x, p=self.p, training=True, inplace=inplace) + else: + return x + + def make_generation_fast_( + self, + name: str, + retain_dropout: bool = False, + retain_dropout_modules: Optional[List[str]] = None, + **kwargs + ): + if retain_dropout: + if retain_dropout_modules is not None and self.module_name is None: + logger.warning( + "Cannot enable dropout during inference for module {} " + "because module_name was not set".format(name) + ) + elif ( + retain_dropout_modules is None # if None, apply to all modules + or self.module_name in retain_dropout_modules + ): + logger.info( + "Enabling dropout during inference for module: {}".format(name) + ) + self.apply_during_inference = True + else: + logger.info("Disabling dropout for module: {}".format(name)) diff --git a/fairseq-0.10.2/fairseq/modules/grad_multiply.py b/fairseq-0.10.2/fairseq/modules/grad_multiply.py new file mode 100644 index 0000000000000000000000000000000000000000..08d15f55dfda9c61a1cf8641ea31424fe1d97f57 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/grad_multiply.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch + + +class GradMultiply(torch.autograd.Function): + @staticmethod + def forward(ctx, x, scale): + ctx.scale = scale + res = x.new(x) + return res + + @staticmethod + def backward(ctx, grad): + return grad * ctx.scale, None diff --git a/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py b/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..47657bb0ab70864a3f7a0b00c226ccc9fc527fa3 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py @@ -0,0 +1,199 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class GumbelVectorQuantizer(nn.Module): + def __init__( + self, + dim, + num_vars, + temp, + groups, + combine_groups, + vq_dim, + time_first, + activation=nn.GELU(), + weight_proj_depth=1, + weight_proj_factor=1, + ): + """Vector quantization using gumbel softmax + + Args: + dim: input dimension (channels) + num_vars: number of quantized vectors per group + temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor) + groups: number of groups for vector quantization + combine_groups: whether to use the vectors for all groups + vq_dim: dimensionality of the resulting quantized vector + time_first: if true, expect input in BxTxC format, otherwise in BxCxT + activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1 + weight_proj_depth: number of layers (with activation in between) to project input before computing logits + weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of + projections by this factor + """ + super().__init__() + + self.groups = groups + self.combine_groups = combine_groups + self.input_dim = dim + self.num_vars = num_vars + self.time_first = time_first + + assert ( + vq_dim % groups == 0 + ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation" + + var_dim = vq_dim // groups + num_groups = groups if not combine_groups else 1 + + self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim)) + nn.init.uniform_(self.vars) + + if weight_proj_depth > 1: + + def block(input_dim, output_dim): + return nn.Sequential(nn.Linear(input_dim, output_dim), activation) + + inner_dim = self.input_dim * weight_proj_factor + self.weight_proj = nn.Sequential( + *[ + block(self.input_dim if i == 0 else inner_dim, inner_dim) + for i in range(weight_proj_depth - 1) + ], + nn.Linear(inner_dim, groups * num_vars), + ) + else: + self.weight_proj = nn.Linear(self.input_dim, groups * num_vars) + nn.init.normal_(self.weight_proj.weight, mean=0, std=1) + nn.init.zeros_(self.weight_proj.bias) + + assert len(temp) == 3, temp + + self.max_temp, self.min_temp, self.temp_decay = temp + self.curr_temp = self.max_temp + self.codebook_indices = None + + def set_num_updates(self, num_updates): + self.curr_temp = max( + self.max_temp * self.temp_decay ** num_updates, self.min_temp + ) + + def get_codebook_indices(self): + if self.codebook_indices is None: + from itertools import product + + p = [range(self.num_vars)] * self.groups + inds = list(product(*p)) + self.codebook_indices = torch.tensor( + inds, dtype=torch.long, device=self.vars.device + ).flatten() + + if not self.combine_groups: + self.codebook_indices = self.codebook_indices.view( + self.num_vars ** self.groups, -1 + ) + for b in range(1, self.groups): + self.codebook_indices[:, b] += self.num_vars * b + self.codebook_indices = self.codebook_indices.flatten() + return self.codebook_indices + + def codebook(self): + indices = self.get_codebook_indices() + return ( + self.vars.squeeze(0) + .index_select(0, indices) + .view(self.num_vars ** self.groups, -1) + ) + + def sample_from_codebook(self, b, n): + indices = self.get_codebook_indices() + indices = indices.view(-1, self.groups) + cb_size = indices.size(0) + assert ( + n < cb_size + ), f"sample size {n} is greater than size of codebook {cb_size}" + sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,)) + indices = indices[sample_idx] + + z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1) + return z + + def to_codebook_index(self, indices): + res = indices.new_full(indices.shape[:-1], 0) + for i in range(self.groups): + exponent = self.groups - i - 1 + res += indices[..., i] * (self.num_vars ** exponent) + return res + + def forward_idx(self, x): + res = self.forward(x, produce_targets=True) + return res["x"], res["targets"] + + def forward(self, x, produce_targets=False): + + result = {"num_vars": self.num_vars * self.groups} + + if not self.time_first: + x = x.transpose(1, 2) + + bsz, tsz, fsz = x.shape + x = x.reshape(-1, fsz) + x = self.weight_proj(x) + x = x.view(bsz * tsz * self.groups, -1) + + _, k = x.max(-1) + hard_x = ( + x.new_zeros(*x.shape) + .scatter_(-1, k.view(-1, 1), 1.0) + .view(bsz * tsz, self.groups, -1) + ) + hard_probs = torch.mean(hard_x.float(), dim=0) + result["code_perplexity"] = torch.exp( + -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) + ).sum() + + avg_probs = torch.softmax( + x.view(bsz * tsz, self.groups, -1).float(), dim=-1 + ).mean(dim=0) + result["prob_perplexity"] = torch.exp( + -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) + ).sum() + + result["temp"] = self.curr_temp + + if self.training: + x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x) + else: + x = hard_x + + x = x.view(bsz * tsz, -1) + + vars = self.vars + if self.combine_groups: + vars = vars.repeat(1, self.groups, 1) + + if produce_targets: + result["targets"] = ( + x.view(bsz * tsz * self.groups, -1) + .argmax(dim=-1) + .view(bsz, tsz, self.groups) + .detach() + ) + + x = x.unsqueeze(-1) * vars + x = x.view(bsz * tsz, self.groups, self.num_vars, -1) + x = x.sum(-2) + x = x.view(bsz, tsz, -1) + + if not self.time_first: + x = x.transpose(1, 2) # BTC -> BCT + + result["x"] = x + + return result diff --git a/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py b/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..040db1e83e775a3bb59d5263d22aae9276a83f22 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py @@ -0,0 +1,127 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from fairseq.modules import Fp32GroupNorm + + +class KmeansVectorQuantizer(nn.Module): + def __init__( + self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25 + ): + """Vector quantization using straight pass-through estimator (i.e. kmeans) + + Args: + dim: input dimension (channels) + num_vars: number of quantized vectors per group + groups: number of groups for vector quantization + combine_groups: whether to use the vectors for all groups + vq_dim: dimensionality of the resulting quantized vector + time_first: if true, expect input in BxTxC format, otherwise in BxCxT + gamma: commitment loss coefficient + """ + super().__init__() + + self.groups = groups + self.combine_groups = combine_groups + self.input_dim = dim + self.num_vars = num_vars + self.vq_dim = vq_dim + self.time_first = time_first + + assert ( + vq_dim % groups == 0 + ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation" + + self.var_dim = vq_dim // groups + num_groups = groups if not combine_groups else 1 + + self.embedding = nn.Parameter( + 0.01 * torch.randn(num_vars, num_groups, self.var_dim) + ) + self.projection = nn.Sequential( + nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False), + Fp32GroupNorm(groups, dim), + ) + self.gamma = gamma + self.mse_mean = nn.MSELoss(reduction="mean") + + def _pass_grad(self, x, y): + """Manually set gradient for backward pass. + for y = f(x), ensure that during the backward pass, + dL/dy = dL/dx regardless of f(x). + Returns: + y, with the gradient forced to be dL/dy = dL/dx. + """ + + return y.detach() + (x - x.detach()) + + @property + def expand_embedding(self): + if self.combine_groups: + return self.embedding.expand(self.num_vars, self.groups, self.var_dim) + return self.embedding + + def forward_idx(self, x): + res = self.forward(x, produce_targets=True) + return res["x"], res["targets"] + + def forward(self, x, produce_targets=False): + + result = {"num_vars": self.num_vars} + + if self.time_first: + x = x.transpose(1, 2) + + bsz, fsz, tsz = x.shape + + ze = self.projection(x) + ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2) + d = ( + (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1)) + .view(self.num_vars, bsz, tsz, self.groups, -1) + .norm(dim=-1, p=2) + ) + idx = d.argmin(dim=0) + zq = ( + torch.stack( + [ + self.expand_embedding[idx[..., group], group] + for group in range(self.groups) + ], + dim=-2, + ) + .view(bsz, tsz, self.groups * self.var_dim) + .permute(0, 2, 1) + ) + assert ze.shape == zq.shape, (ze.shape, zq.shape) + x = self._pass_grad(ze, zq) + + hard_x = ( + idx.new_zeros(bsz * tsz * self.groups, self.num_vars) + .scatter_(-1, idx.view(-1, 1), 1.0) + .view(bsz * tsz, self.groups, -1) + ) + hard_probs = torch.mean(hard_x.float(), dim=0) + result["code_perplexity"] = torch.exp( + -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) + ).sum() + + if produce_targets: + result["targets"] = idx + + if self.time_first: + x = x.transpose(1, 2) # BCT -> BTC + result["x"] = x + + ze = ze.float() + zq = zq.float() + latent_loss = self.mse_mean(zq, ze.detach()) + commitment_loss = self.mse_mean(ze, zq.detach()) + + result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss + + return result diff --git a/fairseq-0.10.2/fairseq/modules/layer_norm.py b/fairseq-0.10.2/fairseq/modules/layer_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..234609d9e213a650e0032aaa0ca0462a818bfead --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/layer_norm.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +try: + from apex.normalization import FusedLayerNorm as _FusedLayerNorm + + has_fused_layernorm = True + + class FusedLayerNorm(_FusedLayerNorm): + @torch.jit.unused + def forward(self, x): + if not x.is_cuda: + return super().forward(x) + else: + with torch.cuda.device(x.device): + return super().forward(x) + + +except ImportError: + has_fused_layernorm = False + + +def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): + if torch.jit.is_scripting(): + export = True + if not export and torch.cuda.is_available() and has_fused_layernorm: + return FusedLayerNorm(normalized_shape, eps, elementwise_affine) + return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) + + +class Fp32LayerNorm(nn.LayerNorm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, input): + output = F.layer_norm( + input.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ) + return output.type_as(input) diff --git a/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py b/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..378d0f707183dd344dbb9288dda394b11053acf0 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from torch import Tensor + + +class LearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + Padding ids are ignored by either offsetting based on padding_idx + or by setting padding_idx to None and ensuring that the appropriate + position ids are passed to the forward function. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.onnx_trace = False + if self.padding_idx is not None: + self.max_positions = self.num_embeddings - self.padding_idx - 1 + else: + self.max_positions = self.num_embeddings + + def forward( + self, + input: Tensor, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + positions: Optional[Tensor] = None, + ): + """Input is expected to be of size [bsz x seqlen].""" + assert (positions is None) or ( + self.padding_idx is None + ), "If positions is pre-computed then padding_idx should not be set." + + if positions is None: + if incremental_state is not None: + # positions is the same for every token when decoding a single step + # Without the int() cast, it doesn't work in some cases when exporting to ONNX + positions = torch.zeros( + (1, 1), device=input.device, dtype=input.dtype + ).fill_(int(self.padding_idx + input.size(1))) + else: + positions = utils.make_positions( + input, self.padding_idx, onnx_trace=self.onnx_trace + ) + return F.embedding( + positions, + self.weight, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3b2a99c1227f827768911e5e22e79f6865ffbfd3 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .lightconv_layer import LightconvLayer # noqa diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4bf6b5ad365d604bd91eda384bb422857b640744 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp @@ -0,0 +1,54 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +std::vector lightconv_cuda_forward( + at::Tensor input, + at::Tensor filters, + int padding_l); + +std::vector lightconv_cuda_backward( + at::Tensor gradOutput, + int padding_l, + at::Tensor input, + at::Tensor filters); + + +#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +std::vector lightconv_forward( + at::Tensor input, + at::Tensor filters, + int padding_l) { + + CHECK_INPUT(input); + CHECK_INPUT(filters); + + return lightconv_cuda_forward(input, filters, padding_l); +} + +std::vector lightconv_backward( + at::Tensor gradOutput, + int padding_l, + at::Tensor input, + at::Tensor filters) { + + CHECK_INPUT(gradOutput); + CHECK_INPUT(input); + CHECK_INPUT(filters); + + return lightconv_cuda_backward(gradOutput, padding_l, input, filters); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &lightconv_forward, "lighconv forward (CUDA)"); + m.def("backward", &lightconv_backward, "lighconv backward (CUDA)"); +} diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..052635be79b466d0ad56cf5cf607bd10c2297ecf --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + + +setup( + name="lightconv_layer", + ext_modules=[ + CUDAExtension( + "lightconv_cuda", + [ + "lightconv_cuda.cpp", + "lightconv_cuda_kernel.cu", + ], + ), + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d95da59c2471bfa858fd627605196d7f41f9ec12 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py @@ -0,0 +1,51 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from fairseq.modules import TransformerSentenceEncoderLayer +from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention + + +class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): + """ + Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention) + """ + + def __init__( + self, + embedding_dim: int = 768, + ffn_embedding_dim: int = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + export: bool = False, + is_bidirectional: bool = True, + stride: int = 32, + expressivity: int = 8, + ) -> None: + + super().__init__( + embedding_dim, + ffn_embedding_dim, + num_attention_heads, + dropout, + attention_dropout, + activation_dropout, + activation_fn, + export, + ) + + self.self_attn = SparseMultiheadAttention( + self.embedding_dim, + num_attention_heads, + dropout=attention_dropout, + add_bias_kv=False, + add_zero_attn=False, + self_attention=True, + is_bidirectional=is_bidirectional, + stride=stride, + expressivity=expressivity, + ) diff --git a/fairseq-0.10.2/fairseq/modules/transformer_layer.py b/fairseq-0.10.2/fairseq/modules/transformer_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..48cd4c731445ea9343fc4523f8379133015f4ed1 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/transformer_layer.py @@ -0,0 +1,423 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.modules import LayerNorm, MultiheadAttention +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor + + +class TransformerEncoderLayer(nn.Module): + """Encoder layer block. + + In the original paper each operation (multi-head attention or FFN) is + postprocessed with: `dropout -> add residual -> layernorm`. In the + tensor2tensor code they suggest that learning is more robust when + preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.encoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + + def __init__(self, args): + super().__init__() + self.embed_dim = args.encoder_embed_dim + self.quant_noise = getattr(args, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) + self.self_attn = self.build_self_attention(self.embed_dim, args) + self.self_attn_layer_norm = LayerNorm(self.embed_dim) + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.activation_fn = utils.get_activation_fn( + activation=getattr(args, "activation_fn", "relu") + ) + activation_dropout_p = getattr(args, "activation_dropout", 0) + if activation_dropout_p == 0: + # for backwards compatibility with models that use args.relu_dropout + activation_dropout_p = getattr(args, "relu_dropout", 0) + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + self.normalize_before = args.encoder_normalize_before + self.fc1 = self.build_fc1( + self.embed_dim, + args.encoder_ffn_embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + self.fc2 = self.build_fc2( + args.encoder_ffn_embed_dim, + self.embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + + self.final_layer_norm = LayerNorm(self.embed_dim) + + def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise( + nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size + ) + + def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise( + nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size + ) + + def build_self_attention(self, embed_dim, args): + return MultiheadAttention( + embed_dim, + args.encoder_attention_heads, + dropout=args.attention_dropout, + self_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def residual_connection(self, x, residual): + return residual + x + + def upgrade_state_dict_named(self, state_dict, name): + """ + Rename layer norm states from `...layer_norms.0.weight` to + `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to + `...final_layer_norm.weight` + """ + layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"} + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layer_norms.{}.{}".format(name, old, m) + if k in state_dict: + state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k] + del state_dict[k] + + def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor): binary ByteTensor of shape + `(batch, seq_len)` where padding elements are indicated by ``1``. + attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`, + where `tgt_len` is the length of output and `src_len` is the + length of input, though here both are equal to `seq_len`. + `attn_mask[tgt_i, src_j] = 1` means that when calculating the + embedding for `tgt_i`, we exclude (mask out) `src_j`. This is + useful for strided self-attention. + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + # anything in original attn_mask = 1, becomes -1e8 + # anything in original attn_mask = 0, becomes 0 + # Note that we cannot use -inf here, because at some edge cases, + # the attention weight (before softmax) for some padded element in query + # will become -inf, which results in NaN in model parameters + if attn_mask is not None: + attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + x, _ = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=encoder_padding_mask, + attn_mask=attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + return x + + +class TransformerDecoderLayer(nn.Module): + """Decoder layer block. + + In the original paper each operation (multi-head attention, encoder + attention or FFN) is postprocessed with: `dropout -> add residual -> + layernorm`. In the tensor2tensor code they suggest that learning is more + robust when preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.decoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + ): + super().__init__() + self.embed_dim = args.decoder_embed_dim + self.dropout_module = FairseqDropout( + args.dropout, module_name=self.__class__.__name__ + ) + self.quant_noise = getattr(args, "quant_noise_pq", 0) + self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) + + self.cross_self_attention = getattr(args, "cross_self_attention", False) + + self.self_attn = self.build_self_attention( + self.embed_dim, + args, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + ) + + self.activation_fn = utils.get_activation_fn( + activation=str(args.activation_fn) + if getattr(args, "activation_fn", None) is not None + else "relu" + ) + activation_dropout_p = getattr(args, "activation_dropout", 0) + if activation_dropout_p == 0: + # for backwards compatibility with models that use args.relu_dropout + activation_dropout_p = getattr(args, "relu_dropout", 0) + self.activation_dropout_module = FairseqDropout( + float(activation_dropout_p), module_name=self.__class__.__name__ + ) + self.normalize_before = args.decoder_normalize_before + + # use layerNorm rather than FusedLayerNorm for exporting. + # char_inputs can be used to determint this. + # TODO remove this once we update apex with the fix + export = getattr(args, "char_inputs", False) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + if no_encoder_attn: + self.encoder_attn = None + self.encoder_attn_layer_norm = None + else: + self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + self.fc1 = self.build_fc1( + self.embed_dim, + args.decoder_ffn_embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + self.fc2 = self.build_fc2( + args.decoder_ffn_embed_dim, + self.embed_dim, + self.quant_noise, + self.quant_noise_block_size, + ) + + self.final_layer_norm = LayerNorm(self.embed_dim, export=export) + self.need_attn = True + + self.onnx_trace = False + + def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_self_attention( + self, embed_dim, args, add_bias_kv=False, add_zero_attn=False + ): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=not getattr(args, "cross_self_attention", False), + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def build_encoder_attention(self, embed_dim, args): + return MultiheadAttention( + embed_dim, + args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + q_noise=self.quant_noise, + qn_block_size=self.quant_noise_block_size, + ) + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def residual_connection(self, x, residual): + return residual + x + + def forward( + self, + x, + encoder_out: Optional[torch.Tensor] = None, + encoder_padding_mask: Optional[torch.Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + prev_self_attn_state: Optional[List[torch.Tensor]] = None, + prev_attn_state: Optional[List[torch.Tensor]] = None, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + need_attn: bool = False, + need_head_weights: bool = False, + ): + """ + Args: + x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_padding_mask (ByteTensor, optional): binary + ByteTensor of shape `(batch, src_len)` where padding + elements are indicated by ``1``. + need_attn (bool, optional): return attention weights + need_head_weights (bool, optional): return attention weights + for each head (default: return average over heads). + + Returns: + encoded output of shape `(seq_len, batch, embed_dim)` + """ + if need_head_weights: + need_attn = True + + residual = x + if self.normalize_before: + x = self.self_attn_layer_norm(x) + if prev_self_attn_state is not None: + prev_key, prev_value = prev_self_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_self_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] + assert incremental_state is not None + self.self_attn._set_input_buffer(incremental_state, saved_state) + _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) + if self.cross_self_attention and not ( + incremental_state is not None + and _self_attn_input_buffer is not None + and "prev_key" in _self_attn_input_buffer + ): + if self_attn_mask is not None: + assert encoder_out is not None + self_attn_mask = torch.cat( + (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 + ) + if self_attn_padding_mask is not None: + if encoder_padding_mask is None: + assert encoder_out is not None + encoder_padding_mask = self_attn_padding_mask.new_zeros( + encoder_out.size(1), encoder_out.size(0) + ) + self_attn_padding_mask = torch.cat( + (encoder_padding_mask, self_attn_padding_mask), dim=1 + ) + assert encoder_out is not None + y = torch.cat((encoder_out, x), dim=0) + else: + y = x + + x, attn = self.self_attn( + query=x, + key=y, + value=y, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.self_attn_layer_norm(x) + + if self.encoder_attn is not None and encoder_out is not None: + residual = x + if self.normalize_before: + x = self.encoder_attn_layer_norm(x) + if prev_attn_state is not None: + prev_key, prev_value = prev_attn_state[:2] + saved_state: Dict[str, Optional[Tensor]] = { + "prev_key": prev_key, + "prev_value": prev_value, + } + if len(prev_attn_state) >= 3: + saved_state["prev_key_padding_mask"] = prev_attn_state[2] + assert incremental_state is not None + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=need_attn or (not self.training and self.need_attn), + need_head_weights=need_head_weights, + ) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.encoder_attn_layer_norm(x) + + residual = x + if self.normalize_before: + x = self.final_layer_norm(x) + + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = self.residual_connection(x, residual) + if not self.normalize_before: + x = self.final_layer_norm(x) + if self.onnx_trace and incremental_state is not None: + saved_state = self.self_attn._get_input_buffer(incremental_state) + assert saved_state is not None + if self_attn_padding_mask is not None: + self_attn_state = [ + saved_state["prev_key"], + saved_state["prev_value"], + saved_state["prev_key_padding_mask"], + ] + else: + self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] + return x, attn, self_attn_state + return x, attn, None + + def make_generation_fast_(self, need_attn: bool = False, **kwargs): + self.need_attn = need_attn + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m diff --git a/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..3589c60fe6843c549cfcb94a26cd27bad1fd8033 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py @@ -0,0 +1,134 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Optional + +import torch +import torch.nn as nn +from fairseq import utils +from fairseq.modules import LayerNorm, MultiheadAttention +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise + + +class TransformerSentenceEncoderLayer(nn.Module): + """ + Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained + models. + """ + + def __init__( + self, + embedding_dim: int = 768, + ffn_embedding_dim: int = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + export: bool = False, + q_noise: float = 0.0, + qn_block_size: int = 8, + init_fn: Callable = None, + ) -> None: + super().__init__() + + if init_fn is not None: + init_fn() + + # Initialize parameters + self.embedding_dim = embedding_dim + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + self.activation_dropout_module = FairseqDropout( + activation_dropout, module_name=self.__class__.__name__ + ) + + # Initialize blocks + self.activation_fn = utils.get_activation_fn(activation_fn) + self.self_attn = self.build_self_attention( + self.embedding_dim, + num_attention_heads, + dropout=attention_dropout, + self_attention=True, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + + # layer norm associated with the self attention layer + self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) + + self.fc1 = self.build_fc1( + self.embedding_dim, + ffn_embedding_dim, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + self.fc2 = self.build_fc2( + ffn_embedding_dim, + self.embedding_dim, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + + # layer norm associated with the position wise feed-forward NN + self.final_layer_norm = LayerNorm(self.embedding_dim, export=export) + + def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): + return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) + + def build_self_attention( + self, + embed_dim, + num_attention_heads, + dropout, + self_attention, + q_noise, + qn_block_size, + ): + return MultiheadAttention( + embed_dim, + num_attention_heads, + dropout=dropout, + self_attention=True, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + + def forward( + self, + x: torch.Tensor, + self_attn_mask: Optional[torch.Tensor] = None, + self_attn_padding_mask: Optional[torch.Tensor] = None, + ): + """ + LayerNorm is applied either before or after the self-attention/ffn + modules similar to the original Transformer implementation. + """ + residual = x + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = self.dropout_module(x) + x = residual + x + x = self.self_attn_layer_norm(x) + + residual = x + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = residual + x + x = self.final_layer_norm(x) + return x, attn diff --git a/fairseq-0.10.2/fairseq/modules/unfold.py b/fairseq-0.10.2/fairseq/modules/unfold.py new file mode 100644 index 0000000000000000000000000000000000000000..138272f1ef4f673b29e36aed4531106f7ce95968 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/unfold.py @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn.functional as F + + +def unfold1d(x, kernel_size, padding_l, pad_value=0): + """unfold T x B x C to T x B x C x K""" + if kernel_size > 1: + T, B, C = x.size() + x = F.pad( + x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value + ) + x = x.as_strided((T, B, C, kernel_size), (B * C, C, 1, B * C)) + else: + x = x.unsqueeze(3) + return x