diff --git a/fairseq-0.10.2/fairseq/criterions/__init__.py b/fairseq-0.10.2/fairseq/criterions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7eb5f6f3c272c86b15fdf697f72ee9e9382907f
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""isort:skip_file"""
+
+import importlib
+import os
+from argparse import Namespace
+from typing import Union
+
+from fairseq import registry
+from fairseq.criterions.fairseq_criterion import ( # noqa
+ FairseqCriterion,
+ LegacyFairseqCriterion,
+)
+from omegaconf import DictConfig
+
+
+(
+ build_criterion_,
+ register_criterion,
+ CRITERION_REGISTRY,
+ CRITERION_DATACLASS_REGISTRY,
+) = registry.setup_registry(
+ "--criterion", base_class=FairseqCriterion, default="cross_entropy"
+)
+
+
+def build_criterion(criterion_cfg: Union[DictConfig, Namespace], task):
+ return build_criterion_(criterion_cfg, task)
+
+
+# automatically import any Python files in the criterions/ directory
+for file in os.listdir(os.path.dirname(__file__)):
+ if file.endswith(".py") and not file.startswith("_"):
+ file_name = file[: file.find(".py")]
+ importlib.import_module("fairseq.criterions." + file_name)
diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbef97433cc28e965c86281c5c72ecfc792ef3a1
Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/ctc.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b64fc17e45963bfad92bf845a05d4ed17f4cf8f3
Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/fairseq_criterion.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b669b4c89eae47c072677df71802b84dcc10696
Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/legacy_masked_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc b/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee152ccb93436cf15cfd892a8ed898865ec4f117
Binary files /dev/null and b/fairseq-0.10.2/fairseq/criterions/__pycache__/sentence_prediction.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py b/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..74ba37c321e7ba95c1cd97b5d9f0396dd313b4ee
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/adaptive_loss.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from dataclasses import dataclass
+
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.dataclass import FairseqDataclass
+from fairseq.dataclass.constants import DDP_BACKEND_CHOICES
+from omegaconf import II
+
+
+@dataclass
+class AdaptiveLossConfig(FairseqDataclass):
+ sentence_avg: bool = II("params.optimization.sentence_avg")
+ ddp_backend: DDP_BACKEND_CHOICES = II("params.distributed_training.ddp_backend")
+
+
+@register_criterion("adaptive_loss", dataclass=AdaptiveLossConfig)
+class AdaptiveLoss(FairseqCriterion):
+ """This is an implementation of the loss function accompanying the adaptive softmax approximation for
+ graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
+ (http://arxiv.org/abs/1609.04309)."""
+
+ def __init__(self, task, sentence_avg):
+ super().__init__(task)
+ self.sentence_avg = sentence_avg
+
+ @classmethod
+ def build_criterion(cls, args, task):
+ if getattr(args, "ddp_backend", None) == "c10d":
+ raise Exception(
+ "AdaptiveLoss is not compatible with the c10d "
+ "version of DistributedDataParallel. Please use "
+ "`--ddp-backend=no_c10d` instead."
+ )
+ return cls(task, args.sentence_avg)
+
+ def forward(self, model, sample, reduce=True):
+ """Compute the loss for the given sample.
+
+ Returns a tuple with three elements:
+ 1) the loss
+ 2) the sample size, which is used as the denominator for the gradient
+ 3) logging outputs to display while training
+ """
+
+ assert (
+ hasattr(model.decoder, "adaptive_softmax")
+ and model.decoder.adaptive_softmax is not None
+ )
+ adaptive_softmax = model.decoder.adaptive_softmax
+
+ net_output = model(**sample["net_input"])
+ orig_target = model.get_targets(sample, net_output)
+
+ nsentences = orig_target.size(0)
+ orig_target = orig_target.view(-1)
+
+ bsz = orig_target.size(0)
+
+ logits, target = adaptive_softmax(net_output[0], orig_target)
+ assert len(target) == len(logits)
+
+ loss = net_output[0].new(1 if reduce else bsz).zero_()
+
+ for i in range(len(target)):
+ if target[i] is not None:
+ assert target[i].min() >= 0 and target[i].max() <= logits[i].size(1)
+ loss += F.cross_entropy(
+ logits[i],
+ target[i],
+ ignore_index=self.padding_idx,
+ reduction="sum" if reduce else "none",
+ )
+
+ orig = utils.strip_pad(orig_target, self.padding_idx)
+ ntokens = orig.numel()
+ sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
+ logging_output = {
+ "loss": loss.data,
+ "ntokens": ntokens,
+ "nsentences": nsentences,
+ "sample_size": sample_size,
+ }
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ """Aggregate logging outputs from data parallel training."""
+ loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+ ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+ sample_size = utils.item(
+ sum(log.get("sample_size", 0) for log in logging_outputs)
+ )
+
+ metrics.log_scalar(
+ "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+ )
+ if sample_size != ntokens:
+ metrics.log_scalar(
+ "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+ )
+ metrics.log_derived(
+ "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
+ )
+ else:
+ metrics.log_derived(
+ "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg)
+ )
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return True
diff --git a/fairseq-0.10.2/fairseq/criterions/composite_loss.py b/fairseq-0.10.2/fairseq/criterions/composite_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..98e835fa6e4c0bcad062df9c519701bf795c98be
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/composite_loss.py
@@ -0,0 +1,100 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.criterions import LegacyFairseqCriterion, register_criterion
+from torch import nn
+
+
+@register_criterion("composite_loss")
+class CompositeLoss(LegacyFairseqCriterion):
+ """This is a composite loss that, given a list of model outputs and a list of targets,
+ computes an average of losses for each output-target pair"""
+
+ def __init__(self, args, task):
+ super().__init__(args, task)
+ self.underlying_criterion = args.underlying_criterion
+
+ @staticmethod
+ def add_args(parser):
+ """Add criterion-specific arguments to the parser."""
+ # fmt: off
+ parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True,
+ help='underlying criterion to use for the composite loss')
+ # fmt: on
+
+ @staticmethod
+ def build_underlying_criterion(args, task):
+ saved_criterion = args.criterion
+ args.criterion = args.underlying_criterion
+ assert saved_criterion != args.underlying_criterion
+ underlying_criterion = task.build_criterion(args)
+ args.criterion = saved_criterion
+ return underlying_criterion
+
+ @classmethod
+ def build_criterion(cls, args, task):
+ underlying_criterion = CompositeLoss.build_underlying_criterion(args, task)
+
+ class FakeModel(nn.Module):
+ def __init__(self, model, net_out, target):
+ super().__init__()
+ self.model = model
+ self.net_out = net_out
+ self.target = target
+
+ def forward(self, **unused):
+ return self.net_out
+
+ def get_normalized_probs(self, net_output, log_probs, sample=None):
+ return self.model.get_normalized_probs(
+ net_output, log_probs, sample=sample
+ )
+
+ def get_targets(self, *unused):
+ return self.target
+
+ @property
+ def decoder(self):
+ return self.model.decoder
+
+ class _CompositeLoss(LegacyFairseqCriterion):
+ def __init__(self, args, task, underlying_criterion):
+ super().__init__(args, task)
+ self.underlying_criterion = underlying_criterion
+
+ def forward(self, model, sample, reduce=True):
+ net_outputs = model(**sample["net_input"])
+ targets = sample["target"]
+
+ bsz = targets[0].size(0)
+ loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_()
+
+ sample_size = 0
+ logging_output = {}
+ for o, t in zip(net_outputs[0], targets):
+ m = FakeModel(model, (o, net_outputs[1]), t)
+ sample["target"] = t
+ l, ss, logging_output = self.underlying_criterion(m, sample, reduce)
+ loss += l
+ sample_size += ss
+
+ loss.div_(len(targets))
+ sample_size /= len(targets)
+
+ logging_output["loss"] = utils.item(loss.data) if reduce else loss.data
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def aggregate_logging_outputs(logging_outputs):
+ return underlying_criterion.__class__.aggregate_logging_outputs(
+ logging_outputs
+ )
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ underlying_criterion.__class__.reduce_metrics(logging_outputs)
+
+ return _CompositeLoss(args, task, underlying_criterion)
diff --git a/fairseq-0.10.2/fairseq/criterions/ctc.py b/fairseq-0.10.2/fairseq/criterions/ctc.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f93b3cbfd172f43449d2b80b6f3efd88416eba2
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/ctc.py
@@ -0,0 +1,253 @@
+# All rights reserved.
+#
+# This source code is licensed under the license found in the LICENSE file in
+# the root directory of this source tree. An additional grant of patent rights
+# can be found in the PATENTS file in the same directory.
+
+import math
+from argparse import Namespace
+
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+from fairseq.data.data_utils import post_process
+from fairseq.logging.meters import safe_round
+
+
+@register_criterion("ctc")
+class CtcCriterion(FairseqCriterion):
+ def __init__(self, task, wer_args, zero_infinity, sentence_avg, remove_bpe):
+ super().__init__(task)
+ self.blank_idx = task.target_dictionary.bos()
+ self.pad_idx = task.target_dictionary.pad()
+ self.eos_idx = task.target_dictionary.eos()
+ self.post_process = remove_bpe if remove_bpe else "letter"
+
+ if wer_args is not None:
+ from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder
+
+ wer_compute_kenlm, wer_lexicon, lm_w, ws_w = eval(wer_args)
+
+ dec_args = Namespace()
+ dec_args.nbest = 1
+ dec_args.criterion = "ctc"
+ dec_args.kenlm_model = wer_compute_kenlm
+ dec_args.lexicon = wer_lexicon
+ dec_args.beam = 50
+ dec_args.beam_size_token = min(50, len(task.target_dictionary))
+ dec_args.beam_threshold = min(50, len(task.target_dictionary))
+ dec_args.lm_weight = lm_w
+ dec_args.word_score = ws_w
+ dec_args.unk_weight = -math.inf
+ dec_args.sil_weight = 0
+
+ self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary)
+ else:
+ self.w2l_decoder = None
+
+ self.zero_infinity = zero_infinity
+ self.sentence_avg = sentence_avg
+
+ @staticmethod
+ def add_args(parser):
+ """Add criterion-specific arguments to the parser."""
+ parser.add_argument(
+ "--zero-infinity", action="store_true", help="zero inf loss"
+ )
+ try:
+ parser.add_argument(
+ "--remove-bpe",
+ "--post-process",
+ default="letter",
+ help="remove BPE tokens before scoring (can be set to sentencepiece, letter, and more)",
+ )
+ except:
+ pass # this option might have been added from eval args
+ parser.add_argument(
+ "--wer-args",
+ type=str,
+ default=None,
+ help="options for wer computation on valid set using 4 gram lm. this should be a tuple of 4 elements: path to 4-gram lm, \
+ path to lexicon, lm score, word score",
+ )
+
+ def forward(self, model, sample, reduce=True):
+ net_output = model(**sample["net_input"])
+ lprobs = model.get_normalized_probs(
+ net_output, log_probs=True
+ ).contiguous() # (T, B, C) from the encoder
+
+ if "src_lengths" in sample["net_input"]:
+ input_lengths = sample["net_input"]["src_lengths"]
+ else:
+ non_padding_mask = ~net_output["padding_mask"]
+ input_lengths = non_padding_mask.long().sum(-1)
+
+ pad_mask = (sample["target"] != self.pad_idx) & (
+ sample["target"] != self.eos_idx
+ )
+ targets_flat = sample["target"].masked_select(pad_mask)
+ target_lengths = sample["target_lengths"]
+
+ with torch.backends.cudnn.flags(enabled=False):
+ loss = F.ctc_loss(
+ lprobs,
+ targets_flat,
+ input_lengths,
+ target_lengths,
+ blank=self.blank_idx,
+ reduction="sum",
+ zero_infinity=self.zero_infinity,
+ )
+
+ ntokens = (
+ sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item()
+ )
+
+ sample_size = sample["target"].size(0) if self.sentence_avg else ntokens
+ logging_output = {
+ "loss": utils.item(loss.data), # * sample['ntokens'],
+ "ntokens": ntokens,
+ "nsentences": sample["id"].numel(),
+ "sample_size": sample_size,
+ }
+
+ if not model.training:
+ import editdistance
+
+ with torch.no_grad():
+ lprobs_t = lprobs.transpose(0, 1).float().contiguous().cpu()
+
+ c_err = 0
+ c_len = 0
+ w_errs = 0
+ w_len = 0
+ wv_errs = 0
+ for lp, t, inp_l in zip(
+ lprobs_t,
+ sample["target_label"]
+ if "target_label" in sample
+ else sample["target"],
+ input_lengths,
+ ):
+ lp = lp[:inp_l].unsqueeze(0)
+
+ decoded = None
+ if self.w2l_decoder is not None:
+ decoded = self.w2l_decoder.decode(lp)
+ if len(decoded) < 1:
+ decoded = None
+ else:
+ decoded = decoded[0]
+ if len(decoded) < 1:
+ decoded = None
+ else:
+ decoded = decoded[0]
+
+ p = (t != self.task.target_dictionary.pad()) & (
+ t != self.task.target_dictionary.eos()
+ )
+ targ = t[p]
+ targ_units = self.task.target_dictionary.string(targ)
+ targ_units_arr = targ.tolist()
+
+ toks = lp.argmax(dim=-1).unique_consecutive()
+ pred_units_arr = toks[toks != self.blank_idx].tolist()
+
+ c_err += editdistance.eval(pred_units_arr, targ_units_arr)
+ c_len += len(targ_units_arr)
+
+ targ_words = post_process(targ_units, self.post_process).split()
+
+ pred_units = self.task.target_dictionary.string(pred_units_arr)
+ pred_words_raw = post_process(pred_units, self.post_process).split()
+
+ if decoded is not None and "words" in decoded:
+ pred_words = decoded["words"]
+ w_errs += editdistance.eval(pred_words, targ_words)
+ wv_errs += editdistance.eval(pred_words_raw, targ_words)
+ else:
+ dist = editdistance.eval(pred_words_raw, targ_words)
+ w_errs += dist
+ wv_errs += dist
+
+ w_len += len(targ_words)
+
+ logging_output["wv_errors"] = wv_errs
+ logging_output["w_errors"] = w_errs
+ logging_output["w_total"] = w_len
+ logging_output["c_errors"] = c_err
+ logging_output["c_total"] = c_len
+
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ """Aggregate logging outputs from data parallel training."""
+
+ loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs))
+ ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs))
+ nsentences = utils.item(
+ sum(log.get("nsentences", 0) for log in logging_outputs)
+ )
+ sample_size = utils.item(
+ sum(log.get("sample_size", 0) for log in logging_outputs)
+ )
+
+ metrics.log_scalar(
+ "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+ )
+ metrics.log_scalar("ntokens", ntokens)
+ metrics.log_scalar("nsentences", nsentences)
+ if sample_size != ntokens:
+ metrics.log_scalar(
+ "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+ )
+
+ c_errors = sum(log.get("c_errors", 0) for log in logging_outputs)
+ metrics.log_scalar("_c_errors", c_errors)
+ c_total = sum(log.get("c_total", 0) for log in logging_outputs)
+ metrics.log_scalar("_c_total", c_total)
+ w_errors = sum(log.get("w_errors", 0) for log in logging_outputs)
+ metrics.log_scalar("_w_errors", w_errors)
+ wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs)
+ metrics.log_scalar("_wv_errors", wv_errors)
+ w_total = sum(log.get("w_total", 0) for log in logging_outputs)
+ metrics.log_scalar("_w_total", w_total)
+
+ if c_total > 0:
+ metrics.log_derived(
+ "uer",
+ lambda meters: safe_round(
+ meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
+ )
+ if meters["_c_total"].sum > 0
+ else float("nan"),
+ )
+ if w_total > 0:
+ metrics.log_derived(
+ "wer",
+ lambda meters: safe_round(
+ meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3
+ )
+ if meters["_w_total"].sum > 0
+ else float("nan"),
+ )
+ metrics.log_derived(
+ "raw_wer",
+ lambda meters: safe_round(
+ meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3
+ )
+ if meters["_w_total"].sum > 0
+ else float("nan"),
+ )
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return True
diff --git a/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py b/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef94a863276d6569cb47028069ec199ec5f63055
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/fairseq_criterion.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+from typing import Any, Dict, List
+
+from fairseq import metrics, utils
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from torch.nn.modules.loss import _Loss
+
+
+class FairseqCriterion(_Loss):
+ def __init__(self, task):
+ super().__init__()
+ self.task = task
+ if hasattr(task, "target_dictionary"):
+ tgt_dict = task.target_dictionary
+ self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100
+
+ @classmethod
+ def add_args(cls, parser):
+ """Add criterion-specific arguments to the parser."""
+ dc = getattr(cls, "__dataclass", None)
+ if dc is not None:
+ gen_parser_from_dataclass(parser, dc())
+
+ @classmethod
+ def build_criterion(cls, args, task):
+ """Construct a criterion from command-line args."""
+ # Criterions can override this, but for convenience we also try
+ # to automatically map argparse.Namespace keys to corresponding
+ # arguments in the __init__.
+ init_args = {}
+ for p in inspect.signature(cls).parameters.values():
+ if (
+ p.kind == p.POSITIONAL_ONLY
+ or p.kind == p.VAR_POSITIONAL
+ or p.kind == p.VAR_KEYWORD
+ ):
+ # we haven't implemented inference for these argument types,
+ # but PRs welcome :)
+ raise NotImplementedError("{} not supported".format(p.kind))
+
+ assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY}
+
+ if p.name == "task":
+ init_args["task"] = task
+ elif hasattr(args, p.name):
+ init_args[p.name] = getattr(args, p.name)
+ elif p.default != p.empty:
+ pass # we'll use the default value
+ else:
+ raise NotImplementedError(
+ "Unable to infer Criterion arguments, please implement "
+ "{}.build_criterion".format(cls.__name__)
+ )
+ return cls(**init_args)
+
+ def forward(self, model, sample, reduce=True):
+ """Compute the loss for the given sample.
+
+ Returns a tuple with three elements:
+ 1) the loss
+ 2) the sample size, which is used as the denominator for the gradient
+ 3) logging outputs to display while training
+ """
+ raise NotImplementedError
+
+ @staticmethod
+ def aggregate_logging_outputs(
+ logging_outputs: List[Dict[str, Any]],
+ ) -> Dict[str, Any]:
+ """Aggregate logging outputs from data parallel training."""
+ utils.deprecation_warning(
+ "The aggregate_logging_outputs API is deprecated. "
+ "Please use the reduce_metrics API instead."
+ )
+ raise NotImplementedError
+
+ @classmethod
+ def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None:
+ """Aggregate logging outputs from data parallel training."""
+ utils.deprecation_warning(
+ "Criterions should implement the reduce_metrics API. "
+ "Falling back to deprecated aggregate_logging_outputs API."
+ )
+ agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs)
+ for k, v in agg_logging_outputs.items():
+ if k in {"nsentences", "ntokens", "sample_size"}:
+ continue
+ metrics.log_scalar(k, v)
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return False
+
+
+class LegacyFairseqCriterion(FairseqCriterion):
+ def __init__(self, args, task):
+ super().__init__(task=task)
+ self.args = args
+
+ utils.deprecation_warning(
+ "Criterions should take explicit arguments instead of an "
+ "argparse.Namespace object, please update your criterion by "
+ "extending FairseqCriterion instead of LegacyFairseqCriterion."
+ )
+
+ @classmethod
+ def build_criterion(cls, args, task):
+ """Construct a criterion from command-line args."""
+ return cls(args, task)
diff --git a/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py b/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c70608c5a143b7b4fbd8c58dfcf9f873639d379c
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/legacy_masked_lm.py
@@ -0,0 +1,177 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+def compute_cross_entropy_loss(logits, targets, ignore_index=-100):
+ """
+ Function to compute the cross entropy loss. The default value of
+ ignore_index is the same as the default value for F.cross_entropy in
+ pytorch.
+ """
+ assert logits.size(0) == targets.size(
+ -1
+ ), "Logits and Targets tensor shapes don't match up"
+
+ loss = F.nll_loss(
+ F.log_softmax(logits, -1, dtype=torch.float32),
+ targets,
+ reduction="sum",
+ ignore_index=ignore_index,
+ )
+ return loss
+
+
+@register_criterion("legacy_masked_lm_loss")
+class LegacyMaskedLmLoss(FairseqCriterion):
+ """
+ Implementation for the loss used in masked language model (MLM) training.
+ This optionally also computes the next sentence prediction (NSP) loss and
+ adds it to the overall loss based on the specified args. There are three
+ cases to consider:
+ 1) Generic MLM training without NSP loss. In this case sentence_targets
+ and sentence_logits are both None.
+ 2) BERT training without NSP loss. In this case sentence_targets is
+ not None but sentence_logits is None and we should not be computing
+ a sentence level loss.
+ 3) BERT training with NSP loss. In this case both sentence_targets and
+ sentence_logits are not None and we should be computing a sentence
+ level loss. The weight of the sentence level loss is specified as
+ an argument.
+ """
+
+ def __init__(self, task, masked_lm_only, nsp_loss_weight):
+ super().__init__(task)
+ self.masked_lm_only = masked_lm_only
+ self.nsp_loss_weight = nsp_loss_weight
+
+ @staticmethod
+ def add_args(parser):
+ """Args for MaskedLM Loss"""
+ # Default for masked_lm_only is False so as to not break BERT training
+ parser.add_argument(
+ "--masked-lm-only",
+ default=False,
+ action="store_true",
+ help="compute MLM loss only",
+ )
+ parser.add_argument(
+ "--nsp-loss-weight",
+ default=1.0,
+ type=float,
+ help="weight for next sentence prediction" " loss (default 1)",
+ )
+
+ def forward(self, model, sample, reduce=True):
+ """Compute the loss for the given sample.
+ Returns a tuple with three elements:
+ 1) the loss
+ 2) the sample size, which is used as the denominator for the gradient
+ 3) logging outputs to display while training
+ """
+ lm_logits, output_metadata = model(**sample["net_input"])
+
+ # reshape lm_logits from (N,T,C) to (N*T,C)
+ lm_logits = lm_logits.view(-1, lm_logits.size(-1))
+ lm_targets = sample["lm_target"].view(-1)
+ lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx)
+
+ # compute the number of tokens for which loss is computed. This is used
+ # to normalize the loss
+ ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel()
+ loss = lm_loss / ntokens
+ nsentences = sample["nsentences"]
+ # nsentences = 0
+
+ # Compute sentence loss if masked_lm_only is False
+ sentence_loss = None
+ if not self.masked_lm_only:
+ sentence_logits = output_metadata["sentence_logits"]
+ sentence_targets = sample["sentence_target"].view(-1)
+ # This needs to be recomputed due to some differences between
+ # TokenBlock and BlockPair dataset. This can be resolved with a
+ # refactor of BERTModel which we will do in the future.
+ # TODO: Remove this after refactor of BERTModel
+ nsentences = sentence_targets.size(0)
+
+ # Check for logits being none which can happen when remove_heads
+ # is set to true in the BERT model. Ideally we should set
+ # masked_lm_only to true in this case, but that requires some
+ # refactor in the BERT model.
+ if sentence_logits is not None:
+ sentence_loss = compute_cross_entropy_loss(
+ sentence_logits, sentence_targets
+ )
+
+ loss += self.nsp_loss_weight * (sentence_loss / nsentences)
+
+ # NOTE: as we are summing up per token mlm loss and per sentence nsp loss
+ # we don't need to use sample_size as denominator for the gradient
+ # here sample_size is just used for logging
+ sample_size = 1
+ logging_output = {
+ "loss": utils.item(loss.data) if reduce else loss.data,
+ "lm_loss": utils.item(lm_loss.data) if reduce else lm_loss.data,
+ # sentence loss is not always computed
+ "sentence_loss": (
+ (utils.item(sentence_loss.data) if reduce else sentence_loss.data)
+ if sentence_loss is not None
+ else 0.0
+ ),
+ "ntokens": ntokens,
+ "nsentences": nsentences,
+ "sample_size": sample_size,
+ }
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ """Aggregate logging outputs from data parallel training."""
+ lm_loss_sum = sum(log.get("lm_loss", 0) for log in logging_outputs)
+ sentence_loss_sum = sum(log.get("sentence_loss", 0) for log in logging_outputs)
+ ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+ nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+ sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+ agg_loss = sum(log.get("loss", 0) for log in logging_outputs)
+
+ metrics.log_scalar(
+ "loss",
+ agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.0,
+ sample_size,
+ round=3,
+ )
+ metrics.log_scalar(
+ "lm_loss",
+ lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
+ ntokens,
+ round=3,
+ )
+ metrics.log_scalar(
+ "sentence_loss",
+ sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.0,
+ nsentences,
+ round=3,
+ )
+ metrics.log_scalar(
+ "nll_loss",
+ lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0,
+ ntokens,
+ round=3,
+ )
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return True
diff --git a/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py b/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py
new file mode 100644
index 0000000000000000000000000000000000000000..9519fdc56d7de86b727f74ef5b18db520382e562
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/sentence_prediction.py
@@ -0,0 +1,99 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("sentence_prediction")
+class SentencePredictionCriterion(FairseqCriterion):
+ def __init__(self, task, classification_head_name, regression_target):
+ super().__init__(task)
+ self.classification_head_name = classification_head_name
+ self.regression_target = regression_target
+
+ @staticmethod
+ def add_args(parser):
+ # fmt: off
+ parser.add_argument('--classification-head-name',
+ default='sentence_classification_head',
+ help='name of the classification head to use')
+ # fmt: on
+
+ def forward(self, model, sample, reduce=True):
+ """Compute the loss for the given sample.
+
+ Returns a tuple with three elements:
+ 1) the loss
+ 2) the sample size, which is used as the denominator for the gradient
+ 3) logging outputs to display while training
+ """
+ assert (
+ hasattr(model, "classification_heads")
+ and self.classification_head_name in model.classification_heads
+ ), "model must provide sentence classification head for --criterion=sentence_prediction"
+
+ logits, _ = model(
+ **sample["net_input"],
+ features_only=True,
+ classification_head_name=self.classification_head_name,
+ )
+ targets = model.get_targets(sample, [logits]).view(-1)
+ sample_size = targets.numel()
+
+ if not self.regression_target:
+ lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+ loss = F.nll_loss(lprobs, targets, reduction="sum")
+ else:
+ logits = logits.view(-1).float()
+ targets = targets.float()
+ loss = F.mse_loss(logits, targets, reduction="sum")
+
+ logging_output = {
+ "loss": loss.data,
+ "ntokens": sample["ntokens"],
+ "nsentences": sample_size,
+ "sample_size": sample_size,
+ }
+ if not self.regression_target:
+ preds = logits.argmax(dim=1)
+ logging_output["ncorrect"] = (preds == targets).sum()
+
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ """Aggregate logging outputs from data parallel training."""
+ loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+ ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+ nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+ sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+ metrics.log_scalar(
+ "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+ )
+ if sample_size != ntokens:
+ metrics.log_scalar(
+ "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+ )
+
+ if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
+ ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+ metrics.log_scalar(
+ "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
+ )
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return True
diff --git a/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py b/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4c76341d4d87e6d0da21ac89e833ce0bda13a0c
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/criterions/sentence_ranking.py
@@ -0,0 +1,120 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+
+import torch
+import torch.nn.functional as F
+from fairseq import metrics, utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+
+
+@register_criterion("sentence_ranking")
+class SentenceRankingCriterion(FairseqCriterion):
+ def __init__(self, task, ranking_head_name, save_predictions, num_classes):
+ super().__init__(task)
+ self.ranking_head_name = ranking_head_name
+ if save_predictions is not None:
+ self.prediction_h = open(save_predictions, "w")
+ else:
+ self.prediction_h = None
+ self.num_classes = num_classes
+
+ def __del__(self):
+ if self.prediction_h is not None:
+ self.prediction_h.close()
+
+ @staticmethod
+ def add_args(parser):
+ # fmt: off
+ parser.add_argument('--save-predictions', metavar='FILE',
+ help='file to save predictions to')
+ parser.add_argument('--ranking-head-name',
+ default='sentence_classification_head',
+ help='name of the ranking head to use')
+ # fmt: on
+
+ def forward(self, model, sample, reduce=True):
+ """Compute ranking loss for the given sample.
+
+ Returns a tuple with three elements:
+ 1) the loss
+ 2) the sample size, which is used as the denominator for the gradient
+ 3) logging outputs to display while training
+ """
+ assert (
+ hasattr(model, "classification_heads")
+ and self.ranking_head_name in model.classification_heads
+ ), "model must provide sentence ranking head for --criterion=sentence_ranking"
+
+ scores = []
+ for idx in range(self.num_classes):
+ score, _ = model(
+ **sample["net_input{idx}".format(idx=idx + 1)],
+ classification_head_name=self.ranking_head_name,
+ )
+ scores.append(score)
+
+ logits = torch.cat(scores, dim=1)
+ sample_size = logits.size(0)
+
+ if "target" in sample:
+ targets = model.get_targets(sample, [logits]).view(-1)
+ lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
+ loss = F.nll_loss(lprobs, targets, reduction="sum")
+ else:
+ targets = None
+ loss = torch.tensor(0.0, requires_grad=True)
+
+ if self.prediction_h is not None:
+ preds = logits.argmax(dim=1)
+ for i, (id, pred) in enumerate(zip(sample["id"].tolist(), preds.tolist())):
+ if targets is not None:
+ label = targets[i].item()
+ print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h)
+ else:
+ print("{}\t{}".format(id, pred), file=self.prediction_h)
+
+ logging_output = {
+ "loss": loss.data,
+ "ntokens": sample["ntokens"],
+ "nsentences": sample_size,
+ "sample_size": sample_size,
+ }
+ if targets is not None:
+ logging_output["ncorrect"] = (logits.argmax(dim=1) == targets).sum()
+
+ return loss, sample_size, logging_output
+
+ @staticmethod
+ def reduce_metrics(logging_outputs) -> None:
+ """Aggregate logging outputs from data parallel training."""
+ loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+ ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+ nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+ sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+
+ metrics.log_scalar(
+ "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
+ )
+ if sample_size != ntokens:
+ metrics.log_scalar(
+ "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3
+ )
+
+ if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]:
+ ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+ metrics.log_scalar(
+ "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1
+ )
+
+ @staticmethod
+ def logging_outputs_can_be_summed() -> bool:
+ """
+ Whether the logging outputs returned by `forward` can be summed
+ across workers prior to calling `reduce_metrics`. Setting this
+ to True will improves distributed training speed.
+ """
+ return True
diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65c5013e79484af9c146e46d80d022fdb3d9202c
Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/transformer.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbfc6ae4a0bfb8e8c66403a621d5ad6e52996b1a
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/model.py
@@ -0,0 +1,721 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.model_parallel.models.pipeline_parallel_transformer.layers import (
+ Embedding,
+ TransformerDecoderEmbedding,
+ TransformerDecoderLayer,
+ TransformerDecoderOutputLayer,
+ TransformerEncoderEmbedding,
+ TransformerEncoderLayer,
+ TransformerEncoderLayerNorm,
+)
+from fairseq.models import (
+ BaseFairseqModel,
+ FairseqDecoder,
+ FairseqEncoder,
+ register_model,
+ register_model_architecture,
+)
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.transformer import (
+ base_architecture,
+ transformer_iwslt_de_en,
+ transformer_wmt_en_de_big,
+)
+from fairseq.modules import SinusoidalPositionalEmbedding
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_MAX_SOURCE_POSITIONS = 1024
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model("pipeline_parallel_transformer")
+class PipelineParallelTransformerModel(BaseFairseqModel):
+ def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint):
+ try:
+ from fairscale.nn import Pipe
+ except ImportError:
+ raise ImportError("Please install fairscale with: pip install fairscale")
+ super().__init__()
+ assert isinstance(encoder, FairseqEncoder)
+ assert isinstance(decoder, FairseqDecoder)
+ encoder_module_list = (
+ [encoder.embedding_layer]
+ + list(encoder.encoder_layers)
+ + [encoder.final_layer_norm]
+ )
+ self.num_encoder_modules = len(encoder_module_list)
+ decoder_module_list = (
+ [decoder.embedding_layer]
+ + list(decoder.decoder_layers)
+ + [decoder.decoder_output_layer]
+ )
+ self.num_decoder_modules = len(decoder_module_list)
+ module_list = encoder_module_list + decoder_module_list
+ self.devices = devices
+ self.model = Pipe(
+ nn.Sequential(*module_list),
+ balance=balance,
+ devices=devices,
+ chunks=chunks,
+ checkpoint=checkpoint,
+ )
+ self.encoder_max_positions = self.max_positions_helper(
+ encoder.embedding_layer, "max_source_positions"
+ )
+ self.decoder_max_positions = self.max_positions_helper(
+ decoder.embedding_layer, "max_target_positions"
+ )
+ self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None)
+ # Note: To be populated during inference
+ self.encoder = None
+ self.decoder = None
+
+ def forward(self, src_tokens, src_lengths, prev_output_tokens):
+ if self.training:
+ input_lst = [src_tokens, src_lengths, prev_output_tokens]
+ input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst)
+ return self.model(input)
+ else:
+ assert self.encoder is not None and self.decoder is not None, (
+ "encoder and decoder need to be initialized by "
+ + "calling the `prepare_for_inference_()` method"
+ )
+ encoder_output_tuple = self.encoder(input)
+ return self.decoder(encoder_output_tuple)
+
+ def prepare_for_inference_(self, args):
+ if self.encoder is not None and self.decoder is not None:
+ logger.info("Encoder and Decoder already initialized")
+ return
+ encoder_module_list = []
+ decoder_module_list = []
+ module_count = 0
+ for partition in self.model.partitions:
+ for module in partition:
+ if module_count < self.num_encoder_modules:
+ encoder_module_list.append(module)
+ else:
+ decoder_module_list.append(module)
+ module_count += 1
+ self.model = None
+ self.encoder = TransformerEncoder(args, None, None, encoder_module_list)
+ self.decoder = TransformerDecoder(
+ args, None, None, decoder_module_list=decoder_module_list
+ )
+
+ @staticmethod
+ def add_args(parser):
+ """Add model-specific arguments to the parser."""
+ # fmt: off
+ parser.add_argument('--activation-fn',
+ choices=utils.get_available_activation_fns(),
+ help='activation function to use')
+ parser.add_argument('--dropout', type=float, metavar='D',
+ help='dropout probability')
+ parser.add_argument('--attention-dropout', type=float, metavar='D',
+ help='dropout probability for attention weights')
+ parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D',
+ help='dropout probability after activation in FFN.')
+ parser.add_argument('--encoder-embed-path', type=str, metavar='STR',
+ help='path to pre-trained encoder embedding')
+ parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
+ help='encoder embedding dimension')
+ parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N',
+ help='encoder embedding dimension for FFN')
+ parser.add_argument('--encoder-layers', type=int, metavar='N',
+ help='num encoder layers')
+ parser.add_argument('--encoder-attention-heads', type=int, metavar='N',
+ help='num encoder attention heads')
+ parser.add_argument('--encoder-normalize-before', action='store_true',
+ help='apply layernorm before each encoder block')
+ parser.add_argument('--encoder-learned-pos', action='store_true',
+ help='use learned positional embeddings in the encoder')
+ parser.add_argument('--decoder-embed-path', type=str, metavar='STR',
+ help='path to pre-trained decoder embedding')
+ parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
+ help='decoder embedding dimension')
+ parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N',
+ help='decoder embedding dimension for FFN')
+ parser.add_argument('--decoder-layers', type=int, metavar='N',
+ help='num decoder layers')
+ parser.add_argument('--decoder-attention-heads', type=int, metavar='N',
+ help='num decoder attention heads')
+ parser.add_argument('--decoder-learned-pos', action='store_true',
+ help='use learned positional embeddings in the decoder')
+ parser.add_argument('--decoder-normalize-before', action='store_true',
+ help='apply layernorm before each decoder block')
+ parser.add_argument('--share-decoder-input-output-embed', action='store_true',
+ help='share decoder input and output embeddings')
+ parser.add_argument('--share-all-embeddings', action='store_true',
+ help='share encoder, decoder and output embeddings'
+ ' (requires shared dictionary and embed dim)')
+ parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true',
+ help='if set, disables positional embeddings (outside self attention)')
+ parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR',
+ help='comma separated list of adaptive softmax cutoff points. '
+ 'Must be used with adaptive_loss criterion'),
+ parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D',
+ help='sets adaptive softmax dropout for the tail projections')
+ parser.add_argument('--num-embedding-chunks', type=int, metavar='N', default=1,
+ help='Number of embedding layer chunks (enables more even distribution'
+ 'of optimizer states across data parallel nodes'
+ 'when using optimizer state sharding and'
+ 'a big embedding vocabulary)')
+ # fmt: on
+
+ @classmethod
+ def build_model_base(cls, args, task):
+ """Build a new model instance."""
+
+ # make sure all arguments are present in older models
+ base_architecture(args)
+
+ if not hasattr(args, "max_source_positions"):
+ args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+ if not hasattr(args, "max_target_positions"):
+ args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+
+ src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+
+ def build_embedding(dictionary, embed_dim, path=None, num_embed_chunks=1):
+ assert embed_dim % num_embed_chunks == 0, (
+ f"Number of embedding chunks = {num_embed_chunks} should be "
+ + f"divisible by the embedding dimension = {embed_dim}"
+ )
+ assert path is None or num_embed_chunks == 1, (
+ "Loading embedding from a path with number of embedding chunks > 1"
+ + " is not yet supported"
+ )
+ num_embeddings = len(dictionary)
+ padding_idx = dictionary.pad()
+ # if provided, load from preloaded dictionaries
+ if path:
+ emb = Embedding(num_embeddings, embed_dim, padding_idx)
+ embed_dict = utils.parse_embedding(path)
+ utils.load_embedding(embed_dict, dictionary, emb)
+ else:
+ embed_chunk_dim = embed_dim // num_embed_chunks
+ emb = nn.ModuleList()
+ for i in range(num_embed_chunks):
+ emb.append(Embedding(num_embeddings, embed_chunk_dim, padding_idx))
+ return emb
+
+ num_embed_chunks = args.num_embedding_chunks
+ if args.share_all_embeddings:
+ if src_dict != tgt_dict:
+ raise ValueError("--share-all-embeddings requires a joined dictionary")
+ if args.encoder_embed_dim != args.decoder_embed_dim:
+ raise ValueError(
+ "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+ )
+ if args.decoder_embed_path and (
+ args.decoder_embed_path != args.encoder_embed_path
+ ):
+ raise ValueError(
+ "--share-all-embeddings not compatible with --decoder-embed-path"
+ )
+ encoder_embed_tokens = build_embedding(
+ src_dict,
+ args.encoder_embed_dim,
+ args.encoder_embed_path,
+ num_embed_chunks,
+ )
+ decoder_embed_tokens = encoder_embed_tokens
+ args.share_decoder_input_output_embed = True
+ else:
+ assert args.share_decoder_input_output_embed or num_embed_chunks == 1, (
+ "Not sharing decoder I/O embeddings is not yet supported with number of "
+ + "embedding chunks > 1"
+ )
+ encoder_embed_tokens = build_embedding(
+ src_dict,
+ args.encoder_embed_dim,
+ args.encoder_embed_path,
+ num_embed_chunks,
+ )
+ decoder_embed_tokens = build_embedding(
+ tgt_dict,
+ args.decoder_embed_dim,
+ args.decoder_embed_path,
+ num_embed_chunks,
+ )
+
+ encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+ decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+ return (encoder, decoder)
+
+ @classmethod
+ def build_encoder(cls, args, src_dict, embed_tokens):
+ return TransformerEncoder(args, src_dict, embed_tokens)
+
+ @classmethod
+ def build_decoder(cls, args, tgt_dict, embed_tokens):
+ return TransformerDecoder(args, tgt_dict, embed_tokens)
+
+ @classmethod
+ def build_model(cls, args, task):
+ encoder, decoder = cls.build_model_base(args, task)
+ return PipelineParallelTransformerModel(
+ encoder=encoder,
+ decoder=decoder,
+ balance=utils.eval_str_list(args.pipeline_balance, type=int),
+ devices=utils.eval_str_list(args.pipeline_devices, type=int),
+ chunks=args.pipeline_chunks,
+ checkpoint=args.pipeline_checkpoint,
+ )
+
+ def output_layer(self, features, **kwargs):
+ """Project features to the default output size (typically vocabulary size)."""
+ return self.decoder.output_layer(features, **kwargs)
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return (self.encoder_max_positions, self.decoder_max_positions)
+
+ def max_positions_helper(
+ self, embedding_layer, max_positions_field="max_source_positions"
+ ):
+ """Maximum input length supported by the encoder or decoder."""
+ if embedding_layer.embed_positions is None:
+ return getattr(embedding_layer, max_positions_field)
+ return min(
+ getattr(embedding_layer, max_positions_field),
+ embedding_layer.embed_positions.max_positions,
+ )
+
+ def get_normalized_probs(self, net_output, log_probs, sample=None):
+ """Get normalized probabilities (or log probs) from a net's output."""
+
+ if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+ if sample is not None:
+ assert "target" in sample
+ target = sample["target"]
+ else:
+ target = None
+ out = self.adaptive_softmax.get_log_prob(net_output, target=target)
+ return out.exp_() if not log_probs else out
+
+ # A Pipe() module returns a tuple of tensors as the output.
+ # In this case, the tuple has one element - the output tensor of logits
+ logits = net_output if isinstance(net_output, torch.Tensor) else net_output[0]
+ if log_probs:
+ return utils.log_softmax(logits, dim=-1, onnx_trace=False)
+ else:
+ return utils.softmax(logits, dim=-1, onnx_trace=False)
+
+ def max_decoder_positions(self):
+ """Maximum length supported by the decoder."""
+ return self.decoder_max_positions
+
+ def load_state_dict(self, state_dict, strict=True, args=None):
+ """Copies parameters and buffers from *state_dict* into this module and
+ its descendants.
+
+ Overrides the method in :class:`nn.Module`. Compared with that method
+ this additionally "upgrades" *state_dicts* from old checkpoints.
+ """
+ self.upgrade_state_dict(state_dict)
+ is_regular_transformer = not any("model.partitions" in k for k in state_dict)
+ if is_regular_transformer:
+ state_dict = self.convert_to_pipeline_parallel_state_dict(state_dict)
+ return super().load_state_dict(state_dict, strict)
+
+ def convert_to_pipeline_parallel_state_dict(self, state_dict):
+ new_state_dict = self.state_dict()
+ encoder_layer_idx = 0
+ decoder_layer_idx = 0
+ encoder_key_suffixes = [
+ "self_attn.k_proj.weight",
+ "self_attn.k_proj.bias",
+ "self_attn.v_proj.weight",
+ "self_attn.v_proj.bias",
+ "self_attn.q_proj.weight",
+ "self_attn.q_proj.bias",
+ "self_attn.out_proj.weight",
+ "self_attn.out_proj.bias",
+ "self_attn_layer_norm.weight",
+ "self_attn_layer_norm.bias",
+ "fc1.weight",
+ "fc1.bias",
+ "fc2.weight",
+ "fc2.bias",
+ "final_layer_norm.weight",
+ "final_layer_norm.bias",
+ ]
+ decoder_key_suffixes = [
+ "self_attn.k_proj.weight",
+ "self_attn.k_proj.bias",
+ "self_attn.v_proj.weight",
+ "self_attn.v_proj.bias",
+ "self_attn.q_proj.weight",
+ "self_attn.q_proj.bias",
+ "self_attn.out_proj.weight",
+ "self_attn.out_proj.bias",
+ "self_attn_layer_norm.weight",
+ "self_attn_layer_norm.bias",
+ "encoder_attn.k_proj.weight",
+ "encoder_attn.k_proj.bias",
+ "encoder_attn.v_proj.weight",
+ "encoder_attn.v_proj.bias",
+ "encoder_attn.q_proj.weight",
+ "encoder_attn.q_proj.bias",
+ "encoder_attn.out_proj.weight",
+ "encoder_attn.out_proj.bias",
+ "encoder_attn_layer_norm.weight",
+ "encoder_attn_layer_norm.bias",
+ "fc1.weight",
+ "fc1.bias",
+ "fc2.weight",
+ "fc2.bias",
+ "final_layer_norm.weight",
+ "final_layer_norm.bias",
+ ]
+ for pid, partition in enumerate(self.model.partitions):
+ logger.info(f"Begin Partition {pid}")
+ for mid, module in enumerate(partition):
+ # fmt: off
+ if isinstance(module, TransformerEncoderEmbedding):
+ new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight']
+ new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['encoder.embed_positions._float_tensor']
+ if isinstance(module, TransformerEncoderLayer):
+ for suffix in encoder_key_suffixes:
+ new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}']
+ encoder_layer_idx += 1
+ if isinstance(module, TransformerDecoderLayer):
+ for suffix in decoder_key_suffixes:
+ new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'decoder.layers.{decoder_layer_idx}.{suffix}']
+ decoder_layer_idx += 1
+ if isinstance(module, TransformerEncoderLayerNorm):
+ if 'encoder.layer_norm.weight' in state_dict:
+ new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.weight'] = state_dict['encoder.layer_norm.weight']
+ new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias']
+ if isinstance(module, TransformerDecoderEmbedding):
+ new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight']
+ new_state_dict[f'model.partitions.{pid}.{mid}.embed_positions._float_tensor'] = state_dict['decoder.embed_positions._float_tensor']
+ if isinstance(module, TransformerDecoderOutputLayer):
+ new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight']
+ # fmt: on
+ return new_state_dict
+
+
+class TransformerEncoder(FairseqEncoder):
+ """
+ Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+ is a :class:`TransformerEncoderLayer`.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments
+ dictionary (~fairseq.data.Dictionary): encoding dictionary
+ embed_tokens (torch.nn.Embedding): input embedding
+ """
+
+ def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None):
+ super().__init__(dictionary)
+ self.register_buffer("version", torch.Tensor([3]))
+ try:
+ from fairscale.nn import Pipe
+ except ImportError:
+ raise ImportError("Please install fairscale with: pip install fairscale")
+ if encoder_module_list is None:
+ embedding_layer = TransformerEncoderEmbedding(args, embed_tokens)
+ layers = [TransformerEncoderLayer(args) for i in range(args.encoder_layers)]
+ if isinstance(embed_tokens, nn.ModuleList):
+ emb_dim = sum(e.embedding_dim for e in embed_tokens)
+ else:
+ emb_dim = embed_tokens.embedding_dim
+ final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim)
+ encoder_module_list = [embedding_layer] + layers + [final_layer_norm]
+ self.use_pipeline = getattr(args, "pipeline_encoder_balance", None) is not None
+ if self.use_pipeline:
+ encoder_balance = utils.eval_str_list(
+ args.pipeline_encoder_balance, type=int
+ )
+ encoder_devices = utils.eval_str_list(
+ args.pipeline_encoder_devices, type=int
+ )
+ assert sum(encoder_balance) == len(encoder_module_list), (
+ f"Sum of encoder_balance={encoder_balance} is not equal "
+ + f"to num_encoder_modules={len(encoder_module_list)}"
+ )
+ self.model = Pipe(
+ module=nn.Sequential(*encoder_module_list),
+ balance=encoder_balance,
+ devices=encoder_devices,
+ chunks=args.pipeline_chunks,
+ checkpoint=args.pipeline_checkpoint,
+ )
+ else:
+ self.embedding_layer = encoder_module_list[0]
+ self.encoder_layers = nn.Sequential(*encoder_module_list[1:-1])
+ self.final_layer_norm = encoder_module_list[-1]
+
+ def forward(self, src_tokens, src_lengths):
+ """
+ Args:
+ input_tuple(
+ src_tokens (LongTensor): tokens in the source language of shape
+ `(batch, src_len)`
+ src_lengths (torch.LongTensor): lengths of each source sentence of
+ shape `(batch)`
+ )
+
+ Returns:
+ output_tuple(
+ - **encoder_out** (Tensor): the last encoder layer's output of
+ shape `(src_len, batch, embed_dim)`
+ - **encoder_padding_mask** (ByteTensor): the positions of
+ padding elements of shape `(batch, src_len)`
+ - prev_output_tokens
+ - **encoder_states** (List[Tensor]): all intermediate
+ hidden states of shape `(src_len, batch, embed_dim)`.
+ Only populated if *return_all_hiddens* is True.
+ )
+ """
+ dummy_prev_output_tokens = torch.zeros(
+ 1, dtype=src_tokens.dtype, device=src_tokens.device
+ )
+ input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens)
+ if self.use_pipeline:
+ input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
+ encoder_out = self.model(input_tuple)
+ else:
+ encoder_embed_output_tuple = self.embedding_layer(input_tuple)
+ encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple)
+ encoder_out = self.final_layer_norm(encoder_layers_output)
+ # first element is the encoder output
+ # second element is the encoder padding mask
+ # the remaining elements of EncoderOut are not computed by
+ # the PipelineParallelTransformer
+ return EncoderOut(encoder_out[0], encoder_out[1], None, None, None, None)
+
+ def reorder_encoder_out(self, encoder_out, new_order):
+ """
+ Reorder encoder output according to *new_order*.
+
+ Args:
+ encoder_out: output from the ``forward()`` method
+ new_order (LongTensor): desired order
+
+ Returns:
+ *encoder_out* rearranged according to *new_order*
+ """
+ if encoder_out.encoder_out is not None:
+ encoder_out = encoder_out._replace(
+ encoder_out=encoder_out.encoder_out.index_select(1, new_order)
+ )
+ if encoder_out.encoder_padding_mask is not None:
+ encoder_out = encoder_out._replace(
+ encoder_padding_mask=encoder_out.encoder_padding_mask.index_select(
+ 0, new_order
+ )
+ )
+ if encoder_out.encoder_embedding is not None:
+ encoder_out = encoder_out._replace(
+ encoder_embedding=encoder_out.encoder_embedding.index_select(
+ 0, new_order
+ )
+ )
+ if encoder_out.encoder_states is not None:
+ for idx, state in enumerate(encoder_out.encoder_states):
+ encoder_out.encoder_states[idx] = state.index_select(1, new_order)
+ return encoder_out
+
+ def max_positions(self):
+ """Maximum input length supported by the encoder."""
+ if self.embedding_layer.embed_positions is None:
+ return self.embedding_layer.max_source_positions
+ return min(
+ self.embedding_layer.max_source_positions,
+ self.embedding_layer.embed_positions.max_positions,
+ )
+
+
+class TransformerDecoder(FairseqDecoder):
+ """
+ Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+ is a :class:`TransformerDecoderLayer`.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments
+ dictionary (~fairseq.data.Dictionary): decoding dictionary
+ embed_tokens (torch.nn.Embedding): output embedding
+ no_encoder_attn (bool, optional): whether to attend to encoder outputs
+ (default: False).
+ """
+
+ def __init__(
+ self,
+ args,
+ dictionary,
+ embed_tokens,
+ no_encoder_attn=False,
+ decoder_module_list=None,
+ ):
+ super().__init__(dictionary)
+ self.register_buffer("version", torch.Tensor([3]))
+ try:
+ from fairscale.nn import Pipe
+ except ImportError:
+ raise ImportError("Please install fairscale with: pip install fairscale")
+ if decoder_module_list is None:
+ embedding_layer = TransformerDecoderEmbedding(args, embed_tokens)
+ layers = [
+ TransformerDecoderLayer(args, no_encoder_attn)
+ for _ in range(args.decoder_layers)
+ ]
+ decoder_output_layer = TransformerDecoderOutputLayer(
+ args, embed_tokens, dictionary
+ )
+ decoder_module_list = [embedding_layer] + layers + [decoder_output_layer]
+ self.use_pipeline = getattr(args, "pipeline_decoder_balance", None) is not None
+ if self.use_pipeline:
+ decoder_balance = utils.eval_str_list(
+ args.pipeline_decoder_balance, type=int
+ )
+ decoder_devices = utils.eval_str_list(
+ args.pipeline_decoder_devices, type=int
+ )
+ assert sum(decoder_balance) == len(decoder_module_list), (
+ f"Sum of decoder_balance={decoder_balance} is not equal "
+ + f"to num_decoder_modules={len(decoder_module_list)}"
+ )
+ self.model = Pipe(
+ module=nn.Sequential(*decoder_module_list),
+ balance=decoder_balance,
+ devices=decoder_devices,
+ chunks=args.pipeline_chunks,
+ checkpoint=args.pipeline_checkpoint,
+ )
+ else:
+ self.embedding_layer = decoder_module_list[0]
+ self.decoder_layers = nn.Sequential(*decoder_module_list[1:-1])
+ self.decoder_output_layer = decoder_module_list[-1]
+
+ def forward(
+ self,
+ prev_output_tokens,
+ encoder_out=None,
+ ):
+ """
+ Args:
+ prev_output_tokens (LongTensor): previous decoder outputs of shape
+ `(batch, tgt_len)`, for teacher forcing
+ encoder_out (optional): output from the encoder, used for
+ encoder-side attention
+ incremental_state (dict): dictionary used for storing state during
+ :ref:`Incremental decoding`
+ features_only (bool, optional): only return features without
+ applying output layer (default: False).
+
+ Returns:
+ tuple:
+ - the decoder's output of shape `(batch, tgt_len, vocab)`
+ - a dictionary with any model-specific outputs
+ """
+ input_tuple = (
+ encoder_out.encoder_out,
+ encoder_out.encoder_padding_mask,
+ prev_output_tokens,
+ )
+ if self.use_pipeline:
+ input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple)
+ return (self.model(input_tuple),)
+ else:
+ embed_layer_output = self.embedding_layer(input_tuple)
+ state = self.decoder_layers(embed_layer_output)
+ return (self.decoder_output_layer(state),)
+
+ def output_layer(self, features, **kwargs):
+ """Project features to the vocabulary size."""
+ if self.adaptive_softmax is None:
+ # project back to size of vocabulary
+ if self.share_input_output_embed:
+ return F.linear(features, self.embed_tokens.weight)
+ else:
+ return F.linear(features, self.embed_out)
+ else:
+ return features
+
+ def max_positions(self):
+ """Maximum output length supported by the decoder."""
+ if self.embedding_layer.embed_positions is None:
+ return self.embedding_layer.max_target_positions
+ return min(
+ self.embedding_layer.max_target_positions,
+ self.embedding_layer.embed_positions.max_positions,
+ )
+
+ def buffered_future_mask(self, tensor):
+ dim = tensor.size(0)
+ if (
+ not hasattr(self, "_future_mask")
+ or self._future_mask is None
+ or self._future_mask.device != tensor.device
+ or self._future_mask.size(0) < dim
+ ):
+ self._future_mask = torch.triu(
+ utils.fill_with_neg_inf(tensor.new(dim, dim)), 1
+ )
+ return self._future_mask[:dim, :dim]
+
+ def upgrade_state_dict_named(self, state_dict, name):
+ """Upgrade a (possibly old) state dict for new versions of fairseq."""
+ if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
+ weights_key = "{}.embed_positions.weights".format(name)
+ if weights_key in state_dict:
+ del state_dict[weights_key]
+ state_dict[
+ "{}.embed_positions._float_tensor".format(name)
+ ] = torch.FloatTensor(1)
+
+ for i in range(len(self.layers)):
+ # update layer norms
+ layer_norm_map = {
+ "0": "self_attn_layer_norm",
+ "1": "encoder_attn_layer_norm",
+ "2": "final_layer_norm",
+ }
+ for old, new in layer_norm_map.items():
+ for m in ("weight", "bias"):
+ k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m)
+ if k in state_dict:
+ state_dict[
+ "{}.layers.{}.{}.{}".format(name, i, new, m)
+ ] = state_dict[k]
+ del state_dict[k]
+
+ version_key = "{}.version".format(name)
+ if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2:
+ # earlier checkpoints did not normalize after the stack of layers
+ self.layer_norm = None
+ self.normalize = False
+ state_dict[version_key] = torch.Tensor([1])
+
+ return state_dict
+
+
+@register_model_architecture(
+ "pipeline_parallel_transformer", "transformer_iwslt_de_en_pipeline_parallel"
+)
+def transformer_iwslt_de_en_dist(args):
+ transformer_iwslt_de_en(args)
+
+
+@register_model_architecture(
+ "pipeline_parallel_transformer", "transformer_wmt_en_de_big_pipeline_parallel"
+)
+def transformer_wmt_en_de_big_dist(args):
+ transformer_wmt_en_de_big(args)
diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..117827c3e9c176477f33e3a6fd7fe19a922411a2
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .model import * # noqa
diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86eb916eddcdc3781aaf02b7320c293c17dd45e1
Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ddc583ce9946e70655bd09bf2604029c9d390a2
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46fa438688304fdc89f8fad8308b2896e8c00cc1
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/distributed_fairseq_model.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..643549647692c6cc192d23d03168a4e5f511a7d3
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_decoder.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e3c50bc4650da9745c90d1d0ca817f1a01c4e10
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_encoder.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..489bb4adc7ac86fde068b9a796a06cfc5d74c2cc
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_incremental_decoder.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a80fca170ef22eef58a006b5115af2f4b8edc175
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fairseq_model.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acadfc2292198993a872422f37ce0c0ff93f399c
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae8753142c67a74f88f7ae0e67f7e1d04eebfce0
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfa043d5f5c1b1197feacfa1476f15c84d8603bc
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/fconv_self_att.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69425245346f14bb98f484715caa190c745e14b7
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4677bccd8ce34a3ee6e3c9b7c6c070c8286adc40
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lightconv_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ce08d2d28811eb3eb9364e1c7d25bcf21e486a1
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lstm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4a287f918f3fd619a82979dbc57fadf5617431b
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/lstm_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e408484ea10c396dc44c691728d08c0b2ff8175a
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/masked_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8084a8b89b9b5554542a892ba96c68ad8a9668d0
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/model_utils.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d42f8d48e03bc4d4383a43cf38b0006c81d9c499
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/multilingual_transformer.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f58f1f65706e34ecd620bfd007e3a2a971c8f3c2
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6938891548751ce0bd939a5f945ce33f3ff5ad17
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/__pycache__/transformer_lm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/bart/__init__.py b/fairseq-0.10.2/fairseq/models/bart/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a701923f7e5a2a8aa9b75e5580ddea22907f53ee
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/bart/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .hub_interface import * # noqa
+from .model import * # noqa
diff --git a/fairseq-0.10.2/fairseq/models/bart/hub_interface.py b/fairseq-0.10.2/fairseq/models/bart/hub_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdabe36010bdfde5680f7fd6439b9b2c56c660bd
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/bart/hub_interface.py
@@ -0,0 +1,201 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import logging
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import encoders
+
+
+logger = logging.getLogger(__name__)
+
+
+class BARTHubInterface(nn.Module):
+ """A simple PyTorch Hub interface to BART.
+
+ Usage: https://github.com/pytorch/fairseq/tree/master/examples/bart
+ """
+
+ def __init__(self, args, task, model):
+ super().__init__()
+ self.args = args
+ self.task = task
+ self.model = model
+
+ self.bpe = encoders.build_bpe(args)
+
+ self.max_positions = min(
+ utils.resolve_max_positions(
+ self.task.max_positions(),
+ self.model.max_positions(),
+ )
+ )
+
+ # this is useful for determining the device
+ self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
+
+ @property
+ def device(self):
+ return self._float_tensor.device
+
+ def encode(
+ self, sentence: str, *addl_sentences, no_separator=True
+ ) -> torch.LongTensor:
+ """
+ BPE-encode a sentence (or multiple sentences).
+
+ Every sequence begins with a beginning-of-sentence (``) symbol.
+ Every sentence ends with an end-of-sentence (``).
+
+ Example (single sentence): ` a b c `
+ Example (sentence pair): ` d e f 1 2 3 `
+
+ The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE
+ requires leading spaces. For example::
+
+ >>> bart.encode('Hello world').tolist()
+ [0, 31414, 232, 2]
+ >>> bart.encode(' world').tolist()
+ [0, 232, 2]
+ >>> bart.encode('world').tolist()
+ [0, 8331, 2]
+ """
+ tokens = self.bpe.encode(sentence)
+ if len(tokens.split(" ")) > self.max_positions - 2:
+ tokens = " ".join(tokens.split(" ")[: self.max_positions - 2])
+ bpe_sentence = " " + tokens + " "
+ for s in addl_sentences:
+ bpe_sentence += " " if not no_separator else ""
+ bpe_sentence += " " + self.bpe.encode(s) + " "
+ tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False)
+ return tokens.long()
+
+ def decode(self, tokens: torch.LongTensor):
+ assert tokens.dim() == 1
+ tokens = tokens.cpu().numpy()
+ if tokens[0] == self.task.source_dictionary.bos():
+ tokens = tokens[1:] # remove
+ eos_mask = tokens == self.task.source_dictionary.eos()
+ doc_mask = eos_mask[1:] & eos_mask[:-1]
+ sentences = np.split(tokens, doc_mask.nonzero()[0] + 1)
+ sentences = [
+ self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences
+ ]
+ if len(sentences) == 1:
+ return sentences[0]
+ return sentences
+
+ def _build_sample(self, src_tokens: List[torch.LongTensor]):
+ # assert torch.is_tensor(src_tokens)
+ dataset = self.task.build_dataset_for_inference(
+ src_tokens,
+ [x.numel() for x in src_tokens],
+ )
+ sample = dataset.collater(dataset)
+ sample = utils.apply_to_sample(lambda tensor: tensor.to(self.device), sample)
+ return sample
+
+ def sample(
+ self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs
+ ) -> str:
+ input = [self.encode(sentence) for sentence in sentences]
+ hypos = self.generate(input, beam, verbose, **kwargs)
+ return [self.decode(x["tokens"]) for x in hypos]
+
+ def generate(
+ self,
+ tokens: List[torch.LongTensor],
+ beam: int = 5,
+ verbose: bool = False,
+ **kwargs
+ ) -> torch.LongTensor:
+ sample = self._build_sample(tokens)
+
+ # build generator using current args as well as any kwargs
+ gen_args = copy.copy(self.args)
+ gen_args.beam = beam
+ for k, v in kwargs.items():
+ setattr(gen_args, k, v)
+ generator = self.task.build_generator([self.model], gen_args)
+ translations = self.task.inference_step(
+ generator,
+ [self.model],
+ sample,
+ prefix_tokens=sample["net_input"]["src_tokens"]
+ .new_zeros((len(tokens), 1))
+ .fill_(self.task.source_dictionary.bos()),
+ )
+
+ if verbose:
+ src_str_with_unk = self.string(tokens)
+ logger.info("S\t{}".format(src_str_with_unk))
+
+ def getarg(name, default):
+ return getattr(gen_args, name, getattr(self.args, name, default))
+
+ # Process top predictions
+ hypos = [x[0] for x in translations]
+ hypos = [v for _, v in sorted(zip(sample["id"].tolist(), hypos))]
+ return hypos
+
+ def extract_features(
+ self, tokens: torch.LongTensor, return_all_hiddens: bool = False
+ ) -> torch.Tensor:
+ if tokens.dim() == 1:
+ tokens = tokens.unsqueeze(0)
+ if tokens.size(-1) > min(self.model.max_positions()):
+ raise ValueError(
+ "tokens exceeds maximum length: {} > {}".format(
+ tokens.size(-1), self.model.max_positions()
+ )
+ )
+ tokens.to(device=self.device),
+ prev_output_tokens = tokens.clone()
+
+ prev_output_tokens[:, 0] = tokens.gather(
+ 1,
+ (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1) - 1).unsqueeze(-1),
+ ).squeeze()
+
+ prev_output_tokens[:, 1:] = tokens[:, :-1]
+ features, extra = self.model(
+ src_tokens=tokens,
+ src_lengths=None,
+ prev_output_tokens=prev_output_tokens,
+ features_only=True,
+ return_all_hiddens=return_all_hiddens,
+ )
+ if return_all_hiddens:
+ # convert from T x B x C -> B x T x C
+ inner_states = extra["inner_states"]
+ return [inner_state.transpose(0, 1) for inner_state in inner_states]
+ else:
+ return features # just the last layer's features
+
+ def register_classification_head(
+ self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs
+ ):
+ self.model.register_classification_head(
+ name, num_classes=num_classes, embedding_size=embedding_size, **kwargs
+ )
+
+ def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False):
+ if tokens.dim() == 1:
+ tokens = tokens.unsqueeze(0)
+ features = self.extract_features(tokens.to(device=self.device))
+ sentence_representation = features[
+ tokens.eq(self.task.source_dictionary.eos()), :
+ ].view(features.size(0), -1, features.size(-1))[:, -1, :]
+
+ logits = self.model.classification_heads[head](sentence_representation)
+ if return_logits:
+ return logits
+ return F.log_softmax(logits, dim=-1)
diff --git a/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py b/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ece10c6333f486176a8851c4b39b2e6617e37e51
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/distributed_fairseq_model.py
@@ -0,0 +1,103 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import inspect
+
+import torch.nn as nn
+from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
+
+
+_GOSSIP_DISABLED = False
+try:
+ import gossip
+except ImportError:
+ _GOSSIP_DISABLED = True
+
+
+def DistributedFairseqModel(args, model, process_group=None):
+ """
+ Wrap a *model* to support distributed data parallel training.
+
+ This is similar to the built-in DistributedDataParallel, but allows
+ additional configuration of the DistributedDataParallel class to
+ use, and also provides easier access to the wrapped model by
+ forwarding requests for missing attributes to the wrapped model.
+
+ Args:
+ args (argparse.Namespace): fairseq args
+ model (BaseFairseqModel): model to wrap
+ """
+ # determine which DDP class to extend
+ assert isinstance(model, nn.Module)
+ if args.distributed_wrapper == "DDP" and args.ddp_backend == "c10d":
+ ddp_class = nn.parallel.DistributedDataParallel
+ init_kwargs = dict(
+ module=model,
+ device_ids=[args.device_id],
+ output_device=args.device_id,
+ broadcast_buffers=args.broadcast_buffers,
+ bucket_cap_mb=args.bucket_cap_mb,
+ process_group=process_group,
+ )
+ # Maintain backward compatibility
+ if "check_reduction" in inspect.getargspec(ddp_class)[0]:
+ init_kwargs["check_reduction"] = True
+ if "find_unused_parameters" in inspect.getargspec(ddp_class)[0]:
+ init_kwargs["find_unused_parameters"] = args.find_unused_parameters
+ elif args.distributed_wrapper == "DDP" and args.ddp_backend == "no_c10d":
+ ddp_class = LegacyDistributedDataParallel
+ init_kwargs = dict(
+ module=model,
+ world_size=args.distributed_world_size,
+ buffer_size=2 ** 28,
+ process_group=process_group,
+ )
+ elif args.distributed_wrapper == "SlowMo":
+ if _GOSSIP_DISABLED:
+ raise ImportError(
+ "Cannot find gossip library. Please install from: "
+ "github.com/facebookresearch/stochastic_gradient_push"
+ )
+ ddp_class = gossip.GossipDataParallel
+
+ # The values of slowmo_momentum below were obtained by tuning on the
+ # En-De 16 dataset by training the transformer_wmt_en_de_large model
+ if args.slowmo_momentum is None:
+ if args.distributed_world_size <= 16:
+ args.slowmo_momentum = 0.0
+ elif args.distributed_world_size <= 32:
+ args.slowmo_momentum = 0.2
+ elif args.distributed_world_size <= 64:
+ args.slowmo_momentum = 0.5
+ else:
+ args.slowmo_momentum = 0.6
+
+ init_kwargs = dict(
+ module=model,
+ device_ids=[args.device_id],
+ output_device=args.device_id,
+ broadcast_buffers=args.broadcast_buffers,
+ nprocs_per_node=args.nprocs_per_node,
+ slowmo_momentum=args.slowmo_momentum,
+ localsgd=(args.slowmo_algorithm == "LocalSGD"),
+ localsgd_frequency=args.localsgd_frequency,
+ )
+ else:
+ raise ValueError("Unknown --ddp-backend: " + args.ddp_backend)
+
+ class _DistributedFairseqModel(ddp_class):
+ """Extend DistributedDataParallel to check for missing
+ attributes in the wrapped module."""
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def __getattr__(self, name):
+ wrapped_module = super().__getattr__("module")
+ if hasattr(wrapped_module, name):
+ return getattr(wrapped_module, name)
+ return super().__getattr__(name)
+
+ return _DistributedFairseqModel(**init_kwargs)
diff --git a/fairseq-0.10.2/fairseq/models/fairseq_decoder.py b/fairseq-0.10.2/fairseq/models/fairseq_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb6c52dc7ffd95c63e0b43512db398cbb8b91582
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/fairseq_decoder.py
@@ -0,0 +1,90 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch.nn as nn
+from fairseq import utils
+from torch import Tensor
+
+
+class FairseqDecoder(nn.Module):
+ """Base class for decoders."""
+
+ def __init__(self, dictionary):
+ super().__init__()
+ self.dictionary = dictionary
+ self.onnx_trace = False
+
+ def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
+ """
+ Args:
+ prev_output_tokens (LongTensor): shifted output tokens of shape
+ `(batch, tgt_len)`, for teacher forcing
+ encoder_out (dict, optional): output from the encoder, used for
+ encoder-side attention
+
+ Returns:
+ tuple:
+ - the decoder's output of shape `(batch, tgt_len, vocab)`
+ - a dictionary with any model-specific outputs
+ """
+ x, extra = self.extract_features(
+ prev_output_tokens, encoder_out=encoder_out, **kwargs
+ )
+ x = self.output_layer(x)
+ return x, extra
+
+ def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
+ """
+ Returns:
+ tuple:
+ - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+ - a dictionary with any model-specific outputs
+ """
+ raise NotImplementedError
+
+ def output_layer(self, features, **kwargs):
+ """
+ Project features to the default output size, e.g., vocabulary size.
+
+ Args:
+ features (Tensor): features returned by *extract_features*.
+ """
+ raise NotImplementedError
+
+ def get_normalized_probs(
+ self,
+ net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+ log_probs: bool,
+ sample: Optional[Dict[str, Tensor]] = None,
+ ):
+ """Get normalized probabilities (or log probs) from a net's output."""
+
+ if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None:
+ if sample is not None:
+ assert "target" in sample
+ target = sample["target"]
+ else:
+ target = None
+ out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
+ return out.exp_() if not log_probs else out
+
+ logits = net_output[0]
+ if log_probs:
+ return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+ else:
+ return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
+
+ def max_positions(self):
+ """Maximum input length supported by the decoder."""
+ return 1e6 # an arbitrary large number
+
+ def upgrade_state_dict(self, state_dict):
+ """Upgrade a (possibly old) state dict for new versions of fairseq."""
+ return state_dict
+
+ def prepare_for_onnx_export_(self):
+ self.onnx_trace = True
diff --git a/fairseq-0.10.2/fairseq/models/fairseq_encoder.py b/fairseq-0.10.2/fairseq/models/fairseq_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8873daa283163881a7dc0190e8b25353abed410
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/fairseq_encoder.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, NamedTuple, Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+EncoderOut = NamedTuple(
+ "EncoderOut",
+ [
+ ("encoder_out", Tensor), # T x B x C
+ ("encoder_padding_mask", Optional[Tensor]), # B x T
+ ("encoder_embedding", Optional[Tensor]), # B x T x C
+ ("encoder_states", Optional[List[Tensor]]), # List[T x B x C]
+ ("src_tokens", Optional[Tensor]), # B x T
+ ("src_lengths", Optional[Tensor]), # B x 1
+ ],
+)
+
+
+class FairseqEncoder(nn.Module):
+ """Base class for encoders."""
+
+ def __init__(self, dictionary):
+ super().__init__()
+ self.dictionary = dictionary
+
+ def forward(self, src_tokens, src_lengths=None, **kwargs):
+ """
+ Args:
+ src_tokens (LongTensor): tokens in the source language of shape
+ `(batch, src_len)`
+ src_lengths (LongTensor): lengths of each source sentence of shape
+ `(batch)`
+ """
+ raise NotImplementedError
+
+ def forward_torchscript(self, net_input: Dict[str, Tensor]):
+ """A TorchScript-compatible version of forward.
+
+ Encoders which use additional arguments may want to override
+ this method for TorchScript compatibility.
+ """
+ if torch.jit.is_scripting():
+ return self.forward(
+ src_tokens=net_input["src_tokens"],
+ src_lengths=net_input["src_lengths"],
+ )
+ else:
+ return self.forward_non_torchscript(net_input)
+
+ @torch.jit.unused
+ def forward_non_torchscript(self, net_input: Dict[str, Tensor]):
+ encoder_input = {
+ k: v for k, v in net_input.items() if k != "prev_output_tokens"
+ }
+ return self.forward(**encoder_input)
+
+ def reorder_encoder_out(self, encoder_out, new_order):
+ """
+ Reorder encoder output according to `new_order`.
+
+ Args:
+ encoder_out: output from the ``forward()`` method
+ new_order (LongTensor): desired order
+
+ Returns:
+ `encoder_out` rearranged according to `new_order`
+ """
+ raise NotImplementedError
+
+ def max_positions(self):
+ """Maximum input length supported by the encoder."""
+ return 1e6 # an arbitrary large number
+
+ def upgrade_state_dict(self, state_dict):
+ """Upgrade a (possibly old) state dict for new versions of fairseq."""
+ return state_dict
+
+ def set_num_updates(self, num_updates):
+ """State from trainer to pass along to model at every update."""
+
+ def _apply(m):
+ if hasattr(m, "set_num_updates") and m != self:
+ m.set_num_updates(num_updates)
+
+ self.apply(_apply)
diff --git a/fairseq-0.10.2/fairseq/models/fairseq_model.py b/fairseq-0.10.2/fairseq/models/fairseq_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..092fba43ce16beb479412394b4efcb8e4a07bfbe
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/fairseq_model.py
@@ -0,0 +1,556 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Base classes for various fairseq models.
+"""
+
+import logging
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+# from fairseq.checkpoint_utils import prune_state_dict
+from fairseq.data import Dictionary
+from fairseq.dataclass.utils import gen_parser_from_dataclass
+from fairseq.models import FairseqDecoder, FairseqEncoder
+from torch import Tensor
+
+
+logger = logging.getLogger(__name__)
+
+
+class BaseFairseqModel(nn.Module):
+ """Base class for fairseq models."""
+
+ def __init__(self):
+ super().__init__()
+ self._is_generation_fast = False
+
+ @classmethod
+ def add_args(cls, parser):
+ """Add model-specific arguments to the parser."""
+ dc = getattr(cls, "__dataclass", None)
+ if dc is not None:
+ # do not set defaults so that settings defaults from various architectures still works
+ gen_parser_from_dataclass(parser, dc(), delete_default=True)
+
+ @classmethod
+ def build_model(cls, args, task):
+ """Build a new model instance."""
+ raise NotImplementedError("Model must implement the build_model method")
+
+ def get_targets(self, sample, net_output):
+ """Get targets from either the sample or the net's output."""
+ return sample["target"]
+
+ def get_normalized_probs(
+ self,
+ net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+ log_probs: bool,
+ sample: Optional[Dict[str, Tensor]] = None,
+ ):
+ """Get normalized probabilities (or log probs) from a net's output."""
+ return self.get_normalized_probs_scriptable(net_output, log_probs, sample)
+
+ # TorchScript doesn't support super() method so that the scriptable Subclass
+ # can't access the base class model in Torchscript.
+ # Current workaround is to add a helper function with different name and
+ # call the helper function from scriptable Subclass.
+ def get_normalized_probs_scriptable(
+ self,
+ net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+ log_probs: bool,
+ sample: Optional[Dict[str, Tensor]] = None,
+ ):
+ """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel"""
+ if hasattr(self, "decoder"):
+ return self.decoder.get_normalized_probs(net_output, log_probs, sample)
+ elif torch.is_tensor(net_output):
+ # syntactic sugar for simple models which don't have a decoder
+ # (e.g., the classification tutorial)
+ logits = net_output.float()
+ if log_probs:
+ return F.log_softmax(logits, dim=-1)
+ else:
+ return F.softmax(logits, dim=-1)
+ raise NotImplementedError
+
+ def extract_features(self, *args, **kwargs):
+ """Similar to *forward* but only return features."""
+ return self(*args, **kwargs)
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return None
+
+ def load_state_dict(self, state_dict, strict=True, args=None):
+ """Copies parameters and buffers from *state_dict* into this module and
+ its descendants.
+
+ Overrides the method in :class:`nn.Module`. Compared with that method
+ this additionally "upgrades" *state_dicts* from old checkpoints.
+ """
+ self.upgrade_state_dict(state_dict)
+ from fairseq.checkpoint_utils import prune_state_dict
+ new_state_dict = prune_state_dict(state_dict, args)
+ return super().load_state_dict(new_state_dict, strict)
+
+ def upgrade_state_dict(self, state_dict):
+ """Upgrade old state dicts to work with newer code."""
+ self.upgrade_state_dict_named(state_dict, "")
+
+ def upgrade_state_dict_named(self, state_dict, name):
+ """Upgrade old state dicts to work with newer code.
+
+ Args:
+ state_dict (dict): state dictionary to upgrade, in place
+ name (str): the state dict key corresponding to the current module
+ """
+ assert state_dict is not None
+
+ def do_upgrade(m, prefix):
+ if len(prefix) > 0:
+ prefix += "."
+
+ for n, c in m.named_children():
+ name = prefix + n
+ if hasattr(c, "upgrade_state_dict_named"):
+ c.upgrade_state_dict_named(state_dict, name)
+ elif hasattr(c, "upgrade_state_dict"):
+ c.upgrade_state_dict(state_dict)
+ do_upgrade(c, name)
+
+ do_upgrade(self, name)
+
+ def set_num_updates(self, num_updates):
+ """State from trainer to pass along to model at every update."""
+
+ def _apply(m):
+ if hasattr(m, "set_num_updates") and m != self:
+ m.set_num_updates(num_updates)
+
+ self.apply(_apply)
+
+ def prepare_for_inference_(self, args):
+ """Prepare model for inference."""
+ kwargs = {}
+ kwargs["beamable_mm_beam_size"] = (
+ None if getattr(args, "no_beamable_mm", False) else getattr(args, "beam", 5)
+ )
+ kwargs["need_attn"] = getattr(args, "print_alignment", False)
+ if hasattr(args, "retain_dropout"):
+ kwargs["retain_dropout"] = args.retain_dropout
+ kwargs["retain_dropout_modules"] = getattr(
+ args, "retain_dropout_modules", None
+ )
+ self.make_generation_fast_(**kwargs)
+
+ def make_generation_fast_(self, **kwargs):
+ """
+ Legacy entry point to optimize model for faster generation.
+ Prefer prepare_for_inference_.
+ """
+ if self._is_generation_fast:
+ return # only apply once
+ self._is_generation_fast = True
+
+ # remove weight norm from all modules in the network
+ def apply_remove_weight_norm(module):
+ try:
+ nn.utils.remove_weight_norm(module)
+ except (AttributeError, ValueError): # this module didn't have weight norm
+ return
+
+ self.apply(apply_remove_weight_norm)
+
+ def apply_make_generation_fast_(module, prefix):
+ if len(prefix) > 0:
+ prefix += "."
+
+ base_func = BaseFairseqModel.make_generation_fast_
+ for n, m in module.named_modules():
+ if (
+ m != self
+ and hasattr(m, "make_generation_fast_")
+ # don't call this implementation again, e.g., if
+ # children modules also inherit from BaseFairseqModel
+ and m.make_generation_fast_.__func__ is not base_func
+ ):
+ name = prefix + n
+ m.make_generation_fast_(name=name, **kwargs)
+
+ apply_make_generation_fast_(self, "")
+
+ def train(mode=True):
+ if mode:
+ raise RuntimeError("cannot train after make_generation_fast")
+
+ # this model should no longer be used for training
+ self.eval()
+ self.train = train
+
+ def prepare_for_onnx_export_(self, **kwargs):
+ """Make model exportable via ONNX trace."""
+ seen = set()
+
+ def apply_prepare_for_onnx_export_(module):
+ if (
+ module != self
+ and hasattr(module, "prepare_for_onnx_export_")
+ and module not in seen
+ ):
+ seen.add(module)
+ module.prepare_for_onnx_export_(**kwargs)
+
+ self.apply(apply_prepare_for_onnx_export_)
+
+ def prepare_for_tpu_(self, **kwargs):
+ """Optionally modify model for use on TPUs."""
+ seen = set()
+
+ def apply_prepare_for_tpu_(module):
+ if (
+ module != self
+ and hasattr(module, "prepare_for_tpu_")
+ and module not in seen
+ ):
+ seen.add(module)
+ module.prepare_for_tpu_(**kwargs)
+
+ self.apply(apply_prepare_for_tpu_)
+
+ @classmethod
+ def upgrade_args(cls, args):
+ if hasattr(args, "max_sentences") and not hasattr(args, "batch_size"):
+ args.batch_size = args.max_sentences
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ model_name_or_path,
+ checkpoint_file="model.pt",
+ data_name_or_path=".",
+ **kwargs,
+ ):
+ """
+ Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model
+ file. Downloads and caches the pre-trained model file if needed.
+
+ The base implementation returns a
+ :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to
+ generate translations or sample from language models. The underlying
+ :class:`~fairseq.models.FairseqModel` can be accessed via the
+ *generator.models* attribute.
+
+ Other models may override this to implement custom hub interfaces.
+
+ Args:
+ model_name_or_path (str): either the name of a pre-trained model to
+ load or a path/URL to a pre-trained model state dict
+ checkpoint_file (str, optional): colon-separated list of checkpoint
+ files in the model archive to ensemble (default: 'model.pt')
+ data_name_or_path (str, optional): point args.data to the archive
+ at the given path/URL. Can start with '.' or './' to reuse the
+ model archive path.
+ """
+ from fairseq import hub_utils
+
+ x = hub_utils.from_pretrained(
+ model_name_or_path,
+ checkpoint_file,
+ data_name_or_path,
+ archive_map=cls.hub_models(),
+ **kwargs,
+ )
+
+ cls.upgrade_args(x["args"])
+
+ logger.info(x["args"])
+ return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"])
+
+ @classmethod
+ def hub_models(cls):
+ return {}
+
+
+class FairseqEncoderDecoderModel(BaseFairseqModel):
+ """Base class for encoder-decoder models.
+
+ Args:
+ encoder (FairseqEncoder): the encoder
+ decoder (FairseqDecoder): the decoder
+ """
+
+ def __init__(self, encoder, decoder):
+ super().__init__()
+
+ self.encoder = encoder
+ self.decoder = decoder
+ assert isinstance(self.encoder, FairseqEncoder)
+ assert isinstance(self.decoder, FairseqDecoder)
+
+ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+ """
+ Run the forward pass for an encoder-decoder model.
+
+ First feed a batch of source tokens through the encoder. Then, feed the
+ encoder output and previous decoder outputs (i.e., teacher forcing) to
+ the decoder to produce the next outputs::
+
+ encoder_out = self.encoder(src_tokens, src_lengths)
+ return self.decoder(prev_output_tokens, encoder_out)
+
+ Args:
+ src_tokens (LongTensor): tokens in the source language of shape
+ `(batch, src_len)`
+ src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+ prev_output_tokens (LongTensor): previous decoder outputs of shape
+ `(batch, tgt_len)`, for teacher forcing
+
+ Returns:
+ tuple:
+ - the decoder's output of shape `(batch, tgt_len, vocab)`
+ - a dictionary with any model-specific outputs
+ """
+ encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+ decoder_out = self.decoder(
+ prev_output_tokens, encoder_out=encoder_out, **kwargs
+ )
+ return decoder_out
+
+ def forward_decoder(self, prev_output_tokens, **kwargs):
+ return self.decoder(prev_output_tokens, **kwargs)
+
+ def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+ """
+ Similar to *forward* but only return features.
+
+ Returns:
+ tuple:
+ - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+ - a dictionary with any model-specific outputs
+ """
+ encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs)
+ features = self.decoder.extract_features(
+ prev_output_tokens, encoder_out=encoder_out, **kwargs
+ )
+ return features
+
+ def output_layer(self, features, **kwargs):
+ """Project features to the default output size (typically vocabulary size)."""
+ return self.decoder.output_layer(features, **kwargs)
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return (self.encoder.max_positions(), self.decoder.max_positions())
+
+ def max_decoder_positions(self):
+ """Maximum length supported by the decoder."""
+ return self.decoder.max_positions()
+
+
+class FairseqModel(FairseqEncoderDecoderModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ utils.deprecation_warning(
+ "FairseqModel is deprecated, please use FairseqEncoderDecoderModel "
+ "or BaseFairseqModel instead",
+ stacklevel=4,
+ )
+
+
+class FairseqMultiModel(BaseFairseqModel):
+ """Base class for combining multiple encoder-decoder models."""
+
+ def __init__(self, encoders, decoders):
+ super().__init__()
+ assert encoders.keys() == decoders.keys()
+ self.keys = list(encoders.keys())
+ for key in self.keys:
+ assert isinstance(encoders[key], FairseqEncoder)
+ assert isinstance(decoders[key], FairseqDecoder)
+
+ self.models = nn.ModuleDict(
+ {
+ key: FairseqEncoderDecoderModel(encoders[key], decoders[key])
+ for key in self.keys
+ }
+ )
+
+ @staticmethod
+ def build_shared_embeddings(
+ dicts: Dict[str, Dictionary],
+ langs: List[str],
+ embed_dim: int,
+ build_embedding: callable,
+ pretrained_embed_path: Optional[str] = None,
+ ):
+ """
+ Helper function to build shared embeddings for a set of languages after
+ checking that all dicts corresponding to those languages are equivalent.
+
+ Args:
+ dicts: Dict of lang_id to its corresponding Dictionary
+ langs: languages that we want to share embeddings for
+ embed_dim: embedding dimension
+ build_embedding: callable function to actually build the embedding
+ pretrained_embed_path: Optional path to load pretrained embeddings
+ """
+ shared_dict = dicts[langs[0]]
+ if any(dicts[lang] != shared_dict for lang in langs):
+ raise ValueError(
+ "--share-*-embeddings requires a joined dictionary: "
+ "--share-encoder-embeddings requires a joined source "
+ "dictionary, --share-decoder-embeddings requires a joined "
+ "target dictionary, and --share-all-embeddings requires a "
+ "joint source + target dictionary."
+ )
+ return build_embedding(shared_dict, embed_dim, pretrained_embed_path)
+
+ def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs):
+ raise NotImplementedError
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return {
+ key: (
+ self.models[key].encoder.max_positions(),
+ self.models[key].decoder.max_positions(),
+ )
+ for key in self.keys
+ }
+
+ def max_decoder_positions(self):
+ """Maximum length supported by the decoder."""
+ return min(model.decoder.max_positions() for model in self.models.values())
+
+ @property
+ def encoder(self):
+ return self.models[self.keys[0]].encoder
+
+ @property
+ def decoder(self):
+ return self.models[self.keys[0]].decoder
+
+ def forward_decoder(self, prev_output_tokens, **kwargs):
+ return self.decoder(prev_output_tokens, **kwargs)
+
+ def load_state_dict(self, state_dict, strict=True, args=None):
+ """Copies parameters and buffers from *state_dict* into this module and
+ its descendants.
+
+ Overrides the method in :class:`nn.Module`. Compared with that method
+ this additionally "upgrades" *state_dicts* from old checkpoints.
+ """
+ self.upgrade_state_dict(state_dict)
+ from fairseq.checkpoint_utils import prune_state_dict
+ new_state_dict = prune_state_dict(state_dict, args)
+ return super().load_state_dict(new_state_dict, strict)
+
+
+class FairseqLanguageModel(BaseFairseqModel):
+ """Base class for decoder-only models.
+
+ Args:
+ decoder (FairseqDecoder): the decoder
+ """
+
+ def __init__(self, decoder):
+ super().__init__()
+ self.decoder = decoder
+ assert isinstance(self.decoder, FairseqDecoder)
+
+ def forward(self, src_tokens, **kwargs):
+ """
+ Run the forward pass for a decoder-only model.
+
+ Feeds a batch of tokens through the decoder to predict the next tokens.
+
+ Args:
+ src_tokens (LongTensor): tokens on which to condition the decoder,
+ of shape `(batch, tgt_len)`
+ src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+ Returns:
+ tuple:
+ - the decoder's output of shape `(batch, seq_len, vocab)`
+ - a dictionary with any model-specific outputs
+ """
+ return self.decoder(src_tokens, **kwargs)
+
+ def forward_decoder(self, prev_output_tokens, **kwargs):
+ return self.decoder(prev_output_tokens, **kwargs)
+
+ def extract_features(self, src_tokens, **kwargs):
+ """
+ Similar to *forward* but only return features.
+
+ Returns:
+ tuple:
+ - the decoder's features of shape `(batch, seq_len, embed_dim)`
+ - a dictionary with any model-specific outputs
+ """
+ return self.decoder.extract_features(src_tokens, **kwargs)
+
+ def output_layer(self, features, **kwargs):
+ """Project features to the default output size (typically vocabulary size)."""
+ return self.decoder.output_layer(features, **kwargs)
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return self.decoder.max_positions()
+
+ def max_decoder_positions(self):
+ """Maximum length supported by the decoder."""
+ return self.decoder.max_positions()
+
+ @property
+ def supported_targets(self):
+ return {"future"}
+
+
+class FairseqEncoderModel(BaseFairseqModel):
+ """Base class for encoder-only models.
+
+ Args:
+ encoder (FairseqEncoder): the encoder
+ """
+
+ def __init__(self, encoder):
+ super().__init__()
+ self.encoder = encoder
+ assert isinstance(self.encoder, FairseqEncoder)
+
+ def forward(self, src_tokens, src_lengths, **kwargs):
+ """
+ Run the forward pass for a encoder-only model.
+
+ Feeds a batch of tokens through the encoder to generate features.
+
+ Args:
+ src_tokens (LongTensor): input tokens of shape `(batch, src_len)`
+ src_lengths (LongTensor): source sentence lengths of shape `(batch)`
+
+ Returns:
+ the encoder's output, typically of shape `(batch, src_len, features)`
+ """
+ return self.encoder(src_tokens, src_lengths, **kwargs)
+
+ def get_normalized_probs(self, net_output, log_probs, sample=None):
+ """Get normalized probabilities (or log probs) from a net's output."""
+ encoder_out = net_output["encoder_out"]
+ if torch.is_tensor(encoder_out):
+ logits = encoder_out.float()
+ if log_probs:
+ return F.log_softmax(logits, dim=-1)
+ else:
+ return F.softmax(logits, dim=-1)
+ raise NotImplementedError
+
+ def max_positions(self):
+ """Maximum length supported by the model."""
+ return self.encoder.max_positions()
diff --git a/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py b/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a8eb78198f5808557092f814e92f1c9d72933ec
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/huggingface/hf_gpt2.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import sys
+from typing import Dict, List, Optional
+
+import torch
+from fairseq.models import (
+ FairseqIncrementalDecoder,
+ FairseqLanguageModel,
+ register_model,
+ register_model_architecture,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+DEFAULT_MAX_TARGET_POSITIONS = 1024
+
+
+@register_model("hf_gpt2")
+class HuggingFaceGPT2LanguageModel(FairseqLanguageModel):
+ def __init__(self, decoder):
+ super().__init__(decoder)
+
+ @staticmethod
+ def add_args(parser):
+ """Add model-specific arguments to the parser."""
+ # fmt: off
+ parser.add_argument('--embed-dim', type=int, metavar='N',
+ help='embedding dimension')
+ parser.add_argument('--num-attention-heads', type=int, metavar='N',
+ help='num attention heads')
+ parser.add_argument('--num-layers', type=int, metavar='N',
+ help='num layers')
+ parser.add_argument('--dropout', type=float, metavar='D',
+ help='dropout probability for all fully connected layers '
+ 'in the embeddings, encoder, and pooler')
+ parser.add_argument('--attention-dropout', type=float, metavar='D',
+ help='dropout probability for attention weights')
+ # fmt: on
+
+ @classmethod
+ def build_model(cls, args, task):
+ """Build a new model instance."""
+ default_architecture(args)
+ return cls(HuggingFaceGPT2Decoder(args, task))
+
+
+class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder):
+ def __init__(self, args, task):
+ try:
+ from transformers import GPT2Config, GPT2LMHeadModel
+ except ImportError:
+ raise ImportError(
+ "\n\nPlease install huggingface/transformers with:"
+ "\n\n pip install transformers"
+ )
+
+ super().__init__(task.target_dictionary)
+
+ config = GPT2Config(
+ vocab_size=len(task.target_dictionary),
+ n_positions=args.max_target_positions + 1,
+ n_ctx=args.max_target_positions,
+ n_embd=args.embed_dim,
+ n_layer=args.num_layers,
+ n_head=args.num_attention_heads,
+ resid_pdrop=args.dropout,
+ embd_pdrop=args.dropout,
+ attn_pdrop=args.attention_dropout,
+ layer_norm_epsilon=1e-6,
+ )
+ self.model = GPT2LMHeadModel(config)
+
+ # set zero embedding for padding symbol
+ self.pad_idx = task.target_dictionary.pad()
+ self.model.transformer.wte.weight.data[self.pad_idx].zero_()
+ self.model.transformer.wpe.weight.data[0].zero_()
+
+ def forward(
+ self,
+ prev_output_tokens,
+ src_lengths=None,
+ incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+ encoder_out=None,
+ ):
+ features = self.extract_features(prev_output_tokens, incremental_state)
+ lm_logits = self.model.lm_head(features)
+ return (lm_logits,)
+
+ def extract_features(
+ self,
+ prev_output_tokens,
+ incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None,
+ ):
+ if incremental_state:
+ past = self.get_incremental_state("past")
+ else:
+ past = None
+
+ # don't attend to padding symbols
+ attention_mask = prev_output_tokens.ne(self.pad_idx).int()
+
+ # set position ids to exclude padding symbols
+ position_ids = attention_mask * (
+ torch.arange(1, 1 + prev_output_tokens.size(1))
+ .to(prev_output_tokens)
+ .repeat(prev_output_tokens.size(0), 1)
+ )
+
+ outputs = self.model.transformer(
+ input_ids=prev_output_tokens,
+ past=past,
+ attention_mask=attention_mask,
+ position_ids=position_ids,
+ )
+ last_hidden_states = outputs[0]
+
+ if incremental_state:
+ self.set_incremental_state(incremental_state, "past", outputs[1])
+
+ return last_hidden_states
+
+ def max_positions(self):
+ return self.model.config.n_positions - 1
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2")
+def default_architecture(args):
+ if getattr(args, "max_target_positions", None) is None:
+ args.max_target_positions = getattr(
+ args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS
+ )
+ args.embed_dim = getattr(args, "embed_dim", 768)
+ args.num_attention_heads = getattr(args, "num_attention_heads", 12)
+ args.num_layers = getattr(args, "num_layers", 12)
+ args.dropout = getattr(args, "dropout", 0.1)
+ args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_medium")
+def hf_gpt2_medium(args):
+ args.embed_dim = getattr(args, "embed_dim", 1024)
+ args.num_attention_heads = getattr(args, "num_attention_heads", 16)
+ args.num_layers = getattr(args, "num_layers", 24)
+ default_architecture(args)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_large")
+def hf_gpt2_large(args):
+ args.embed_dim = getattr(args, "embed_dim", 1280)
+ args.num_attention_heads = getattr(args, "num_attention_heads", 20)
+ args.num_layers = getattr(args, "num_layers", 36)
+ default_architecture(args)
+
+
+@register_model_architecture("hf_gpt2", "hf_gpt2_xl")
+def hf_gpt2_xl(args):
+ args.embed_dim = getattr(args, "embed_dim", 1600)
+ args.num_attention_heads = getattr(args, "num_attention_heads", 25)
+ args.num_layers = getattr(args, "num_layers", 48)
+ default_architecture(args)
diff --git a/fairseq-0.10.2/fairseq/models/lightconv_lm.py b/fairseq-0.10.2/fairseq/models/lightconv_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d9efc4e42a5ecc1b83338055f18ade5a83ea666
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/lightconv_lm.py
@@ -0,0 +1,306 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq import utils
+from fairseq.models import (
+ FairseqLanguageModel,
+ register_model,
+ register_model_architecture,
+)
+from fairseq.models.lightconv import Embedding, LightConvDecoder
+from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
+
+
+@register_model("lightconv_lm")
+class LightConvLanguageModel(FairseqLanguageModel):
+ def __init__(self, decoder):
+ super().__init__(decoder)
+
+ @staticmethod
+ def add_args(parser):
+ """Add model-specific arguments to the parser."""
+ parser.add_argument(
+ "--dropout",
+ default=0.1,
+ type=float,
+ metavar="D",
+ help="dropout probability",
+ )
+ parser.add_argument(
+ "--attention-dropout",
+ default=0.0,
+ type=float,
+ metavar="D",
+ help="dropout probability for attention weights",
+ )
+ parser.add_argument(
+ "--relu-dropout",
+ default=0.0,
+ type=float,
+ metavar="D",
+ help="dropout probability after ReLU in FFN",
+ )
+ parser.add_argument(
+ "--input-dropout",
+ type=float,
+ metavar="D",
+ help="dropout probability of the inputs",
+ )
+ parser.add_argument(
+ "--decoder-embed-dim",
+ type=int,
+ metavar="N",
+ help="decoder embedding dimension",
+ )
+ parser.add_argument(
+ "--decoder-output-dim",
+ type=int,
+ metavar="N",
+ help="decoder output dimension",
+ )
+ parser.add_argument(
+ "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension"
+ )
+ parser.add_argument(
+ "--decoder-ffn-embed-dim",
+ type=int,
+ metavar="N",
+ help="decoder embedding dimension for FFN",
+ )
+ parser.add_argument(
+ "--decoder-layers", type=int, metavar="N", help="num decoder layers"
+ )
+ parser.add_argument(
+ "--decoder-attention-heads",
+ type=int,
+ metavar="N",
+ help="num decoder attention heads or LightConv/DynamicConv heads",
+ )
+ parser.add_argument(
+ "--decoder-normalize-before",
+ default=False,
+ action="store_true",
+ help="apply layernorm before each decoder block",
+ )
+ parser.add_argument(
+ "--adaptive-softmax-cutoff",
+ metavar="EXPR",
+ help="comma separated list of adaptive softmax cutoff points. "
+ "Must be used with adaptive_loss criterion",
+ )
+ parser.add_argument(
+ "--adaptive-softmax-dropout",
+ type=float,
+ metavar="D",
+ help="sets adaptive softmax dropout for the tail projections",
+ )
+ parser.add_argument(
+ "--adaptive-softmax-factor",
+ type=float,
+ metavar="N",
+ help="adaptive input factor",
+ )
+ parser.add_argument(
+ "--no-token-positional-embeddings",
+ default=False,
+ action="store_true",
+ help="if set, disables positional embeddings (outside self attention)",
+ )
+ parser.add_argument(
+ "--share-decoder-input-output-embed",
+ default=False,
+ action="store_true",
+ help="share decoder input and output embeddings",
+ )
+ parser.add_argument(
+ "--character-embeddings",
+ default=False,
+ action="store_true",
+ help="if set, uses character embedding convolutions to produce token embeddings",
+ )
+ parser.add_argument(
+ "--character-filters",
+ type=str,
+ metavar="LIST",
+ default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
+ help="size of character embeddings",
+ )
+ parser.add_argument(
+ "--character-embedding-dim",
+ type=int,
+ metavar="N",
+ default=4,
+ help="size of character embeddings",
+ )
+ parser.add_argument(
+ "--char-embedder-highway-layers",
+ type=int,
+ metavar="N",
+ default=2,
+ help="number of highway layers for character token embeddder",
+ )
+ parser.add_argument(
+ "--adaptive-input",
+ default=False,
+ action="store_true",
+ help="if set, uses adaptive input",
+ )
+ parser.add_argument(
+ "--adaptive-input-factor",
+ type=float,
+ metavar="N",
+ help="adaptive input factor",
+ )
+ parser.add_argument(
+ "--adaptive-input-cutoff",
+ metavar="EXPR",
+ help="comma separated list of adaptive input cutoff points.",
+ )
+ parser.add_argument(
+ "--tie-adaptive-weights",
+ action="store_true",
+ help="if set, ties the weights of adaptive softmax and adaptive input",
+ )
+ parser.add_argument(
+ "--tie-adaptive-proj",
+ action="store_true",
+ help="if set, ties the projection weights of adaptive softmax and adaptive input",
+ )
+ parser.add_argument(
+ "--decoder-learned-pos",
+ action="store_true",
+ help="use learned positional embeddings in the decoder",
+ )
+
+ """LightConv and DynamicConv arguments"""
+ parser.add_argument(
+ "--decoder-kernel-size-list",
+ type=lambda x: utils.eval_str_list(x, int),
+ help='list of kernel size (default: "[3,7,15,31,31,31]")',
+ )
+ parser.add_argument(
+ "--decoder-glu", type=utils.eval_bool, help="glu after in proj"
+ )
+ parser.add_argument(
+ "--decoder-conv-type",
+ default="dynamic",
+ type=str,
+ choices=["dynamic", "lightweight"],
+ help="type of convolution",
+ )
+ parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
+ parser.add_argument(
+ "--weight-dropout",
+ type=float,
+ metavar="D",
+ help="dropout probability for conv weights",
+ )
+
+ @classmethod
+ def build_model(cls, args, task):
+ """Build a new model instance."""
+
+ # make sure all arguments are present in older models
+ base_lm_architecture(args)
+
+ if getattr(args, "max_source_positions", None) is None:
+ args.max_source_positions = args.tokens_per_sample
+ if getattr(args, "max_target_positions", None) is None:
+ args.max_target_positions = args.tokens_per_sample
+
+ if args.character_embeddings:
+ embed_tokens = CharacterTokenEmbedder(
+ task.dictionary,
+ eval(args.character_filters),
+ args.character_embedding_dim,
+ args.decoder_embed_dim,
+ args.char_embedder_highway_layers,
+ )
+ elif args.adaptive_input:
+ embed_tokens = AdaptiveInput(
+ len(task.dictionary),
+ task.dictionary.pad(),
+ args.decoder_input_dim,
+ args.adaptive_input_factor,
+ args.decoder_embed_dim,
+ utils.eval_str_list(args.adaptive_input_cutoff, type=int),
+ )
+ else:
+ embed_tokens = Embedding(
+ len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()
+ )
+
+ if args.tie_adaptive_weights:
+ assert args.adaptive_input
+ assert args.adaptive_input_factor == args.adaptive_softmax_factor
+ assert (
+ args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
+ ), "{} != {}".format(
+ args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
+ )
+ assert args.decoder_input_dim == args.decoder_output_dim
+
+ decoder = LightConvDecoder(
+ args,
+ task.output_dictionary,
+ embed_tokens,
+ no_encoder_attn=True,
+ final_norm=False,
+ )
+ return LightConvLanguageModel(decoder)
+
+
+@register_model_architecture("lightconv_lm", "lightconv_lm")
+def base_lm_architecture(args):
+ args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+ args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
+ args.decoder_layers = getattr(args, "decoder_layers", 6)
+ args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+ args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+ args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+ args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
+ args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+
+ args.character_embeddings = getattr(args, "character_embeddings", False)
+
+ args.decoder_output_dim = getattr(
+ args, "decoder_output_dim", args.decoder_embed_dim
+ )
+ args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+ args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
+
+ # The model training is not stable without this
+ args.decoder_normalize_before = True
+
+ args.adaptive_input = getattr(args, "adaptive_input", False)
+ args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
+ args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
+
+ args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+ args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
+
+ args.decoder_kernel_size_list = getattr(
+ args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
+ )
+ if len(args.decoder_kernel_size_list) == 1:
+ args.decoder_kernel_size_list = (
+ args.decoder_kernel_size_list * args.decoder_layers
+ )
+ assert (
+ len(args.decoder_kernel_size_list) == args.decoder_layers
+ ), "decoder_kernel_size_list doesn't match decoder_layers"
+ args.decoder_glu = getattr(args, "decoder_glu", True)
+ args.input_dropout = getattr(args, "input_dropout", 0.1)
+ args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
+
+
+@register_model_architecture("lightconv_lm", "lightconv_lm_gbw")
+def lightconv_lm_gbw(args):
+ args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+ args.dropout = getattr(args, "dropout", 0.1)
+ args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+ args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+ args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+ base_lm_architecture(args)
diff --git a/fairseq-0.10.2/fairseq/models/masked_lm.py b/fairseq-0.10.2/fairseq/models/masked_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..c786de9125551f7247618b0a1d0867477894c755
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/masked_lm.py
@@ -0,0 +1,403 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.models import (
+ FairseqEncoder,
+ FairseqEncoderModel,
+ register_model,
+ register_model_architecture,
+)
+from fairseq.modules import (
+ LayerNorm,
+ SinusoidalPositionalEmbedding,
+ TransformerSentenceEncoder,
+)
+from fairseq.modules.transformer_sentence_encoder import init_bert_params
+
+
+logger = logging.getLogger(__name__)
+
+
+@register_model("masked_lm")
+class MaskedLMModel(FairseqEncoderModel):
+ """
+ Class for training a Masked Language Model. It also supports an
+ additional sentence level prediction if the sent-loss argument is set.
+ """
+
+ def __init__(self, args, encoder):
+ super().__init__(encoder)
+ self.args = args
+
+ # if specified then apply bert initialization on the model. We need
+ # to explictly call this to make sure that the output embeddings
+ # and projection layers are also correctly initialized
+ if getattr(args, "apply_bert_init", False):
+ self.apply(init_bert_params)
+
+ @staticmethod
+ def add_args(parser):
+ """Add model-specific arguments to the parser."""
+ # Arguments related to dropout
+ parser.add_argument(
+ "--dropout", type=float, metavar="D", help="dropout probability"
+ )
+ parser.add_argument(
+ "--attention-dropout",
+ type=float,
+ metavar="D",
+ help="dropout probability for" " attention weights",
+ )
+ parser.add_argument(
+ "--act-dropout",
+ type=float,
+ metavar="D",
+ help="dropout probability after" " activation in FFN",
+ )
+
+ # Arguments related to hidden states and self-attention
+ parser.add_argument(
+ "--encoder-ffn-embed-dim",
+ type=int,
+ metavar="N",
+ help="encoder embedding dimension for FFN",
+ )
+ parser.add_argument(
+ "--encoder-layers", type=int, metavar="N", help="num encoder layers"
+ )
+ parser.add_argument(
+ "--encoder-attention-heads",
+ type=int,
+ metavar="N",
+ help="num encoder attention heads",
+ )
+
+ # Arguments related to input and output embeddings
+ parser.add_argument(
+ "--encoder-embed-dim",
+ type=int,
+ metavar="N",
+ help="encoder embedding dimension",
+ )
+ parser.add_argument(
+ "--share-encoder-input-output-embed",
+ action="store_true",
+ help="share encoder input" " and output embeddings",
+ )
+ parser.add_argument(
+ "--encoder-learned-pos",
+ action="store_true",
+ help="use learned positional embeddings in the encoder",
+ )
+ parser.add_argument(
+ "--no-token-positional-embeddings",
+ action="store_true",
+ help="if set, disables positional embeddings" " (outside self attention)",
+ )
+ parser.add_argument(
+ "--num-segment", type=int, metavar="N", help="num segment in the input"
+ )
+ parser.add_argument(
+ "--max-positions", type=int, help="number of positional embeddings to learn"
+ )
+
+ # Arguments related to sentence level prediction
+ parser.add_argument(
+ "--sentence-class-num",
+ type=int,
+ metavar="N",
+ help="number of classes for sentence task",
+ )
+ parser.add_argument(
+ "--sent-loss",
+ action="store_true",
+ help="if set," " calculate sentence level predictions",
+ )
+
+ # Arguments related to parameter initialization
+ parser.add_argument(
+ "--apply-bert-init",
+ action="store_true",
+ help="use custom param initialization for BERT",
+ )
+
+ # misc params
+ parser.add_argument(
+ "--activation-fn",
+ choices=utils.get_available_activation_fns(),
+ help="activation function to use",
+ )
+ parser.add_argument(
+ "--pooler-activation-fn",
+ choices=utils.get_available_activation_fns(),
+ help="Which activation function to use for pooler layer.",
+ )
+ parser.add_argument(
+ "--encoder-normalize-before",
+ action="store_true",
+ help="apply layernorm before each encoder block",
+ )
+
+ def forward(self, src_tokens, segment_labels=None, **kwargs):
+ return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs)
+
+ def max_positions(self):
+ return self.encoder.max_positions
+
+ @classmethod
+ def build_model(cls, args, task):
+ """Build a new model instance."""
+ # make sure all arguments are present in older models
+ base_architecture(args)
+
+ if not hasattr(args, "max_positions"):
+ args.max_positions = args.tokens_per_sample
+
+ logger.info(args)
+
+ encoder = MaskedLMEncoder(args, task.dictionary)
+ return cls(args, encoder)
+
+
+class MaskedLMEncoder(FairseqEncoder):
+ """
+ Encoder for Masked Language Modelling.
+ """
+
+ def __init__(self, args, dictionary):
+ super().__init__(dictionary)
+
+ self.padding_idx = dictionary.pad()
+ self.vocab_size = dictionary.__len__()
+ self.max_positions = args.max_positions
+
+ self.sentence_encoder = TransformerSentenceEncoder(
+ padding_idx=self.padding_idx,
+ vocab_size=self.vocab_size,
+ num_encoder_layers=args.encoder_layers,
+ embedding_dim=args.encoder_embed_dim,
+ ffn_embedding_dim=args.encoder_ffn_embed_dim,
+ num_attention_heads=args.encoder_attention_heads,
+ dropout=args.dropout,
+ attention_dropout=args.attention_dropout,
+ activation_dropout=args.act_dropout,
+ max_seq_len=self.max_positions,
+ num_segments=args.num_segment,
+ use_position_embeddings=not args.no_token_positional_embeddings,
+ encoder_normalize_before=args.encoder_normalize_before,
+ apply_bert_init=args.apply_bert_init,
+ activation_fn=args.activation_fn,
+ learned_pos_embedding=args.encoder_learned_pos,
+ )
+
+ self.share_input_output_embed = args.share_encoder_input_output_embed
+ self.embed_out = None
+ self.sentence_projection_layer = None
+ self.sentence_out_dim = args.sentence_class_num
+ self.lm_output_learned_bias = None
+
+ # Remove head is set to true during fine-tuning
+ self.load_softmax = not getattr(args, "remove_head", False)
+
+ self.masked_lm_pooler = nn.Linear(
+ args.encoder_embed_dim, args.encoder_embed_dim
+ )
+ self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn)
+
+ self.lm_head_transform_weight = nn.Linear(
+ args.encoder_embed_dim, args.encoder_embed_dim
+ )
+ self.activation_fn = utils.get_activation_fn(args.activation_fn)
+ self.layer_norm = LayerNorm(args.encoder_embed_dim)
+
+ self.lm_output_learned_bias = None
+ if self.load_softmax:
+ self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size))
+
+ if not self.share_input_output_embed:
+ self.embed_out = nn.Linear(
+ args.encoder_embed_dim, self.vocab_size, bias=False
+ )
+
+ if args.sent_loss:
+ self.sentence_projection_layer = nn.Linear(
+ args.encoder_embed_dim, self.sentence_out_dim, bias=False
+ )
+
+ def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused):
+ """
+ Forward pass for Masked LM encoder. This first computes the token
+ embedding using the token embedding matrix, position embeddings (if
+ specified) and segment embeddings (if specified).
+
+ Here we assume that the sentence representation corresponds to the
+ output of the classification_token (see bert_task or cross_lingual_lm
+ task for more details).
+ Args:
+ - src_tokens: B x T matrix representing sentences
+ - segment_labels: B x T matrix representing segment label for tokens
+ Returns:
+ - a tuple of the following:
+ - logits for predictions in format B x T x C to be used in
+ softmax afterwards
+ - a dictionary of additional data, where 'pooled_output' contains
+ the representation for classification_token and 'inner_states'
+ is a list of internal model states used to compute the
+ predictions (similar in ELMO). 'sentence_logits'
+ is the prediction logit for NSP task and is only computed if
+ this is specified in the input arguments.
+ """
+
+ inner_states, sentence_rep = self.sentence_encoder(
+ src_tokens,
+ segment_labels=segment_labels,
+ )
+
+ x = inner_states[-1].transpose(0, 1)
+ # project masked tokens only
+ if masked_tokens is not None:
+ x = x[masked_tokens, :]
+ x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x)))
+
+ pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep))
+
+ # project back to size of vocabulary
+ if self.share_input_output_embed and hasattr(
+ self.sentence_encoder.embed_tokens, "weight"
+ ):
+ x = F.linear(x, self.sentence_encoder.embed_tokens.weight)
+ elif self.embed_out is not None:
+ x = self.embed_out(x)
+ if self.lm_output_learned_bias is not None:
+ x = x + self.lm_output_learned_bias
+ sentence_logits = None
+ if self.sentence_projection_layer:
+ sentence_logits = self.sentence_projection_layer(pooled_output)
+
+ return x, {
+ "inner_states": inner_states,
+ "pooled_output": pooled_output,
+ "sentence_logits": sentence_logits,
+ }
+
+ def max_positions(self):
+ """Maximum output length supported by the encoder."""
+ return self.max_positions
+
+ def upgrade_state_dict_named(self, state_dict, name):
+ if isinstance(
+ self.sentence_encoder.embed_positions, SinusoidalPositionalEmbedding
+ ):
+ state_dict[
+ name + ".sentence_encoder.embed_positions._float_tensor"
+ ] = torch.FloatTensor(1)
+ if not self.load_softmax:
+ for k in list(state_dict.keys()):
+ if (
+ "embed_out.weight" in k
+ or "sentence_projection_layer.weight" in k
+ or "lm_output_learned_bias" in k
+ ):
+ del state_dict[k]
+ return state_dict
+
+
+@register_model_architecture("masked_lm", "masked_lm")
+def base_architecture(args):
+ args.dropout = getattr(args, "dropout", 0.1)
+ args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+ args.act_dropout = getattr(args, "act_dropout", 0.0)
+
+ args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+ args.encoder_layers = getattr(args, "encoder_layers", 6)
+ args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+
+ args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+ args.share_encoder_input_output_embed = getattr(
+ args, "share_encoder_input_output_embed", False
+ )
+ args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
+ args.no_token_positional_embeddings = getattr(
+ args, "no_token_positional_embeddings", False
+ )
+ args.num_segment = getattr(args, "num_segment", 2)
+
+ args.sentence_class_num = getattr(args, "sentence_class_num", 2)
+ args.sent_loss = getattr(args, "sent_loss", False)
+
+ args.apply_bert_init = getattr(args, "apply_bert_init", False)
+
+ args.activation_fn = getattr(args, "activation_fn", "relu")
+ args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+ args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+
+
+@register_model_architecture("masked_lm", "bert_base")
+def bert_base_architecture(args):
+ args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+ args.share_encoder_input_output_embed = getattr(
+ args, "share_encoder_input_output_embed", True
+ )
+ args.no_token_positional_embeddings = getattr(
+ args, "no_token_positional_embeddings", False
+ )
+ args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+ args.num_segment = getattr(args, "num_segment", 2)
+
+ args.encoder_layers = getattr(args, "encoder_layers", 12)
+
+ args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+ args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
+
+ args.sentence_class_num = getattr(args, "sentence_class_num", 2)
+ args.sent_loss = getattr(args, "sent_loss", True)
+
+ args.apply_bert_init = getattr(args, "apply_bert_init", True)
+
+ args.activation_fn = getattr(args, "activation_fn", "gelu")
+ args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+ args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+ base_architecture(args)
+
+
+@register_model_architecture("masked_lm", "bert_large")
+def bert_large_architecture(args):
+ args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+ args.encoder_layers = getattr(args, "encoder_layers", 24)
+ args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+ args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+ bert_base_architecture(args)
+
+
+@register_model_architecture("masked_lm", "xlm_base")
+def xlm_architecture(args):
+ args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+ args.share_encoder_input_output_embed = getattr(
+ args, "share_encoder_input_output_embed", True
+ )
+ args.no_token_positional_embeddings = getattr(
+ args, "no_token_positional_embeddings", False
+ )
+ args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True)
+ args.num_segment = getattr(args, "num_segment", 1)
+
+ args.encoder_layers = getattr(args, "encoder_layers", 6)
+
+ args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+ args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+
+ args.sent_loss = getattr(args, "sent_loss", False)
+
+ args.activation_fn = getattr(args, "activation_fn", "gelu")
+ args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+ args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+ args.apply_bert_init = getattr(args, "apply_bert_init", True)
+ base_architecture(args)
diff --git a/fairseq-0.10.2/fairseq/models/model_utils.py b/fairseq-0.10.2/fairseq/models/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..732d66b1d5f695151c26d29eb7f6b53179c269f1
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/model_utils.py
@@ -0,0 +1,92 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional
+
+import torch
+from torch import Tensor
+
+
+@torch.jit.script
+def script_skip_tensor_list(x: List[Tensor], mask):
+ res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x]
+ outputs = []
+ for i, t in enumerate(res):
+ if t.numel() != 0:
+ outputs.append(t)
+ else:
+ outputs.append(x[i])
+ return outputs
+
+
+@torch.jit.script
+def script_skip_tensor(x: Tensor, mask):
+ # None case
+ if x.size(0) == 0:
+ return x
+ res = x[mask] if x.size(0) == mask.size(0) else x[:, mask]
+ if res.numel() == 0:
+ return x
+ else:
+ return res
+
+
+@torch.jit.script
+def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int):
+ """
+ Expand 2D/3D tensor on dim=1
+ """
+ if x is None:
+ return None
+
+ assert x.dim() == 2 or x.dim() == 3
+ assert trg_dim >= x.size(1), (trg_dim, x.size())
+ if trg_dim == x.size(1):
+ return x
+
+ dims = [x.size(0), trg_dim - x.size(1)]
+ if x.dim() == 3:
+ dims.append(x.size(2))
+ x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1)
+
+ return x
+
+
+@torch.jit.script
+def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor:
+ return x if x is not None else y
+
+
+@torch.jit.script
+def fill_tensors(
+ x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int
+) -> Optional[Tensor]:
+ """
+ Filling tensor x with y at masked positions (dim=0).
+ """
+ if x is None or x.size()[0] == 0 or y is None:
+ return x
+ assert x.dim() == y.dim() and mask.size(0) == x.size(0)
+ assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2))
+
+ n_selected = mask.sum()
+ if n_selected == 0:
+ return x
+ assert n_selected == y.size(0)
+ if n_selected == x.size(0):
+ return y
+
+ if x.size(1) < y.size(1):
+ x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx)
+ x[mask] = y
+ elif x.size(1) > y.size(1):
+ x[mask] = torch.tensor(padding_idx).type_as(x)
+ if x.dim() == 2:
+ x[mask, : y.size(1)] = y
+ else:
+ x[mask, : y.size(1), :] = y
+ else:
+ x[mask] = y
+ return x
diff --git a/fairseq-0.10.2/fairseq/models/multilingual_transformer.py b/fairseq-0.10.2/fairseq/models/multilingual_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3fbbd5710dfb10b16f5495c9131fa42b11544be
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/multilingual_transformer.py
@@ -0,0 +1,228 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import OrderedDict
+
+from fairseq import utils
+from fairseq.models import (
+ FairseqMultiModel,
+ register_model,
+ register_model_architecture,
+)
+from fairseq.models.transformer import (
+ Embedding,
+ TransformerDecoder,
+ TransformerEncoder,
+ TransformerModel,
+ base_architecture,
+)
+
+
+@register_model("multilingual_transformer")
+class MultilingualTransformerModel(FairseqMultiModel):
+ """Train Transformer models for multiple language pairs simultaneously.
+
+ Requires `--task multilingual_translation`.
+
+ We inherit all arguments from TransformerModel and assume that all language
+ pairs use a single Transformer architecture. In addition, we provide several
+ options that are specific to the multilingual setting.
+
+ Args:
+ --share-encoder-embeddings: share encoder embeddings across all source languages
+ --share-decoder-embeddings: share decoder embeddings across all target languages
+ --share-encoders: share all encoder params (incl. embeddings) across all source languages
+ --share-decoders: share all decoder params (incl. embeddings) across all target languages
+ """
+
+ def __init__(self, encoders, decoders):
+ super().__init__(encoders, decoders)
+
+ @staticmethod
+ def add_args(parser):
+ """Add model-specific arguments to the parser."""
+ TransformerModel.add_args(parser)
+ parser.add_argument(
+ "--share-encoder-embeddings",
+ action="store_true",
+ help="share encoder embeddings across languages",
+ )
+ parser.add_argument(
+ "--share-decoder-embeddings",
+ action="store_true",
+ help="share decoder embeddings across languages",
+ )
+ parser.add_argument(
+ "--share-encoders",
+ action="store_true",
+ help="share encoders across languages",
+ )
+ parser.add_argument(
+ "--share-decoders",
+ action="store_true",
+ help="share decoders across languages",
+ )
+
+ @classmethod
+ def build_model(cls, args, task):
+ """Build a new model instance."""
+ from fairseq.tasks.multilingual_translation import MultilingualTranslationTask
+
+ assert isinstance(task, MultilingualTranslationTask)
+
+ # make sure all arguments are present in older models
+ base_multilingual_architecture(args)
+
+ if not hasattr(args, "max_source_positions"):
+ args.max_source_positions = 1024
+ if not hasattr(args, "max_target_positions"):
+ args.max_target_positions = 1024
+
+ src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs]
+ tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs]
+
+ if args.share_encoders:
+ args.share_encoder_embeddings = True
+ if args.share_decoders:
+ args.share_decoder_embeddings = True
+
+ def build_embedding(dictionary, embed_dim, path=None):
+ num_embeddings = len(dictionary)
+ padding_idx = dictionary.pad()
+ emb = Embedding(num_embeddings, embed_dim, padding_idx)
+ # if provided, load from preloaded dictionaries
+ if path:
+ embed_dict = utils.parse_embedding(path)
+ utils.load_embedding(embed_dict, dictionary, emb)
+ return emb
+
+ # build shared embeddings (if applicable)
+ shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None
+ if args.share_all_embeddings:
+ if args.encoder_embed_dim != args.decoder_embed_dim:
+ raise ValueError(
+ "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+ )
+ if args.decoder_embed_path and (
+ args.decoder_embed_path != args.encoder_embed_path
+ ):
+ raise ValueError(
+ "--share-all-embeddings not compatible with --decoder-embed-path"
+ )
+ shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+ dicts=task.dicts,
+ langs=task.langs,
+ embed_dim=args.encoder_embed_dim,
+ build_embedding=build_embedding,
+ pretrained_embed_path=args.encoder_embed_path,
+ )
+ shared_decoder_embed_tokens = shared_encoder_embed_tokens
+ args.share_decoder_input_output_embed = True
+ else:
+ if args.share_encoder_embeddings:
+ shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+ dicts=task.dicts,
+ langs=src_langs,
+ embed_dim=args.encoder_embed_dim,
+ build_embedding=build_embedding,
+ pretrained_embed_path=args.encoder_embed_path,
+ )
+ if args.share_decoder_embeddings:
+ shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings(
+ dicts=task.dicts,
+ langs=tgt_langs,
+ embed_dim=args.decoder_embed_dim,
+ build_embedding=build_embedding,
+ pretrained_embed_path=args.decoder_embed_path,
+ )
+
+ # encoders/decoders for each language
+ lang_encoders, lang_decoders = {}, {}
+
+ def get_encoder(lang):
+ if lang not in lang_encoders:
+ if shared_encoder_embed_tokens is not None:
+ encoder_embed_tokens = shared_encoder_embed_tokens
+ else:
+ encoder_embed_tokens = build_embedding(
+ task.dicts[lang],
+ args.encoder_embed_dim,
+ args.encoder_embed_path,
+ )
+ lang_encoders[lang] = cls._get_module_class(
+ True, args, task.dicts[lang], encoder_embed_tokens, src_langs
+ )
+ return lang_encoders[lang]
+
+ def get_decoder(lang):
+ if lang not in lang_decoders:
+ if shared_decoder_embed_tokens is not None:
+ decoder_embed_tokens = shared_decoder_embed_tokens
+ else:
+ decoder_embed_tokens = build_embedding(
+ task.dicts[lang],
+ args.decoder_embed_dim,
+ args.decoder_embed_path,
+ )
+ lang_decoders[lang] = cls._get_module_class(
+ False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs
+ )
+ return lang_decoders[lang]
+
+ # shared encoders/decoders (if applicable)
+ shared_encoder, shared_decoder = None, None
+ if args.share_encoders:
+ shared_encoder = get_encoder(src_langs[0])
+ if args.share_decoders:
+ shared_decoder = get_decoder(tgt_langs[0])
+
+ encoders, decoders = OrderedDict(), OrderedDict()
+ for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs):
+ encoders[lang_pair] = (
+ shared_encoder if shared_encoder is not None else get_encoder(src)
+ )
+ decoders[lang_pair] = (
+ shared_decoder if shared_decoder is not None else get_decoder(tgt)
+ )
+
+ return MultilingualTransformerModel(encoders, decoders)
+
+ @classmethod
+ def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs):
+ module_class = TransformerEncoder if is_encoder else TransformerDecoder
+ return module_class(args, lang_dict, embed_tokens)
+
+ def load_state_dict(self, state_dict, strict=True, args=None):
+ state_dict_subset = state_dict.copy()
+ for k, _ in state_dict.items():
+ assert k.startswith("models.")
+ lang_pair = k.split(".")[1]
+ if lang_pair not in self.models:
+ del state_dict_subset[k]
+ super().load_state_dict(state_dict_subset, strict=strict, args=args)
+
+
+@register_model_architecture("multilingual_transformer", "multilingual_transformer")
+def base_multilingual_architecture(args):
+ base_architecture(args)
+ args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", False)
+ args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", False)
+ args.share_encoders = getattr(args, "share_encoders", False)
+ args.share_decoders = getattr(args, "share_decoders", False)
+
+
+@register_model_architecture(
+ "multilingual_transformer", "multilingual_transformer_iwslt_de_en"
+)
+def multilingual_transformer_iwslt_de_en(args):
+ args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+ args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+ args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+ args.encoder_layers = getattr(args, "encoder_layers", 6)
+ args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+ args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+ args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+ args.decoder_layers = getattr(args, "decoder_layers", 6)
+ base_multilingual_architecture(args)
diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d74a9bf6cb461e842e7b3293545fd2578e80bd49
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/hub_interface.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9864f25948e3e4872391c25024dd3ced269985c
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_camembert.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e40f8a4b5a91035e66c52d889067ea24d7185a3
Binary files /dev/null and b/fairseq-0.10.2/fairseq/models/roberta/__pycache__/model_xlmr.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py b/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5886880f73bd1e2176c49e3d491a7d46eb3d9322
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/models/roberta/model_xlmr.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Unsupervised Cross-lingual Representation Learning at Scale
+"""
+
+from fairseq.models import register_model
+
+from .hub_interface import RobertaHubInterface
+from .model import RobertaModel
+
+
+@register_model("xlmr")
+class XLMRModel(RobertaModel):
+ @classmethod
+ def hub_models(cls):
+ return {
+ "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz",
+ "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz",
+ }
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ model_name_or_path,
+ checkpoint_file="model.pt",
+ data_name_or_path=".",
+ bpe="sentencepiece",
+ **kwargs
+ ):
+ from fairseq import hub_utils
+
+ x = hub_utils.from_pretrained(
+ model_name_or_path,
+ checkpoint_file,
+ data_name_or_path,
+ archive_map=cls.hub_models(),
+ bpe=bpe,
+ load_checkpoint_heads=True,
+ **kwargs,
+ )
+ return RobertaHubInterface(x["args"], x["task"], x["models"][0])
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f54438a2eb96d62ce90ea5a3a9d6ed58fbb6098
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/beamable_mm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1adf927e347f1b940946ce74c275bb3248a684e8
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/conv_tbc.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96569923675dcd69044b6420c57438690660edd3
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_crf_layer.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0d2ed4af95a89c618698343413b3a749da1c7c6
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/fp32_group_norm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70cdc44765423f38190818d92942bf854be3ce93
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/gelu.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32c155869dbc4b46ac741fa6f55bb370cd1fb158
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/layer_norm.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7447ddff6f874a457d3b63ff67726984ba6ad06a
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/quant_noise.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..033dc4f4a609c2da4ba05f7635043177aeb03ae2
Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_layer.cpython-310.pyc differ
diff --git a/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py b/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cdece3f7fca2b830eb72999ce93f58667ed595b
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/downsampled_multihead_attention.py
@@ -0,0 +1,316 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.scalar_bias import scalar_bias
+
+
+class SingleHeadAttention(nn.Module):
+ """
+ Single-head attention that supports Gating and Downsampling
+ """
+
+ def __init__(
+ self,
+ out_channels,
+ embed_dim,
+ head_dim,
+ head_index,
+ dropout=0.0,
+ bias=True,
+ project_input=True,
+ gated=False,
+ downsample=False,
+ num_heads=1,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.dropout_module = FairseqDropout(
+ dropout, module_name=self.__class__.__name__
+ )
+ self.head_index = head_index
+ self.head_dim = head_dim
+ self.project_input = project_input
+ self.gated = gated
+ self.downsample = downsample
+ self.num_heads = num_heads
+ self.projection = None
+
+ k_layers = []
+ v_layers = []
+ if self.downsample:
+ k_layers.append(Downsample(self.head_index))
+ v_layers.append(Downsample(self.head_index))
+ out_proj_size = self.head_dim
+ else:
+ out_proj_size = self.head_dim * self.num_heads
+ if self.gated:
+ k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+ self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias)
+ v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias))
+ else:
+ k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+ self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias)
+ v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias))
+
+ self.in_proj_k = nn.Sequential(*k_layers)
+ self.in_proj_v = nn.Sequential(*v_layers)
+
+ if self.downsample:
+ self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias)
+ else:
+ self.out_proj = Linear(out_proj_size, out_channels, bias=bias)
+
+ self.scaling = self.head_dim ** -0.5
+
+ def forward(
+ self,
+ query,
+ key,
+ value,
+ mask_future_timesteps=False,
+ key_padding_mask=None,
+ use_scalar_bias=False,
+ ):
+ """Input shape: Time x Batch x Channel
+ Self-attention can be implemented by passing in the same arguments for
+ query, key and value. Future timesteps can be masked with the
+ `mask_future_timesteps` argument. Padding elements can be excluded from
+ the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
+ batch x src_len, where padding elements are indicated by 1s.
+ """
+ src_len, bsz, out_channels = key.size()
+ tgt_len = query.size(0)
+ assert list(query.size()) == [tgt_len, bsz, out_channels]
+ assert key.size() == value.size()
+
+ if key_padding_mask is not None:
+ assert key_padding_mask.size(0) == bsz
+ assert key_padding_mask.size(1) == src_len
+
+ if self.downsample:
+ size = bsz
+ else:
+ size = bsz * self.num_heads
+
+ k = key
+ v = value
+ q = query
+ if self.project_input:
+ q = self.in_proj_q(q)
+ k = self.in_proj_k(k)
+ v = self.in_proj_v(v)
+ src_len = k.size()[0]
+ q *= self.scaling
+
+ if not self.downsample:
+ q = q.view(tgt_len, size, self.head_dim)
+ k = k.view(src_len, size, self.head_dim)
+ v = v.view(src_len, size, self.head_dim)
+
+ q = q.transpose(0, 1)
+ k = k.transpose(0, 1)
+ v = v.transpose(0, 1)
+
+ attn_weights = torch.bmm(q, k.transpose(1, 2))
+ if mask_future_timesteps:
+ assert (
+ query.size() == key.size()
+ ), "mask_future_timesteps only applies to self-attention"
+ attn_weights *= torch.tril(
+ attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(),
+ diagonal=-1,
+ )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+ attn_weights += torch.triu(
+ attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(),
+ diagonal=0,
+ )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0)
+ tgt_size = tgt_len
+ if use_scalar_bias:
+ attn_weights = scalar_bias(attn_weights, 2)
+ v = scalar_bias(v, 1)
+ tgt_size += 1
+
+ if key_padding_mask is not None:
+ # don't attend to padding symbols
+ if key_padding_mask.max() > 0:
+ if self.downsample:
+ attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len)
+ else:
+ attn_weights = attn_weights.view(
+ size, self.num_heads, tgt_len, src_len
+ )
+ attn_weights = attn_weights.masked_fill(
+ key_padding_mask.unsqueeze(1).unsqueeze(2),
+ -math.inf,
+ )
+ attn_weights = attn_weights.view(size, tgt_len, src_len)
+ attn_weights = F.softmax(attn_weights, dim=-1)
+ attn_weights = self.dropout_module(attn_weights)
+
+ attn = torch.bmm(attn_weights, v)
+ if self.downsample:
+ attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim)
+ else:
+ attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)
+
+ attn = self.out_proj(attn)
+
+ return attn, attn_weights
+
+
+class DownsampledMultiHeadAttention(nn.ModuleList):
+ """
+ Multi-headed attention with Gating and Downsampling
+ """
+
+ def __init__(
+ self,
+ out_channels,
+ embed_dim,
+ num_heads,
+ dropout=0.0,
+ bias=True,
+ project_input=True,
+ gated=False,
+ downsample=False,
+ ):
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.head_dim = embed_dim // num_heads
+ self.downsample = downsample
+ self.gated = gated
+ self.project_input = project_input
+ assert self.head_dim * num_heads == embed_dim
+
+ if self.downsample:
+ attention_heads = []
+ for index in range(self.num_heads):
+ attention_heads.append(
+ SingleHeadAttention(
+ out_channels,
+ self.embed_dim,
+ self.head_dim,
+ index,
+ dropout,
+ bias,
+ self.project_input,
+ self.gated,
+ self.downsample,
+ self.num_heads,
+ )
+ )
+ super().__init__(modules=attention_heads)
+ self.out_proj = Linear(embed_dim, out_channels, bias=bias)
+ else:
+ # either we have a list of attention heads, or just one attention head
+ # if not being downsampled, we can do the heads with one linear layer instead of separate ones
+ super().__init__()
+ self.attention_module = SingleHeadAttention(
+ out_channels,
+ self.embed_dim,
+ self.head_dim,
+ 1,
+ dropout,
+ bias,
+ self.project_input,
+ self.gated,
+ self.downsample,
+ self.num_heads,
+ )
+
+ def forward(
+ self,
+ query,
+ key,
+ value,
+ mask_future_timesteps=False,
+ key_padding_mask=None,
+ use_scalar_bias=False,
+ ):
+ src_len, bsz, embed_dim = key.size()
+ tgt_len = query.size(0)
+ assert embed_dim == self.embed_dim
+ assert list(query.size()) == [tgt_len, bsz, embed_dim]
+ assert key.size() == value.size()
+
+ tgt_size = tgt_len
+ if use_scalar_bias:
+ tgt_size += 1
+
+ attn = []
+ attn_weights = []
+ if self.downsample:
+ for attention_head_number in range(self.num_heads):
+ # call the forward of each attention head
+ _attn, _attn_weight = self[attention_head_number](
+ query,
+ key,
+ value,
+ mask_future_timesteps,
+ key_padding_mask,
+ use_scalar_bias,
+ )
+ attn.append(_attn)
+ attn_weights.append(_attn_weight)
+ full_attn = torch.cat(attn, dim=2)
+ full_attn = self.out_proj(full_attn)
+ return full_attn, attn_weights[0].clone()
+ else:
+ _attn, _attn_weight = self.attention_module(
+ query,
+ key,
+ value,
+ mask_future_timesteps,
+ key_padding_mask,
+ use_scalar_bias,
+ )
+ attn.append(_attn)
+ attn_weights.append(_attn_weight)
+ full_attn = torch.cat(attn, dim=2)
+ full_attn_weights = torch.cat(attn_weights)
+ full_attn_weights = full_attn_weights.view(
+ bsz, self.num_heads, tgt_size, src_len
+ )
+ full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads
+ return full_attn, full_attn_weights
+
+
+class Downsample(nn.Module):
+ """
+ Selects every nth element, where n is the index
+ """
+
+ def __init__(self, index):
+ super().__init__()
+ self.index = index
+
+ def forward(self, x):
+ return x[:: self.index + 1]
+
+
+def Linear(in_features, out_features, dropout=0.0, bias=True):
+ """Weight-normalized Linear layer (input: B x T x C)"""
+ m = nn.Linear(in_features, out_features, bias=bias)
+ m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
+ m.bias.data.zero_()
+ return nn.utils.weight_norm(m)
+
+
+def GatedLinear(in_features, out_features, dropout=0.0, bias=True):
+ """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units"""
+ return nn.Sequential(
+ Linear(in_features, out_features * 4, dropout, bias),
+ nn.GLU(),
+ Linear(out_features * 2, out_features * 2, dropout, bias),
+ nn.GLU(),
+ Linear(out_features, out_features, dropout, bias),
+ )
diff --git a/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py b/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fcc6b8d2672d2eacc6d01b9688bac44d5e1ce26
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/dynamic_crf_layer.py
@@ -0,0 +1,189 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file is to re-implemented the low-rank and beam approximation of CRF layer
+Proposed by:
+
+Sun, Zhiqing, et al.
+Fast Structured Decoding for Sequence Models
+https://arxiv.org/abs/1910.11555
+
+The CRF implementation is mainly borrowed from
+https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py
+
+"""
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+def logsumexp(x, dim=1):
+ return torch.logsumexp(x.float(), dim=dim).type_as(x)
+
+
+class DynamicCRF(nn.Module):
+ """Dynamic CRF layer is used to approximate the traditional
+ Conditional Random Fields (CRF)
+ $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$
+
+ where in this function, we assume the emition scores (s) are given,
+ and the transition score is a |V| x |V| matrix $M$
+
+ in the following two aspects:
+ (1) it used a low-rank approximation for the transition matrix:
+ $M = E_1 E_2^T$
+ (2) it used a beam to estimate the normalizing factor Z(x)
+ """
+
+ def __init__(self, num_embedding, low_rank=32, beam_size=64):
+ super().__init__()
+
+ self.E1 = nn.Embedding(num_embedding, low_rank)
+ self.E2 = nn.Embedding(num_embedding, low_rank)
+
+ self.vocb = num_embedding
+ self.rank = low_rank
+ self.beam = beam_size
+
+ def extra_repr(self):
+ return "vocab_size={}, low_rank={}, beam_size={}".format(
+ self.vocb, self.rank, self.beam
+ )
+
+ def forward(self, emissions, targets, masks, beam=None):
+ """
+ Compute the conditional log-likelihood of a sequence of target tokens given emission scores
+
+ Args:
+ emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+ ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+ targets (`~torch.LongTensor`): Sequence of target token indices
+ ``(batch_size, seq_len)
+ masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+ Returns:
+ `~torch.Tensor`: approximated log-likelihood
+ """
+ numerator = self._compute_score(emissions, targets, masks)
+ denominator = self._compute_normalizer(emissions, targets, masks, beam)
+ return numerator - denominator
+
+ def forward_decoder(self, emissions, masks=None, beam=None):
+ """
+ Find the most likely output sequence using Viterbi algorithm.
+
+ Args:
+ emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output
+ ``(batch_size, seq_len, vocab_size)``. We assume batch-first
+ masks (`~torch.ByteTensor`): Mask tensor with the same size as targets
+
+ Returns:
+ `~torch.LongTensor`: decoded sequence from the CRF model
+ """
+ return self._viterbi_decode(emissions, masks, beam)
+
+ def _compute_score(self, emissions, targets, masks=None):
+ batch_size, seq_len = targets.size()
+ emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0] # B x T
+ transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2)
+
+ scores = emission_scores
+ scores[:, 1:] += transition_scores
+
+ if masks is not None:
+ scores = scores * masks.type_as(scores)
+ return scores.sum(-1)
+
+ def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None):
+ # HACK: we include "target" which is a hueristic for training
+ # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+ beam = beam if beam is not None else self.beam
+ batch_size, seq_len = emissions.size()[:2]
+ if targets is not None:
+ _emissions = emissions.scatter(2, targets[:, :, None], np.float("inf"))
+ beam_targets = _emissions.topk(beam, 2)[1]
+ beam_emission_scores = emissions.gather(2, beam_targets)
+ else:
+ beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+ beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D
+ beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D
+ beam_transition_matrix = torch.bmm(
+ beam_transition_score1.view(-1, beam, self.rank),
+ beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
+ )
+ beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+ # compute the normalizer in the log-space
+ score = beam_emission_scores[:, 0] # B x K
+ for i in range(1, seq_len):
+ next_score = score[:, :, None] + beam_transition_matrix[:, i - 1]
+ next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i]
+
+ if masks is not None:
+ score = torch.where(masks[:, i : i + 1], next_score, score)
+ else:
+ score = next_score
+
+ # Sum (log-sum-exp) over all possible tags
+ return logsumexp(score, dim=1)
+
+ def _viterbi_decode(self, emissions, masks=None, beam=None):
+ # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?)
+
+ beam = beam if beam is not None else self.beam
+ batch_size, seq_len = emissions.size()[:2]
+ beam_emission_scores, beam_targets = emissions.topk(beam, 2)
+ beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D
+ beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D
+ beam_transition_matrix = torch.bmm(
+ beam_transition_score1.view(-1, beam, self.rank),
+ beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2),
+ )
+ beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam)
+
+ traj_tokens, traj_scores = [], []
+ finalized_tokens, finalized_scores = [], []
+
+ # compute the normalizer in the log-space
+ score = beam_emission_scores[:, 0] # B x K
+ dummy = (
+ torch.arange(beam, device=score.device).expand(*score.size()).contiguous()
+ )
+
+ for i in range(1, seq_len):
+ traj_scores.append(score)
+ _score = score[:, :, None] + beam_transition_matrix[:, i - 1]
+ _score, _index = _score.max(dim=1)
+ _score = _score + beam_emission_scores[:, i]
+
+ if masks is not None:
+ score = torch.where(masks[:, i : i + 1], _score, score)
+ index = torch.where(masks[:, i : i + 1], _index, dummy)
+ else:
+ score, index = _score, _index
+ traj_tokens.append(index)
+
+ # now running the back-tracing and find the best
+ best_score, best_index = score.max(dim=1)
+ finalized_tokens.append(best_index[:, None])
+ finalized_scores.append(best_score[:, None])
+
+ for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)):
+ previous_index = finalized_tokens[-1]
+ finalized_tokens.append(idx.gather(1, previous_index))
+ finalized_scores.append(scs.gather(1, previous_index))
+
+ finalized_tokens.reverse()
+ finalized_tokens = torch.cat(finalized_tokens, 1)
+ finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0]
+
+ finalized_scores.reverse()
+ finalized_scores = torch.cat(finalized_scores, 1)
+ finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1]
+
+ return finalized_scores, finalized_tokens
diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..9304f99eb8169a614f39babc830c84cac80e080b
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/cuda_function_gen.py
@@ -0,0 +1,223 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def gen_forward():
+
+ kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+ blocks = [32, 64, 128, 256]
+
+ head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) {
+
+ at::DeviceGuard g(input.device());
+ const auto minibatch = input.size(0);
+ const auto numFeatures = input.size(1);
+ const auto sequenceLength = input.size(2);
+
+ const auto numHeads = weight.size(1);
+ const auto filterSize = weight.size(2);
+
+ const auto numFiltersInBlock = numFeatures / numHeads;
+ const dim3 blocks(minibatch, numFeatures);
+
+ auto output = at::zeros_like(input);
+ auto stream = at::cuda::getCurrentCUDAStream();
+"""
+
+ switch = """
+ switch(filterSize) {
+"""
+
+ case_k = """
+ case {k}:
+"""
+
+ main_block = """
+ if (padding_l == {pad}) {{
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{
+ dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t>
+ <<>>(
+ input.data(),
+ weight.data(),
+ minibatch,
+ sequenceLength,
+ numFeatures,
+ numFiltersInBlock,
+ numHeads,
+ output.data());
+ }}));
+ }} else
+"""
+
+ bad_padding = """
+ {
+ std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl;
+ }
+ break;\n
+"""
+
+ end = """
+ default:
+ std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl;
+ }
+
+ return {output};
+}
+"""
+
+ with open("dynamicconv_cuda_forward.cu", "w") as forward:
+ forward.write(head)
+ forward.write(switch)
+ for k in kernels:
+ b_size = 32
+ for b in blocks:
+ if b > k:
+ b_size = b
+ break
+ forward.write(case_k.format(k=k))
+ for pad in [k // 2, k - 1]:
+ forward.write(main_block.format(k=k, b_size=b_size, pad=pad))
+ forward.write(bad_padding)
+ forward.write(end)
+
+
+def gen_backward():
+
+ kernels = [3, 5, 7, 15, 31, 63, 127, 255]
+ thresh = [512, 512, 512, 512, 512, 380, 256, 256]
+ min_block = [64, 64, 64, 64, 64, 64, 128, 256]
+ seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
+
+ head = """
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "dynamicconv_cuda.cuh"
+
+std::vector dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) {
+
+ at::DeviceGuard g(input.device());
+ const auto minibatch = input.size(0);
+ const auto numFeatures = input.size(1);
+ const auto sequenceLength = input.size(2);
+
+ const auto numHeads = weight.size(1);
+ const auto filterSize = weight.size(2);
+
+ const auto numFiltersInBlock = numFeatures / numHeads;
+ auto numChunks = 1;
+
+ auto gradInput = at::zeros_like(input);
+ auto gradWeight = at::zeros_like(weight);
+ auto stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 blocks(minibatch, numHeads, numChunks);
+"""
+
+ sequence_if = """
+ if (sequenceLength < {seq}) {{
+ switch(filterSize) {{
+"""
+
+ case_k = """
+ case {k}:
+"""
+
+ chunks_reset = """
+ numChunks = int(ceilf(sequenceLength/float({b_size})));
+ blocks = dim3(minibatch, numHeads, numChunks);
+"""
+
+ main_block = """
+ if (padding_l == {p}) {{
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{
+ dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t>
+ <<>>(
+ gradOutput.data(),
+ input.data(),
+ weight.data(),
+ minibatch,
+ sequenceLength,
+ numFeatures,
+ numFiltersInBlock,
+ numHeads,
+ gradWeight.data(),
+ gradInput.data());
+ }}));
+ }} else
+"""
+
+ bad_padding = """
+ {
+ std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl;
+ }
+ break;\n
+"""
+
+ bad_filter = """
+ default:
+ std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl;
+ }
+"""
+
+ con_else = """
+ } else
+"""
+
+ final_else = """
+ {
+ switch(filterSize) {
+"""
+
+ last_return = """
+ }
+ return {gradInput, gradWeight};
+}
+"""
+
+ with open("dynamicconv_cuda_backward.cu", "w") as backward:
+ backward.write(head)
+ for seq in seqs:
+ backward.write(sequence_if.format(seq=seq))
+ for k, t, m in zip(kernels, thresh, min_block):
+ backward.write(case_k.format(k=k))
+ if seq <= t:
+ b_size = seq
+ else:
+ b_size = m
+ backward.write(chunks_reset.format(b_size=b_size))
+ for p in [k // 2, k - 1]:
+ backward.write(main_block.format(k=k, b_size=b_size, p=p))
+ backward.write(bad_padding)
+ backward.write(bad_filter)
+ backward.write(con_else)
+ backward.write(final_else)
+ for k, m in zip(kernels, min_block):
+ backward.write(case_k.format(k=k))
+ backward.write(chunks_reset.format(b_size=m))
+ for p in [k // 2, k - 1]:
+ backward.write(main_block.format(k=k, b_size=m, p=p))
+ backward.write(bad_padding)
+ backward.write(bad_filter)
+ backward.write(last_return)
+
+
+if __name__ == "__main__":
+ gen_forward()
+ gen_backward()
diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a683d2690d5e3058192afb1b3f4c1f3e2c41352
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamicconv_layer.py
@@ -0,0 +1,227 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dynamicconv_cuda
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.unfold import unfold1d
+from torch import nn
+from torch.autograd import Function
+
+
+class dynamicconvFunction(Function):
+ @staticmethod
+ def forward(ctx, x, weights, padding_l):
+ ctx.padding_l = padding_l
+ outputs = dynamicconv_cuda.forward(x, weights, padding_l)
+ variables = [x, weights]
+ ctx.save_for_backward(*variables)
+ return outputs[0]
+
+ @staticmethod
+ def backward(ctx, grad_output):
+ outputs = dynamicconv_cuda.backward(
+ grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors
+ )
+ grad_input, grad_weights = outputs
+ return grad_input, grad_weights, None
+
+
+@with_incremental_state
+class DynamicconvLayer(nn.Module):
+ def __init__(
+ self,
+ input_size,
+ kernel_size=1,
+ padding_l=None,
+ weight_softmax=False,
+ num_heads=1,
+ weight_dropout=0.0,
+ bias=False,
+ renorm_padding=False,
+ conv_bias=False,
+ query_size=None,
+ ):
+
+ super(DynamicconvLayer, self).__init__()
+ self.input_size = input_size
+ self.query_size = input_size if query_size is None else query_size
+ self.kernel_size = kernel_size
+ self.padding_l = padding_l
+ self.num_heads = num_heads
+ self.weight_softmax = weight_softmax
+ self.weight_dropout_module = FairseqDropout(
+ weight_dropout, module_name=self.__class__.__name__
+ )
+ self.renorm_padding = renorm_padding
+ self.bias = bias
+
+ self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias)
+ if conv_bias:
+ self.conv_bias = nn.Parameter(torch.Tensor(input_size))
+ else:
+ self.conv_bias = None
+ self.reset_parameters()
+
+ def reset_parameters(self):
+ nn.init.xavier_uniform_(self.weight_linear.weight)
+ if self.conv_bias is not None:
+ nn.init.constant_(self.conv_bias, 0.0)
+ nn.init.constant_(self.weight_linaer.bias, 0.0)
+
+ def forward(self, x, incremental_state=None, query=None, unfold=None):
+
+ T, B, C = x.size()
+ K, H = self.kernel_size, self.num_heads
+ # R = C // H
+
+ # during inference time, incremental BMM is faster
+ if incremental_state is not None:
+ unfold = (
+ x.size(0) > 512 if unfold is None else unfold
+ ) # use unfold mode as default for long sequence to save memory
+ unfold = unfold or (incremental_state is not None)
+ assert query is None
+
+ if query is None:
+ query = x
+ if unfold:
+ output = self._forward_unfolded(x, incremental_state, query)
+ else:
+ output = self._forward_expanded(x, incremental_state, query)
+
+ if self.conv_bias is not None:
+ output = output + self.conv_bias.view(1, 1, -1)
+
+ return output
+
+ # during training time, use CUDA kernel
+ else:
+ weight = self.weight_linear(x).view(T, B, H, K)
+ if self.weight_softmax:
+ weight = F.softmax(weight, dim=-1)
+ if self.weight_dropout_module.p:
+ weight = self.weight_dropout_module(weight)
+
+ weight = weight.permute(1, 2, 3, 0).contiguous()
+ self.filters = weight
+ x = x.permute(1, 2, 0).contiguous()
+ output = dynamicconvFunction.apply(x, weight, self.padding_l).permute(
+ 2, 0, 1
+ )
+ if self.conv_bias is not None:
+ output = output + self.conv_bias.view(1, 1, -1)
+ return output
+
+ def reorder_incremental_state(self, incremental_state, new_order):
+ input_buffer = self._get_input_buffer(incremental_state)
+ if input_buffer is not None:
+ input_buffer = input_buffer.index_select(1, new_order)
+ self._set_input_buffer(incremental_state, input_buffer)
+
+ def _get_input_buffer(self, incremental_state):
+ return utils.get_incremental_state(self, incremental_state, "input_buffer")
+
+ def _set_input_buffer(self, incremental_state, new_buffer):
+ return utils.set_incremental_state(
+ self, incremental_state, "input_buffer", new_buffer
+ )
+
+ def _forward_unfolded(self, x, incremental_state, query):
+ """The conventional implementation of convolutions.
+ Unfolding the input by having a window shifting to the right."""
+ T, B, C = x.size()
+ K, H = self.kernel_size, self.num_heads
+ R = C // H
+ assert R * H == C == self.input_size
+
+ weight = self.weight_linear(query).view(T * B * H, -1)
+
+ # renorm_padding is only implemented in _forward_expanded
+ assert not self.renorm_padding or incremental_state is not None
+
+ if incremental_state is not None:
+ input_buffer = self._get_input_buffer(incremental_state)
+ if input_buffer is None:
+ input_buffer = x.new()
+ x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3)
+ if self.kernel_size > 1:
+ self._set_input_buffer(
+ incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :]
+ )
+ x_unfold = x_unfold.view(T * B * H, R, -1)
+ else:
+ padding_l = self.padding_l
+ if K > T and padding_l == K - 1:
+ weight = weight.narrow(1, K - T, T)
+ K, padding_l = T, T - 1
+ # unfold the input: T x B x C --> T' x B x C x K
+ x_unfold = unfold1d(x, K, padding_l, 0)
+ x_unfold = x_unfold.view(T * B * H, R, K)
+
+ if self.weight_softmax and not self.renorm_padding:
+ weight = F.softmax(weight, dim=1)
+ weight = weight.narrow(1, 0, K)
+
+ if incremental_state is not None:
+ weight = weight[:, -x_unfold.size(2) :]
+ K = weight.size(1)
+
+ if self.weight_softmax and self.renorm_padding:
+ weight = F.softmax(weight, dim=1)
+
+ weight = self.weight_dropout_module(weight, inplace=False)
+
+ output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1
+ output = output.view(T, B, C)
+ return output
+
+ def _forward_expanded(self, x, incremental_stat, query):
+ """Turn the convolution filters into band matrices and do matrix multiplication.
+ This is faster when the sequence is short, but less memory efficient.
+ This is not used in the decoder during inference.
+ """
+ T, B, C = x.size()
+ K, H = self.kernel_size, self.num_heads
+ R = C // H
+ assert R * H == C == self.input_size
+ weight = self.weight_linear(query).view(T * B * H, -1)
+
+ if not self.renorm_padding:
+ if self.weight_softmax:
+ weight = F.softmax(weight, dim=1)
+ weight = self.weight_dropout_module(weight, inplace=False)
+ weight = weight.narrow(1, 0, K).contiguous()
+ weight = weight.view(T, B * H, K).transpose(0, 1)
+
+ x = x.view(T, B * H, R).transpose(0, 1)
+ if self.weight_softmax and self.renorm_padding:
+ # turn the convolution filters into band matrices
+ weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf"))
+ weight_expanded.as_strided(
+ (B * H, T, K), (T * (T + K - 1), T + K, 1)
+ ).copy_(weight)
+ weight_expanded = weight_expanded.narrow(2, self.padding_l, T)
+ # normalize the weight over valid positions like self-attention
+ weight_expanded = F.softmax(weight_expanded, dim=2)
+ weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False)
+ else:
+ P = self.padding_l
+ # For efficieny, we cut the kernel size and reduce the padding when the kernel is larger than the length
+ if K > T and P == K - 1:
+ weight = weight.narrow(2, K - T, T)
+ K, P = T, T - 1
+ # turn the convolution filters into band matrices
+ weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False)
+ weight_expanded.as_strided(
+ (B * H, T, K), (T * (T + K - 1), T + K, 1)
+ ).copy_(weight)
+ weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T
+ output = torch.bmm(weight_expanded, x)
+ output = output.transpose(0, 1).contiguous().view(T, B, C)
+ return output
diff --git a/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a6af4285da3c40a01383541acf1f455ffc060fb
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp
@@ -0,0 +1,35 @@
+#include
+#include
+
+std::vector dynamicconv_cpu_forward(
+ float* input,
+ float* filters,
+ int padding_l);
+
+std::vector dynamicconv_cpu_backward(
+ float* gradOutput,
+ int padding_l,
+ float* input,
+ float* filters);
+
+std::vector dynamicconv_forward(
+ float* input,
+ float* filters,
+ int padding_l) {
+
+ return dynamicconv_cpu_forward(input, filters, padding_l);
+}
+
+std::vector dynamicconv_backward(
+ float* gradOutput,
+ int padding_l,
+ float* input,
+ float* filters) {
+
+ return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)");
+ m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)");
+}
diff --git a/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py b/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..f070a804e6c1e00b6c0db315b944305c2c41d807
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/fairseq_dropout.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import List, Optional
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+logger = logging.getLogger(__name__)
+
+
+class FairseqDropout(nn.Module):
+ def __init__(self, p, module_name=None):
+ super().__init__()
+ self.p = p
+ self.module_name = module_name
+ self.apply_during_inference = False
+
+ def forward(self, x, inplace: bool = False):
+ if self.training or self.apply_during_inference:
+ return F.dropout(x, p=self.p, training=True, inplace=inplace)
+ else:
+ return x
+
+ def make_generation_fast_(
+ self,
+ name: str,
+ retain_dropout: bool = False,
+ retain_dropout_modules: Optional[List[str]] = None,
+ **kwargs
+ ):
+ if retain_dropout:
+ if retain_dropout_modules is not None and self.module_name is None:
+ logger.warning(
+ "Cannot enable dropout during inference for module {} "
+ "because module_name was not set".format(name)
+ )
+ elif (
+ retain_dropout_modules is None # if None, apply to all modules
+ or self.module_name in retain_dropout_modules
+ ):
+ logger.info(
+ "Enabling dropout during inference for module: {}".format(name)
+ )
+ self.apply_during_inference = True
+ else:
+ logger.info("Disabling dropout for module: {}".format(name))
diff --git a/fairseq-0.10.2/fairseq/modules/grad_multiply.py b/fairseq-0.10.2/fairseq/modules/grad_multiply.py
new file mode 100644
index 0000000000000000000000000000000000000000..08d15f55dfda9c61a1cf8641ea31424fe1d97f57
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/grad_multiply.py
@@ -0,0 +1,18 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class GradMultiply(torch.autograd.Function):
+ @staticmethod
+ def forward(ctx, x, scale):
+ ctx.scale = scale
+ res = x.new(x)
+ return res
+
+ @staticmethod
+ def backward(ctx, grad):
+ return grad * ctx.scale, None
diff --git a/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py b/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..47657bb0ab70864a3f7a0b00c226ccc9fc527fa3
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/gumbel_vector_quantizer.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GumbelVectorQuantizer(nn.Module):
+ def __init__(
+ self,
+ dim,
+ num_vars,
+ temp,
+ groups,
+ combine_groups,
+ vq_dim,
+ time_first,
+ activation=nn.GELU(),
+ weight_proj_depth=1,
+ weight_proj_factor=1,
+ ):
+ """Vector quantization using gumbel softmax
+
+ Args:
+ dim: input dimension (channels)
+ num_vars: number of quantized vectors per group
+ temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor)
+ groups: number of groups for vector quantization
+ combine_groups: whether to use the vectors for all groups
+ vq_dim: dimensionality of the resulting quantized vector
+ time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+ activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1
+ weight_proj_depth: number of layers (with activation in between) to project input before computing logits
+ weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of
+ projections by this factor
+ """
+ super().__init__()
+
+ self.groups = groups
+ self.combine_groups = combine_groups
+ self.input_dim = dim
+ self.num_vars = num_vars
+ self.time_first = time_first
+
+ assert (
+ vq_dim % groups == 0
+ ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+ var_dim = vq_dim // groups
+ num_groups = groups if not combine_groups else 1
+
+ self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim))
+ nn.init.uniform_(self.vars)
+
+ if weight_proj_depth > 1:
+
+ def block(input_dim, output_dim):
+ return nn.Sequential(nn.Linear(input_dim, output_dim), activation)
+
+ inner_dim = self.input_dim * weight_proj_factor
+ self.weight_proj = nn.Sequential(
+ *[
+ block(self.input_dim if i == 0 else inner_dim, inner_dim)
+ for i in range(weight_proj_depth - 1)
+ ],
+ nn.Linear(inner_dim, groups * num_vars),
+ )
+ else:
+ self.weight_proj = nn.Linear(self.input_dim, groups * num_vars)
+ nn.init.normal_(self.weight_proj.weight, mean=0, std=1)
+ nn.init.zeros_(self.weight_proj.bias)
+
+ assert len(temp) == 3, temp
+
+ self.max_temp, self.min_temp, self.temp_decay = temp
+ self.curr_temp = self.max_temp
+ self.codebook_indices = None
+
+ def set_num_updates(self, num_updates):
+ self.curr_temp = max(
+ self.max_temp * self.temp_decay ** num_updates, self.min_temp
+ )
+
+ def get_codebook_indices(self):
+ if self.codebook_indices is None:
+ from itertools import product
+
+ p = [range(self.num_vars)] * self.groups
+ inds = list(product(*p))
+ self.codebook_indices = torch.tensor(
+ inds, dtype=torch.long, device=self.vars.device
+ ).flatten()
+
+ if not self.combine_groups:
+ self.codebook_indices = self.codebook_indices.view(
+ self.num_vars ** self.groups, -1
+ )
+ for b in range(1, self.groups):
+ self.codebook_indices[:, b] += self.num_vars * b
+ self.codebook_indices = self.codebook_indices.flatten()
+ return self.codebook_indices
+
+ def codebook(self):
+ indices = self.get_codebook_indices()
+ return (
+ self.vars.squeeze(0)
+ .index_select(0, indices)
+ .view(self.num_vars ** self.groups, -1)
+ )
+
+ def sample_from_codebook(self, b, n):
+ indices = self.get_codebook_indices()
+ indices = indices.view(-1, self.groups)
+ cb_size = indices.size(0)
+ assert (
+ n < cb_size
+ ), f"sample size {n} is greater than size of codebook {cb_size}"
+ sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,))
+ indices = indices[sample_idx]
+
+ z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1)
+ return z
+
+ def to_codebook_index(self, indices):
+ res = indices.new_full(indices.shape[:-1], 0)
+ for i in range(self.groups):
+ exponent = self.groups - i - 1
+ res += indices[..., i] * (self.num_vars ** exponent)
+ return res
+
+ def forward_idx(self, x):
+ res = self.forward(x, produce_targets=True)
+ return res["x"], res["targets"]
+
+ def forward(self, x, produce_targets=False):
+
+ result = {"num_vars": self.num_vars * self.groups}
+
+ if not self.time_first:
+ x = x.transpose(1, 2)
+
+ bsz, tsz, fsz = x.shape
+ x = x.reshape(-1, fsz)
+ x = self.weight_proj(x)
+ x = x.view(bsz * tsz * self.groups, -1)
+
+ _, k = x.max(-1)
+ hard_x = (
+ x.new_zeros(*x.shape)
+ .scatter_(-1, k.view(-1, 1), 1.0)
+ .view(bsz * tsz, self.groups, -1)
+ )
+ hard_probs = torch.mean(hard_x.float(), dim=0)
+ result["code_perplexity"] = torch.exp(
+ -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+ ).sum()
+
+ avg_probs = torch.softmax(
+ x.view(bsz * tsz, self.groups, -1).float(), dim=-1
+ ).mean(dim=0)
+ result["prob_perplexity"] = torch.exp(
+ -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1)
+ ).sum()
+
+ result["temp"] = self.curr_temp
+
+ if self.training:
+ x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=True).type_as(x)
+ else:
+ x = hard_x
+
+ x = x.view(bsz * tsz, -1)
+
+ vars = self.vars
+ if self.combine_groups:
+ vars = vars.repeat(1, self.groups, 1)
+
+ if produce_targets:
+ result["targets"] = (
+ x.view(bsz * tsz * self.groups, -1)
+ .argmax(dim=-1)
+ .view(bsz, tsz, self.groups)
+ .detach()
+ )
+
+ x = x.unsqueeze(-1) * vars
+ x = x.view(bsz * tsz, self.groups, self.num_vars, -1)
+ x = x.sum(-2)
+ x = x.view(bsz, tsz, -1)
+
+ if not self.time_first:
+ x = x.transpose(1, 2) # BTC -> BCT
+
+ result["x"] = x
+
+ return result
diff --git a/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py b/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..040db1e83e775a3bb59d5263d22aae9276a83f22
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/kmeans_vector_quantizer.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from fairseq.modules import Fp32GroupNorm
+
+
+class KmeansVectorQuantizer(nn.Module):
+ def __init__(
+ self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25
+ ):
+ """Vector quantization using straight pass-through estimator (i.e. kmeans)
+
+ Args:
+ dim: input dimension (channels)
+ num_vars: number of quantized vectors per group
+ groups: number of groups for vector quantization
+ combine_groups: whether to use the vectors for all groups
+ vq_dim: dimensionality of the resulting quantized vector
+ time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+ gamma: commitment loss coefficient
+ """
+ super().__init__()
+
+ self.groups = groups
+ self.combine_groups = combine_groups
+ self.input_dim = dim
+ self.num_vars = num_vars
+ self.vq_dim = vq_dim
+ self.time_first = time_first
+
+ assert (
+ vq_dim % groups == 0
+ ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+ self.var_dim = vq_dim // groups
+ num_groups = groups if not combine_groups else 1
+
+ self.embedding = nn.Parameter(
+ 0.01 * torch.randn(num_vars, num_groups, self.var_dim)
+ )
+ self.projection = nn.Sequential(
+ nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False),
+ Fp32GroupNorm(groups, dim),
+ )
+ self.gamma = gamma
+ self.mse_mean = nn.MSELoss(reduction="mean")
+
+ def _pass_grad(self, x, y):
+ """Manually set gradient for backward pass.
+ for y = f(x), ensure that during the backward pass,
+ dL/dy = dL/dx regardless of f(x).
+ Returns:
+ y, with the gradient forced to be dL/dy = dL/dx.
+ """
+
+ return y.detach() + (x - x.detach())
+
+ @property
+ def expand_embedding(self):
+ if self.combine_groups:
+ return self.embedding.expand(self.num_vars, self.groups, self.var_dim)
+ return self.embedding
+
+ def forward_idx(self, x):
+ res = self.forward(x, produce_targets=True)
+ return res["x"], res["targets"]
+
+ def forward(self, x, produce_targets=False):
+
+ result = {"num_vars": self.num_vars}
+
+ if self.time_first:
+ x = x.transpose(1, 2)
+
+ bsz, fsz, tsz = x.shape
+
+ ze = self.projection(x)
+ ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2)
+ d = (
+ (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1))
+ .view(self.num_vars, bsz, tsz, self.groups, -1)
+ .norm(dim=-1, p=2)
+ )
+ idx = d.argmin(dim=0)
+ zq = (
+ torch.stack(
+ [
+ self.expand_embedding[idx[..., group], group]
+ for group in range(self.groups)
+ ],
+ dim=-2,
+ )
+ .view(bsz, tsz, self.groups * self.var_dim)
+ .permute(0, 2, 1)
+ )
+ assert ze.shape == zq.shape, (ze.shape, zq.shape)
+ x = self._pass_grad(ze, zq)
+
+ hard_x = (
+ idx.new_zeros(bsz * tsz * self.groups, self.num_vars)
+ .scatter_(-1, idx.view(-1, 1), 1.0)
+ .view(bsz * tsz, self.groups, -1)
+ )
+ hard_probs = torch.mean(hard_x.float(), dim=0)
+ result["code_perplexity"] = torch.exp(
+ -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1)
+ ).sum()
+
+ if produce_targets:
+ result["targets"] = idx
+
+ if self.time_first:
+ x = x.transpose(1, 2) # BCT -> BTC
+ result["x"] = x
+
+ ze = ze.float()
+ zq = zq.float()
+ latent_loss = self.mse_mean(zq, ze.detach())
+ commitment_loss = self.mse_mean(ze, zq.detach())
+
+ result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss
+
+ return result
diff --git a/fairseq-0.10.2/fairseq/modules/layer_norm.py b/fairseq-0.10.2/fairseq/modules/layer_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..234609d9e213a650e0032aaa0ca0462a818bfead
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/layer_norm.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+try:
+ from apex.normalization import FusedLayerNorm as _FusedLayerNorm
+
+ has_fused_layernorm = True
+
+ class FusedLayerNorm(_FusedLayerNorm):
+ @torch.jit.unused
+ def forward(self, x):
+ if not x.is_cuda:
+ return super().forward(x)
+ else:
+ with torch.cuda.device(x.device):
+ return super().forward(x)
+
+
+except ImportError:
+ has_fused_layernorm = False
+
+
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
+ if torch.jit.is_scripting():
+ export = True
+ if not export and torch.cuda.is_available() and has_fused_layernorm:
+ return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+ return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+
+
+class Fp32LayerNorm(nn.LayerNorm):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ def forward(self, input):
+ output = F.layer_norm(
+ input.float(),
+ self.normalized_shape,
+ self.weight.float() if self.weight is not None else None,
+ self.bias.float() if self.bias is not None else None,
+ self.eps,
+ )
+ return output.type_as(input)
diff --git a/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py b/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..378d0f707183dd344dbb9288dda394b11053acf0
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/learned_positional_embedding.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from fairseq import utils
+from torch import Tensor
+
+
+class LearnedPositionalEmbedding(nn.Embedding):
+ """
+ This module learns positional embeddings up to a fixed maximum size.
+ Padding ids are ignored by either offsetting based on padding_idx
+ or by setting padding_idx to None and ensuring that the appropriate
+ position ids are passed to the forward function.
+ """
+
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+ super().__init__(num_embeddings, embedding_dim, padding_idx)
+ self.onnx_trace = False
+ if self.padding_idx is not None:
+ self.max_positions = self.num_embeddings - self.padding_idx - 1
+ else:
+ self.max_positions = self.num_embeddings
+
+ def forward(
+ self,
+ input: Tensor,
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+ positions: Optional[Tensor] = None,
+ ):
+ """Input is expected to be of size [bsz x seqlen]."""
+ assert (positions is None) or (
+ self.padding_idx is None
+ ), "If positions is pre-computed then padding_idx should not be set."
+
+ if positions is None:
+ if incremental_state is not None:
+ # positions is the same for every token when decoding a single step
+ # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+ positions = torch.zeros(
+ (1, 1), device=input.device, dtype=input.dtype
+ ).fill_(int(self.padding_idx + input.size(1)))
+ else:
+ positions = utils.make_positions(
+ input, self.padding_idx, onnx_trace=self.onnx_trace
+ )
+ return F.embedding(
+ positions,
+ self.weight,
+ self.padding_idx,
+ self.max_norm,
+ self.norm_type,
+ self.scale_grad_by_freq,
+ self.sparse,
+ )
diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b2a99c1227f827768911e5e22e79f6865ffbfd3
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .lightconv_layer import LightconvLayer # noqa
diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bf6b5ad365d604bd91eda384bb422857b640744
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda.cpp
@@ -0,0 +1,54 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include
+#include
+
+std::vector lightconv_cuda_forward(
+ at::Tensor input,
+ at::Tensor filters,
+ int padding_l);
+
+std::vector lightconv_cuda_backward(
+ at::Tensor gradOutput,
+ int padding_l,
+ at::Tensor input,
+ at::Tensor filters);
+
+
+#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector lightconv_forward(
+ at::Tensor input,
+ at::Tensor filters,
+ int padding_l) {
+
+ CHECK_INPUT(input);
+ CHECK_INPUT(filters);
+
+ return lightconv_cuda_forward(input, filters, padding_l);
+}
+
+std::vector lightconv_backward(
+ at::Tensor gradOutput,
+ int padding_l,
+ at::Tensor input,
+ at::Tensor filters) {
+
+ CHECK_INPUT(gradOutput);
+ CHECK_INPUT(input);
+ CHECK_INPUT(filters);
+
+ return lightconv_cuda_backward(gradOutput, padding_l, input, filters);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+ m.def("forward", &lightconv_forward, "lighconv forward (CUDA)");
+ m.def("backward", &lightconv_backward, "lighconv backward (CUDA)");
+}
diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py b/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..052635be79b466d0ad56cf5cf607bd10c2297ecf
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/setup.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+setup(
+ name="lightconv_layer",
+ ext_modules=[
+ CUDAExtension(
+ "lightconv_cuda",
+ [
+ "lightconv_cuda.cpp",
+ "lightconv_cuda_kernel.cu",
+ ],
+ ),
+ ],
+ cmdclass={"build_ext": BuildExtension},
+)
diff --git a/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d95da59c2471bfa858fd627605196d7f41f9ec12
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder_layer.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from fairseq.modules import TransformerSentenceEncoderLayer
+from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention
+
+
+class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+ """
+ Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention)
+ """
+
+ def __init__(
+ self,
+ embedding_dim: int = 768,
+ ffn_embedding_dim: int = 3072,
+ num_attention_heads: int = 8,
+ dropout: float = 0.1,
+ attention_dropout: float = 0.1,
+ activation_dropout: float = 0.1,
+ activation_fn: str = "relu",
+ export: bool = False,
+ is_bidirectional: bool = True,
+ stride: int = 32,
+ expressivity: int = 8,
+ ) -> None:
+
+ super().__init__(
+ embedding_dim,
+ ffn_embedding_dim,
+ num_attention_heads,
+ dropout,
+ attention_dropout,
+ activation_dropout,
+ activation_fn,
+ export,
+ )
+
+ self.self_attn = SparseMultiheadAttention(
+ self.embedding_dim,
+ num_attention_heads,
+ dropout=attention_dropout,
+ add_bias_kv=False,
+ add_zero_attn=False,
+ self_attention=True,
+ is_bidirectional=is_bidirectional,
+ stride=stride,
+ expressivity=expressivity,
+ )
diff --git a/fairseq-0.10.2/fairseq/modules/transformer_layer.py b/fairseq-0.10.2/fairseq/modules/transformer_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..48cd4c731445ea9343fc4523f8379133015f4ed1
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/transformer_layer.py
@@ -0,0 +1,423 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor
+
+
+class TransformerEncoderLayer(nn.Module):
+ """Encoder layer block.
+
+ In the original paper each operation (multi-head attention or FFN) is
+ postprocessed with: `dropout -> add residual -> layernorm`. In the
+ tensor2tensor code they suggest that learning is more robust when
+ preprocessing each layer with layernorm and postprocessing with:
+ `dropout -> add residual`. We default to the approach in the paper, but the
+ tensor2tensor approach can be enabled by setting
+ *args.encoder_normalize_before* to ``True``.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments
+ """
+
+ def __init__(self, args):
+ super().__init__()
+ self.embed_dim = args.encoder_embed_dim
+ self.quant_noise = getattr(args, "quant_noise_pq", 0)
+ self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+ self.self_attn = self.build_self_attention(self.embed_dim, args)
+ self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+ self.dropout_module = FairseqDropout(
+ args.dropout, module_name=self.__class__.__name__
+ )
+ self.activation_fn = utils.get_activation_fn(
+ activation=getattr(args, "activation_fn", "relu")
+ )
+ activation_dropout_p = getattr(args, "activation_dropout", 0)
+ if activation_dropout_p == 0:
+ # for backwards compatibility with models that use args.relu_dropout
+ activation_dropout_p = getattr(args, "relu_dropout", 0)
+ self.activation_dropout_module = FairseqDropout(
+ float(activation_dropout_p), module_name=self.__class__.__name__
+ )
+ self.normalize_before = args.encoder_normalize_before
+ self.fc1 = self.build_fc1(
+ self.embed_dim,
+ args.encoder_ffn_embed_dim,
+ self.quant_noise,
+ self.quant_noise_block_size,
+ )
+ self.fc2 = self.build_fc2(
+ args.encoder_ffn_embed_dim,
+ self.embed_dim,
+ self.quant_noise,
+ self.quant_noise_block_size,
+ )
+
+ self.final_layer_norm = LayerNorm(self.embed_dim)
+
+ def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(
+ nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+ )
+
+ def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(
+ nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
+ )
+
+ def build_self_attention(self, embed_dim, args):
+ return MultiheadAttention(
+ embed_dim,
+ args.encoder_attention_heads,
+ dropout=args.attention_dropout,
+ self_attention=True,
+ q_noise=self.quant_noise,
+ qn_block_size=self.quant_noise_block_size,
+ )
+
+ def residual_connection(self, x, residual):
+ return residual + x
+
+ def upgrade_state_dict_named(self, state_dict, name):
+ """
+ Rename layer norm states from `...layer_norms.0.weight` to
+ `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
+ `...final_layer_norm.weight`
+ """
+ layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
+ for old, new in layer_norm_map.items():
+ for m in ("weight", "bias"):
+ k = "{}.layer_norms.{}.{}".format(name, old, m)
+ if k in state_dict:
+ state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
+ del state_dict[k]
+
+ def forward(self, x, encoder_padding_mask, attn_mask: Optional[Tensor] = None):
+ """
+ Args:
+ x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+ `(batch, seq_len)` where padding elements are indicated by ``1``.
+ attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
+ where `tgt_len` is the length of output and `src_len` is the
+ length of input, though here both are equal to `seq_len`.
+ `attn_mask[tgt_i, src_j] = 1` means that when calculating the
+ embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
+ useful for strided self-attention.
+
+ Returns:
+ encoded output of shape `(seq_len, batch, embed_dim)`
+ """
+ # anything in original attn_mask = 1, becomes -1e8
+ # anything in original attn_mask = 0, becomes 0
+ # Note that we cannot use -inf here, because at some edge cases,
+ # the attention weight (before softmax) for some padded element in query
+ # will become -inf, which results in NaN in model parameters
+ if attn_mask is not None:
+ attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+
+ residual = x
+ if self.normalize_before:
+ x = self.self_attn_layer_norm(x)
+ x, _ = self.self_attn(
+ query=x,
+ key=x,
+ value=x,
+ key_padding_mask=encoder_padding_mask,
+ attn_mask=attn_mask,
+ )
+ x = self.dropout_module(x)
+ x = self.residual_connection(x, residual)
+ if not self.normalize_before:
+ x = self.self_attn_layer_norm(x)
+
+ residual = x
+ if self.normalize_before:
+ x = self.final_layer_norm(x)
+
+ x = self.activation_fn(self.fc1(x))
+ x = self.activation_dropout_module(x)
+ x = self.fc2(x)
+ x = self.dropout_module(x)
+ x = self.residual_connection(x, residual)
+ if not self.normalize_before:
+ x = self.final_layer_norm(x)
+ return x
+
+
+class TransformerDecoderLayer(nn.Module):
+ """Decoder layer block.
+
+ In the original paper each operation (multi-head attention, encoder
+ attention or FFN) is postprocessed with: `dropout -> add residual ->
+ layernorm`. In the tensor2tensor code they suggest that learning is more
+ robust when preprocessing each layer with layernorm and postprocessing with:
+ `dropout -> add residual`. We default to the approach in the paper, but the
+ tensor2tensor approach can be enabled by setting
+ *args.decoder_normalize_before* to ``True``.
+
+ Args:
+ args (argparse.Namespace): parsed command-line arguments
+ no_encoder_attn (bool, optional): whether to attend to encoder outputs
+ (default: False).
+ """
+
+ def __init__(
+ self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
+ ):
+ super().__init__()
+ self.embed_dim = args.decoder_embed_dim
+ self.dropout_module = FairseqDropout(
+ args.dropout, module_name=self.__class__.__name__
+ )
+ self.quant_noise = getattr(args, "quant_noise_pq", 0)
+ self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8)
+
+ self.cross_self_attention = getattr(args, "cross_self_attention", False)
+
+ self.self_attn = self.build_self_attention(
+ self.embed_dim,
+ args,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ )
+
+ self.activation_fn = utils.get_activation_fn(
+ activation=str(args.activation_fn)
+ if getattr(args, "activation_fn", None) is not None
+ else "relu"
+ )
+ activation_dropout_p = getattr(args, "activation_dropout", 0)
+ if activation_dropout_p == 0:
+ # for backwards compatibility with models that use args.relu_dropout
+ activation_dropout_p = getattr(args, "relu_dropout", 0)
+ self.activation_dropout_module = FairseqDropout(
+ float(activation_dropout_p), module_name=self.__class__.__name__
+ )
+ self.normalize_before = args.decoder_normalize_before
+
+ # use layerNorm rather than FusedLayerNorm for exporting.
+ # char_inputs can be used to determint this.
+ # TODO remove this once we update apex with the fix
+ export = getattr(args, "char_inputs", False)
+ self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+ if no_encoder_attn:
+ self.encoder_attn = None
+ self.encoder_attn_layer_norm = None
+ else:
+ self.encoder_attn = self.build_encoder_attention(self.embed_dim, args)
+ self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export)
+
+ self.fc1 = self.build_fc1(
+ self.embed_dim,
+ args.decoder_ffn_embed_dim,
+ self.quant_noise,
+ self.quant_noise_block_size,
+ )
+ self.fc2 = self.build_fc2(
+ args.decoder_ffn_embed_dim,
+ self.embed_dim,
+ self.quant_noise,
+ self.quant_noise_block_size,
+ )
+
+ self.final_layer_norm = LayerNorm(self.embed_dim, export=export)
+ self.need_attn = True
+
+ self.onnx_trace = False
+
+ def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+ def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+ def build_self_attention(
+ self, embed_dim, args, add_bias_kv=False, add_zero_attn=False
+ ):
+ return MultiheadAttention(
+ embed_dim,
+ args.decoder_attention_heads,
+ dropout=args.attention_dropout,
+ add_bias_kv=add_bias_kv,
+ add_zero_attn=add_zero_attn,
+ self_attention=not getattr(args, "cross_self_attention", False),
+ q_noise=self.quant_noise,
+ qn_block_size=self.quant_noise_block_size,
+ )
+
+ def build_encoder_attention(self, embed_dim, args):
+ return MultiheadAttention(
+ embed_dim,
+ args.decoder_attention_heads,
+ kdim=getattr(args, "encoder_embed_dim", None),
+ vdim=getattr(args, "encoder_embed_dim", None),
+ dropout=args.attention_dropout,
+ encoder_decoder_attention=True,
+ q_noise=self.quant_noise,
+ qn_block_size=self.quant_noise_block_size,
+ )
+
+ def prepare_for_onnx_export_(self):
+ self.onnx_trace = True
+
+ def residual_connection(self, x, residual):
+ return residual + x
+
+ def forward(
+ self,
+ x,
+ encoder_out: Optional[torch.Tensor] = None,
+ encoder_padding_mask: Optional[torch.Tensor] = None,
+ incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+ prev_self_attn_state: Optional[List[torch.Tensor]] = None,
+ prev_attn_state: Optional[List[torch.Tensor]] = None,
+ self_attn_mask: Optional[torch.Tensor] = None,
+ self_attn_padding_mask: Optional[torch.Tensor] = None,
+ need_attn: bool = False,
+ need_head_weights: bool = False,
+ ):
+ """
+ Args:
+ x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+ encoder_padding_mask (ByteTensor, optional): binary
+ ByteTensor of shape `(batch, src_len)` where padding
+ elements are indicated by ``1``.
+ need_attn (bool, optional): return attention weights
+ need_head_weights (bool, optional): return attention weights
+ for each head (default: return average over heads).
+
+ Returns:
+ encoded output of shape `(seq_len, batch, embed_dim)`
+ """
+ if need_head_weights:
+ need_attn = True
+
+ residual = x
+ if self.normalize_before:
+ x = self.self_attn_layer_norm(x)
+ if prev_self_attn_state is not None:
+ prev_key, prev_value = prev_self_attn_state[:2]
+ saved_state: Dict[str, Optional[Tensor]] = {
+ "prev_key": prev_key,
+ "prev_value": prev_value,
+ }
+ if len(prev_self_attn_state) >= 3:
+ saved_state["prev_key_padding_mask"] = prev_self_attn_state[2]
+ assert incremental_state is not None
+ self.self_attn._set_input_buffer(incremental_state, saved_state)
+ _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state)
+ if self.cross_self_attention and not (
+ incremental_state is not None
+ and _self_attn_input_buffer is not None
+ and "prev_key" in _self_attn_input_buffer
+ ):
+ if self_attn_mask is not None:
+ assert encoder_out is not None
+ self_attn_mask = torch.cat(
+ (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1
+ )
+ if self_attn_padding_mask is not None:
+ if encoder_padding_mask is None:
+ assert encoder_out is not None
+ encoder_padding_mask = self_attn_padding_mask.new_zeros(
+ encoder_out.size(1), encoder_out.size(0)
+ )
+ self_attn_padding_mask = torch.cat(
+ (encoder_padding_mask, self_attn_padding_mask), dim=1
+ )
+ assert encoder_out is not None
+ y = torch.cat((encoder_out, x), dim=0)
+ else:
+ y = x
+
+ x, attn = self.self_attn(
+ query=x,
+ key=y,
+ value=y,
+ key_padding_mask=self_attn_padding_mask,
+ incremental_state=incremental_state,
+ need_weights=False,
+ attn_mask=self_attn_mask,
+ )
+ x = self.dropout_module(x)
+ x = self.residual_connection(x, residual)
+ if not self.normalize_before:
+ x = self.self_attn_layer_norm(x)
+
+ if self.encoder_attn is not None and encoder_out is not None:
+ residual = x
+ if self.normalize_before:
+ x = self.encoder_attn_layer_norm(x)
+ if prev_attn_state is not None:
+ prev_key, prev_value = prev_attn_state[:2]
+ saved_state: Dict[str, Optional[Tensor]] = {
+ "prev_key": prev_key,
+ "prev_value": prev_value,
+ }
+ if len(prev_attn_state) >= 3:
+ saved_state["prev_key_padding_mask"] = prev_attn_state[2]
+ assert incremental_state is not None
+ self.encoder_attn._set_input_buffer(incremental_state, saved_state)
+
+ x, attn = self.encoder_attn(
+ query=x,
+ key=encoder_out,
+ value=encoder_out,
+ key_padding_mask=encoder_padding_mask,
+ incremental_state=incremental_state,
+ static_kv=True,
+ need_weights=need_attn or (not self.training and self.need_attn),
+ need_head_weights=need_head_weights,
+ )
+ x = self.dropout_module(x)
+ x = self.residual_connection(x, residual)
+ if not self.normalize_before:
+ x = self.encoder_attn_layer_norm(x)
+
+ residual = x
+ if self.normalize_before:
+ x = self.final_layer_norm(x)
+
+ x = self.activation_fn(self.fc1(x))
+ x = self.activation_dropout_module(x)
+ x = self.fc2(x)
+ x = self.dropout_module(x)
+ x = self.residual_connection(x, residual)
+ if not self.normalize_before:
+ x = self.final_layer_norm(x)
+ if self.onnx_trace and incremental_state is not None:
+ saved_state = self.self_attn._get_input_buffer(incremental_state)
+ assert saved_state is not None
+ if self_attn_padding_mask is not None:
+ self_attn_state = [
+ saved_state["prev_key"],
+ saved_state["prev_value"],
+ saved_state["prev_key_padding_mask"],
+ ]
+ else:
+ self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]]
+ return x, attn, self_attn_state
+ return x, attn, None
+
+ def make_generation_fast_(self, need_attn: bool = False, **kwargs):
+ self.need_attn = need_attn
+
+
+def Linear(in_features, out_features, bias=True):
+ m = nn.Linear(in_features, out_features, bias)
+ nn.init.xavier_uniform_(m.weight)
+ if bias:
+ nn.init.constant_(m.bias, 0.0)
+ return m
diff --git a/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3589c60fe6843c549cfcb94a26cd27bad1fd8033
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder_layer.py
@@ -0,0 +1,134 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+from fairseq import utils
+from fairseq.modules import LayerNorm, MultiheadAttention
+from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.quant_noise import quant_noise
+
+
+class TransformerSentenceEncoderLayer(nn.Module):
+ """
+ Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+ models.
+ """
+
+ def __init__(
+ self,
+ embedding_dim: int = 768,
+ ffn_embedding_dim: int = 3072,
+ num_attention_heads: int = 8,
+ dropout: float = 0.1,
+ attention_dropout: float = 0.1,
+ activation_dropout: float = 0.1,
+ activation_fn: str = "relu",
+ export: bool = False,
+ q_noise: float = 0.0,
+ qn_block_size: int = 8,
+ init_fn: Callable = None,
+ ) -> None:
+ super().__init__()
+
+ if init_fn is not None:
+ init_fn()
+
+ # Initialize parameters
+ self.embedding_dim = embedding_dim
+ self.dropout_module = FairseqDropout(
+ dropout, module_name=self.__class__.__name__
+ )
+ self.activation_dropout_module = FairseqDropout(
+ activation_dropout, module_name=self.__class__.__name__
+ )
+
+ # Initialize blocks
+ self.activation_fn = utils.get_activation_fn(activation_fn)
+ self.self_attn = self.build_self_attention(
+ self.embedding_dim,
+ num_attention_heads,
+ dropout=attention_dropout,
+ self_attention=True,
+ q_noise=q_noise,
+ qn_block_size=qn_block_size,
+ )
+
+ # layer norm associated with the self attention layer
+ self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+ self.fc1 = self.build_fc1(
+ self.embedding_dim,
+ ffn_embedding_dim,
+ q_noise=q_noise,
+ qn_block_size=qn_block_size,
+ )
+ self.fc2 = self.build_fc2(
+ ffn_embedding_dim,
+ self.embedding_dim,
+ q_noise=q_noise,
+ qn_block_size=qn_block_size,
+ )
+
+ # layer norm associated with the position wise feed-forward NN
+ self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
+
+ def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+ def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
+ return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+ def build_self_attention(
+ self,
+ embed_dim,
+ num_attention_heads,
+ dropout,
+ self_attention,
+ q_noise,
+ qn_block_size,
+ ):
+ return MultiheadAttention(
+ embed_dim,
+ num_attention_heads,
+ dropout=dropout,
+ self_attention=True,
+ q_noise=q_noise,
+ qn_block_size=qn_block_size,
+ )
+
+ def forward(
+ self,
+ x: torch.Tensor,
+ self_attn_mask: Optional[torch.Tensor] = None,
+ self_attn_padding_mask: Optional[torch.Tensor] = None,
+ ):
+ """
+ LayerNorm is applied either before or after the self-attention/ffn
+ modules similar to the original Transformer implementation.
+ """
+ residual = x
+ x, attn = self.self_attn(
+ query=x,
+ key=x,
+ value=x,
+ key_padding_mask=self_attn_padding_mask,
+ need_weights=False,
+ attn_mask=self_attn_mask,
+ )
+ x = self.dropout_module(x)
+ x = residual + x
+ x = self.self_attn_layer_norm(x)
+
+ residual = x
+ x = self.activation_fn(self.fc1(x))
+ x = self.activation_dropout_module(x)
+ x = self.fc2(x)
+ x = self.dropout_module(x)
+ x = residual + x
+ x = self.final_layer_norm(x)
+ return x, attn
diff --git a/fairseq-0.10.2/fairseq/modules/unfold.py b/fairseq-0.10.2/fairseq/modules/unfold.py
new file mode 100644
index 0000000000000000000000000000000000000000..138272f1ef4f673b29e36aed4531106f7ce95968
--- /dev/null
+++ b/fairseq-0.10.2/fairseq/modules/unfold.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch.nn.functional as F
+
+
+def unfold1d(x, kernel_size, padding_l, pad_value=0):
+ """unfold T x B x C to T x B x C x K"""
+ if kernel_size > 1:
+ T, B, C = x.size()
+ x = F.pad(
+ x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value
+ )
+ x = x.as_strided((T, B, C, kernel_size), (B * C, C, 1, B * C))
+ else:
+ x = x.unsqueeze(3)
+ return x