diff --git a/fairseq-0.10.2/fairseq/__pycache__/pdb.cpython-310.pyc b/fairseq-0.10.2/fairseq/__pycache__/pdb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..242e3e5e576190fb606eae04d47cafe65f7e1c92 Binary files /dev/null and b/fairseq-0.10.2/fairseq/__pycache__/pdb.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/hub_utils.py b/fairseq-0.10.2/fairseq/hub_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b293e54e2a5b77df69b8044b526837b6fbceb3aa --- /dev/null +++ b/fairseq-0.10.2/fairseq/hub_utils.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import copy +import logging +import os +from typing import Any, Dict, Iterator, List, Tuple + +import torch +from fairseq import utils +from fairseq.data import encoders +from torch import nn + + +logger = logging.getLogger(__name__) + + +def from_pretrained( + model_name_or_path, + checkpoint_file="model.pt", + data_name_or_path=".", + archive_map=None, + **kwargs +): + from fairseq import checkpoint_utils, file_utils + + if archive_map is not None: + if model_name_or_path in archive_map: + model_name_or_path = archive_map[model_name_or_path] + if data_name_or_path is not None and data_name_or_path in archive_map: + data_name_or_path = archive_map[data_name_or_path] + + # allow archive_map to set default arg_overrides (e.g., tokenizer, bpe) + # for each model + if isinstance(model_name_or_path, dict): + for k, v in model_name_or_path.items(): + if k == "checkpoint_file": + checkpoint_file = v + elif ( + k != "path" + # only set kwargs that don't already have overrides + and k not in kwargs + ): + kwargs[k] = v + model_name_or_path = model_name_or_path["path"] + + model_path = file_utils.load_archive_file(model_name_or_path) + + # convenience hack for loading data and BPE codes from model archive + if data_name_or_path.startswith("."): + kwargs["data"] = os.path.abspath(os.path.join(model_path, data_name_or_path)) + else: + kwargs["data"] = file_utils.load_archive_file(data_name_or_path) + for file, arg in { + "code": "bpe_codes", + "bpecodes": "bpe_codes", + "sentencepiece.bpe.model": "sentencepiece_model", + }.items(): + path = os.path.join(model_path, file) + if os.path.exists(path): + kwargs[arg] = path + + if "user_dir" in kwargs: + utils.import_user_module(argparse.Namespace(user_dir=kwargs["user_dir"])) + + models, args, task = checkpoint_utils.load_model_ensemble_and_task( + [os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep)], + arg_overrides=kwargs, + ) + + return { + "args": args, + "task": task, + "models": models, + } + + +class GeneratorHubInterface(nn.Module): + """ + PyTorch Hub interface for generating sequences from a pre-trained + translation or language model. + """ + + def __init__(self, args, task, models): + super().__init__() + self.args = args + self.task = task + self.models = nn.ModuleList(models) + self.src_dict = task.source_dictionary + self.tgt_dict = task.target_dictionary + + # optimize model for generation + for model in self.models: + model.prepare_for_inference_(args) + + # Load alignment dictionary for unknown word replacement + # (None if no unknown word replacement, empty if no path to align dictionary) + self.align_dict = utils.load_align_dict(getattr(args, "replace_unk", None)) + + self.tokenizer = encoders.build_tokenizer(args) + self.bpe = encoders.build_bpe(args) + + self.max_positions = utils.resolve_max_positions( + self.task.max_positions(), *[model.max_positions() for model in models] + ) + + # this is useful for determining the device + self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) + + @property + def device(self): + return self._float_tensor.device + + def translate( + self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs + ) -> List[str]: + return self.sample(sentences, beam, verbose, **kwargs) + + def sample( + self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs + ) -> List[str]: + if isinstance(sentences, str): + return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0] + tokenized_sentences = [self.encode(sentence) for sentence in sentences] + batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) + return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos] + + def score(self, sentences: List[str], **kwargs): + if isinstance(sentences, str): + return self.score([sentences], **kwargs)[0] + # NOTE: this doesn't support translation tasks currently + tokenized_sentences = [self.encode(sentence) for sentence in sentences] + return [ + hypos[0] + for hypos in self.generate( + tokenized_sentences, score_reference=True, **kwargs + ) + ] + + def generate( + self, + tokenized_sentences: List[torch.LongTensor], + beam: int = 5, + verbose: bool = False, + skip_invalid_size_inputs=False, + inference_step_args=None, + **kwargs + ) -> List[List[Dict[str, torch.Tensor]]]: + if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1: + return self.generate( + tokenized_sentences.unsqueeze(0), beam=beam, verbose=verbose, **kwargs + )[0] + + # build generator using current args as well as any kwargs + gen_args = copy.copy(self.args) + gen_args.beam = beam + for k, v in kwargs.items(): + setattr(gen_args, k, v) + generator = self.task.build_generator(self.models, gen_args) + + inference_step_args = inference_step_args or {} + results = [] + for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs): + batch = utils.apply_to_sample(lambda t: t.to(self.device), batch) + translations = self.task.inference_step( + generator, self.models, batch, **inference_step_args + ) + for id, hypos in zip(batch["id"].tolist(), translations): + results.append((id, hypos)) + + # sort output to match input order + outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])] + + if verbose: + + def getarg(name, default): + return getattr(gen_args, name, getattr(self.args, name, default)) + + for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs): + src_str_with_unk = self.string(source_tokens) + logger.info("S\t{}".format(src_str_with_unk)) + for hypo in target_hypotheses: + hypo_str = self.decode(hypo["tokens"]) + logger.info("H\t{}\t{}".format(hypo["score"], hypo_str)) + logger.info( + "P\t{}".format( + " ".join( + map( + lambda x: "{:.4f}".format(x), + hypo["positional_scores"].tolist(), + ) + ) + ) + ) + if hypo["alignment"] is not None and getarg( + "print_alignment", False + ): + logger.info( + "A\t{}".format( + " ".join( + [ + "{}-{}".format(src_idx, tgt_idx) + for src_idx, tgt_idx in hypo["alignment"] + ] + ) + ) + ) + return outputs + + def encode(self, sentence: str) -> torch.LongTensor: + sentence = self.tokenize(sentence) + sentence = self.apply_bpe(sentence) + return self.binarize(sentence) + + def decode(self, tokens: torch.LongTensor) -> str: + sentence = self.string(tokens) + sentence = self.remove_bpe(sentence) + return self.detokenize(sentence) + + def tokenize(self, sentence: str) -> str: + if self.tokenizer is not None: + sentence = self.tokenizer.encode(sentence) + return sentence + + def detokenize(self, sentence: str) -> str: + if self.tokenizer is not None: + sentence = self.tokenizer.decode(sentence) + return sentence + + def apply_bpe(self, sentence: str) -> str: + if self.bpe is not None: + sentence = self.bpe.encode(sentence) + return sentence + + def remove_bpe(self, sentence: str) -> str: + if self.bpe is not None: + sentence = self.bpe.decode(sentence) + return sentence + + def binarize(self, sentence: str) -> torch.LongTensor: + return self.src_dict.encode_line(sentence, add_if_not_exist=False).long() + + def string(self, tokens: torch.LongTensor) -> str: + return self.tgt_dict.string(tokens) + + def _build_batches( + self, tokens: List[List[int]], skip_invalid_size_inputs: bool + ) -> Iterator[Dict[str, Any]]: + lengths = torch.LongTensor([t.numel() for t in tokens]) + batch_iterator = self.task.get_batch_iterator( + dataset=self.task.build_dataset_for_inference(tokens, lengths), + max_tokens=self.args.max_tokens, + max_sentences=self.args.batch_size, + max_positions=self.max_positions, + ignore_invalid_inputs=skip_invalid_size_inputs, + disable_iterator_cache=True, + ).next_epoch_itr(shuffle=False) + return batch_iterator + + +class BPEHubInterface(object): + """PyTorch Hub interface for Byte-Pair Encoding (BPE).""" + + def __init__(self, bpe, **kwargs): + super().__init__() + args = argparse.Namespace(bpe=bpe, **kwargs) + self.bpe = encoders.build_bpe(args) + assert self.bpe is not None + + def encode(self, sentence: str) -> str: + return self.bpe.encode(sentence) + + def decode(self, sentence: str) -> str: + return self.bpe.decode(sentence) + + +class TokenizerHubInterface(object): + """PyTorch Hub interface for tokenization.""" + + def __init__(self, tokenizer, **kwargs): + super().__init__() + args = argparse.Namespace(tokenizer=tokenizer, **kwargs) + self.tokenizer = encoders.build_tokenizer(args) + assert self.tokenizer is not None + + def encode(self, sentence: str) -> str: + return self.tokenizer.encode(sentence) + + def decode(self, sentence: str) -> str: + return self.tokenizer.decode(sentence) diff --git a/fairseq-0.10.2/fairseq/iterative_refinement_generator.py b/fairseq-0.10.2/fairseq/iterative_refinement_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..4fb0946f499329ceb130761b59675d761df1c158 --- /dev/null +++ b/fairseq-0.10.2/fairseq/iterative_refinement_generator.py @@ -0,0 +1,359 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import namedtuple + +import numpy as np +import torch +from fairseq import utils + + +DecoderOut = namedtuple( + "IterativeRefinementDecoderOut", + ["output_tokens", "output_scores", "attn", "step", "max_step", "history"], +) + + +class IterativeRefinementGenerator(object): + def __init__( + self, + tgt_dict, + models=None, + eos_penalty=0.0, + max_iter=10, + max_ratio=2, + beam_size=1, + decoding_format=None, + retain_dropout=False, + adaptive=True, + retain_history=False, + reranking=False, + ): + """ + Generates translations based on iterative refinement. + + Args: + tgt_dict: target dictionary + eos_penalty: if > 0.0, it penalized early-stopping in decoding + max_iter: maximum number of refinement iterations + max_ratio: generate sequences of maximum length ax, where x is the source length + decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'} + retain_dropout: retaining dropout in the inference + adaptive: decoding with early stop + """ + self.bos = tgt_dict.bos() + self.pad = tgt_dict.pad() + self.unk = tgt_dict.unk() + self.eos = tgt_dict.eos() + self.vocab_size = len(tgt_dict) + self.eos_penalty = eos_penalty + self.max_iter = max_iter + self.max_ratio = max_ratio + self.beam_size = beam_size + self.reranking = reranking + self.decoding_format = decoding_format + self.retain_dropout = retain_dropout + self.retain_history = retain_history + self.adaptive = adaptive + self.models = models + + def generate_batched_itr( + self, + data_itr, + maxlen_a=None, + maxlen_b=None, + cuda=False, + timer=None, + prefix_size=0, + ): + """Iterate over a batched dataset and yield individual translations. + + Args: + maxlen_a/b: generate sequences of maximum length ax + b, + where x is the source sentence length. + cuda: use GPU for generation + timer: StopwatchMeter for timing generations. + """ + + for sample in data_itr: + if "net_input" not in sample: + continue + if timer is not None: + timer.start() + with torch.no_grad(): + hypos = self.generate( + self.models, + sample, + prefix_tokens=sample["target"][:, :prefix_size] + if prefix_size > 0 + else None, + ) + if timer is not None: + timer.stop(sample["ntokens"]) + for i, id in enumerate(sample["id"]): + # remove padding + src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad) + ref = utils.strip_pad(sample["target"][i, :], self.pad) + yield id, src, ref, hypos[i] + + @torch.no_grad() + def generate(self, models, sample, prefix_tokens=None, constraints=None): + if constraints is not None: + raise NotImplementedError( + "Constrained decoding with the IterativeRefinementGenerator is not supported" + ) + + # TODO: iterative refinement generator does not support ensemble for now. + if not self.retain_dropout: + for model in models: + model.eval() + + model, reranker = models[0], None + if self.reranking: + assert len(models) > 1, "Assuming the last checkpoint is the reranker" + assert ( + self.beam_size > 1 + ), "Reranking requires multiple translation for each example" + + reranker = models[-1] + models = models[:-1] + + if len(models) > 1 and hasattr(model, "enable_ensemble"): + assert model.allow_ensemble, "{} does not support ensembling".format( + model.__class__.__name__ + ) + model.enable_ensemble(models) + + # TODO: better encoder inputs? + src_tokens = sample["net_input"]["src_tokens"] + src_lengths = sample["net_input"]["src_lengths"] + bsz, src_len = src_tokens.size() + + # initialize + encoder_out = model.forward_encoder([src_tokens, src_lengths]) + prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens) + + if self.beam_size > 1: + assert ( + model.allow_length_beam + ), "{} does not support decoding with length beam.".format( + model.__class__.__name__ + ) + + # regenerate data based on length-beam + length_beam_order = ( + utils.new_arange(src_tokens, self.beam_size, bsz).t().reshape(-1) + ) + encoder_out = model.encoder.reorder_encoder_out( + encoder_out, length_beam_order + ) + prev_decoder_out = model.regenerate_length_beam( + prev_decoder_out, self.beam_size + ) + bsz = bsz * self.beam_size + + sent_idxs = torch.arange(bsz) + prev_output_tokens = prev_decoder_out.output_tokens.clone() + + if self.retain_history: + prev_decoder_out = prev_decoder_out._replace(history=[prev_output_tokens]) + + finalized = [[] for _ in range(bsz)] + + def is_a_loop(x, y, s, a): + b, l_x, l_y = x.size(0), x.size(1), y.size(1) + if l_x > l_y: + y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1) + s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1) + if a is not None: + a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1) + elif l_x < l_y: + x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1) + return (x == y).all(1), y, s, a + + def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn): + cutoff = prev_out_token.ne(self.pad) + tokens = prev_out_token[cutoff] + if prev_out_score is None: + scores, score = None, None + else: + scores = prev_out_score[cutoff] + score = scores.mean() + + if prev_out_attn is None: + hypo_attn, alignment = None, None + else: + hypo_attn = prev_out_attn[cutoff] + alignment = hypo_attn.max(dim=1)[1] + return { + "steps": step, + "tokens": tokens, + "positional_scores": scores, + "score": score, + "hypo_attn": hypo_attn, + "alignment": alignment, + } + + for step in range(self.max_iter + 1): + + decoder_options = { + "eos_penalty": self.eos_penalty, + "max_ratio": self.max_ratio, + "decoding_format": self.decoding_format, + } + prev_decoder_out = prev_decoder_out._replace( + step=step, + max_step=self.max_iter + 1, + ) + + decoder_out = model.forward_decoder( + prev_decoder_out, encoder_out, **decoder_options + ) + + if self.adaptive: + # terminate if there is a loop + terminated, out_tokens, out_scores, out_attn = is_a_loop( + prev_output_tokens, + decoder_out.output_tokens, + decoder_out.output_scores, + decoder_out.attn, + ) + decoder_out = decoder_out._replace( + output_tokens=out_tokens, + output_scores=out_scores, + attn=out_attn, + ) + + else: + terminated = decoder_out.output_tokens.new_zeros( + decoder_out.output_tokens.size(0) + ).bool() + + if step == self.max_iter: # reach last iteration, terminate + terminated.fill_(1) + + # collect finalized sentences + finalized_idxs = sent_idxs[terminated] + finalized_tokens = decoder_out.output_tokens[terminated] + finalized_scores = decoder_out.output_scores[terminated] + finalized_attn = ( + None + if (decoder_out.attn is None or decoder_out.attn.size(0) == 0) + else decoder_out.attn[terminated] + ) + + if self.retain_history: + finalized_history_tokens = [h[terminated] for h in decoder_out.history] + + for i in range(finalized_idxs.size(0)): + finalized[finalized_idxs[i]] = [ + finalized_hypos( + step, + finalized_tokens[i], + finalized_scores[i], + None if finalized_attn is None else finalized_attn[i], + ) + ] + + if self.retain_history: + finalized[finalized_idxs[i]][0]["history"] = [] + for j in range(len(finalized_history_tokens)): + finalized[finalized_idxs[i]][0]["history"].append( + finalized_hypos( + step, finalized_history_tokens[j][i], None, None + ) + ) + + # check if all terminated + if terminated.sum() == terminated.size(0): + break + + # for next step + not_terminated = ~terminated + prev_decoder_out = decoder_out._replace( + output_tokens=decoder_out.output_tokens[not_terminated], + output_scores=decoder_out.output_scores[not_terminated], + attn=decoder_out.attn[not_terminated] + if (decoder_out.attn is not None and decoder_out.attn.size(0) > 0) + else None, + history=[h[not_terminated] for h in decoder_out.history] + if decoder_out.history is not None + else None, + ) + encoder_out = model.encoder.reorder_encoder_out( + encoder_out, not_terminated.nonzero(as_tuple=False).squeeze() + ) + sent_idxs = sent_idxs[not_terminated] + prev_output_tokens = prev_decoder_out.output_tokens.clone() + + if self.beam_size > 1: + if reranker is not None: + finalized = self.rerank( + reranker, finalized, [src_tokens, src_lengths], self.beam_size + ) + + # aggregate information from length beam + finalized = [ + finalized[ + np.argmax( + [ + finalized[self.beam_size * i + j][0]["score"] + for j in range(self.beam_size) + ] + ) + + self.beam_size * i + ] + for i in range(len(finalized) // self.beam_size) + ] + + return finalized + + def rerank(self, reranker, finalized, encoder_input, beam_size): + def rebuild_batch(finalized): + finalized_tokens = [f[0]["tokens"] for f in finalized] + finalized_maxlen = max(f.size(0) for f in finalized_tokens) + final_output_tokens = ( + finalized_tokens[0] + .new_zeros(len(finalized_tokens), finalized_maxlen) + .fill_(self.pad) + ) + for i, f in enumerate(finalized_tokens): + final_output_tokens[i, : f.size(0)] = f + return final_output_tokens + + final_output_tokens = rebuild_batch(finalized) + final_output_tokens[ + :, 0 + ] = self.eos # autoregressive model assumes starting with EOS + + reranker_encoder_out = reranker.encoder(*encoder_input) + length_beam_order = ( + utils.new_arange( + final_output_tokens, beam_size, reranker_encoder_out.encoder_out.size(1) + ) + .t() + .reshape(-1) + ) + reranker_encoder_out = reranker.encoder.reorder_encoder_out( + reranker_encoder_out, length_beam_order + ) + reranking_scores = reranker.get_normalized_probs( + reranker.decoder(final_output_tokens[:, :-1], reranker_encoder_out), + True, + None, + ) + reranking_scores = reranking_scores.gather(2, final_output_tokens[:, 1:, None]) + reranking_masks = final_output_tokens[:, 1:].ne(self.pad) + reranking_scores = ( + reranking_scores[:, :, 0].masked_fill_(~reranking_masks, 0).sum(1) + ) + reranking_scores = reranking_scores / reranking_masks.sum(1).type_as( + reranking_scores + ) + + for i in range(len(finalized)): + finalized[i][0]["score"] = reranking_scores[i] + + return finalized diff --git a/fairseq-0.10.2/fairseq/legacy_distributed_data_parallel.py b/fairseq-0.10.2/fairseq/legacy_distributed_data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..44f87c7c4253b18179e2174e8657e1e2ecf18176 --- /dev/null +++ b/fairseq-0.10.2/fairseq/legacy_distributed_data_parallel.py @@ -0,0 +1,171 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +A modified version of the legacy DistributedDataParallel module that uses c10d +communication primitives. This version is simpler than the latest PyTorch +version and is useful for debugging. Notably it does not overlap gradient +communication with the backward pass, which makes it slower but more robust +than the PyTorch version. + +This version also supports the *no_sync* context manager, which allows faster +training with `--update-freq`. +""" + +import copy +from collections import OrderedDict +from contextlib import contextmanager + +import torch +from torch import nn +from torch.autograd import Variable + +from . import distributed_utils + + +class LegacyDistributedDataParallel(nn.Module): + """Implements distributed data parallelism at the module level. + + A simplified version of :class:`torch.nn.parallel.DistributedDataParallel`. + This version uses a c10d process group for communication and does not + broadcast buffers. + + Args: + module (~torch.nn.Module): module to be parallelized + world_size (int): number of parallel workers + process_group (optional): the c10d process group to be used for + distributed data all-reduction. If None, the default process group + will be used. + buffer_size (int, optional): number of elements to buffer before + performing all-reduce (default: 256M). + """ + + def __init__(self, module, world_size, process_group=None, buffer_size=2 ** 28): + super().__init__() + + self.module = module + self.world_size = world_size + self.process_group = process_group + + # Never use a bigger buffer than the number of model params + self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters())) + self.buffer = None + + # We can also forcibly accumulate grads locally and only do the + # all-reduce at some later time + self.accumulate_grads = False + + # make per-device lists of parameters + paramlists = OrderedDict() + for param in self.module.parameters(): + device = param.device + if paramlists.get(device) is None: + paramlists[device] = [] + paramlists[device] += [param] + self.per_device_params = list(paramlists.values()) + + def __getstate__(self): + attrs = copy.copy(self.__dict__) + return attrs + + def __setstate__(self, state): + super().__setstate__(state) + + @contextmanager + def no_sync(self): + """A context manager to disable gradient synchronization.""" + old_accumulate_grads = self.accumulate_grads + self.accumulate_grads = True + yield + self.accumulate_grads = old_accumulate_grads + + def forward(self, *inputs, **kwargs): + return self.module(*inputs, **kwargs) + + def all_reduce(self): + """ + This function must be called explicitly after backward to reduce + gradients. There is no automatic hook like c10d. + """ + + def all_reduce_params(params): + buffer = self.buffer + nonzero_buffer = False + if len(params) > 1: + offset = 0 + for p in params: + sz = p.numel() + if p.grad is not None: + buffer[offset : offset + sz].copy_(p.grad.data.view(-1)) + nonzero_buffer = True + else: + buffer[offset : offset + sz].zero_() + offset += sz + else: + # we only have a single grad to all-reduce + p = params[0] + if p.grad is not None: + buffer = p.grad.data + nonzero_buffer = True + elif p.numel() <= self.buffer.numel(): + buffer = buffer[: p.numel()] + buffer.zero_() + else: + buffer = torch.zeros_like(p) + + if nonzero_buffer: + buffer.div_(self.world_size) + + distributed_utils.all_reduce(buffer, self.process_group) + + # copy all-reduced grads back into their original place + offset = 0 + for p in params: + sz = p.numel() + if p.grad is not None: + p.grad.data.copy_(buffer[offset : offset + sz].view_as(p)) + else: + p.grad = buffer[offset : offset + sz].view_as(p).clone() + offset += sz + + def reduction_fn(): + # This function only needs to be called once + if self.accumulate_grads: + return + + if self.buffer is None: + self.buffer = next(self.module.parameters()).new(self.buffer_size) + + for params in self.per_device_params: + # All-reduce the gradients in buckets + offset = 0 + buffered_params = [] + for param in params: + if not param.requires_grad: + continue + if param.grad is None: + param.grad = torch.zeros_like(param) + if param.grad.requires_grad: + raise RuntimeError( + "DistributedDataParallel only works " + "with gradients that don't require " + "grad" + ) + sz = param.numel() + if sz > self.buffer.numel(): + # all-reduce big params directly + all_reduce_params([param]) + else: + if offset + sz > self.buffer.numel(): + all_reduce_params(buffered_params) + offset = 0 + buffered_params.clear() + buffered_params.append(param) + offset += sz + + if len(buffered_params) > 0: + all_reduce_params(buffered_params) + + reduction_fn() diff --git a/fairseq-0.10.2/fairseq/model_parallel/__pycache__/megatron_trainer.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/__pycache__/megatron_trainer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dab83f81cdf6a4b8e6b17b0a6020a9b6ad21a897 Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/__pycache__/megatron_trainer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6005f8b511d9c6fff1d9011cc5627e763665f8c3 Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/criterions/__pycache__/vocab_parallel_cross_entropy.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/megatron_trainer.py b/fairseq-0.10.2/fairseq/model_parallel/megatron_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..761ffc8e61b1181689f117fb670218dce15994f5 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/megatron_trainer.py @@ -0,0 +1,66 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Train a network across multiple GPUs. +""" + +from fairseq import distributed_utils +from fairseq.trainer import Trainer + + +try: + from fairseq.model_parallel.megatron.mpu import ( + get_data_parallel_group, + get_data_parallel_rank, + get_data_parallel_world_size, + get_model_parallel_group, + get_model_parallel_src_rank, + ) + + has_megatron_submodule = True +except (ImportError, ModuleNotFoundError): + has_megatron_submodule = False + + +class MegatronTrainer(Trainer): + """Main class for model parallel with data parallel training.""" + + def __init__(self, args, task, model, criterion): + if not has_megatron_submodule: + raise ImportError( + "\n\nPlease install the megatron submodule:" + "\n\n git submodule update --init " + "fairseq/model_parallel/megatron" + ) + super().__init__(args, task, model, criterion) + + @property + def data_parallel_world_size(self): + return get_data_parallel_world_size() + + @property + def data_parallel_process_group(self): + return get_data_parallel_group() + + @property + def data_parallel_rank(self): + return get_data_parallel_rank() + + @property + def is_data_parallel_master(self): + return get_model_parallel_src_rank() == 0 + + def clip_grad_norm(self, clip_norm): + def _aggregate_model_parallel_grad_norm(total_norm): + total_norm = total_norm ** 2 + distributed_utils.all_reduce(total_norm, group=get_model_parallel_group()) + total_norm = total_norm ** 0.5 + return total_norm + + return self.optimizer.clip_grad_norm( + clip_norm, + aggregate_norm_fn=_aggregate_model_parallel_grad_norm, + ) diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a96064f015fd96c944d3673d95086abfe15dbbf Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..117827c3e9c176477f33e3a6fd7fe19a922411a2 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .model import * # noqa diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bd428ac21f61980ee11d1fbe07c8e7e30fcb2ce Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..eb81ded341257ba0a43c4d0867e8f3c83f276bc7 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py @@ -0,0 +1,600 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from collections import namedtuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import options, utils +from fairseq.modules import ( + AdaptiveSoftmax, + LayerNorm, + MultiheadAttention, + PositionalEmbedding, +) + + +EncoderOut = namedtuple( + "TransformerEncoderOut", + [ + "encoder_out", # T x B x C + "encoder_padding_mask", # B x T + "encoder_embedding", # B x T x C + "encoder_states", # List[T x B x C] + ], +) + + +class TransformerEncoderEmbedding(nn.Module): + """ Encoder Embedding + Positional Embedding """ + + def __init__(self, args, embed_tokens): + super().__init__() + self.dropout = args.dropout + self.max_source_positions = args.max_source_positions + self.embed_tokens = embed_tokens + if isinstance(embed_tokens, nn.ModuleList): + self.padding_idx = embed_tokens[0].padding_idx + embed_dim = sum(e.embedding_dim for e in embed_tokens) + else: + self.padding_idx = embed_tokens.padding_idx + embed_dim = embed_tokens.embedding_dim + self.embed_scale = math.sqrt(embed_dim) + self.embed_positions = ( + PositionalEmbedding( + args.max_source_positions, + embed_dim, + self.padding_idx, + learned=args.encoder_learned_pos, + ) + if not args.no_token_positional_embeddings + else None + ) + if getattr(args, "layernorm_embedding", False): + self.layernorm_embedding = LayerNorm(embed_dim) + else: + self.layernorm_embedding = None + + def forward(self, input): + # embed tokens and positions + src_tokens = input[0] + prev_output_tokens = input[2] + if isinstance(self.embed_tokens, nn.ModuleList): + x_embed_list = [] + for embed_tokens_part in self.embed_tokens: + x_embed_list.append(embed_tokens_part(src_tokens)) + + embedded = torch.cat(x_embed_list, dim=-1) + else: + embedded = self.embed_tokens(src_tokens) + x = embed = self.embed_scale * embedded + if self.embed_positions is not None: + x = embed + self.embed_positions(src_tokens) + if self.layernorm_embedding: + x = self.layernorm_embedding(x) + x = F.dropout(x, p=self.dropout, training=self.training) + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + # compute padding mask + encoder_padding_mask = src_tokens.eq(self.padding_idx) + return (x, encoder_padding_mask, prev_output_tokens) + + +class TransformerEncoderLayerNorm(nn.Module): + """ + Layer norm at the the end of all encoder layers if + args.encoder_enormalize_before = True + """ + + def __init__(self, args, embed_dim): + super().__init__() + if args.encoder_normalize_before: + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward(self, input): + x = input[0] + encoder_padding_mask = input[1] + prev_output_tokens = input[2] + if self.layer_norm: + x = self.layer_norm(x) + # keeping track of the incremental_state is not supported yet + return (x, encoder_padding_mask, prev_output_tokens) + + +class TransformerDecoderEmbedding(nn.Module): + """ Decoder Embedding + Positional Embedding """ + + def __init__(self, args, embed_tokens): + super().__init__() + self.dropout = args.dropout + self.share_input_output_embed = args.share_decoder_input_output_embed + input_embed_dim = ( + sum(e.embedding_dim for e in embed_tokens) + if isinstance(embed_tokens, nn.ModuleList) + else embed_tokens.embedding_dim + ) + embed_dim = args.decoder_embed_dim + self.output_embed_dim = args.decoder_output_dim + + padding_idx = ( + embed_tokens[0].padding_idx + if isinstance(embed_tokens, nn.ModuleList) + else embed_tokens.padding_idx + ) + self.max_target_positions = args.max_target_positions + + self.embed_tokens = embed_tokens + self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim + + self.project_in_dim = ( + Linear(input_embed_dim, embed_dim, bias=False) + if embed_dim != input_embed_dim + else None + ) + + self.embed_positions = ( + PositionalEmbedding( + args.max_target_positions, + embed_dim, + padding_idx, + learned=args.decoder_learned_pos, + ) + if not args.no_token_positional_embeddings + else None + ) + + def forward(self, input): + mt_task = False + if isinstance(input, tuple): + if len(input) == 3: + encoder_out = input[0] + encoder_padding_mask = input[1] + prev_output_tokens = input[2] + incremental_state = None # Hardcoding to avoid passing of None objects + mt_task = True + else: + # HACK for now, need to fix (TODO sidgoyal) + prev_output_tokens = input[0] + # discard "src_lengths" + encoder_out = None + encoder_padding_mask = None + incremental_state = None + + else: + prev_output_tokens = input + encoder_out = None + encoder_padding_mask = None + incremental_state = None + + positions = ( + self.embed_positions( + prev_output_tokens, + incremental_state=incremental_state, + ) + if self.embed_positions is not None + else None + ) + + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + if positions is not None: + positions = positions[:, -1:] + + # embed tokens and positions + + if isinstance(self.embed_tokens, nn.ModuleList): + x_embed_list = [] + for embed_tokens_part in self.embed_tokens: + x_embed_list.append(embed_tokens_part(prev_output_tokens)) + + x = self.embed_scale * torch.cat(x_embed_list, dim=-1) + else: + x = self.embed_scale * self.embed_tokens(prev_output_tokens) + + if self.project_in_dim is not None: + x = self.project_in_dim(x) + + if positions is not None: + x += positions + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + if mt_task: + return (x, encoder_out, encoder_padding_mask) + return x + + +class TransformerDecoderOutputLayer(nn.Module): + def __init__(self, args, embed_tokens, dictionary): + super().__init__() + self.share_input_output_embed = args.share_decoder_input_output_embed + self.embed_tokens = embed_tokens + self.output_embed_dim = args.decoder_output_dim + embed_dim = args.decoder_embed_dim + + self.project_out_dim = ( + Linear(embed_dim, self.output_embed_dim, bias=False) + if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights + else None + ) + self.adaptive_softmax = None + if args.adaptive_softmax_cutoff is not None: + assert not isinstance(embed_tokens, nn.ModuleList) + self.adaptive_softmax = AdaptiveSoftmax( + len(dictionary), + self.output_embed_dim, + options.eval_str_list(args.adaptive_softmax_cutoff, type=int), + dropout=args.adaptive_softmax_dropout, + adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, + factor=args.adaptive_softmax_factor, + tie_proj=args.tie_adaptive_proj, + ) + elif not self.share_input_output_embed: + self.embed_tokens = nn.Parameter( + torch.Tensor(len(dictionary), self.output_embed_dim) + ) + nn.init.normal_( + self.embed_tokens, mean=0, std=self.output_embed_dim ** -0.5 + ) + + if args.decoder_normalize_before and not getattr( + args, "no_decoder_final_norm", False + ): + self.layer_norm = LayerNorm(embed_dim) + else: + self.layer_norm = None + + def forward(self, input, apply_final_proj=True): + if isinstance(input, tuple): + x = input[0] + else: + x = input + + if self.layer_norm: + x = self.layer_norm(x) + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + if self.project_out_dim is not None: + x = self.project_out_dim(x) + if apply_final_proj: + x = self.output_layer(x) + return x + + def output_layer(self, features, **kwargs): + """Project features to the vocabulary size.""" + if self.adaptive_softmax is None: + # project back to size of vocabulary + if self.share_input_output_embed: + if isinstance(self.embed_tokens, nn.ModuleList): + output = None + for i, emb in enumerate(self.embed_tokens): + sidx = i * emb.embedding_dim + eidx = (i + 1) * emb.embedding_dim + if output is None: + output = F.linear(features[:, :, sidx:eidx], emb.weight) + else: + output += F.linear(features[:, :, sidx:eidx], emb.weight) + + return output + else: + return F.linear(features, self.embed_tokens.weight) + else: + return F.linear(features, self.embed_tokens) + else: + return features + + +class TransformerEncoderLayer(nn.Module): + """Encoder layer block. + In the original paper each operation (multi-head attention or FFN) is + postprocessed with: `dropout -> add residual -> layernorm`. In the + tensor2tensor code they suggest that learning is more robust when + preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.encoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + """ + + def __init__(self, args): + super().__init__() + self.embed_dim = args.encoder_embed_dim + self.self_attn = MultiheadAttention( + self.embed_dim, + args.encoder_attention_heads, + dropout=args.attention_dropout, + self_attention=True, + ) + self.self_attn_layer_norm = LayerNorm(self.embed_dim) + self.dropout = args.dropout + self.activation_fn = utils.get_activation_fn( + activation=getattr(args, "activation_fn", "relu") + ) + self.activation_dropout = getattr(args, "activation_dropout", 0) + if self.activation_dropout == 0: + # for backwards compatibility with models that use args.relu_dropout + self.activation_dropout = getattr(args, "relu_dropout", 0) + self.normalize_before = args.encoder_normalize_before + self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) + self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) + self.final_layer_norm = LayerNorm(self.embed_dim) + + def upgrade_state_dict_named(self, state_dict, name): + """ + Rename layer norm states from `...layer_norms.0.weight` to + `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to + `...final_layer_norm.weight` + """ + layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"} + for old, new in layer_norm_map.items(): + for m in ("weight", "bias"): + k = "{}.layer_norms.{}.{}".format(name, old, m) + if k in state_dict: + state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k] + del state_dict[k] + + def forward(self, input): + """ + Args: + input (Tuple): + input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + input[1] (ByteTensor/FloatTensor): encoder padding mask - + binary ByteTensor of shape `(batch, src_len)` where padding elements + are indicated by ``1``. + input[2] (LongTensor): previous decoder outputs of shape + `(batch, tgt_len)`, for teacher forcing) + Returns: + output (Tuple): + output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)` + output[1] (ByteTensor/FloatTensor): encoder padding mask + output[2] (LongTensor): previous decoder outputs + """ + x = input[0] + encoder_padding_mask = input[1] + prev_output_tokens = input[2] + residual = x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) + x, _ = self.self_attn( + query=x, key=x, value=x, key_padding_mask=encoder_padding_mask + ) + x = F.dropout(x, p=self.dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) + + residual = x + x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) + x = self.activation_fn(self.fc1(x)) + x = F.dropout(x, p=self.activation_dropout, training=self.training) + x = self.fc2(x) + x = F.dropout(x, p=self.dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) + return (x, encoder_padding_mask, prev_output_tokens) + + def maybe_layer_norm(self, layer_norm, x, before=False, after=False): + assert before ^ after + if after ^ self.normalize_before: + return layer_norm(x) + else: + return x + + +class TransformerDecoderLayer(nn.Module): + """Decoder layer block. + + In the original paper each operation (multi-head attention, encoder + attention or FFN) is postprocessed with: `dropout -> add residual -> + layernorm`. In the tensor2tensor code they suggest that learning is more + robust when preprocessing each layer with layernorm and postprocessing with: + `dropout -> add residual`. We default to the approach in the paper, but the + tensor2tensor approach can be enabled by setting + *args.decoder_normalize_before* to ``True``. + + Args: + args (argparse.Namespace): parsed command-line arguments + no_encoder_attn (bool, optional): whether to attend to encoder outputs + (default: False). + """ + + def __init__( + self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False + ): + super().__init__() + self.embed_dim = args.decoder_embed_dim + self.self_attn = MultiheadAttention( + embed_dim=self.embed_dim, + num_heads=args.decoder_attention_heads, + dropout=args.attention_dropout, + add_bias_kv=add_bias_kv, + add_zero_attn=add_zero_attn, + self_attention=True, + ) + self.dropout = args.dropout + self.activation_fn = utils.get_activation_fn( + activation=getattr(args, "activation_fn", "relu") + ) + self.activation_dropout = getattr(args, "activation_dropout", 0) + if self.activation_dropout == 0: + # for backwards compatibility with models that use args.relu_dropout + self.activation_dropout = getattr(args, "relu_dropout", 0) + self.normalize_before = args.decoder_normalize_before + + # use layerNorm rather than FusedLayerNorm for exporting. + # char_inputs can be used to determint this. + # TODO remove this once we update apex with the fix + export = getattr(args, "char_inputs", False) + self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + if no_encoder_attn: + self.encoder_attn = None + self.encoder_attn_layer_norm = None + else: + self.encoder_attn = MultiheadAttention( + self.embed_dim, + args.decoder_attention_heads, + kdim=getattr(args, "encoder_embed_dim", None), + vdim=getattr(args, "encoder_embed_dim", None), + dropout=args.attention_dropout, + encoder_decoder_attention=True, + ) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) + + self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) + self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) + + self.final_layer_norm = LayerNorm(self.embed_dim, export=export) + self.need_attn = True + + self.onnx_trace = False + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def forward(self, input): + """ + Args: + input (Tuple): + input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` + input[1] (Tensor): encoder output of shape `(batch, src_len, embed_dim)` + input[2] (ByteTensor/FloatTensor): encoder padding mask - + binary ByteTensor of shape `(batch, src_len)` where padding elements + are indicated by ``1``. + Returns: + output (Tuple): + output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)` + output[1] (ByteTensor/FloatTensor): encoder padding mask + output[2] (LongTensor): previous decoder outputs + """ + # Note: incremental state is not yet supported + mt_task = False + if isinstance(input, tuple): + x = input[0] + encoder_out = input[1] + encoder_padding_mask = input[2] + incremental_state = None + mt_task = True + else: + x = input + encoder_out = None + encoder_padding_mask = None + incremental_state = None + + if incremental_state is None: + self_attn_mask = self.buffered_future_mask(x) + else: + self_attn_mask = None + + # TODO: add back prev_self_attn_state, prev_attn_state, + # self_attn_padding_mask + prev_self_attn_state = None + prev_attn_state = None + self_attn_padding_mask = None + + residual = x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) + if prev_self_attn_state is not None: + if incremental_state is None: + incremental_state = {} + prev_key, prev_value = prev_self_attn_state + saved_state = {"prev_key": prev_key, "prev_value": prev_value} + self.self_attn._set_input_buffer(incremental_state, saved_state) + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + incremental_state=incremental_state, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = F.dropout(x, p=self.dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) + + if self.encoder_attn is not None: + residual = x + x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) + if prev_attn_state is not None: + if incremental_state is None: + incremental_state = {} + prev_key, prev_value = prev_attn_state + saved_state = {"prev_key": prev_key, "prev_value": prev_value} + self.encoder_attn._set_input_buffer(incremental_state, saved_state) + x, attn = self.encoder_attn( + query=x, + key=encoder_out, + value=encoder_out, + key_padding_mask=encoder_padding_mask, + incremental_state=incremental_state, + static_kv=True, + need_weights=(not self.training and self.need_attn), + ) + x = F.dropout(x, p=self.dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) + + residual = x + x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) + x = self.activation_fn(self.fc1(x)) + x = F.dropout(x, p=self.activation_dropout, training=self.training) + x = self.fc2(x) + x = F.dropout(x, p=self.dropout, training=self.training) + x = residual + x + x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) + + if mt_task: + return (x, encoder_out, encoder_padding_mask) + return x + + def buffered_future_mask(self, tensor): + dim = tensor.size(0) + if ( + not hasattr(self, "_future_mask") + or self._future_mask is None + or self._future_mask.device != tensor.device + ): + self._future_mask = torch.triu( + utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 + ) + if self._future_mask.size(0) < dim: + self._future_mask = torch.triu( + utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1 + ) + return self._future_mask[:dim, :dim] + + def maybe_layer_norm(self, layer_norm, x, before=False, after=False): + assert before ^ after + if after ^ self.normalize_before: + return layer_norm(x) + else: + return x + + def make_generation_fast_(self, need_attn=False, **kwargs): + self.need_attn = need_attn + + +def Embedding(num_embeddings, embedding_dim, padding_idx): + m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + nn.init.constant_(m.weight[padding_idx], 0) + return m + + +def Linear(in_features, out_features, bias=True): + m = nn.Linear(in_features, out_features, bias) + nn.init.xavier_uniform_(m.weight) + if bias: + nn.init.constant_(m.bias, 0.0) + return m diff --git a/fairseq-0.10.2/fairseq/model_parallel/models/roberta/model.py b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/model.py new file mode 100644 index 0000000000000000000000000000000000000000..68ad88d2a56516624e4a3a10cf4d5bec5f17d2e6 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/models/roberta/model.py @@ -0,0 +1,287 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +RoBERTa: A Robustly Optimized BERT Pretraining Approach. +""" + +import logging + +import torch +import torch.nn as nn +import torch.nn.functional as F +from fairseq import utils +from fairseq.model_parallel.modules import ModelParallelTransformerSentenceEncoder +from fairseq.models import FairseqEncoder, register_model, register_model_architecture +from fairseq.models.roberta import ( + RobertaClassificationHead, + RobertaEncoder, + RobertaLMHead, + RobertaModel, +) +from fairseq.modules import LayerNorm, TransformerSentenceEncoder +from fairseq.modules.transformer_sentence_encoder import init_bert_params + + +try: + from fairseq.model_parallel.megatron.mpu import ( + copy_to_model_parallel_region, + gather_from_model_parallel_region, + ColumnParallelLinear, + RowParallelLinear, + ) + + has_megatron_submodule = True +except (ImportError, ModuleNotFoundError): + has_megatron_submodule = False + +logger = logging.getLogger(__name__) + + +@register_model("model_parallel_roberta") +class ModelParallelRobertaModel(RobertaModel): + def __init__(self, args, encoder): + super().__init__(args, encoder) + + self.classification_heads = nn.ModuleDict() + + @staticmethod + def add_args(parser): + super(ModelParallelRobertaModel, ModelParallelRobertaModel).add_args(parser) + + @classmethod + def build_model(cls, args, task): + """Build a new model instance.""" + + # make sure all arguments are present + base_architecture(args) + + task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8) + task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8) + + if not hasattr(args, "max_positions"): + args.max_positions = args.tokens_per_sample + + if getattr(args, "untie_weights_roberta", False): + raise NotImplementedError( + "--untie-weights-roberta is not supported in model parallel mode" + ) + + encoder = ModelParallelRobertaEncoder(args, task.source_dictionary) + return cls(args, encoder) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + classification_head_name=None, + **kwargs + ): + if classification_head_name is not None: + features_only = True + + x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs) + + if classification_head_name is not None: + x = self.classification_heads[classification_head_name](x) + return x, extra + + def register_classification_head( + self, name, num_classes=None, inner_dim=None, **kwargs + ): + """Register a classification head.""" + if name in self.classification_heads: + prev_num_classes = self.classification_heads[name].out_proj.out_features + prev_inner_dim = self.classification_heads[name].dense.out_features + if num_classes != prev_num_classes or inner_dim != prev_inner_dim: + logger.warning( + 're-registering head "{}" with num_classes {} (prev: {}) ' + "and inner_dim {} (prev: {})".format( + name, num_classes, prev_num_classes, inner_dim, prev_inner_dim + ) + ) + self.classification_heads[name] = ModelParallelRobertaClassificationHead( + self.args.encoder_embed_dim, + inner_dim or self.args.encoder_embed_dim, + num_classes, + self.args.pooler_activation_fn, + self.args.pooler_dropout, + ) + + +class ModelParallelRobertaLMHead(nn.Module): + """Head for masked language modeling.""" + + def __init__(self, embed_dim, output_dim, activation_fn, weight=None): + super().__init__() + self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True) + self.activation_fn = utils.get_activation_fn(activation_fn) + self.layer_norm = LayerNorm(embed_dim) + + if weight is None: + weight = nn.Linear(embed_dim, output_dim, bias=False).weight + self.weight = weight + self.bias = nn.Parameter(torch.zeros(output_dim)) + + def forward(self, features, masked_tokens=None, **kwargs): + # Only project the unmasked tokens while training, + # saves both memory and computation + if masked_tokens is not None: + features = features[masked_tokens, :] + + x = self.dense(features) + x = self.activation_fn(x) + x = self.layer_norm(x) + + x = copy_to_model_parallel_region(x) + # project back to size of vocabulary with bias + x = F.linear(x, self.weight) + x = gather_from_model_parallel_region(x).contiguous() + x = x + self.bias + return x + + +class ModelParallelRobertaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout + ): + super().__init__() + self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True) + self.activation_fn = utils.get_activation_fn(activation_fn) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = self.activation_fn(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +class ModelParallelRobertaEncoder(FairseqEncoder): + """RoBERTa encoder. + + Implements the :class:`~fairseq.models.FairseqDecoder` interface required + by :class:`~fairseq.models.FairseqLanguageModel`. + """ + + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + # RoBERTa is a sentence encoder model, so users will intuitively trim + # encoder layers. However, the implementation uses the fairseq decoder, + # so we fix here. + if args.encoder_layers_to_keep: + args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) + args.decoder_layers_to_keep = args.encoder_layers_to_keep + args.encoder_layers_to_keep = None + + self.sentence_encoder = ModelParallelTransformerSentenceEncoder( + padding_idx=dictionary.pad(), + vocab_size=len(dictionary), + num_encoder_layers=args.encoder_layers, + embedding_dim=args.encoder_embed_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=args.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + layerdrop=args.encoder_layerdrop, + max_seq_len=args.max_positions, + num_segments=0, + encoder_normalize_before=False, + apply_bert_init=False, + activation_fn=args.activation_fn, + ) + self.lm_head = ModelParallelRobertaLMHead( + embed_dim=args.encoder_embed_dim, + output_dim=len(dictionary), + activation_fn=args.activation_fn, + weight=self.sentence_encoder.embed_tokens.weight, + ) + + def forward( + self, + src_tokens, + features_only=False, + return_all_hiddens=False, + masked_tokens=None, + **unused + ): + """ + Args: + src_tokens (LongTensor): input tokens of shape `(batch, src_len)` + features_only (bool, optional): skip LM head and just return + features. If True, the output will be of shape + `(batch, src_len, embed_dim)`. + return_all_hiddens (bool, optional): also return all of the + intermediate hidden states (default: False). + + Returns: + tuple: + - the LM output of shape `(batch, src_len, vocab)` + - a dictionary of additional data, where 'inner_states' + is a list of hidden states. Note that the hidden + states have shape `(src_len, batch, vocab)`. + """ + x, extra = self.extract_features( + src_tokens, return_all_hiddens=return_all_hiddens + ) + if not features_only: + x = self.output_layer(x, masked_tokens=masked_tokens) + return x, extra + + def extract_features(self, src_tokens, return_all_hiddens=False, **unused): + inner_states, _ = self.sentence_encoder( + src_tokens, + last_state_only=not return_all_hiddens, + ) + features = inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C + return features, {"inner_states": inner_states if return_all_hiddens else None} + + def output_layer(self, features, masked_tokens=None, **unused): + return self.lm_head(features, masked_tokens) + + def max_positions(self): + """Maximum output length supported by the encoder.""" + return self.args.max_positions + + +@register_model_architecture("model_parallel_roberta", "model_parallel_roberta") +def base_architecture(args): + args.encoder_layers = getattr(args, "encoder_layers", 12) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) + + args.activation_fn = getattr(args, "activation_fn", "gelu") + args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") + + args.dropout = getattr(args, "dropout", 0.1) + args.attention_dropout = getattr(args, "attention_dropout", 0.1) + args.activation_dropout = getattr(args, "activation_dropout", 0.0) + args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) + args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) + args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0) + + +@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base") +def roberta_base_architecture(args): + base_architecture(args) + + +@register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large") +def roberta_large_architecture(args): + args.encoder_layers = getattr(args, "encoder_layers", 24) + args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) + args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) + args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) + base_architecture(args) diff --git a/fairseq-0.10.2/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f48752481594be870db1bb038756266ad975a668 Binary files /dev/null and b/fairseq-0.10.2/fairseq/model_parallel/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py b/fairseq-0.10.2/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e10bf5233210b6ec2ac4fe4b0c59dc4bb6d8e4c8 --- /dev/null +++ b/fairseq-0.10.2/fairseq/model_parallel/modules/transformer_sentence_encoder_layer.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.model_parallel.modules import ModelParallelMultiheadAttention +from fairseq.modules import TransformerSentenceEncoderLayer + + +try: + from fairseq.model_parallel.megatron.mpu import ( + ColumnParallelLinear, + RowParallelLinear, + ) + + has_megatron_submodule = True +except (ImportError, ModuleNotFoundError): + has_megatron_submodule = False + + +class ModelParallelTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): + """ + Implements a Model Parallel Transformer Encoder Layer used in + BERT/XLM style pre-trained models. + """ + + def build_fc1(self, input_dim, output_dim, **unused): + return ColumnParallelLinear(input_dim, output_dim, gather_output=False) + + def build_fc2(self, input_dim, output_dim, **unused): + return RowParallelLinear(input_dim, output_dim, input_is_parallel=True) + + def build_self_attention( + self, + embed_dim, + num_attention_heads, + dropout, + **kwargs, + ): + return ModelParallelMultiheadAttention( + embed_dim, num_attention_heads, dropout=dropout, self_attention=True + ) + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + ): + """ + LayerNorm is applied either before or after the self-attention/ffn + modules similar to the original Transformer imlementation. + """ + residual = x + x = self.self_attn_layer_norm(x) + x, attn = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + attn_mask=self_attn_mask, + ) + x = self.dropout_module(x) + x = residual + x + + residual = x + x = self.final_layer_norm(x) + x = self.activation_fn(self.fc1(x)) + x = self.activation_dropout_module(x) + x = self.fc2(x) + x = self.dropout_module(x) + x = residual + x + return x, None diff --git a/fairseq-0.10.2/fairseq/modules/__init__.py b/fairseq-0.10.2/fairseq/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e2326ac6e36d9264e74f64e580834685ed99b546 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/__init__.py @@ -0,0 +1,76 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""isort:skip_file""" + +from .adaptive_input import AdaptiveInput +from .adaptive_softmax import AdaptiveSoftmax +from .beamable_mm import BeamableMM +from .character_token_embedder import CharacterTokenEmbedder +from .conv_tbc import ConvTBC +from .cross_entropy import cross_entropy +from .downsampled_multihead_attention import DownsampledMultiHeadAttention +from .dynamic_convolution import DynamicConv, DynamicConv1dTBC +from .dynamic_crf_layer import DynamicCRF +from .fairseq_dropout import FairseqDropout +from .fp32_group_norm import Fp32GroupNorm +from .gelu import gelu, gelu_accurate +from .grad_multiply import GradMultiply +from .gumbel_vector_quantizer import GumbelVectorQuantizer +from .kmeans_vector_quantizer import KmeansVectorQuantizer +from .layer_drop import LayerDropModuleList +from .layer_norm import Fp32LayerNorm, LayerNorm +from .learned_positional_embedding import LearnedPositionalEmbedding +from .lightweight_convolution import LightweightConv, LightweightConv1dTBC +from .linearized_convolution import LinearizedConvolution +from .multihead_attention import MultiheadAttention +from .positional_embedding import PositionalEmbedding +from .same_pad import SamePad +from .scalar_bias import ScalarBias +from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding +from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer +from .transformer_sentence_encoder import TransformerSentenceEncoder +from .transpose_last import TransposeLast +from .unfold import unfold1d +from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer +from .vggblock import VGGBlock + +__all__ = [ + "AdaptiveInput", + "AdaptiveSoftmax", + "BeamableMM", + "CharacterTokenEmbedder", + "ConvTBC", + "cross_entropy", + "DownsampledMultiHeadAttention", + "DynamicConv1dTBC", + "DynamicConv", + "DynamicCRF", + "FairseqDropout", + "Fp32GroupNorm", + "Fp32LayerNorm", + "gelu", + "gelu_accurate", + "GradMultiply", + "GumbelVectorQuantizer", + "KmeansVectorQuantizer", + "LayerDropModuleList", + "LayerNorm", + "LearnedPositionalEmbedding", + "LightweightConv1dTBC", + "LightweightConv", + "LinearizedConvolution", + "MultiheadAttention", + "PositionalEmbedding", + "SamePad", + "ScalarBias", + "SinusoidalPositionalEmbedding", + "TransformerSentenceEncoderLayer", + "TransformerSentenceEncoder", + "TransformerDecoderLayer", + "TransformerEncoderLayer", + "TransposeLast", + "VGGBlock", + "unfold1d", +] diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ae55b409c8fd64e653f3a597992bfb3cf60bb0c Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/adaptive_softmax.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d55a7d487d1eb851241d926afbce0c00458ad61 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/dynamic_convolution.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86558e12ee9dafa50826414ba5d83040cfdbbdfc Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/grad_multiply.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ea8ba5424e5fe5fe7ebe67548695e8e6191ab370 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/gumbel_vector_quantizer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f29fa80ae4a847973b0a506b0b99db9a32523fc1 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/kmeans_vector_quantizer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7746805e2e3405d1b4282a9453b6f707093ff199 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/learned_positional_embedding.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44b68aaf976993f5c786b4e2b9e30228b1315913 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/lightweight_convolution.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ffad100e8634548da076cacd0f55925df03cb8a4 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/transformer_sentence_encoder_layer.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/__pycache__/vggblock.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/__pycache__/vggblock.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e0ef3697d1ae69de31c1c1bb27e2f64c4b16423 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/__pycache__/vggblock.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/adaptive_softmax.py b/fairseq-0.10.2/fairseq/modules/adaptive_softmax.py new file mode 100644 index 0000000000000000000000000000000000000000..ae0c77ba0f6ee98501306d66cbc4a948b4ade0f7 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/adaptive_softmax.py @@ -0,0 +1,268 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import functools +import operator + +import torch +import torch.nn.functional as F +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import nn + + +class TiedLinear(nn.Module): + def __init__(self, weight, transpose): + super().__init__() + self.weight = weight + self.transpose = transpose + + def forward(self, input): + return F.linear(input, self.weight.t() if self.transpose else self.weight) + + +class TiedHeadModule(nn.Module): + def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size): + super().__init__() + tied_emb, _ = weights + self.num_words, emb_dim = tied_emb.size() + + self.word_proj = quant_noise( + TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size + ) + if input_dim != emb_dim: + self.word_proj = nn.Sequential( + quant_noise( + nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size + ), + self.word_proj, + ) + + self.class_proj = quant_noise( + nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size + ) + self.out_dim = self.num_words + num_classes + + self.register_buffer("_float_tensor", torch.FloatTensor(1)) + + def forward(self, input): + inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1) + out = self._float_tensor.new(inp_sz, self.out_dim) + out[:, : self.num_words] = self.word_proj(input.view(inp_sz, -1)) + out[:, self.num_words :] = self.class_proj(input.view(inp_sz, -1)) + return out + + +class AdaptiveSoftmax(nn.Module): + """ + This is an implementation of the efficient softmax approximation for + graphical processing units (GPU), described in the paper "Efficient softmax + approximation for GPUs" (http://arxiv.org/abs/1609.04309). + """ + + def __init__( + self, + vocab_size, + input_dim, + cutoff, + dropout, + factor=4.0, + adaptive_inputs=None, + tie_proj=False, + q_noise=0, + qn_block_size=8, + ): + super().__init__() + + if vocab_size > cutoff[-1]: + cutoff = cutoff + [vocab_size] + else: + assert ( + vocab_size == cutoff[-1] + ), "cannot specify cutoff larger than vocab size" + + output_dim = cutoff[0] + len(cutoff) - 1 + + self.vocab_size = vocab_size + self.cutoff = cutoff + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + self.input_dim = input_dim + self.factor = factor + self.q_noise = q_noise + self.qn_block_size = qn_block_size + + self.lsm = nn.LogSoftmax(dim=1) + + if adaptive_inputs is not None: + self.head = TiedHeadModule( + adaptive_inputs.weights_for_band(0), + input_dim, + len(cutoff) - 1, + self.q_noise, + self.qn_block_size, + ) + else: + self.head = quant_noise( + nn.Linear(input_dim, output_dim, bias=False), + self.q_noise, + self.qn_block_size, + ) + + self._make_tail(adaptive_inputs, tie_proj) + + def init_weights(m): + if ( + hasattr(m, "weight") + and not isinstance(m, TiedLinear) + and not isinstance(m, TiedHeadModule) + ): + nn.init.xavier_uniform_(m.weight) + + self.apply(init_weights) + + self.register_buffer("version", torch.LongTensor([1])) + + def _make_tail(self, adaptive_inputs=None, tie_proj=False): + self.tail = nn.ModuleList() + for i in range(len(self.cutoff) - 1): + dim = int(self.input_dim // self.factor ** (i + 1)) + + tied_emb, tied_proj = ( + adaptive_inputs.weights_for_band(i + 1) + if adaptive_inputs is not None + else (None, None) + ) + + if tied_proj is not None: + if tie_proj: + proj = quant_noise( + TiedLinear(tied_proj, transpose=True), + self.q_noise, + self.qn_block_size, + ) + else: + proj = quant_noise( + nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), + self.q_noise, + self.qn_block_size, + ) + else: + proj = quant_noise( + nn.Linear(self.input_dim, dim, bias=False), + self.q_noise, + self.qn_block_size, + ) + + if tied_emb is None: + out_proj = nn.Linear( + dim, self.cutoff[i + 1] - self.cutoff[i], bias=False + ) + else: + out_proj = TiedLinear(tied_emb, transpose=False) + + m = nn.Sequential( + proj, + nn.Dropout(self.dropout_module.p), + quant_noise(out_proj, self.q_noise, self.qn_block_size), + ) + + self.tail.append(m) + + def upgrade_state_dict_named(self, state_dict, name): + version_name = name + ".version" + if version_name not in state_dict: + raise Exception("This version of the model is no longer supported") + + def adapt_target(self, target): + """ + In order to be efficient, the AdaptiveSoftMax does not compute the + scores for all the word of the vocabulary for all the examples. It is + thus necessary to call the method adapt_target of the AdaptiveSoftMax + layer inside each forward pass. + """ + + target = target.view(-1) + new_target = [target.clone()] + target_idxs = [] + + for i in range(len(self.cutoff) - 1): + mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1])) + new_target[0][mask] = self.cutoff[0] + i + + if mask.any(): + target_idxs.append(mask.nonzero(as_tuple=False).squeeze(1)) + new_target.append(target[mask].add(-self.cutoff[i])) + else: + target_idxs.append(None) + new_target.append(None) + + return new_target, target_idxs + + def forward(self, input, target): + """ + Args: + input: (b x t x d) + target: (b x t) + Returns: + 2 lists: output for each cutoff section and new targets by cut off + """ + + input = input.contiguous().view(-1, input.size(-1)) + input = self.dropout_module(input) + + new_target, target_idxs = self.adapt_target(target) + output = [self.head(input)] + + for i in range(len(target_idxs)): + if target_idxs[i] is not None: + output.append(self.tail[i](input.index_select(0, target_idxs[i]))) + else: + output.append(None) + + return output, new_target + + def get_log_prob(self, input, target): + """ + Computes the log probabilities for all the words of the vocabulary, + given a 2D tensor of hidden vectors. + """ + + bsz, length, dim = input.size() + input = input.contiguous().view(-1, dim) + + if target is not None: + _, target_idxs = self.adapt_target(target) + else: + target_idxs = None + + head_y = self.head(input) + log_probs = head_y.new_zeros(input.size(0), self.vocab_size) + + head_sz = self.cutoff[0] + len(self.tail) + log_probs[:, :head_sz] = self.lsm(head_y) + tail_priors = log_probs[:, self.cutoff[0] : head_sz].clone() + + for i in range(len(self.tail)): + start = self.cutoff[i] + end = self.cutoff[i + 1] + + if target_idxs is None: + tail_out = log_probs[:, start:end] + tail_out.copy_(self.tail[i](input)) + log_probs[:, start:end] = self.lsm(tail_out).add_( + tail_priors[:, i, None] + ) + elif target_idxs[i] is not None: + idxs = target_idxs[i] + tail_out = log_probs[idxs, start:end] + tail_out.copy_(self.tail[i](input[idxs])) + log_probs[idxs, start:end] = self.lsm(tail_out).add_( + tail_priors[idxs, i, None] + ) + + log_probs = log_probs.view(bsz, length, -1) + return log_probs diff --git a/fairseq-0.10.2/fairseq/modules/beamable_mm.py b/fairseq-0.10.2/fairseq/modules/beamable_mm.py new file mode 100644 index 0000000000000000000000000000000000000000..eff1a4607f600c71210e6b914985dc48731aae86 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/beamable_mm.py @@ -0,0 +1,49 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class BeamableMM(nn.Module): + """This module provides an optimized MM for beam decoding with attention. + + It leverage the fact that the source-side of the input is replicated beam + times and the target-side of the input is of width one. This layer speeds up + inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)} + with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}. + """ + + def __init__(self, beam_size=None): + super(BeamableMM, self).__init__() + self.beam_size = beam_size + + def forward(self, input1, input2): + if ( + not self.training + and self.beam_size is not None # test mode + and input1.dim() == 3 # beam size is set + and input1.size(1) # only support batched input + == 1 # single time step update + ): + bsz, beam = input1.size(0), self.beam_size + + # bsz x 1 x nhu --> bsz/beam x beam x nhu + input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1) + + # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu + input2 = input2.unfold(0, beam, beam)[:, :, :, 0] + + # use non batched operation if bsz = beam + if input1.size(0) == 1: + output = torch.mm(input1[0, :, :], input2[0, :, :]) + else: + output = input1.bmm(input2) + return output.view(bsz, 1, -1) + else: + return input1.bmm(input2) + + def set_beam_size(self, beam_size): + self.beam_size = beam_size diff --git a/fairseq-0.10.2/fairseq/modules/character_token_embedder.py b/fairseq-0.10.2/fairseq/modules/character_token_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..181221b61b9f76453b67e3b848b198620dce912c --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/character_token_embedder.py @@ -0,0 +1,214 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import List, Tuple + +import torch +import torch.nn.functional as F +from fairseq.data import Dictionary +from torch import nn + + +CHAR_PAD_IDX = 0 +CHAR_EOS_IDX = 257 + + +logger = logging.getLogger(__name__) + + +class CharacterTokenEmbedder(torch.nn.Module): + def __init__( + self, + vocab: Dictionary, + filters: List[Tuple[int, int]], + char_embed_dim: int, + word_embed_dim: int, + highway_layers: int, + max_char_len: int = 50, + char_inputs: bool = False, + ): + super(CharacterTokenEmbedder, self).__init__() + + self.onnx_trace = False + self.embedding_dim = word_embed_dim + self.max_char_len = max_char_len + self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0) + self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim)) + self.eos_idx, self.unk_idx = 0, 1 + self.char_inputs = char_inputs + + self.convolutions = nn.ModuleList() + for width, out_c in filters: + self.convolutions.append( + nn.Conv1d(char_embed_dim, out_c, kernel_size=width) + ) + + last_dim = sum(f[1] for f in filters) + + self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None + + self.projection = nn.Linear(last_dim, word_embed_dim) + + assert ( + vocab is not None or char_inputs + ), "vocab must be set if not using char inputs" + self.vocab = None + if vocab is not None: + self.set_vocab(vocab, max_char_len) + + self.reset_parameters() + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def set_vocab(self, vocab, max_char_len): + word_to_char = torch.LongTensor(len(vocab), max_char_len) + + truncated = 0 + for i in range(len(vocab)): + if i < vocab.nspecial: + char_idxs = [0] * max_char_len + else: + chars = vocab[i].encode() + # +1 for padding + char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars)) + if len(char_idxs) > max_char_len: + truncated += 1 + char_idxs = char_idxs[:max_char_len] + word_to_char[i] = torch.LongTensor(char_idxs) + + if truncated > 0: + logger.info( + "truncated {} words longer than {} characters".format( + truncated, max_char_len + ) + ) + + self.vocab = vocab + self.word_to_char = word_to_char + + @property + def padding_idx(self): + return Dictionary().pad() if self.vocab is None else self.vocab.pad() + + def reset_parameters(self): + nn.init.xavier_normal_(self.char_embeddings.weight) + nn.init.xavier_normal_(self.symbol_embeddings) + nn.init.xavier_uniform_(self.projection.weight) + + nn.init.constant_( + self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.0 + ) + nn.init.constant_(self.projection.bias, 0.0) + + def forward( + self, + input: torch.Tensor, + ): + if self.char_inputs: + chars = input.view(-1, self.max_char_len) + pads = chars[:, 0].eq(CHAR_PAD_IDX) + eos = chars[:, 0].eq(CHAR_EOS_IDX) + if eos.any(): + if self.onnx_trace: + chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars) + else: + chars[eos] = 0 + + unk = None + else: + flat_words = input.view(-1) + chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as( + input + ) + pads = flat_words.eq(self.vocab.pad()) + eos = flat_words.eq(self.vocab.eos()) + unk = flat_words.eq(self.vocab.unk()) + + word_embs = self._convolve(chars) + if self.onnx_trace: + if pads.any(): + word_embs = torch.where( + pads.unsqueeze(1), word_embs.new_zeros(1), word_embs + ) + if eos.any(): + word_embs = torch.where( + eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs + ) + if unk is not None and unk.any(): + word_embs = torch.where( + unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs + ) + else: + if pads.any(): + word_embs[pads] = 0 + if eos.any(): + word_embs[eos] = self.symbol_embeddings[self.eos_idx] + if unk is not None and unk.any(): + word_embs[unk] = self.symbol_embeddings[self.unk_idx] + + return word_embs.view(input.size()[:2] + (-1,)) + + def _convolve( + self, + char_idxs: torch.Tensor, + ): + char_embs = self.char_embeddings(char_idxs) + char_embs = char_embs.transpose(1, 2) # BTC -> BCT + + conv_result = [] + + for conv in self.convolutions: + x = conv(char_embs) + x, _ = torch.max(x, -1) + x = F.relu(x) + conv_result.append(x) + + x = torch.cat(conv_result, dim=-1) + + if self.highway is not None: + x = self.highway(x) + x = self.projection(x) + + return x + + +class Highway(torch.nn.Module): + """ + A `Highway layer `_. + Adopted from the AllenNLP implementation. + """ + + def __init__(self, input_dim: int, num_layers: int = 1): + super(Highway, self).__init__() + self.input_dim = input_dim + self.layers = nn.ModuleList( + [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)] + ) + self.activation = nn.ReLU() + + self.reset_parameters() + + def reset_parameters(self): + for layer in self.layers: + # As per comment in AllenNLP: + # We should bias the highway layer to just carry its input forward. We do that by + # setting the bias on `B(x)` to be positive, because that means `g` will be biased to + # be high, so we will carry the input forward. The bias on `B(x)` is the second half + # of the bias vector in each Linear layer. + nn.init.constant_(layer.bias[self.input_dim :], 1) + + nn.init.constant_(layer.bias[: self.input_dim], 0) + nn.init.xavier_normal_(layer.weight) + + def forward(self, x: torch.Tensor): + for layer in self.layers: + projection = layer(x) + proj_x, gate = projection.chunk(2, dim=-1) + proj_x = self.activation(proj_x) + gate = torch.sigmoid(gate) + x = gate * x + (gate.new_tensor([1]) - gate) * proj_x + return x diff --git a/fairseq-0.10.2/fairseq/modules/cross_entropy.py b/fairseq-0.10.2/fairseq/modules/cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..0d2beb44bbf8522ea4002fa6108c41e6a4b5ccc6 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/cross_entropy.py @@ -0,0 +1,59 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +import torch.nn.functional as F + + +logger = logging.getLogger(__name__) + + +def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"): + lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) + return F.nll_loss( + lprobs, + target, + ignore_index=ignore_index, + reduction=reduction, + ) + + +try: + import xentropy_cuda + from apex.contrib import xentropy + + logger.info("using fused cross entropy") + + def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): + if logits.device == torch.device("cpu"): + return _cross_entropy_pytorch(logits, target, ignore_index, reduction) + else: + half_to_float = logits.dtype == torch.half + losses = xentropy.SoftmaxCrossEntropyLoss.apply( + logits, + target, + 0.0, + ignore_index, + half_to_float, + ) + if reduction == "sum": + return losses.sum() + elif reduction == "mean": + if ignore_index >= 0: + return losses.sum() / target.ne(ignore_index).sum() + else: + return losses.mean() + elif reduction == "none": + return losses + else: + raise NotImplementedError + + +except ImportError: + + def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): + return _cross_entropy_pytorch(logits, target, ignore_index, reduction) diff --git a/fairseq-0.10.2/fairseq/modules/fp32_group_norm.py b/fairseq-0.10.2/fairseq/modules/fp32_group_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..d03aac022e30c8c14a600062d1d86429504ba003 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/fp32_group_norm.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Layer norm done in fp32 (for fp16 training) +""" + +import torch.nn as nn +import torch.nn.functional as F + + +class Fp32GroupNorm(nn.GroupNorm): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def forward(self, input): + output = F.group_norm( + input.float(), + self.num_groups, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ) + return output.type_as(input) diff --git a/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ee83a56c89754c2abbe717b269d07ca9e64eef2 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu @@ -0,0 +1,375 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "lightconv_cuda.cuh" +#include "lightconv_cuda_forward.cu" +#include "lightconv_cuda_backward.cu" +#include "../cuda_utils.cu" + +template +__global__ +void lightconv_forward_kernel(const scalar_t* input, + const scalar_t* filters, + int minibatch, int sequenceLength, + int numFeatures, int numFiltersInBlock, + scalar_t* output) { + + const int tid = threadIdx.x; + const int batchIdx = blockIdx.x; + const int featureIdx = blockIdx.y; + const int filterIdx = featureIdx / numFiltersInBlock; + + const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; + const scalar_t* inputFeature = &input[IOOffset]; + scalar_t* outputFeature = &output[IOOffset]; + const scalar_t* inputFilter = &filters[filterIdx * FS]; + + assert(blockDim.x == SB); + + scalar_t filter[FS]; + #pragma unroll + for (int i = 0; i < FS; ++i) { + filter[i] = inputFilter[i]; + } + + __shared__ scalar_t temp[SB + FS]; + zeroSharedMem(temp); + + const int numIterations = divUp(sequenceLength, SB); + + for (int i = 0; i < numIterations; ++i) { + // Read input into shared memory + const int inputOffset = i * SB; + + load_input_to_shared(inputFeature, inputOffset, sequenceLength, + i, numIterations, (numIterations == 1), temp); + + __syncthreads(); + + scalar_t out = 0; + #pragma unroll + for (int j = 0; j < FS; ++j) { + out += filter[j] * temp[tid + j]; + } + + // Write output + const int outputOffset = inputOffset; + if ((outputOffset + tid) < sequenceLength) { + outputFeature[outputOffset + tid] = out; + } + + __syncthreads(); + } +} + +template +__global__ +void lightconv_grad_wrt_input_kernel( + const scalar_t* input, + const scalar_t* filters, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + scalar_t* output) { + + // input grad kernel is similar to forward kernel + const int tid = threadIdx.x; + const int batchIdx = blockIdx.x; + const int featureIdx = blockIdx.y; + const int filterIdx = featureIdx / numFiltersInBlock; + + const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; + const scalar_t* inputFeature = &input[IOOffset]; + scalar_t* outputFeature = &output[IOOffset]; + const scalar_t* inputFilter = &filters[filterIdx * FS]; + + assert(blockDim.x == SB); + + scalar_t filter[FS]; + + // The only change is loading the filter in reverse + #pragma unroll + for (int i = 0; i < FS; ++i) { + filter[i] = inputFilter[FS - i - 1]; + } + + __shared__ scalar_t temp[SB + FS]; + const int padding = FS - padding_l - 1; + zeroSharedMem(temp); + + __syncthreads(); + + const int numIterations = divUp(sequenceLength, SB); + + for (int i = 0; i < numIterations; ++i) { + // Read input into shared memory + const int inputOffset = i * SB; + + load_input_to_shared(inputFeature, inputOffset, sequenceLength, + i, numIterations, false, temp); + + __syncthreads(); + + scalar_t out = 0; + #pragma unroll + for (int j = 0; j < FS; ++j) { + out += filter[j] * temp[tid + j]; + } + + // Write output + const int outputOffset = inputOffset; + if ((outputOffset + tid) < sequenceLength) { + outputFeature[outputOffset + tid] = out; + } + + __syncthreads(); + } +} + +// This is by far the most expensive kernel in terms of time taken. +// Can be 16x slower than the forward or grad_wrt_input when filter size is 31 +template +__global__ +void lightconv_grad_wrt_weights_firstpass_short_kernel( + const scalar_t* input, + const scalar_t* gradInput, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + int numHeads, + float* output) { + + const int tid = threadIdx.x; + const int batchIdx = blockIdx.x; + const int filterIdx = blockIdx.y; + + const int numIterations = divUp(sequenceLength, SB); + + float* tempOutputGradWeight = &output[filterIdx * FS * minibatch]; + + assert(blockDim.x == SB); + + __shared__ scalar_t tempInput[SB + FS]; + __shared__ scalar_t tempGradInput[SB + FS]; + + // local weight accumulation + float accumWeights[FS]; + + // Initialize memory + for (int i = 0; i < FS; ++i) { + accumWeights[i] = float(0.0); + } + + + // loop over each sequence within filterblock + for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) { + + const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength; + const scalar_t* inputFeature = &input[featureOffset]; + const scalar_t* gradInputFeature = &gradInput[featureOffset]; + + zeroSharedMem(tempInput); + zeroSharedMem(tempGradInput); + __syncthreads(); + + for (int i = 0; i < numIterations; ++i) { + + const int inputOffset = i * SB; + + load_input_to_shared(inputFeature, inputOffset, sequenceLength, + i, numIterations, false, tempInput); + load_input_to_shared(gradInputFeature, inputOffset, sequenceLength, + i, numIterations, false, tempGradInput); + + __syncthreads(); + + const int gradIndex = (FS/2) + tid; + scalar_t tempGrad = tempGradInput[gradIndex]; + + #pragma unroll + for (int j = 0; j < FS; j++) { + const int inputIndex = tid + j; + accumWeights[j] += tempInput[inputIndex] * tempGrad; + } + + __syncthreads(); + + } + + } + + // Row-major sum + for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { + + float temp; + if (tid < sequenceLength) { + temp = accumWeights[filterWeightIdx]; + } else { + temp = float(0.0); + } + + const int outputOffset = filterWeightIdx * minibatch + batchIdx; + + temp = blockReduce(temp); + + if (tid == 0) { + tempOutputGradWeight[outputOffset] = temp; + } + } +} + +template +__global__ +void lightconv_grad_wrt_weights_secondpass_short_kernel( + const float* input, + const int minibatch, + const int numFiltersInBlock, + scalar_t* output) { + + assert(blockDim.x == SB); + + const int tid = threadIdx.x; + + const int filterIdx = blockIdx.x; + const int filterWeightIdx = blockIdx.y; + + const int inputOffset = filterIdx * FS * minibatch + + filterWeightIdx * minibatch; + const float* tempInput = &input[inputOffset]; + + // read into shared memory for reduction + int readIndex = tid; + + float sum = 0.0; + while (readIndex < minibatch) { + sum += tempInput[readIndex]; + readIndex += SB; + } + + float temp = blockReduce(sum); + + if (tid == 0) { + output[blockIdx.x * FS + blockIdx.y] = temp; + } +} + +// This is by far the most expensive kernel in terms of time taken. +// Can be 16x slower than the forward or grad_wrt_input when filter size is 31 +template +__global__ +void lightconv_grad_wrt_weights_firstpass_kernel( + const scalar_t* input, + const scalar_t* gradInput, + int minibatch, + int sequenceLength, + int numFeatures, + int numFiltersInBlock, + float* output) { + + assert(blockDim.x == SB); + + const int tid = threadIdx.x; + const int batchIdx = blockIdx.x; + const int featureIdx = blockIdx.y; + const int filterIdx = featureIdx / numFiltersInBlock; + const int idxInFilterBlock = featureIdx % numFiltersInBlock; + + const int numIterations = divUp(sequenceLength, SB); + + float temp; + + __shared__ scalar_t tempInput[SB + FS]; + __shared__ scalar_t tempGradInput[SB + FS]; + zeroSharedMem(tempInput); + zeroSharedMem(tempGradInput); + __syncthreads(); + + float accumWeights[FS]; + + for (int i = 0; i < FS; ++i) { + accumWeights[i] = float(0.0); + } + + const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; + const scalar_t* inputFeature = &input[IOOffset]; + const scalar_t* gradInputFeature = &gradInput[IOOffset]; + float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock]; + + for (int i = 0; i < numIterations; ++i) { + const int inputOffset = i * SB; + + load_input_to_shared(inputFeature, inputOffset, sequenceLength, + i, numIterations, false, tempInput); + load_input_to_shared(gradInputFeature, inputOffset, sequenceLength, + i, numIterations, false, tempGradInput); + __syncthreads(); + + #pragma unroll + for (int j = 0; j < FS; ++j) { + accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS/2)]; + } + + __syncthreads(); + } + + // Row-major sum + for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { + + // Write to shared memory before reduction + if (tid < sequenceLength) { + temp = accumWeights[filterWeightIdx]; + } else { + temp = float(0.0); + } + + temp = blockReduce(temp); + + const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock + + batchIdx * numFiltersInBlock + + idxInFilterBlock; + + if (tid == 0) { + tempOutputGradWeight[outputOffset] = temp; + } + } +} + +template +__global__ +void lightconv_grad_wrt_weights_secondpass_kernel( + const float* input, + const int minibatch, + const int numFiltersInBlock, + scalar_t* output) { + + assert(blockDim.x == SB); + const int tid = threadIdx.x; + + // What is the id within a minibatch + const int filterIdx = blockIdx.x; + const int filterWeightIdx = blockIdx.y; + + const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock + + filterWeightIdx * minibatch * numFiltersInBlock; + const float* tempInput = &input[inputOffset]; + + int readIndex = tid; + + float sum = float(0.0); + while (readIndex < (minibatch * numFiltersInBlock)) { + sum += tempInput[readIndex]; + readIndex += SB; + } + + float temp = blockReduce(sum); + + if (tid == 0) { + output[blockIdx.x * FS + blockIdx.y] = temp; + } +} diff --git a/fairseq-0.10.2/fairseq/modules/linearized_convolution.py b/fairseq-0.10.2/fairseq/modules/linearized_convolution.py new file mode 100644 index 0000000000000000000000000000000000000000..09a8f201c0218f461f44ca57b3352328e4efb936 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/linearized_convolution.py @@ -0,0 +1,104 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state + +from .conv_tbc import ConvTBC + + +@with_incremental_state +class LinearizedConvolution(ConvTBC): + """An optimized version of nn.Conv1d. + + At training time, this module uses ConvTBC, which is an optimized version + of Conv1d. At inference time, it optimizes incremental generation (i.e., + one time step at a time) by replacing the convolutions with linear layers. + Note that the input order changes from training to inference. + """ + + def __init__(self, in_channels, out_channels, kernel_size, **kwargs): + super().__init__(in_channels, out_channels, kernel_size, **kwargs) + self._linearized_weight = None + self.register_backward_hook(self._clear_linearized_weight) + + def state_dict(self, destination=None, prefix="", keep_vars=False): + state = ConvTBC.state_dict(self, destination, prefix, keep_vars=keep_vars) + # don't store redundant _linearized_weight in checkpoints + if prefix + "_linearized_weight" in state: + del state[prefix + "_linearized_weight"] + return state + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + if prefix + "_linearized_weight" in state_dict: + del state_dict[prefix + "_linearized_weight"] + + def forward(self, input, incremental_state=None): + """ + Args: + incremental_state: Used to buffer signal; if not None, then input is + expected to contain a single frame. If the input order changes + between time steps, call reorder_incremental_state. + Input: + Time x Batch x Channel during training + Batch x Time x Channel during inference + """ + if incremental_state is None: + output = super().forward(input) + if self.kernel_size[0] > 1 and self.padding[0] > 0: + # remove future timesteps added by padding + output = output[: -self.padding[0], :, :] + return output + + # reshape weight + weight = self._get_linearized_weight() + kw = self.kernel_size[0] + + bsz = input.size(0) # input: bsz x len x dim + if kw > 1: + input = input.data + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is None: + input_buffer = input.new(bsz, kw, input.size(2)).zero_() + self._set_input_buffer(incremental_state, input_buffer) + else: + # shift buffer + input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone() + # append next input + input_buffer[:, -1, :] = input[:, -1, :] + input = input_buffer + with torch.no_grad(): + output = F.linear(input.view(bsz, -1), weight, self.bias) + return output.view(bsz, 1, -1) + + def reorder_incremental_state(self, incremental_state, new_order): + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + input_buffer = input_buffer.index_select(0, new_order) + self._set_input_buffer(incremental_state, input_buffer) + + def _get_input_buffer(self, incremental_state): + return utils.get_incremental_state(self, incremental_state, "input_buffer") + + def _set_input_buffer(self, incremental_state, new_buffer): + return utils.set_incremental_state( + self, incremental_state, "input_buffer", new_buffer + ) + + def _get_linearized_weight(self): + if self._linearized_weight is None: + kw = self.kernel_size[0] + weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() + assert weight.size() == (self.out_channels, kw, self.in_channels) + self._linearized_weight = torch.nn.Parameter( + weight.view(self.out_channels, -1) + ) + return self._linearized_weight + + def _clear_linearized_weight(self, *args): + self._linearized_weight = None diff --git a/fairseq-0.10.2/fairseq/modules/multihead_attention.py b/fairseq-0.10.2/fairseq/modules/multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..99f95deb5f894c62e011ef466aa28eeca3881fd1 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/multihead_attention.py @@ -0,0 +1,488 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Dict, Optional, Tuple + +import torch +import torch.nn.functional as F +from fairseq import utils +from fairseq.incremental_decoding_utils import with_incremental_state +from fairseq.modules.fairseq_dropout import FairseqDropout +from fairseq.modules.quant_noise import quant_noise +from torch import Tensor, nn +from torch.nn import Parameter + + +@with_incremental_state +class MultiheadAttention(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + self.encoder_decoder_attention = encoder_decoder_attention + + assert not self.self_attention or self.qkv_same_dim, ( + "Self-attention requires query, key and " "value to be of the same size" + ) + + self.k_proj = quant_noise( + nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.q_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + + self.out_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self.reset_parameters() + + self.onnx_trace = False + self.tpu = False + + def prepare_for_onnx_export_(self): + self.onnx_trace = True + + def prepare_for_tpu_(self, **kwargs): + self.tpu = True + + def reset_parameters(self): + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) + else: + nn.init.xavier_uniform_(self.k_proj.weight) + nn.init.xavier_uniform_(self.v_proj.weight) + nn.init.xavier_uniform_(self.q_proj.weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.out_proj.bias is not None: + nn.init.constant_(self.out_proj.bias, 0.0) + if self.bias_k is not None: + nn.init.xavier_normal_(self.bias_k) + if self.bias_v is not None: + nn.init.xavier_normal_(self.bias_v) + + def forward( + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, + static_kv: bool = False, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + ) -> Tuple[Tensor, Optional[Tensor]]: + """Input shape: Time x Batch x Channel + + Args: + key_padding_mask (ByteTensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where + padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (ByteTensor, optional): typically used to + implement causal attention, where the mask prevents the + attention from looking forward in time (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: + return the average attention weights over all heads. + """ + if need_head_weights: + need_weights = True + + tgt_len, bsz, embed_dim = query.size() + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + + if ( + not self.onnx_trace + and not self.tpu # don't use PyTorch version on TPUs + and incremental_state is None + and not static_kv + # A workaround for quantization to work. Otherwise JIT compilation + # treats bias in linear module as method. + and not torch.jit.is_scripting() + ): + assert key is not None and value is not None + return F.multi_head_attention_forward( + query, + key, + value, + self.embed_dim, + self.num_heads, + torch.empty([0]), + torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), + self.bias_k, + self.bias_v, + self.add_zero_attn, + self.dropout_module.p, + self.out_proj.weight, + self.out_proj.bias, + self.training or self.dropout_module.apply_during_inference, + key_padding_mask, + need_weights, + attn_mask, + use_separate_proj_weight=True, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + ) + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if saved_state is not None and "prev_key" in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.q_proj(query) + if key is None: + assert value is None + k = v = None + else: + k = self.k_proj(key) + v = self.v_proj(key) + + else: + assert key is not None and value is not None + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + q *= self.scaling + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + + q = ( + q.contiguous() + .view(tgt_len, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + if k is not None: + k = ( + k.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + if v is not None: + v = ( + v.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if "prev_key" in saved_state: + _prev_key = saved_state["prev_key"] + assert _prev_key is not None + prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + assert k is not None + k = torch.cat([prev_key, k], dim=1) + if "prev_value" in saved_state: + _prev_value = saved_state["prev_value"] + assert _prev_value is not None + prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + assert v is not None + v = torch.cat([prev_value, v], dim=1) + prev_key_padding_mask: Optional[Tensor] = None + if "prev_key_padding_mask" in saved_state: + prev_key_padding_mask = saved_state["prev_key_padding_mask"] + assert k is not None and v is not None + key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( + key_padding_mask=key_padding_mask, + prev_key_padding_mask=prev_key_padding_mask, + batch_size=bsz, + src_len=k.size(1), + static_kv=static_kv, + ) + + saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_key_padding_mask"] = key_padding_mask + # In this branch incremental_state is never None + assert incremental_state is not None + incremental_state = self._set_input_buffer(incremental_state, saved_state) + assert k is not None + src_len = k.size(1) + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + assert v is not None + src_len += 1 + k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as( + key_padding_mask + ), + ], + dim=1, + ) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + if self.onnx_trace: + attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + if not self.tpu: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + else: + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v + + attn_weights_float = utils.softmax( + attn_weights, dim=-1, onnx_trace=self.onnx_trace + ) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.dropout_module(attn_weights) + + assert v is not None + attn = torch.bmm(attn_probs, v) + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + if self.onnx_trace and attn.size(1) == 1: + # when ONNX tracing a single decoder step (sequence length == 1) + # the transpose is a no-op copy before view, thus unnecessary + attn = attn.contiguous().view(tgt_len, bsz, embed_dim) + else: + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + attn_weights: Optional[Tensor] = None + if need_weights: + attn_weights = attn_weights_float.view( + bsz, self.num_heads, tgt_len, src_len + ).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights + + @staticmethod + def _append_prev_key_padding_mask( + key_padding_mask: Optional[Tensor], + prev_key_padding_mask: Optional[Tensor], + batch_size: int, + src_len: int, + static_kv: bool, + ) -> Optional[Tensor]: + # saved key padding masks have shape (bsz, seq_len) + if prev_key_padding_mask is not None and static_kv: + new_key_padding_mask = prev_key_padding_mask + elif prev_key_padding_mask is not None and key_padding_mask is not None: + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 + ) + # During incremental decoding, as the padding token enters and + # leaves the frame, there will be a time when prev or current + # is None + elif prev_key_padding_mask is not None: + filler = torch.zeros( + (batch_size, src_len - prev_key_padding_mask.size(1)), + device=prev_key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), filler.float()], dim=1 + ) + elif key_padding_mask is not None: + filler = torch.zeros( + (batch_size, src_len - key_padding_mask.size(1)), + device=key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [filler.float(), key_padding_mask.float()], dim=1 + ) + else: + new_key_padding_mask = prev_key_padding_mask + return new_key_padding_mask + + @torch.jit.export + def reorder_incremental_state( + self, + incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + new_order: Tensor, + ): + """Reorder buffered internal state (for incremental generation).""" + input_buffer = self._get_input_buffer(incremental_state) + if input_buffer is not None: + for k in input_buffer.keys(): + input_buffer_k = input_buffer[k] + if input_buffer_k is not None: + if self.encoder_decoder_attention and input_buffer_k.size( + 0 + ) == new_order.size(0): + break + input_buffer[k] = input_buffer_k.index_select(0, new_order) + incremental_state = self._set_input_buffer(incremental_state, input_buffer) + return incremental_state + + def _get_input_buffer( + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ) -> Dict[str, Optional[Tensor]]: + result = self.get_incremental_state(incremental_state, "attn_state") + if result is not None: + return result + else: + empty_result: Dict[str, Optional[Tensor]] = {} + return empty_result + + def _set_input_buffer( + self, + incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + buffer: Dict[str, Optional[Tensor]], + ): + return self.set_incremental_state(incremental_state, "attn_state", buffer) + + def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): + return attn_weights + + def upgrade_state_dict_named(self, state_dict, name): + prefix = name + "." if name != "" else "" + items_to_add = {} + keys_to_remove = [] + for k in state_dict.keys(): + if k.endswith(prefix + "in_proj_weight"): + # in_proj_weight used to be q + k + v with same dimensions + dim = int(state_dict[k].shape[0] / 3) + items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] + items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim] + items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :] + + keys_to_remove.append(k) + + k_bias = prefix + "in_proj_bias" + if k_bias in state_dict.keys(): + dim = int(state_dict[k].shape[0] / 3) + items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] + items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][ + dim : 2 * dim + ] + items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :] + + keys_to_remove.append(prefix + "in_proj_bias") + + for k in keys_to_remove: + del state_dict[k] + + for key, value in items_to_add.items(): + state_dict[key] = value diff --git a/fairseq-0.10.2/fairseq/modules/positional_embedding.py b/fairseq-0.10.2/fairseq/modules/positional_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..8e94e35edb46bf9dea911fe74577d8ecbe9b5ff1 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/positional_embedding.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn + +from .learned_positional_embedding import LearnedPositionalEmbedding +from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding + + +def PositionalEmbedding( + num_embeddings: int, + embedding_dim: int, + padding_idx: int, + learned: bool = False, +): + if learned: + # if padding_idx is specified then offset the embedding ids by + # this index and adjust num_embeddings appropriately + # TODO: The right place for this offset would be inside + # LearnedPositionalEmbedding. Move this there for a cleaner implementation. + if padding_idx is not None: + num_embeddings = num_embeddings + padding_idx + 1 + m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) + nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) + if padding_idx is not None: + nn.init.constant_(m.weight[padding_idx], 0) + else: + m = SinusoidalPositionalEmbedding( + embedding_dim, + padding_idx, + init_size=num_embeddings + padding_idx + 1, + ) + return m diff --git a/fairseq-0.10.2/fairseq/modules/quant_noise.py b/fairseq-0.10.2/fairseq/modules/quant_noise.py new file mode 100644 index 0000000000000000000000000000000000000000..d777dfbb6c1bf6a9b769dfdaec35d5ef084c8a8b --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quant_noise.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +def quant_noise(module, p, block_size): + """ + Wraps modules and applies quantization noise to the weights for + subsequent quantization with Iterative Product Quantization as + described in "Training with Quantization Noise for Extreme Model Compression" + + Args: + - module: nn.Module + - p: amount of Quantization Noise + - block_size: size of the blocks for subsequent quantization with iPQ + + Remarks: + - Module weights must have the right sizes wrt the block size + - Only Linear, Embedding and Conv2d modules are supported for the moment + - For more detail on how to quantize by blocks with convolutional weights, + see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks" + - We implement the simplest form of noise here as stated in the paper + which consists in randomly dropping blocks + """ + + # if no quantization noise, don't register hook + if p <= 0: + return module + + # supported modules + assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + + # test whether module.weight has the right sizes wrt block_size + is_conv = module.weight.ndim == 4 + + # 2D matrix + if not is_conv: + assert ( + module.weight.size(1) % block_size == 0 + ), "Input features must be a multiple of block sizes" + + # 4D matrix + else: + # 1x1 convolutions + if module.kernel_size == (1, 1): + assert ( + module.in_channels % block_size == 0 + ), "Input channels must be a multiple of block sizes" + # regular convolutions + else: + k = module.kernel_size[0] * module.kernel_size[1] + assert k % block_size == 0, "Kernel size must be a multiple of block size" + + def _forward_pre_hook(mod, input): + # no noise for evaluation + if mod.training: + if not is_conv: + # gather weight and sizes + weight = mod.weight + in_features = weight.size(1) + out_features = weight.size(0) + + # split weight matrix into blocks and randomly drop selected blocks + mask = torch.zeros( + in_features // block_size * out_features, device=weight.device + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) + + else: + # gather weight and sizes + weight = mod.weight + in_channels = mod.in_channels + out_channels = mod.out_channels + + # split weight matrix into blocks and randomly drop selected blocks + if mod.kernel_size == (1, 1): + mask = torch.zeros( + int(in_channels // block_size * out_channels), + device=weight.device, + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) + else: + mask = torch.zeros( + weight.size(0), weight.size(1), device=weight.device + ) + mask.bernoulli_(p) + mask = ( + mask.unsqueeze(2) + .unsqueeze(3) + .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) + ) + + # scale weights and apply mask + mask = mask.to( + torch.bool + ) # x.bool() is not currently supported in TorchScript + s = 1 / (1 - p) + mod.weight.data = s * weight.masked_fill(mask, 0) + + module.register_forward_pre_hook(_forward_pre_hook) + return module diff --git a/fairseq-0.10.2/fairseq/modules/quantization/__pycache__/quantization_options.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/__pycache__/quantization_options.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b71ae1a1bb318efca62b0bf43d185a4130e82628 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/__pycache__/quantization_options.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..709c9cbb5e4bb3b1392a590d2b9d111273f9fc7a Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/pq.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/pq.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62c66a78669e5a9f86dd79d16dad281b1543128a Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/pq.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/utils.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2501a7ca056111f351a6252d96843156d8d3c285 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/__pycache__/utils.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__init__.py b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b67c8e8ad691aa01e9e10e904d69d94595387668 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .qconv import PQConv2d # NOQA +from .qemb import PQEmbedding # NOQA +from .qlinear import PQLinear # NOQA diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/__init__.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..448029b773ed1d3b2840b72f05ba40ac1c171f6a Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/__init__.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qconv.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qconv.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4cd6f2f6431fa2a95682eab44140daf6a356d8dd Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qconv.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qemb.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qemb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07cb7dcbb23a7514ff39ee3a6257afe1b99727a8 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/__pycache__/qemb.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qemb.py b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qemb.py new file mode 100644 index 0000000000000000000000000000000000000000..3a74ad3c4c7c9d3203d26e7885864ba578951bfe --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qemb.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class PQEmbedding(nn.Module): + """ + Quantized counterpart of nn.Embedding module. Stores the centroids and + the assignments. The full weight is re-instantiated at each forward + pass. + + Args: + - centroids: centroids of size n_centroids x block_size + - assignments: assignments of the centroids to the subvectors + of size self.out_features x n_blocks + - bias: the non-quantized bias + + Remarks: + - We refer the reader to the official documentation of the nn.Embedding module + for the other arguments and the behavior of the module + - Performance tests on GPU show that this implementation is 10% slower than + the non-quantized nn.Embedding module for a standard training loop. + """ + + def __init__( + self, + centroids, + assignments, + num_embeddings, + embedding_dim, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, + _weight=None, + ): + super(PQEmbedding, self).__init__() + self.block_size = centroids.size(1) + self.n_centroids = centroids.size(0) + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "Padding_idx must be within num_embeddings" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "Padding_idx must be within num_embeddings" + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + self.sparse = sparse + # check compatibility + if self.embedding_dim % self.block_size != 0: + raise ValueError("Wrong PQ sizes") + if len(assignments) % self.num_embeddings != 0: + raise ValueError("Wrong PQ sizes") + # define parameters + self.centroids = nn.Parameter(centroids, requires_grad=True) + self.register_buffer("assignments", assignments) + self.register_buffer("counts", torch.bincount(assignments).type_as(centroids)) + + @property + def weight(self): + return ( + self.centroids[self.assignments] + .reshape(-1, self.num_embeddings, self.block_size) + .permute(1, 0, 2) + .flatten(1, 2) + ) + + def forward(self, input): + return F.embedding( + input, + self.weight, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + + def extra_repr(self): + s = "{num_embeddings}, {embedding_dim}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + if self.sparse is not False: + s += ", sparse=True" + s += ", n_centroids={n_centroids}, block_size={block_size}" + + return s.format(**self.__dict__) diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qlinear.py b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qlinear.py new file mode 100644 index 0000000000000000000000000000000000000000..9bdd25a8685bb7c7b32e1f02372aaeb26d8ba53a --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/pq/modules/qlinear.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class PQLinear(nn.Module): + """ + Quantized counterpart of nn.Linear module. Stores the centroid, the assignments + and the non-quantized biases. The full weight is re-instantiated at each forward + pass. + + Args: + - centroids: centroids of size n_centroids x block_size + - assignments: assignments of the centroids to the subvectors + of size self.out_features x n_blocks + - bias: the non-quantized bias + + Remarks: + - We refer the reader to the official documentation of the nn.Linear module + for the other arguments and the behavior of the module + - Performance tests on GPU show that this implementation is 15% slower than + the non-quantized nn.Linear module for a standard training loop. + """ + + def __init__(self, centroids, assignments, bias, in_features, out_features): + super(PQLinear, self).__init__() + self.block_size = centroids.size(1) + self.n_centroids = centroids.size(0) + self.in_features = in_features + self.out_features = out_features + # check compatibility + if self.in_features % self.block_size != 0: + raise ValueError("Wrong PQ sizes") + if len(assignments) % self.out_features != 0: + raise ValueError("Wrong PQ sizes") + # define parameters + self.centroids = nn.Parameter(centroids, requires_grad=True) + self.register_buffer("assignments", assignments) + self.register_buffer("counts", torch.bincount(assignments).type_as(centroids)) + if bias is not None: + self.bias = nn.Parameter(bias) + else: + self.register_parameter("bias", None) + + @property + def weight(self): + return ( + self.centroids[self.assignments] + .reshape(-1, self.out_features, self.block_size) + .permute(1, 0, 2) + .flatten(1, 2) + ) + + def forward(self, x): + return F.linear( + x, + self.weight, + self.bias, + ) + + def extra_repr(self): + return f"in_features={self.in_features},\ + out_features={self.out_features},\ + n_centroids={self.n_centroids},\ + block_size={self.block_size},\ + bias={self.bias is not None}" diff --git a/fairseq-0.10.2/fairseq/modules/quantization/pq/pq.py b/fairseq-0.10.2/fairseq/modules/quantization/pq/pq.py new file mode 100644 index 0000000000000000000000000000000000000000..eddc2eb34602403f10979f54cd23a45bc2f104d5 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/pq/pq.py @@ -0,0 +1,128 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .em import EM, EmptyClusterResolveError + + +class PQ(EM): + """ + Quantizes the layer weights W with the standard Product Quantization + technique. This learns a codebook of codewords or centroids of size + block_size from W. For further reference on using PQ to quantize + neural networks, see "And the Bit Goes Down: Revisiting the Quantization + of Neural Networks", Stock et al., ICLR 2020. + + PQ is performed in two steps: + (1) The matrix W (weights or fully-connected or convolutional layer) + is reshaped to (block_size, -1). + - If W is fully-connected (2D), its columns are split into + blocks of size block_size. + - If W is convolutional (4D), its filters are split along the + spatial dimension. + (2) We apply the standard EM/k-means algorithm to the resulting reshaped matrix. + + Args: + - W: weight matrix to quantize of size (in_features x out_features) + - block_size: size of the blocks (subvectors) + - n_centroids: number of centroids + - n_iter: number of k-means iterations + - eps: for cluster reassignment when an empty cluster is found + - max_tentatives for cluster reassignment when an empty cluster is found + - verbose: print information after each iteration + + Remarks: + - block_size be compatible with the shape of W + """ + + def __init__( + self, + W, + block_size, + n_centroids=256, + n_iter=20, + eps=1e-6, + max_tentatives=30, + verbose=True, + ): + self.block_size = block_size + W_reshaped = self._reshape(W) + super(PQ, self).__init__( + W_reshaped, + n_centroids=n_centroids, + n_iter=n_iter, + eps=eps, + max_tentatives=max_tentatives, + verbose=verbose, + ) + + def _reshape(self, W): + """ + Reshapes the matrix W as expained in step (1). + """ + + # fully connected: by convention the weight has size out_features x in_features + if len(W.size()) == 2: + self.out_features, self.in_features = W.size() + assert ( + self.in_features % self.block_size == 0 + ), "Linear: n_blocks must be a multiple of in_features" + return ( + W.reshape(self.out_features, -1, self.block_size) + .permute(2, 1, 0) + .flatten(1, 2) + ) + + # convolutional: we reshape along the spatial dimension + elif len(W.size()) == 4: + self.out_channels, self.in_channels, self.k_h, self.k_w = W.size() + assert ( + self.in_channels * self.k_h * self.k_w + ) % self.block_size == 0, ( + "Conv2d: n_blocks must be a multiple of in_channels * k_h * k_w" + ) + return ( + W.reshape(self.out_channels, -1, self.block_size) + .permute(2, 1, 0) + .flatten(1, 2) + ) + # not implemented + else: + raise NotImplementedError(W.size()) + + def encode(self): + """ + Performs self.n_iter EM steps. + """ + + self.initialize_centroids() + for i in range(self.n_iter): + try: + self.step(i) + except EmptyClusterResolveError: + break + + def decode(self): + """ + Returns the encoded full weight matrix. Must be called after + the encode function. + """ + + # fully connected case + if "k_h" not in self.__dict__: + return ( + self.centroids[self.assignments] + .reshape(-1, self.out_features, self.block_size) + .permute(1, 0, 2) + .flatten(1, 2) + ) + + # convolutional case + else: + return ( + self.centroids[self.assignments] + .reshape(-1, self.out_channels, self.block_size) + .permute(1, 0, 2) + .reshape(self.out_channels, self.in_channels, self.k_h, self.k_w) + ) diff --git a/fairseq-0.10.2/fairseq/modules/quantization/quantization_options.py b/fairseq-0.10.2/fairseq/modules/quantization/quantization_options.py new file mode 100644 index 0000000000000000000000000000000000000000..b46d682c0edaeaaf2a230e51d50da2a32d4bda98 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/quantization_options.py @@ -0,0 +1,44 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +def parse_config_yaml(yaml_data): + # Initialize to default options. + quantization_options = { + "n_centroids": { + "Linear": ["in_features", {"*": 256}], + "Embedding": ["embedding_dim", {"*": 256}], + }, + "block_sizes": { + "Linear": ["fuzzy_name", {"fc": 8, "attn": 4, "emb": 4}], + "Embedding": ["fuzzy_name", {"emb": 8}], + }, + "layers_to_quantize": [ + "decoder\\.layers\\.\\d+\\.fc[12]", + "decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]", + "decoder\\.layers\\.\\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)", + ], + } + + if "n_centroids" in yaml_data: + quantization_options["n_centroids"] = { + layer: convert_yaml_to_tuple(layer_data) + for layer, layer_data in yaml_data["n_centroids"].items() + } + if "block_sizes" in yaml_data: + quantization_options["block_sizes"] = { + layer: convert_yaml_to_tuple(layer_data) + for layer, layer_data in yaml_data["block_sizes"].items() + } + if "layers_to_quantize" in yaml_data: + quantization_options["layers_to_quantize"] = yaml_data["layers_to_quantize"] + + return quantization_options + + +def convert_yaml_to_tuple(yaml_dictionary): + """Converts a yaml dictionary with two keys: `key` and `value` into a two + argument tuple of those values.""" + return (yaml_dictionary["key"], yaml_dictionary["value"]) diff --git a/fairseq-0.10.2/fairseq/modules/quantization/scalar/__pycache__/ops.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/scalar/__pycache__/ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6eed23bd591e28dd1b8479ab96de4b05a80ed5cc Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/scalar/__pycache__/ops.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/__pycache__/qemb.cpython-310.pyc b/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/__pycache__/qemb.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45903b5aad708ddf70fbad93e03fded1bf695964 Binary files /dev/null and b/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/__pycache__/qemb.cpython-310.pyc differ diff --git a/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/qemb.py b/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/qemb.py new file mode 100644 index 0000000000000000000000000000000000000000..d6cf06e5872cb86e5c2e726153c7a80c78db9d1e --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/quantization/scalar/modules/qemb.py @@ -0,0 +1,147 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..ops import emulate_int + + +class IntEmbedding(nn.Module): + """ + Quantized counterpart of the nn.Embedding module that applies QuantNoise during training. + + Args: + - num_embeddings: number of tokens + - embedding_dim: embedding dimension + - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights) + - bits: number of bits + - method: choose among {"tensor", "histogram", "channel"} + - update_step: recompute scale and zero_point every update_steps iterations + + Remarks: + - We use the straight-through estimator so that the gradients + back-propagate nicely in the network, this is implemented with + the detach() trick + - Parameters scale and zero_point are recomputed every update_step + forward pass to reduce the overhead + - At test time, the weights are fully quantized + """ + + def __init__( + self, + num_embeddings, + embedding_dim, + padding_idx=None, + max_norm=None, + norm_type=2.0, + scale_grad_by_freq=False, + sparse=False, + _weight=None, + p=0, + update_step=1000, + bits=8, + method="histogram", + ): + super(IntEmbedding, self).__init__() + self.num_embeddings = num_embeddings + self.embedding_dim = embedding_dim + if padding_idx is not None: + if padding_idx > 0: + assert ( + padding_idx < self.num_embeddings + ), "Padding_idx must be within num_embeddings" + elif padding_idx < 0: + assert ( + padding_idx >= -self.num_embeddings + ), "Padding_idx must be within num_embeddings" + padding_idx = self.num_embeddings + padding_idx + self.padding_idx = padding_idx + self.max_norm = max_norm + self.norm_type = norm_type + self.scale_grad_by_freq = scale_grad_by_freq + if _weight is None: + self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim)) + self.reset_parameters() + else: + assert list(_weight.shape) == [ + num_embeddings, + embedding_dim, + ], "Shape of weight does not match num_embeddings and embedding_dim" + self.weight = nn.Parameter(_weight) + self.sparse = sparse + + # quantization parameters + self.p = p + self.bits = bits + self.method = method + self.update_step = update_step + self.counter = 0 + + def reset_parameters(self): + nn.init.normal_(self.weight) + if self.padding_idx is not None: + with torch.no_grad(): + self.weight[self.padding_idx].fill_(0) + + def forward(self, input): + # train with QuantNoise and evaluate the fully quantized network + p = self.p if self.training else 1 + + # update parameters every 1000 iterations + if self.counter % self.update_step == 0: + self.scale = None + self.zero_point = None + self.counter += 1 + + # quantize weight + weight_quantized, self.scale, self.zero_point = emulate_int( + self.weight.detach(), + bits=self.bits, + method=self.method, + scale=self.scale, + zero_point=self.zero_point, + ) + + # mask to apply noise + mask = torch.zeros_like(self.weight) + mask.bernoulli_(1 - p) + noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0) + + # using straight-through estimator (STE) + clamp_low = -self.scale * self.zero_point + clamp_high = self.scale * (2 ** self.bits - 1 - self.zero_point) + weight = ( + torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + + noise.detach() + ) + + # return output + output = F.embedding( + input, + weight, + self.padding_idx, + self.max_norm, + self.norm_type, + self.scale_grad_by_freq, + self.sparse, + ) + return output + + def extra_repr(self): + s = "{num_embeddings}, {embedding_dim}" + if self.padding_idx is not None: + s += ", padding_idx={padding_idx}" + if self.max_norm is not None: + s += ", max_norm={max_norm}" + if self.norm_type != 2: + s += ", norm_type={norm_type}" + if self.scale_grad_by_freq is not False: + s += ", scale_grad_by_freq={scale_grad_by_freq}" + if self.sparse is not False: + s += ", sparse=True" + s += "quant_noise={p}, bits={bits}, method={method}" + return s.format(**self.__dict__) diff --git a/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder.py b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..f41ec09327fe80b50d20674e7482794ce45c531c --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/sparse_transformer_sentence_encoder.py @@ -0,0 +1,96 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +from fairseq.modules import TransformerSentenceEncoder +from fairseq.modules.sparse_transformer_sentence_encoder_layer import ( + SparseTransformerSentenceEncoderLayer, +) + + +class SparseTransformerSentenceEncoder(TransformerSentenceEncoder): + """ + Sparse implementation of the TransformerSentenceEncoder + - see SparseMultiheadAttention + """ + + def __init__( + self, + padding_idx: int, + vocab_size: int, + num_encoder_layers: int = 6, + embedding_dim: int = 768, + ffn_embedding_dim: int = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + max_seq_len: int = 256, + num_segments: int = 2, + use_position_embeddings: bool = True, + offset_positions_by_padding: bool = True, + encoder_normalize_before: bool = False, + apply_bert_init: bool = False, + activation_fn: str = "relu", + learned_pos_embedding: bool = True, + embed_scale: float = None, + freeze_embeddings: bool = False, + n_trans_layers_to_freeze: int = 0, + export: bool = False, + is_bidirectional: bool = True, + stride: int = 32, + expressivity: int = 8, + ) -> None: + + super().__init__( + padding_idx, + vocab_size, + num_encoder_layers, + embedding_dim, + ffn_embedding_dim, + num_attention_heads, + dropout, + attention_dropout, + activation_dropout, + max_seq_len, + num_segments, + use_position_embeddings, + offset_positions_by_padding, + encoder_normalize_before, + apply_bert_init, + activation_fn, + learned_pos_embedding, + embed_scale, + freeze_embeddings, + n_trans_layers_to_freeze, + export, + ) + + self.layers = nn.ModuleList( + [ + SparseTransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=dropout, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + export=export, + is_bidirectional=is_bidirectional, + stride=stride, + expressivity=expressivity, + ) + for _ in range(num_encoder_layers) + ] + ) + + def freeze_module_params(m): + if m is not None: + for p in m.parameters(): + p.requires_grad = False + + for layer in range(n_trans_layers_to_freeze): + freeze_module_params(self.layers[layer]) diff --git a/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder.py b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..208488f562b64e79360fd4deb9c627b6ff1e53ba --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/transformer_sentence_encoder.py @@ -0,0 +1,286 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple + +import torch +import torch.nn as nn +from fairseq.modules import ( + FairseqDropout, + LayerDropModuleList, + LayerNorm, + MultiheadAttention, + PositionalEmbedding, + TransformerSentenceEncoderLayer, +) +from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ + + +def init_bert_params(module): + """ + Initialize the weights specific to the BERT Model. + This overrides the default initializations depending on the specified arguments. + 1. If normal_init_linear_weights is set then weights of linear + layer will be initialized using the normal distribution and + bais will be set to the specified value. + 2. If normal_init_embed_weights is set then weights of embedding + layer will be initialized using the normal distribution. + 3. If normal_init_proj_weights is set then weights of + in_project_weight for MultiHeadAttention initialized using + the normal distribution (to be validated). + """ + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.02) + if module.bias is not None: + module.bias.data.zero_() + if isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=0.02) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, MultiheadAttention): + module.q_proj.weight.data.normal_(mean=0.0, std=0.02) + module.k_proj.weight.data.normal_(mean=0.0, std=0.02) + module.v_proj.weight.data.normal_(mean=0.0, std=0.02) + + +class TransformerSentenceEncoder(nn.Module): + """ + Implementation for a Bi-directional Transformer based Sentence Encoder used + in BERT/XLM style pre-trained models. + + This first computes the token embedding using the token embedding matrix, + position embeddings (if specified) and segment embeddings + (if specified). After applying the specified number of + TransformerEncoderLayers, it outputs all the internal states of the + encoder as well as the final representation associated with the first + token (usually CLS token). + + Input: + - tokens: B x T matrix representing sentences + - segment_labels: B x T matrix representing segment label for tokens + + Output: + - a tuple of the following: + - a list of internal model states used to compute the + predictions where each tensor has shape T x B x C + - sentence representation associated with first input token + in format B x C. + """ + + def __init__( + self, + padding_idx: int, + vocab_size: int, + num_encoder_layers: int = 6, + embedding_dim: int = 768, + ffn_embedding_dim: int = 3072, + num_attention_heads: int = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + layerdrop: float = 0.0, + max_seq_len: int = 256, + num_segments: int = 2, + use_position_embeddings: bool = True, + offset_positions_by_padding: bool = True, + encoder_normalize_before: bool = False, + apply_bert_init: bool = False, + activation_fn: str = "relu", + learned_pos_embedding: bool = True, + embed_scale: float = None, + freeze_embeddings: bool = False, + n_trans_layers_to_freeze: int = 0, + export: bool = False, + traceable: bool = False, + q_noise: float = 0.0, + qn_block_size: int = 8, + ) -> None: + + super().__init__() + self.padding_idx = padding_idx + self.vocab_size = vocab_size + self.dropout_module = FairseqDropout( + dropout, module_name=self.__class__.__name__ + ) + self.layerdrop = layerdrop + self.max_seq_len = max_seq_len + self.embedding_dim = embedding_dim + self.num_segments = num_segments + self.use_position_embeddings = use_position_embeddings + self.apply_bert_init = apply_bert_init + self.learned_pos_embedding = learned_pos_embedding + self.traceable = traceable + self.tpu = False # whether we're on TPU + + self.embed_tokens = self.build_embedding( + self.vocab_size, self.embedding_dim, self.padding_idx + ) + self.embed_scale = embed_scale + + if q_noise > 0: + self.quant_noise = apply_quant_noise_( + nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), + q_noise, + qn_block_size, + ) + else: + self.quant_noise = None + + self.segment_embeddings = ( + nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None) + if self.num_segments > 0 + else None + ) + + self.embed_positions = ( + PositionalEmbedding( + self.max_seq_len, + self.embedding_dim, + padding_idx=(self.padding_idx if offset_positions_by_padding else None), + learned=self.learned_pos_embedding, + ) + if self.use_position_embeddings + else None + ) + + if self.layerdrop > 0.0: + self.layers = LayerDropModuleList(p=self.layerdrop) + else: + self.layers = nn.ModuleList([]) + self.layers.extend( + [ + self.build_transformer_sentence_encoder_layer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=self.dropout_module.p, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + export=export, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + for _ in range(num_encoder_layers) + ] + ) + + if encoder_normalize_before: + self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) + else: + self.emb_layer_norm = None + + # Apply initialization of model params after building the model + if self.apply_bert_init: + self.apply(init_bert_params) + + def freeze_module_params(m): + if m is not None: + for p in m.parameters(): + p.requires_grad = False + + if freeze_embeddings: + freeze_module_params(self.embed_tokens) + freeze_module_params(self.segment_embeddings) + freeze_module_params(self.embed_positions) + freeze_module_params(self.emb_layer_norm) + + for layer in range(n_trans_layers_to_freeze): + freeze_module_params(self.layers[layer]) + + def build_embedding(self, vocab_size, embedding_dim, padding_idx): + return nn.Embedding(vocab_size, embedding_dim, padding_idx) + + def build_transformer_sentence_encoder_layer( + self, + embedding_dim, + ffn_embedding_dim, + num_attention_heads, + dropout, + attention_dropout, + activation_dropout, + activation_fn, + export, + q_noise, + qn_block_size, + ): + return TransformerSentenceEncoderLayer( + embedding_dim=embedding_dim, + ffn_embedding_dim=ffn_embedding_dim, + num_attention_heads=num_attention_heads, + dropout=dropout, + attention_dropout=attention_dropout, + activation_dropout=activation_dropout, + activation_fn=activation_fn, + export=export, + q_noise=q_noise, + qn_block_size=qn_block_size, + ) + + def prepare_for_tpu_(self, **kwargs): + self.tpu = True + + def forward( + self, + tokens: torch.Tensor, + segment_labels: torch.Tensor = None, + last_state_only: bool = False, + positions: Optional[torch.Tensor] = None, + token_embeddings: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + + # compute padding mask. This is needed for multi-head attention + padding_mask = tokens.eq(self.padding_idx) + if not self.traceable and not self.tpu and not padding_mask.any(): + padding_mask = None + + if token_embeddings is not None: + x = token_embeddings + else: + x = self.embed_tokens(tokens) + + if self.embed_scale is not None: + x = x * self.embed_scale + + if self.embed_positions is not None: + x = x + self.embed_positions(tokens, positions=positions) + + if self.segment_embeddings is not None and segment_labels is not None: + x = x + self.segment_embeddings(segment_labels) + + if self.quant_noise is not None: + x = self.quant_noise(x) + + if self.emb_layer_norm is not None: + x = self.emb_layer_norm(x) + + x = self.dropout_module(x) + + # account for padding while computing the representation + if padding_mask is not None: + x = x * (1 - padding_mask.unsqueeze(-1).type_as(x)) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + inner_states = [] + if not last_state_only: + inner_states.append(x) + + for layer in self.layers: + x, _ = layer(x, self_attn_padding_mask=padding_mask) + if not last_state_only: + inner_states.append(x) + + sentence_rep = x[0, :, :] + + if last_state_only: + inner_states = [x] + + if self.traceable: + return torch.stack(inner_states), sentence_rep + else: + return inner_states, sentence_rep diff --git a/fairseq-0.10.2/fairseq/modules/vggblock.py b/fairseq-0.10.2/fairseq/modules/vggblock.py new file mode 100644 index 0000000000000000000000000000000000000000..ee5ee19a34816c7350c21fba7c4907fec8ca7a61 --- /dev/null +++ b/fairseq-0.10.2/fairseq/modules/vggblock.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import absolute_import, division, print_function, unicode_literals + +from collections.abc import Iterable +from itertools import repeat + +import torch +import torch.nn as nn + + +def _pair(v): + if isinstance(v, Iterable): + assert len(v) == 2, "len(v) != 2" + return v + return tuple(repeat(v, 2)) + + +def infer_conv_output_dim(conv_op, input_dim, sample_inchannel): + sample_seq_len = 200 + sample_bsz = 10 + x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim) + # N x C x H x W + # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim + x = conv_op(x) + # N x C x H x W + x = x.transpose(1, 2) + # N x H x C x W + bsz, seq = x.size()[:2] + per_channel_dim = x.size()[3] + # bsz: N, seq: H, CxW the rest + return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim + + +class VGGBlock(torch.nn.Module): + """ + VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf + + Args: + in_channels: (int) number of input channels (typically 1) + out_channels: (int) number of output channels + conv_kernel_size: convolution channels + pooling_kernel_size: the size of the pooling window to take a max over + num_conv_layers: (int) number of convolution layers + input_dim: (int) input dimension + conv_stride: the stride of the convolving kernel. + Can be a single number or a tuple (sH, sW) Default: 1 + padding: implicit paddings on both sides of the input. + Can be a single number or a tuple (padH, padW). Default: None + layer_norm: (bool) if layer norm is going to be applied. Default: False + + Shape: + Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features) + Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features) + """ + + def __init__( + self, + in_channels, + out_channels, + conv_kernel_size, + pooling_kernel_size, + num_conv_layers, + input_dim, + conv_stride=1, + padding=None, + layer_norm=False, + ): + assert ( + input_dim is not None + ), "Need input_dim for LayerNorm and infer_conv_output_dim" + super(VGGBlock, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.conv_kernel_size = _pair(conv_kernel_size) + self.pooling_kernel_size = _pair(pooling_kernel_size) + self.num_conv_layers = num_conv_layers + self.padding = ( + tuple(e // 2 for e in self.conv_kernel_size) + if padding is None + else _pair(padding) + ) + self.conv_stride = _pair(conv_stride) + + self.layers = nn.ModuleList() + for layer in range(num_conv_layers): + conv_op = nn.Conv2d( + in_channels if layer == 0 else out_channels, + out_channels, + self.conv_kernel_size, + stride=self.conv_stride, + padding=self.padding, + ) + self.layers.append(conv_op) + if layer_norm: + conv_output_dim, per_channel_dim = infer_conv_output_dim( + conv_op, input_dim, in_channels if layer == 0 else out_channels + ) + self.layers.append(nn.LayerNorm(per_channel_dim)) + input_dim = per_channel_dim + self.layers.append(nn.ReLU()) + + if self.pooling_kernel_size is not None: + pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True) + self.layers.append(pool_op) + self.total_output_dim, self.output_dim = infer_conv_output_dim( + pool_op, input_dim, out_channels + ) + + def forward(self, x): + for i, _ in enumerate(self.layers): + x = self.layers[i](x) + return x diff --git a/fairseq-0.10.2/fairseq/nan_detector.py b/fairseq-0.10.2/fairseq/nan_detector.py new file mode 100644 index 0000000000000000000000000000000000000000..faa8031d4666c9ba9837919fe1c884dacf47ac3a --- /dev/null +++ b/fairseq-0.10.2/fairseq/nan_detector.py @@ -0,0 +1,108 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch + + +logger = logging.getLogger(__name__) + + +class NanDetector: + """ + Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name + """ + + def __init__(self, model, forward=True, backward=True): + self.bhooks = [] + self.fhooks = [] + self.forward = forward + self.backward = backward + self.named_parameters = list(model.named_parameters()) + self.reset() + + for name, mod in model.named_modules(): + mod.__module_name = name + self.add_hooks(mod) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + # Dump out all model gnorms to enable better debugging + norm = {} + gradients = {} + for name, param in self.named_parameters: + if param.grad is not None: + grad_norm = torch.norm(param.grad.data, p=2, dtype=torch.float32) + norm[name] = grad_norm.item() + if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any(): + gradients[name] = param.grad.data + if len(gradients) > 0: + logger.info("Detected nan/inf grad norm, dumping norms...") + logger.info(f"norms: {norm}") + logger.info(f"gradients: {gradients}") + + self.close() + + def add_hooks(self, module): + if self.forward: + self.fhooks.append(module.register_forward_hook(self.fhook_fn)) + if self.backward: + self.bhooks.append(module.register_backward_hook(self.bhook_fn)) + + def reset(self): + self.has_printed_f = False + self.has_printed_b = False + + def _detect(self, tensor, name, backward): + err = None + if ( + torch.is_floating_point(tensor) + # single value tensors (like the loss) will not provide much info + and tensor.numel() >= 2 + ): + with torch.no_grad(): + if torch.isnan(tensor).any(): + err = "NaN" + elif torch.isinf(tensor).any(): + err = "Inf" + if err is not None: + err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}" + return err + + def _apply(self, module, inp, x, backward): + if torch.is_tensor(x): + if isinstance(inp, tuple) and len(inp) > 0: + inp = inp[0] + err = self._detect(x, module.__module_name, backward) + if err is not None: + if torch.is_tensor(inp) and not backward: + err += ( + f" input max: {inp.max().item()}, input min: {inp.min().item()}" + ) + + has_printed_attr = "has_printed_b" if backward else "has_printed_f" + logger.warning(err) + setattr(self, has_printed_attr, True) + elif isinstance(x, dict): + for v in x.values(): + self._apply(module, inp, v, backward) + elif isinstance(x, list) or isinstance(x, tuple): + for v in x: + self._apply(module, inp, v, backward) + + def fhook_fn(self, module, inp, output): + if not self.has_printed_f: + self._apply(module, inp, output, backward=False) + + def bhook_fn(self, module, inp, output): + if not self.has_printed_b: + self._apply(module, inp, output, backward=True) + + def close(self): + for hook in self.fhooks + self.bhooks: + hook.remove() diff --git a/fairseq-0.10.2/fairseq/options.py b/fairseq-0.10.2/fairseq/options.py new file mode 100644 index 0000000000000000000000000000000000000000..1a24fccaec4af9d91a6d49ca6b03bce936a650cf --- /dev/null +++ b/fairseq-0.10.2/fairseq/options.py @@ -0,0 +1,441 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from typing import Callable, List, Optional + +import torch +from fairseq import utils +from fairseq.data.indexed_dataset import get_available_dataset_impl +from fairseq.dataclass.data_class import ( + CheckpointParams, + CommonEvalParams, + CommonParams, + DatasetParams, + DistributedTrainingParams, + EvalLMParams, + OptimizationParams, +) +from fairseq.dataclass.utils import gen_parser_from_dataclass + +# this import is for backward compatibility +from fairseq.utils import csv_str_list, eval_bool, eval_str_dict, eval_str_list # noqa + + +def get_preprocessing_parser(default_task="translation"): + parser = get_parser("Preprocessing", default_task) + add_preprocess_args(parser) + return parser + + +def get_training_parser(default_task="translation"): + parser = get_parser("Trainer", default_task) + add_dataset_args(parser, train=True) + add_distributed_training_args(parser) + add_model_args(parser) + add_optimization_args(parser) + add_checkpoint_args(parser) + return parser + + +def get_generation_parser(interactive=False, default_task="translation"): + parser = get_parser("Generation", default_task) + add_dataset_args(parser, gen=True) + add_distributed_training_args(parser, default_world_size=1) + add_generation_args(parser) + if interactive: + add_interactive_args(parser) + return parser + + +def get_interactive_generation_parser(default_task="translation"): + return get_generation_parser(interactive=True, default_task=default_task) + + +def get_eval_lm_parser(default_task="language_modeling"): + parser = get_parser("Evaluate Language Model", default_task) + add_dataset_args(parser, gen=True) + add_distributed_training_args(parser, default_world_size=1) + add_eval_lm_args(parser) + return parser + + +def get_validation_parser(default_task=None): + parser = get_parser("Validation", default_task) + add_dataset_args(parser, train=True) + add_distributed_training_args(parser, default_world_size=1) + group = parser.add_argument_group("Evaluation") + gen_parser_from_dataclass(group, CommonEvalParams()) + return parser + + +def parse_args_and_arch( + parser: argparse.ArgumentParser, + input_args: List[str] = None, + parse_known: bool = False, + suppress_defaults: bool = False, + modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None, +): + """ + Args: + parser (ArgumentParser): the parser + input_args (List[str]): strings to parse, defaults to sys.argv + parse_known (bool): only parse known arguments, similar to + `ArgumentParser.parse_known_args` + suppress_defaults (bool): parse while ignoring all default values + modify_parser (Optional[Callable[[ArgumentParser], None]]): + function to modify the parser, e.g., to set default values + """ + if suppress_defaults: + # Parse args without any default values. This requires us to parse + # twice, once to identify all the necessary task/model args, and a second + # time with all defaults set to None. + args = parse_args_and_arch( + parser, + input_args=input_args, + parse_known=parse_known, + suppress_defaults=False, + ) + suppressed_parser = argparse.ArgumentParser(add_help=False, parents=[parser]) + suppressed_parser.set_defaults(**{k: None for k, v in vars(args).items()}) + args = suppressed_parser.parse_args(input_args) + return argparse.Namespace( + **{k: v for k, v in vars(args).items() if v is not None} + ) + + from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY, MODEL_REGISTRY + + # Before creating the true parser, we need to import optional user module + # in order to eagerly import custom tasks, optimizers, architectures, etc. + usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) + usr_parser.add_argument("--user-dir", default=None) + usr_args, _ = usr_parser.parse_known_args(input_args) + utils.import_user_module(usr_args) + + if modify_parser is not None: + modify_parser(parser) + + # The parser doesn't know about model/criterion/optimizer-specific args, so + # we parse twice. First we parse the model/criterion/optimizer, then we + # parse a second time after adding the *-specific arguments. + # If input_args is given, we will parse those args instead of sys.argv. + args, _ = parser.parse_known_args(input_args) + + # Add model-specific args to parser. + if hasattr(args, "arch"): + model_specific_group = parser.add_argument_group( + "Model-specific configuration", + # Only include attributes which are explicitly given as command-line + # arguments or which have default values. + argument_default=argparse.SUPPRESS, + ) + if args.arch in ARCH_MODEL_REGISTRY: + ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group) + elif args.arch in MODEL_REGISTRY: + MODEL_REGISTRY[args.arch].add_args(model_specific_group) + else: + raise RuntimeError() + + # Add *-specific args to parser. + from fairseq.registry import REGISTRIES + + for registry_name, REGISTRY in REGISTRIES.items(): + choice = getattr(args, registry_name, None) + if choice is not None: + cls = REGISTRY["registry"][choice] + if hasattr(cls, "add_args"): + cls.add_args(parser) + if hasattr(args, "task"): + from fairseq.tasks import TASK_REGISTRY + + TASK_REGISTRY[args.task].add_args(parser) + if getattr(args, "use_bmuf", False): + # hack to support extra args for block distributed data parallelism + from fairseq.optim.bmuf import FairseqBMUF + + FairseqBMUF.add_args(parser) + + # Modify the parser a second time, since defaults may have been reset + if modify_parser is not None: + modify_parser(parser) + + # Parse a second time. + if parse_known: + args, extra = parser.parse_known_args(input_args) + else: + args = parser.parse_args(input_args) + extra = None + # Post-process args. + if ( + hasattr(args, "batch_size_valid") and args.batch_size_valid is None + ) or not hasattr(args, "batch_size_valid"): + args.batch_size_valid = args.batch_size + if hasattr(args, "max_tokens_valid") and args.max_tokens_valid is None: + args.max_tokens_valid = args.max_tokens + if getattr(args, "memory_efficient_fp16", False): + args.fp16 = True + if getattr(args, "memory_efficient_bf16", False): + args.bf16 = True + args.tpu = getattr(args, "tpu", False) + args.bf16 = getattr(args, "bf16", False) + if args.bf16: + args.tpu = True + if args.tpu and args.fp16: + raise ValueError("Cannot combine --fp16 and --tpu, use --bf16 on TPUs") + + if getattr(args, "seed", None) is None: + args.seed = 1 # default seed for training + args.no_seed_provided = True + else: + args.no_seed_provided = False + + # Apply architecture configuration. + if hasattr(args, "arch") and args.arch in ARCH_CONFIG_REGISTRY: + ARCH_CONFIG_REGISTRY[args.arch](args) + + if parse_known: + return args, extra + else: + return args + + +def get_parser(desc, default_task="translation"): + # Before creating the true parser, we need to import optional user module + # in order to eagerly import custom tasks, optimizers, architectures, etc. + usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) + usr_parser.add_argument("--user-dir", default=None) + usr_args, _ = usr_parser.parse_known_args() + utils.import_user_module(usr_args) + + parser = argparse.ArgumentParser(allow_abbrev=False) + gen_parser_from_dataclass(parser, CommonParams()) + + from fairseq.registry import REGISTRIES + + for registry_name, REGISTRY in REGISTRIES.items(): + parser.add_argument( + "--" + registry_name.replace("_", "-"), + default=REGISTRY["default"], + choices=REGISTRY["registry"].keys(), + ) + + # Task definitions can be found under fairseq/tasks/ + from fairseq.tasks import TASK_REGISTRY + + parser.add_argument( + "--task", + metavar="TASK", + default=default_task, + choices=TASK_REGISTRY.keys(), + help="task", + ) + # fmt: on + return parser + + +def add_preprocess_args(parser): + group = parser.add_argument_group("Preprocessing") + # fmt: off + group.add_argument("-s", "--source-lang", default=None, metavar="SRC", + help="source language") + group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", + help="target language") + group.add_argument("--trainpref", metavar="FP", default=None, + help="train file prefix") + group.add_argument("--validpref", metavar="FP", default=None, + help="comma separated, valid file prefixes") + group.add_argument("--testpref", metavar="FP", default=None, + help="comma separated, test file prefixes") + group.add_argument("--align-suffix", metavar="FP", default=None, + help="alignment file suffix") + group.add_argument("--destdir", metavar="DIR", default="data-bin", + help="destination dir") + group.add_argument("--thresholdtgt", metavar="N", default=0, type=int, + help="map words appearing less than threshold times to unknown") + group.add_argument("--thresholdsrc", metavar="N", default=0, type=int, + help="map words appearing less than threshold times to unknown") + group.add_argument("--tgtdict", metavar="FP", + help="reuse given target dictionary") + group.add_argument("--srcdict", metavar="FP", + help="reuse given source dictionary") + group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, + help="number of target words to retain") + group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, + help="number of source words to retain") + group.add_argument("--alignfile", metavar="ALIGN", default=None, + help="an alignment file (optional)") + parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', + choices=get_available_dataset_impl(), + help='output dataset implementation') + group.add_argument("--joined-dictionary", action="store_true", + help="Generate joined dictionary") + group.add_argument("--only-source", action="store_true", + help="Only process the source language") + group.add_argument("--padding-factor", metavar="N", default=8, type=int, + help="Pad dictionary size to be multiple of N") + group.add_argument("--workers", metavar="N", default=1, type=int, + help="number of parallel workers") + # fmt: on + return parser + + +def add_dataset_args(parser, train=False, gen=False): + group = parser.add_argument_group("dataset_data_loading") + gen_parser_from_dataclass(group, DatasetParams()) + # fmt: on + return group + + +def add_distributed_training_args(parser, default_world_size=None): + group = parser.add_argument_group("distributed_training") + if default_world_size is None: + default_world_size = max(1, torch.cuda.device_count()) + gen_parser_from_dataclass( + group, DistributedTrainingParams(distributed_world_size=default_world_size) + ) + return group + + +def add_optimization_args(parser): + group = parser.add_argument_group("optimization") + # fmt: off + gen_parser_from_dataclass(group, OptimizationParams()) + # fmt: on + return group + + +def add_checkpoint_args(parser): + group = parser.add_argument_group("checkpoint") + # fmt: off + gen_parser_from_dataclass(group, CheckpointParams()) + # fmt: on + return group + + +def add_common_eval_args(group): + gen_parser_from_dataclass(group, CommonEvalParams()) + + +def add_eval_lm_args(parser): + group = parser.add_argument_group("LM Evaluation") + add_common_eval_args(group) + gen_parser_from_dataclass(group, EvalLMParams()) + + +def add_generation_args(parser): + group = parser.add_argument_group("Generation") + add_common_eval_args(group) + # fmt: off + group.add_argument('--beam', default=5, type=int, metavar='N', + help='beam size') + group.add_argument('--nbest', default=1, type=int, metavar='N', + help='number of hypotheses to output') + group.add_argument('--max-len-a', default=0, type=float, metavar='N', + help=('generate sequences of maximum length ax + b, ' + 'where x is the source length')) + group.add_argument('--max-len-b', default=200, type=int, metavar='N', + help=('generate sequences of maximum length ax + b, ' + 'where x is the source length')) + group.add_argument('--min-len', default=1, type=float, metavar='N', + help=('minimum generation length')) + group.add_argument('--match-source-len', default=False, action='store_true', + help=('generations should match the source length')) + group.add_argument('--no-early-stop', action='store_true', + help='deprecated') + group.add_argument('--unnormalized', action='store_true', + help='compare unnormalized hypothesis scores') + group.add_argument('--no-beamable-mm', action='store_true', + help='don\'t use BeamableMM in attention layers') + group.add_argument('--lenpen', default=1, type=float, + help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences') + group.add_argument('--unkpen', default=0, type=float, + help='unknown word penalty: <0 produces more unks, >0 produces fewer') + group.add_argument('--replace-unk', nargs='?', const=True, default=None, + help='perform unknown replacement (optionally with alignment dictionary)') + group.add_argument('--sacrebleu', action='store_true', + help='score with sacrebleu') + group.add_argument('--score-reference', action='store_true', + help='just score the reference translation') + group.add_argument('--prefix-size', default=0, type=int, metavar='PS', + help='initialize generation by target prefix of given length') + group.add_argument('--no-repeat-ngram-size', default=0, type=int, metavar='N', + help='ngram blocking such that this size ngram cannot be repeated in the generation') + group.add_argument('--sampling', action='store_true', + help='sample hypotheses instead of using beam search') + group.add_argument('--sampling-topk', default=-1, type=int, metavar='PS', + help='sample from top K likely next words instead of all words') + group.add_argument('--sampling-topp', default=-1.0, type=float, metavar='PS', + help='sample from the smallest set whose cumulative probability mass exceeds p for next words') + group.add_argument('--constraints', const="ordered", nargs="?", choices=["ordered", "unordered"], + help='enables lexically constrained decoding') + group.add_argument('--temperature', default=1., type=float, metavar='N', + help='temperature for generation') + group.add_argument('--diverse-beam-groups', default=-1, type=int, metavar='N', + help='number of groups for Diverse Beam Search') + group.add_argument('--diverse-beam-strength', default=0.5, type=float, metavar='N', + help='strength of diversity penalty for Diverse Beam Search') + group.add_argument('--diversity-rate', default=-1.0, type=float, metavar='N', + help='strength of diversity penalty for Diverse Siblings Search') + group.add_argument('--print-alignment', action='store_true', + help='if set, uses attention feedback to compute and print alignment to source tokens') + group.add_argument('--print-step', action='store_true') + + group.add_argument('--lm-path', default=None, type=str, metavar='PATH', + help='path to lm checkpoint for lm fusion') + group.add_argument('--lm-weight', default=0.0, type=float, metavar='N', + help='weight for lm probs for lm fusion') + + # arguments for iterative refinement generator + group.add_argument('--iter-decode-eos-penalty', default=0.0, type=float, metavar='N', + help='if > 0.0, it penalized early-stopping in decoding.') + group.add_argument('--iter-decode-max-iter', default=10, type=int, metavar='N', + help='maximum iterations for iterative refinement.') + group.add_argument('--iter-decode-force-max-iter', action='store_true', + help='if set, run exact the maximum number of iterations without early stop') + group.add_argument('--iter-decode-with-beam', default=1, type=int, metavar='N', + help='if > 1, model will generate translations varying by the lengths.') + group.add_argument('--iter-decode-with-external-reranker', action='store_true', + help='if set, the last checkpoint are assumed to be a reranker to rescore the translations'), + group.add_argument('--retain-iter-history', action='store_true', + help='if set, decoding returns the whole history of iterative refinement') + group.add_argument('--retain-dropout', action='store_true', + help='Use dropout at inference time') + group.add_argument('--retain-dropout-modules', default=None, nargs='+', type=str, + help='if set, only retain dropout for the specified modules; ' + 'if not set, then dropout will be retained for all modules') + + # special decoding format for advanced decoding. + group.add_argument('--decoding-format', default=None, type=str, choices=['unigram', 'ensemble', 'vote', 'dp', 'bs']) + # fmt: on + return group + + +def add_interactive_args(parser): + group = parser.add_argument_group("Interactive") + # fmt: off + group.add_argument('--buffer-size', default=0, type=int, metavar='N', + help='read this many sentences into a buffer before processing them') + group.add_argument('--input', default='-', type=str, metavar='FILE', + help='file to read from; use - for stdin') + # fmt: on + + +def add_model_args(parser): + group = parser.add_argument_group("Model configuration") + # fmt: off + + # Model definitions can be found under fairseq/models/ + # + # The model architecture can be specified in several ways. + # In increasing order of priority: + # 1) model defaults (lowest priority) + # 2) --arch argument + # 3) --encoder/decoder-* arguments (highest priority) + from fairseq.models import ARCH_MODEL_REGISTRY + group.add_argument('--arch', '-a', metavar='ARCH', + choices=ARCH_MODEL_REGISTRY.keys(), + help='model architecture') + # fmt: on + return group diff --git a/fairseq-0.10.2/fairseq/pdb.py b/fairseq-0.10.2/fairseq/pdb.py new file mode 100644 index 0000000000000000000000000000000000000000..1ba6ef0d336b30717cfdde94e1b838cfe2bfeb20 --- /dev/null +++ b/fairseq-0.10.2/fairseq/pdb.py @@ -0,0 +1,47 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import multiprocessing +import os +import pdb +import sys + + +__all__ = ["set_trace"] + + +_stdin = [None] +_stdin_lock = multiprocessing.Lock() +try: + _stdin_fd = sys.stdin.fileno() +except Exception: + _stdin_fd = None + + +class MultiprocessingPdb(pdb.Pdb): + """A Pdb wrapper that works in a multiprocessing environment. + + Usage: `from fairseq import pdb; pdb.set_trace()` + """ + + def __init__(self): + pdb.Pdb.__init__(self, nosigint=True) + + def _cmdloop(self): + stdin_bak = sys.stdin + with _stdin_lock: + try: + if _stdin_fd is not None: + if not _stdin[0]: + _stdin[0] = os.fdopen(_stdin_fd) + sys.stdin = _stdin[0] + self.cmdloop() + finally: + sys.stdin = stdin_bak + + +def set_trace(): + pdb = MultiprocessingPdb() + pdb.set_trace(sys._getframe().f_back) diff --git a/fairseq-0.10.2/fairseq/registry.py b/fairseq-0.10.2/fairseq/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..382dec22a84e736dba93469aef8f7c11dc885339 --- /dev/null +++ b/fairseq-0.10.2/fairseq/registry.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from argparse import Namespace +from typing import Union + +from fairseq.dataclass import FairseqDataclass +from omegaconf import DictConfig + + +REGISTRIES = {} + + +def setup_registry(registry_name: str, base_class=None, default=None, required=False): + assert registry_name.startswith("--") + registry_name = registry_name[2:].replace("-", "_") + + REGISTRY = {} + REGISTRY_CLASS_NAMES = set() + DATACLASS_REGISTRY = {} + + # maintain a registry of all registries + if registry_name in REGISTRIES: + return # registry already exists + REGISTRIES[registry_name] = {"registry": REGISTRY, "default": default} + + def build_x(args: Union[DictConfig, Namespace], *extra_args, **extra_kwargs): + if isinstance(args, DictConfig): + if getattr(args, "_name", None) is not None: + choice = args._name + elif hasattr(args, registry_name): + choice = args.registry_name + else: + raise RuntimeError( + f"Neither _name nor {registry_name} in args, args = {args}" + ) + else: + choice = getattr(args, registry_name, None) + + if choice is None: + if required: + raise ValueError("--{} is required!".format(registry_name)) + return None + cls = REGISTRY[choice] + if hasattr(cls, "build_" + registry_name): + builder = getattr(cls, "build_" + registry_name) + else: + builder = cls + if isinstance(args, Namespace): + set_defaults(args, cls) + return builder(args, *extra_args, **extra_kwargs) + + def register_x(name, dataclass=None): + def register_x_cls(cls): + if name in REGISTRY: + raise ValueError( + "Cannot register duplicate {} ({})".format(registry_name, name) + ) + if cls.__name__ in REGISTRY_CLASS_NAMES: + raise ValueError( + "Cannot register {} with duplicate class name ({})".format( + registry_name, cls.__name__ + ) + ) + if base_class is not None and not issubclass(cls, base_class): + raise ValueError( + "{} must extend {}".format(cls.__name__, base_class.__name__) + ) + + if dataclass is not None and not issubclass(dataclass, FairseqDataclass): + raise ValueError( + "Dataclass {} must extend FairseqDataclass".format(dataclass) + ) + + cls.__dataclass = dataclass + REGISTRY[name] = cls + DATACLASS_REGISTRY[name] = cls.__dataclass + REGISTRY_CLASS_NAMES.add(cls.__name__) + return cls + + return register_x_cls + + return build_x, register_x, REGISTRY, DATACLASS_REGISTRY + + +def set_defaults(args: Namespace, cls): + """Helper to set default arguments based on *add_args*.""" + if not hasattr(cls, "add_args"): + return + parser = argparse.ArgumentParser( + argument_default=argparse.SUPPRESS, allow_abbrev=False + ) + cls.add_args(parser) + # copied from argparse.py: + defaults = argparse.Namespace() + for action in parser._actions: + if action.dest is not argparse.SUPPRESS: + if not hasattr(defaults, action.dest): + if action.default is not argparse.SUPPRESS: + setattr(defaults, action.dest, action.default) + for key, default_value in vars(defaults).items(): + if not hasattr(args, key): + setattr(args, key, default_value) diff --git a/fairseq-0.10.2/fairseq/sequence_generator.py b/fairseq-0.10.2/fairseq/sequence_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..ddfb67853f2e864e1e15965afb653723f1f6df5d --- /dev/null +++ b/fairseq-0.10.2/fairseq/sequence_generator.py @@ -0,0 +1,1008 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Dict, List, Optional + +import torch +import torch.nn as nn +from fairseq import search, utils +from fairseq.data import data_utils +from fairseq.models import FairseqIncrementalDecoder +from fairseq.models.fairseq_encoder import EncoderOut +from torch import Tensor + + +class SequenceGenerator(nn.Module): + def __init__( + self, + models, + tgt_dict, + beam_size=1, + max_len_a=0, + max_len_b=200, + min_len=1, + normalize_scores=True, + len_penalty=1.0, + unk_penalty=0.0, + temperature=1.0, + match_source_len=False, + no_repeat_ngram_size=0, + search_strategy=None, + eos=None, + symbols_to_strip_from_output=None, + lm_model=None, + lm_weight=1.0, + ): + """Generates translations of a given source sentence. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models, + currently support fairseq.models.TransformerModel for scripting + beam_size (int, optional): beam width (default: 1) + max_len_a/b (int, optional): generate sequences of maximum length + ax + b, where x is the source length + min_len (int, optional): the minimum length of the generated output + (not including end-of-sentence) + normalize_scores (bool, optional): normalize scores by the length + of the output (default: True) + len_penalty (float, optional): length penalty, where <1.0 favors + shorter, >1.0 favors longer sentences (default: 1.0) + unk_penalty (float, optional): unknown word penalty, where <0 + produces more unks, >0 produces fewer (default: 0.0) + temperature (float, optional): temperature, where values + >1.0 produce more uniform samples and values <1.0 produce + sharper samples (default: 1.0) + match_source_len (bool, optional): outputs should match the source + length (default: False) + """ + super().__init__() + if isinstance(models, EnsembleModel): + self.model = models + else: + self.model = EnsembleModel(models) + self.tgt_dict = tgt_dict + self.pad = tgt_dict.pad() + self.unk = tgt_dict.unk() + self.eos = tgt_dict.eos() if eos is None else eos + self.symbols_to_strip_from_output = ( + symbols_to_strip_from_output.union({self.eos}) + if symbols_to_strip_from_output is not None + else {self.eos} + ) + self.vocab_size = len(tgt_dict) + self.beam_size = beam_size + # the max beam size is the dictionary size - 1, since we never select pad + self.beam_size = min(beam_size, self.vocab_size - 1) + self.max_len_a = max_len_a + self.max_len_b = max_len_b + self.min_len = min_len + + self.normalize_scores = normalize_scores + self.len_penalty = len_penalty + self.unk_penalty = unk_penalty + self.temperature = temperature + self.match_source_len = match_source_len + self.no_repeat_ngram_size = no_repeat_ngram_size + assert temperature > 0, "--temperature must be greater than 0" + + self.search = ( + search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy + ) + # We only need to set src_lengths in LengthConstrainedBeamSearch. + # As a module attribute, setting it would break in multithread + # settings when the model is shared. + self.should_set_src_lengths = ( + hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths + ) + + self.model.eval() + + self.lm_model = lm_model + self.lm_weight = lm_weight + if self.lm_model is not None: + self.lm_model.eval() + + def cuda(self): + self.model.cuda() + return self + + @torch.no_grad() + def forward( + self, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Tensor] = None, + bos_token: Optional[int] = None, + ): + """Generate a batch of translations. + + Args: + sample (dict): batch + prefix_tokens (torch.LongTensor, optional): force decoder to begin + with these tokens + bos_token (int, optional): beginning of sentence token + (default: self.eos) + """ + return self._generate(sample, prefix_tokens, bos_token=bos_token) + + # TODO(myleott): unused, deprecate after pytorch-translate migration + def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None): + """Iterate over a batched dataset and yield individual translations. + Args: + cuda (bool, optional): use GPU for generation + timer (StopwatchMeter, optional): time generations + """ + for sample in data_itr: + s = utils.move_to_cuda(sample) if cuda else sample + if "net_input" not in s: + continue + input = s["net_input"] + # model.forward normally channels prev_output_tokens into the decoder + # separately, but SequenceGenerator directly calls model.encoder + encoder_input = { + k: v for k, v in input.items() if k != "prev_output_tokens" + } + if timer is not None: + timer.start() + with torch.no_grad(): + hypos = self.generate(encoder_input) + if timer is not None: + timer.stop(sum(len(h[0]["tokens"]) for h in hypos)) + for i, id in enumerate(s["id"].data): + # remove padding + src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad) + ref = ( + utils.strip_pad(s["target"].data[i, :], self.pad) + if s["target"] is not None + else None + ) + yield id, src, ref, hypos[i] + + @torch.no_grad() + def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs): + """Generate translations. Match the api of other fairseq generators. + + Args: + models (List[~fairseq.models.FairseqModel]): ensemble of models + sample (dict): batch + prefix_tokens (torch.LongTensor, optional): force decoder to begin + with these tokens + constraints (torch.LongTensor, optional): force decoder to include + the list of constraints + bos_token (int, optional): beginning of sentence token + (default: self.eos) + """ + return self._generate(sample, **kwargs) + + def _generate( + self, + sample: Dict[str, Dict[str, Tensor]], + prefix_tokens: Optional[Tensor] = None, + constraints: Optional[Tensor] = None, + bos_token: Optional[int] = None, + ): + incremental_states = torch.jit.annotate( + List[Dict[str, Dict[str, Optional[Tensor]]]], + [ + torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) + for i in range(self.model.models_size) + ], + ) + net_input = sample["net_input"] + + if "src_tokens" in net_input: + src_tokens = net_input["src_tokens"] + # length of the source text being the character length except EndOfSentence and pad + src_lengths = ( + (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1) + ) + elif "source" in net_input: + src_tokens = net_input["source"] + src_lengths = ( + net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) + if net_input["padding_mask"] is not None + else torch.tensor(src_tokens.size(-1)).to(src_tokens) + ) + else: + raise Exception("expected src_tokens or source in net input") + + # bsz: total number of sentences in beam + # Note that src_tokens may have more than 2 dimenions (i.e. audio features) + bsz, src_len = src_tokens.size()[:2] + beam_size = self.beam_size + + if constraints is not None and not self.search.supports_constraints: + raise NotImplementedError( + "Target-side constraints were provided, but search method doesn't support them" + ) + + # Initialize constraints, when active + self.search.init_constraints(constraints, beam_size) + + max_len: int = -1 + if self.match_source_len: + max_len = src_lengths.max().item() + else: + max_len = min( + int(self.max_len_a * src_len + self.max_len_b), + # exclude the EOS marker + self.model.max_decoder_positions() - 1, + ) + assert ( + self.min_len <= max_len + ), "min_len cannot be larger than max_len, please adjust these!" + # compute the encoder output for each beam + encoder_outs = self.model.forward_encoder(net_input) + + # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores + new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) + new_order = new_order.to(src_tokens.device).long() + encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) + # ensure encoder_outs is a List. + assert encoder_outs is not None + + # initialize buffers + scores = ( + torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() + ) # +1 for eos; pad is never chosen for scoring + tokens = ( + torch.zeros(bsz * beam_size, max_len + 2) + .to(src_tokens) + .long() + .fill_(self.pad) + ) # +2 for eos and pad + tokens[:, 0] = self.eos if bos_token is None else bos_token + attn: Optional[Tensor] = None + + # A list that indicates candidates that should be ignored. + # For example, suppose we're sampling and have already finalized 2/5 + # samples. Then cands_to_ignore would mark 2 positions as being ignored, + # so that we only finalize the remaining 3 samples. + cands_to_ignore = ( + torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) + ) # forward and backward-compatible False mask + + # list of completed sentences + finalized = torch.jit.annotate( + List[List[Dict[str, Tensor]]], + [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], + ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step + + finished = [ + False for i in range(bsz) + ] # a boolean array indicating if the sentence at the index is finished or not + num_remaining_sent = bsz # number of sentences remaining + + # number of candidate hypos per step + cand_size = 2 * beam_size # 2 x beam size in case half are EOS + + # offset arrays for converting between different indexing schemes + bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) + cand_offsets = torch.arange(0, cand_size).type_as(tokens) + + reorder_state: Optional[Tensor] = None + batch_idxs: Optional[Tensor] = None + + original_batch_idxs: Optional[Tensor] = None + if "id" in sample and isinstance(sample["id"], Tensor): + original_batch_idxs = sample["id"] + else: + original_batch_idxs = torch.arange(0, bsz).type_as(tokens) + + for step in range(max_len + 1): # one extra step for EOS marker + # reorder decoder internal states based on the prev choice of beams + # print(f'step: {step}') + if reorder_state is not None: + if batch_idxs is not None: + # update beam indices to take into account removed sentences + corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( + batch_idxs + ) + reorder_state.view(-1, beam_size).add_( + corr.unsqueeze(-1) * beam_size + ) + original_batch_idxs = original_batch_idxs[batch_idxs] + self.model.reorder_incremental_state(incremental_states, reorder_state) + encoder_outs = self.model.reorder_encoder_out( + encoder_outs, reorder_state + ) + + lprobs, avg_attn_scores = self.model.forward_decoder( + tokens[:, : step + 1], + encoder_outs, + incremental_states, + self.temperature, + ) + + if self.lm_model is not None: + lm_out = self.lm_model(tokens[:, : step + 1]) + probs = self.lm_model.get_normalized_probs( + lm_out, log_probs=True, sample=None + ) + probs = probs[:, -1, :] * self.lm_weight + lprobs += probs + + lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) + + lprobs[:, self.pad] = -math.inf # never select pad + lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty + + # handle max length constraint + if step >= max_len: + lprobs[:, : self.eos] = -math.inf + lprobs[:, self.eos + 1 :] = -math.inf + + # handle prefix tokens (possibly with different lengths) + if ( + prefix_tokens is not None + and step < prefix_tokens.size(1) + and step < max_len + ): + lprobs, tokens, scores = self._prefix_tokens( + step, lprobs, scores, tokens, prefix_tokens, beam_size + ) + elif step < self.min_len: + # minimum length constraint (does not apply if using prefix_tokens) + lprobs[:, self.eos] = -math.inf + + # Record attention scores, only support avg_attn_scores is a Tensor + if avg_attn_scores is not None: + if attn is None: + attn = torch.empty( + bsz * beam_size, avg_attn_scores.size(1), max_len + 2 + ).to(scores) + attn[:, :, step + 1].copy_(avg_attn_scores) + + scores = scores.type_as(lprobs) + eos_bbsz_idx = torch.empty(0).to( + tokens + ) # indices of hypothesis ending with eos (finished sentences) + eos_scores = torch.empty(0).to( + scores + ) # scores of hypothesis ending with eos (finished sentences) + + if self.should_set_src_lengths: + self.search.set_src_lengths(src_lengths) + + if self.no_repeat_ngram_size > 0: + lprobs = self._no_repeat_ngram(tokens, lprobs, bsz, beam_size, step) + + # Shape: (batch, cand_size) + cand_scores, cand_indices, cand_beams = self.search.step( + step, + lprobs.view(bsz, -1, self.vocab_size), + scores.view(bsz, beam_size, -1)[:, :, :step], + tokens[:, : step + 1], + original_batch_idxs, + ) + + # cand_bbsz_idx contains beam indices for the top candidate + # hypotheses, with a range of values: [0, bsz*beam_size), + # and dimensions: [bsz, cand_size] + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + + # finalize hypotheses that end in eos + # Shape of eos_mask: (batch size, beam size) + eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) + eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) + + # only consider eos when it's among the top beam_size indices + # Now we know what beam item(s) to finish + # Shape: 1d list of absolute-numbered + eos_bbsz_idx = torch.masked_select( + cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents: List[int] = [] + if eos_bbsz_idx.numel() > 0: + eos_scores = torch.masked_select( + cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size] + ) + + finalized_sents = self.finalize_hypos( + step, + eos_bbsz_idx, + eos_scores, + tokens, + scores, + finalized, + finished, + beam_size, + attn, + src_lengths, + max_len, + ) + num_remaining_sent -= len(finalized_sents) + + assert num_remaining_sent >= 0 + if num_remaining_sent == 0: + break + if self.search.stop_on_max_len and step >= max_len: + break + assert step < max_len + + # Remove finalized sentences (ones for which {beam_size} + # finished hypotheses have been generated) from the batch. + if len(finalized_sents) > 0: + new_bsz = bsz - len(finalized_sents) + + # construct batch_idxs which holds indices of batches to keep for the next pass + batch_mask = torch.ones( + bsz, dtype=torch.bool, device=cand_indices.device + ) + batch_mask[finalized_sents] = False + # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it + batch_idxs = torch.arange( + bsz, device=cand_indices.device + ).masked_select(batch_mask) + + # Choose the subset of the hypothesized constraints that will continue + self.search.prune_sentences(batch_idxs) + + eos_mask = eos_mask[batch_idxs] + cand_beams = cand_beams[batch_idxs] + bbsz_offsets.resize_(new_bsz, 1) + cand_bbsz_idx = cand_beams.add(bbsz_offsets) + cand_scores = cand_scores[batch_idxs] + cand_indices = cand_indices[batch_idxs] + + if prefix_tokens is not None: + prefix_tokens = prefix_tokens[batch_idxs] + src_lengths = src_lengths[batch_idxs] + cands_to_ignore = cands_to_ignore[batch_idxs] + + scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) + if attn is not None: + attn = attn.view(bsz, -1)[batch_idxs].view( + new_bsz * beam_size, attn.size(1), -1 + ) + bsz = new_bsz + else: + batch_idxs = None + + # Set active_mask so that values > cand_size indicate eos hypos + # and values < cand_size indicate candidate active hypos. + # After, the min values per row are the top candidate active hypos + + # Rewrite the operator since the element wise or is not supported in torchscript. + + eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) + active_mask = torch.add( + eos_mask.type_as(cand_offsets) * cand_size, + cand_offsets[: eos_mask.size(1)], + ) + + # get the top beam_size active hypotheses, which are just + # the hypos with the smallest values in active_mask. + # {active_hypos} indicates which {beam_size} hypotheses + # from the list of {2 * beam_size} candidates were + # selected. Shapes: (batch size, beam size) + new_cands_to_ignore, active_hypos = torch.topk( + active_mask, k=beam_size, dim=1, largest=False + ) + + # update cands_to_ignore to ignore any finalized hypos. + cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] + # Make sure there is at least one active item for each sentence in the batch. + assert (~cands_to_ignore).any(dim=1).all() + + # update cands_to_ignore to ignore any finalized hypos + + # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam + # can be selected more than once). + active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) + active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) + + active_bbsz_idx = active_bbsz_idx.view(-1) + active_scores = active_scores.view(-1) + + # copy tokens and scores for active hypotheses + + # Set the tokens for each beam (can select the same row more than once) + tokens[:, : step + 1] = torch.index_select( + tokens[:, : step + 1], dim=0, index=active_bbsz_idx + ) + # Select the next token for each of them + tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather( + cand_indices, dim=1, index=active_hypos + ) + if step > 0: + scores[:, :step] = torch.index_select( + scores[:, :step], dim=0, index=active_bbsz_idx + ) + scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather( + cand_scores, dim=1, index=active_hypos + ) + + # Update constraints based on which candidates were selected for the next beam + self.search.update_constraints(active_hypos) + + # copy attention for active hypotheses + if attn is not None: + attn[:, :, : step + 2] = torch.index_select( + attn[:, :, : step + 2], dim=0, index=active_bbsz_idx + ) + + # reorder incremental state in decoder + reorder_state = active_bbsz_idx + + # sort by score descending + for sent in range(len(finalized)): + scores = torch.tensor( + [float(elem["score"].item()) for elem in finalized[sent]] + ) + _, sorted_scores_indices = torch.sort(scores, descending=True) + finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] + finalized[sent] = torch.jit.annotate( + List[Dict[str, Tensor]], finalized[sent] + ) + return finalized + + def _prefix_tokens( + self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int + ): + """Handle prefix tokens""" + prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) + prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1)) + prefix_mask = prefix_toks.ne(self.pad) + lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs) + lprobs[prefix_mask] = lprobs[prefix_mask].scatter( + -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask] + ) + # if prefix includes eos, then we should make sure tokens and + # scores are the same across all beams + eos_mask = prefix_toks.eq(self.eos) + if eos_mask.any(): + # validate that the first beam matches the prefix + first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[ + :, 0, 1 : step + 1 + ] + eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] + target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] + assert (first_beam == target_prefix).all() + + # copy tokens, scores and lprobs from the first beam to all beams + tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size) + scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size) + lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size) + return lprobs, tokens, scores + + def replicate_first_beam(self, tensor, mask, beam_size: int): + tensor = tensor.view(-1, beam_size, tensor.size(-1)) + tensor[mask] = tensor[mask][:, :1, :] + return tensor.view(-1, tensor.size(-1)) + + def finalize_hypos( + self, + step: int, + bbsz_idx, + eos_scores, + tokens, + scores, + finalized: List[List[Dict[str, Tensor]]], + finished: List[bool], + beam_size: int, + attn: Optional[Tensor], + src_lengths, + max_len: int, + ): + """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly. + A sentence is finalized when {beam_size} finished items have been collected for it. + + Returns number of sentences (not beam items) being finalized. + These will be removed from the batch and not processed further. + Args: + bbsz_idx (Tensor): + """ + assert bbsz_idx.numel() == eos_scores.numel() + + # clone relevant token and attention tensors. + # tokens is (batch * beam, max_len). So the index_select + # gets the newly EOS rows, then selects cols 1..{step + 2} + tokens_clone = tokens.index_select(0, bbsz_idx)[ + :, 1 : step + 2 + ] # skip the first index, which is EOS + + tokens_clone[:, step] = self.eos + attn_clone = ( + attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2] + if attn is not None + else None + ) + + # compute scores per token position + pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1] + pos_scores[:, step] = eos_scores + # convert from cumulative to per-position scores + pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] + + # normalize sentence-level scores + if self.normalize_scores: + eos_scores /= (step + 1) ** self.len_penalty + + # cum_unfin records which sentences in the batch are finished. + # It helps match indexing between (a) the original sentences + # in the batch and (b) the current, possibly-reduced set of + # sentences. + cum_unfin: List[int] = [] + prev = 0 + for f in finished: + if f: + prev += 1 + else: + cum_unfin.append(prev) + + # set() is not supported in script export + + # The keys here are of the form "{sent}_{unfin_idx}", where + # "unfin_idx" is the index in the current (possibly reduced) + # list of sentences, and "sent" is the index in the original, + # unreduced batch + sents_seen: Dict[str, Optional[Tensor]] = {} + + # For every finished beam item + for i in range(bbsz_idx.size()[0]): + idx = bbsz_idx[i] + score = eos_scores[i] + # sentence index in the current (possibly reduced) batch + unfin_idx = idx // beam_size + # sentence index in the original (unreduced) batch + sent = unfin_idx + cum_unfin[unfin_idx] + # print(f"{step} FINISHED {idx} {score} {sent}={unfin_idx} {cum_unfin}") + # Cannot create dict for key type '(int, int)' in torchscript. + # The workaround is to cast int to string + seen = str(sent.item()) + "_" + str(unfin_idx.item()) + if seen not in sents_seen: + sents_seen[seen] = None + + if self.match_source_len and step > src_lengths[unfin_idx]: + score = torch.tensor(-math.inf).to(score) + + # An input sentence (among those in a batch) is finished when + # beam_size hypotheses have been collected for it + if len(finalized[sent]) < beam_size: + if attn_clone is not None: + # remove padding tokens from attn scores + hypo_attn = attn_clone[i] + else: + hypo_attn = torch.empty(0) + + finalized[sent].append( + { + "tokens": tokens_clone[i], + "score": score, + "attention": hypo_attn, # src_len x tgt_len + "alignment": torch.empty(0), + "positional_scores": pos_scores[i], + } + ) + + newly_finished: List[int] = [] + + for seen in sents_seen.keys(): + # check termination conditions for this sentence + sent: int = int(float(seen.split("_")[0])) + unfin_idx: int = int(float(seen.split("_")[1])) + + if not finished[sent] and self.is_finished( + step, unfin_idx, max_len, len(finalized[sent]), beam_size + ): + finished[sent] = True + newly_finished.append(unfin_idx) + + return newly_finished + + def is_finished( + self, + step: int, + unfin_idx: int, + max_len: int, + finalized_sent_len: int, + beam_size: int, + ): + """ + Check whether decoding for a sentence is finished, which + occurs when the list of finalized sentences has reached the + beam size, or when we reach the maximum length. + """ + assert finalized_sent_len <= beam_size + if finalized_sent_len == beam_size or step == max_len: + return True + return False + + def calculate_banned_tokens( + self, + tokens, + step: int, + gen_ngrams: List[Dict[str, List[int]]], + no_repeat_ngram_size: int, + bbsz_idx: int, + ): + tokens_list: List[int] = tokens[ + bbsz_idx, step + 2 - no_repeat_ngram_size : step + 1 + ].tolist() + # before decoding the next token, prevent decoding of ngrams that have already appeared + ngram_index = ",".join([str(x) for x in tokens_list]) + return gen_ngrams[bbsz_idx].get(ngram_index, torch.jit.annotate(List[int], [])) + + def transpose_list(self, l: List[List[int]]): + # GeneratorExp aren't supported in TS so ignoring the lint + min_len = min([len(x) for x in l]) # noqa + l2 = [[row[i] for row in l] for i in range(min_len)] + return l2 + + def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int): + # for each beam and batch sentence, generate a list of previous ngrams + gen_ngrams: List[Dict[str, List[int]]] = [ + torch.jit.annotate(Dict[str, List[int]], {}) + for bbsz_idx in range(bsz * beam_size) + ] + cpu_tokens = tokens.cpu() + for bbsz_idx in range(bsz * beam_size): + gen_tokens: List[int] = cpu_tokens[bbsz_idx].tolist() + for ngram in self.transpose_list( + [gen_tokens[i:] for i in range(self.no_repeat_ngram_size)] + ): + key = ",".join([str(x) for x in ngram[:-1]]) + gen_ngrams[bbsz_idx][key] = gen_ngrams[bbsz_idx].get( + key, torch.jit.annotate(List[int], []) + ) + [ngram[-1]] + + if step + 2 - self.no_repeat_ngram_size >= 0: + # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + banned_tokens = [ + self.calculate_banned_tokens( + tokens, step, gen_ngrams, self.no_repeat_ngram_size, bbsz_idx + ) + for bbsz_idx in range(bsz * beam_size) + ] + else: + banned_tokens = [ + torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size) + ] + for bbsz_idx in range(bsz * beam_size): + lprobs[bbsz_idx][ + torch.tensor(banned_tokens[bbsz_idx]).long() + ] = torch.tensor(-math.inf).to(lprobs) + return lprobs + + +class EnsembleModel(nn.Module): + """A wrapper around an ensemble of models.""" + + def __init__(self, models): + super().__init__() + self.models_size = len(models) + # method '__len__' is not supported in ModuleList for torch script + self.single_model = models[0] + self.models = nn.ModuleList(models) + + self.has_incremental: bool = False + if all( + hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder) + for m in models + ): + self.has_incremental = True + + def forward(self): + pass + + def has_encoder(self): + return hasattr(self.single_model, "encoder") + + def has_incremental_states(self): + return self.has_incremental + + def max_decoder_positions(self): + return min([m.max_decoder_positions() for m in self.models]) + + @torch.jit.export + def forward_encoder(self, net_input: Dict[str, Tensor]): + if not self.has_encoder(): + return None + return [model.encoder.forward_torchscript(net_input) for model in self.models] + + @torch.jit.export + def forward_decoder( + self, + tokens, + encoder_outs: List[EncoderOut], + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + temperature: float = 1.0, + ): + log_probs = [] + avg_attn: Optional[Tensor] = None + encoder_out: Optional[EncoderOut] = None + for i, model in enumerate(self.models): + if self.has_encoder(): + encoder_out = encoder_outs[i] + # decode each model + if self.has_incremental_states(): + decoder_out = model.decoder.forward( + tokens, + encoder_out=encoder_out, + incremental_state=incremental_states[i], + ) + else: + decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) + + attn: Optional[Tensor] = None + decoder_len = len(decoder_out) + if decoder_len > 1 and decoder_out[1] is not None: + if isinstance(decoder_out[1], Tensor): + attn = decoder_out[1] + else: + attn_holder = decoder_out[1]["attn"] + if isinstance(attn_holder, Tensor): + attn = attn_holder + elif attn_holder is not None: + attn = attn_holder[0] + if attn is not None: + attn = attn[:, -1, :] + + decoder_out_tuple = ( + decoder_out[0][:, -1:, :].div_(temperature), + None if decoder_len <= 1 else decoder_out[1], + ) + + probs = model.get_normalized_probs( + decoder_out_tuple, log_probs=True, sample=None + ) + probs = probs[:, -1, :] + if self.models_size == 1: + return probs, attn + + log_probs.append(probs) + if attn is not None: + if avg_attn is None: + avg_attn = attn + else: + avg_attn.add_(attn) + + avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log( + self.models_size + ) + + if avg_attn is not None: + avg_attn.div_(self.models_size) + return avg_probs, avg_attn + + @torch.jit.export + def reorder_encoder_out(self, encoder_outs: Optional[List[EncoderOut]], new_order): + """ + Reorder encoder output according to *new_order*. + + Args: + encoder_out: output from the ``forward()`` method + new_order (LongTensor): desired order + + Returns: + *encoder_out* rearranged according to *new_order* + """ + new_outs: List[EncoderOut] = [] + if not self.has_encoder(): + return new_outs + for i, model in enumerate(self.models): + assert encoder_outs is not None + new_outs.append( + model.encoder.reorder_encoder_out(encoder_outs[i], new_order) + ) + return new_outs + + @torch.jit.export + def reorder_incremental_state( + self, + incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], + new_order, + ): + if not self.has_incremental_states(): + return + for i, model in enumerate(self.models): + model.decoder.reorder_incremental_state_scripting( + incremental_states[i], new_order + ) + + +class SequenceGeneratorWithAlignment(SequenceGenerator): + def __init__(self, models, tgt_dict, left_pad_target=False, **kwargs): + """Generates translations of a given source sentence. + + Produces alignments following "Jointly Learning to Align and + Translate with Transformer Models" (Garg et al., EMNLP 2019). + + Args: + left_pad_target (bool, optional): Whether or not the + hypothesis should be left padded or not when they are + teacher forced for generating alignments. + """ + super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs) + self.left_pad_target = left_pad_target + + @torch.no_grad() + def generate(self, models, sample, **kwargs): + finalized = super()._generate(sample, **kwargs) + + src_tokens = sample["net_input"]["src_tokens"] + bsz = src_tokens.shape[0] + beam_size = self.beam_size + ( + src_tokens, + src_lengths, + prev_output_tokens, + tgt_tokens, + ) = self._prepare_batch_for_alignment(sample, finalized) + if any(getattr(m, "full_context_alignment", False) for m in self.model.models): + attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens) + else: + attn = [ + finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0) + for i in range(bsz * beam_size) + ] + + if src_tokens.device != "cpu": + src_tokens = src_tokens.to("cpu") + tgt_tokens = tgt_tokens.to("cpu") + attn = [i.to("cpu") for i in attn] + + # Process the attn matrix to extract hard alignments. + for i in range(bsz * beam_size): + alignment = utils.extract_hard_alignment( + attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos + ) + finalized[i // beam_size][i % beam_size]["alignment"] = alignment + return finalized + + def _prepare_batch_for_alignment(self, sample, hypothesis): + src_tokens = sample["net_input"]["src_tokens"] + bsz = src_tokens.shape[0] + src_tokens = ( + src_tokens[:, None, :] + .expand(-1, self.beam_size, -1) + .contiguous() + .view(bsz * self.beam_size, -1) + ) + src_lengths = sample["net_input"]["src_lengths"] + src_lengths = ( + src_lengths[:, None] + .expand(-1, self.beam_size) + .contiguous() + .view(bsz * self.beam_size) + ) + prev_output_tokens = data_utils.collate_tokens( + [beam["tokens"] for example in hypothesis for beam in example], + self.pad, + self.eos, + self.left_pad_target, + move_eos_to_beginning=True, + ) + tgt_tokens = data_utils.collate_tokens( + [beam["tokens"] for example in hypothesis for beam in example], + self.pad, + self.eos, + self.left_pad_target, + move_eos_to_beginning=False, + ) + return src_tokens, src_lengths, prev_output_tokens, tgt_tokens + + +class EnsembleModelWithAlignment(EnsembleModel): + """A wrapper around an ensemble of models.""" + + def __init__(self, models): + super().__init__(models) + + def forward_align(self, src_tokens, src_lengths, prev_output_tokens): + avg_attn = None + for model in self.models: + decoder_out = model(src_tokens, src_lengths, prev_output_tokens) + attn = decoder_out[1]["attn"][0] + if avg_attn is None: + avg_attn = attn + else: + avg_attn.add_(attn) + if len(self.models) > 1: + avg_attn.div_(len(self.models)) + return avg_attn diff --git a/fairseq-0.10.2/fairseq/sequence_scorer.py b/fairseq-0.10.2/fairseq/sequence_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..411d4df4445ef8dd3f1907ad56f9de6943d1fed8 --- /dev/null +++ b/fairseq-0.10.2/fairseq/sequence_scorer.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import sys + +import torch +from fairseq import utils + + +class SequenceScorer(object): + """Scores the target for a given source sentence.""" + + def __init__( + self, + tgt_dict, + softmax_batch=None, + compute_alignment=False, + eos=None, + symbols_to_strip_from_output=None, + ): + self.pad = tgt_dict.pad() + self.eos = tgt_dict.eos() if eos is None else eos + self.softmax_batch = softmax_batch or sys.maxsize + assert self.softmax_batch > 0 + self.compute_alignment = compute_alignment + self.symbols_to_strip_from_output = ( + symbols_to_strip_from_output.union({self.eos}) + if symbols_to_strip_from_output is not None + else {self.eos} + ) + + @torch.no_grad() + def generate(self, models, sample, **kwargs): + """Score a batch of translations.""" + net_input = sample["net_input"] + + def batch_for_softmax(dec_out, target): + # assumes decoder_out[0] is the only thing needed (may not be correct for future models!) + first, rest = dec_out[0], dec_out[1:] + bsz, tsz, dim = first.shape + if bsz * tsz < self.softmax_batch: + yield dec_out, target, True + else: + flat = first.contiguous().view(1, -1, dim) + flat_tgt = target.contiguous().view(flat.shape[:-1]) + s = 0 + while s < flat.size(1): + e = s + self.softmax_batch + yield (flat[:, s:e],) + rest, flat_tgt[:, s:e], False + s = e + + def gather_target_probs(probs, target): + probs = probs.gather( + dim=2, + index=target.unsqueeze(-1), + ) + return probs + + orig_target = sample["target"] + + # compute scores for each model in the ensemble + avg_probs = None + avg_attn = None + for model in models: + model.eval() + decoder_out = model(**net_input) + attn = decoder_out[1] if len(decoder_out) > 1 else None + if type(attn) is dict: + attn = attn.get("attn", None) + + batched = batch_for_softmax(decoder_out, orig_target) + probs, idx = None, 0 + for bd, tgt, is_single in batched: + sample["target"] = tgt + curr_prob = model.get_normalized_probs( + bd, log_probs=len(models) == 1, sample=sample + ).data + if is_single: + probs = gather_target_probs(curr_prob, orig_target) + else: + if probs is None: + probs = curr_prob.new(orig_target.numel()) + step = curr_prob.size(0) * curr_prob.size(1) + end = step + idx + tgt_probs = gather_target_probs( + curr_prob.view(tgt.shape + (curr_prob.size(-1),)), tgt + ) + probs[idx:end] = tgt_probs.view(-1) + idx = end + sample["target"] = orig_target + + probs = probs.view(sample["target"].shape) + + if avg_probs is None: + avg_probs = probs + else: + avg_probs.add_(probs) + if attn is not None: + if torch.is_tensor(attn): + attn = attn.data + else: + attn = attn[0] + if avg_attn is None: + avg_attn = attn + else: + avg_attn.add_(attn) + if len(models) > 1: + avg_probs.div_(len(models)) + avg_probs.log_() + if avg_attn is not None: + avg_attn.div_(len(models)) + + bsz = avg_probs.size(0) + hypos = [] + start_idxs = sample["start_indices"] if "start_indices" in sample else [0] * bsz + for i in range(bsz): + # remove padding from ref + ref = ( + utils.strip_pad(sample["target"][i, start_idxs[i] :], self.pad) + if sample["target"] is not None + else None + ) + tgt_len = ref.numel() + avg_probs_i = avg_probs[i][start_idxs[i] : start_idxs[i] + tgt_len] + score_i = avg_probs_i.sum() / tgt_len + if avg_attn is not None: + avg_attn_i = avg_attn[i] + if self.compute_alignment: + alignment = utils.extract_hard_alignment( + avg_attn_i, + sample["net_input"]["src_tokens"][i], + sample["target"][i], + self.pad, + self.eos, + ) + else: + alignment = None + else: + avg_attn_i = alignment = None + hypos.append( + [ + { + "tokens": ref, + "score": score_i, + "attention": avg_attn_i, + "alignment": alignment, + "positional_scores": avg_probs_i, + } + ] + ) + return hypos diff --git a/fairseq-0.10.2/fairseq/trainer.py b/fairseq-0.10.2/fairseq/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..1a7fd8a2cce56cb94767c89fa1343d37357bf161 --- /dev/null +++ b/fairseq-0.10.2/fairseq/trainer.py @@ -0,0 +1,1140 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Train a network across multiple GPUs. +""" + +import contextlib +import logging +import sys +import time +from itertools import chain +from typing import Any, Dict, List + +import torch +from fairseq import checkpoint_utils, distributed_utils, models, optim, utils +from fairseq.file_io import PathManager +from fairseq.logging import meters, metrics +from fairseq.nan_detector import NanDetector +from fairseq.optim import lr_scheduler + + +logger = logging.getLogger(__name__) + + +class Trainer(object): + """Main class for data parallel training. + + This class supports synchronous distributed data parallel training, + where multiple workers each have a full model replica and gradients + are accumulated across workers before each update. We use + :class:`~torch.nn.parallel.DistributedDataParallel` to handle + communication of the gradients across workers. + """ + + def __init__(self, args, task, model, criterion, quantizer=None): + self.args = args + self.task = task + + # catalog shared parameters + shared_params = _catalog_shared_params(model) + + self.tpu = getattr(args, "tpu", False) + self.cuda = torch.cuda.is_available() and not args.cpu and not self.tpu + if self.cuda: + self.device = torch.device("cuda") + elif self.tpu: + self.device = utils.get_tpu_device(args) + else: + self.device = torch.device("cpu") + + # copy model and criterion to current device/dtype + self._criterion = criterion + self._model = model + if self.tpu: + import torch_xla.core.xla_model as xm + + self._model = xm.send_cpu_data_to_device(self._model, self.device) + if args.fp16: + self._criterion = self._criterion.half() + self._model = self._model.half() + elif args.bf16: + self._criterion = self._criterion.to(dtype=torch.bfloat16) + self._model = self._model.to(dtype=torch.bfloat16) + if not args.pipeline_model_parallel: + self._criterion = self._criterion.to(device=self.device) + self._model = self._model.to(device=self.device) + self.pipeline_model_parallel = args.pipeline_model_parallel + self.last_device = None + if self.cuda and self.pipeline_model_parallel: + self.last_device = torch.device(args.pipeline_devices[-1]) + + # check that shared parameters are preserved after device transfer + for shared_param in shared_params: + ref = _get_module_by_path(self._model, shared_param[0]) + for path in shared_param[1:]: + logger.info( + "detected shared parameter: {} <- {}".format(shared_param[0], path) + ) + _set_module_by_path(self._model, path, ref) + + self._dummy_batch = None # indicates we don't have a dummy batch at first + self._lr_scheduler = None + self._num_updates = 0 + self._num_xla_compiles = 0 # for TPUs + self._optim_history = None + self._optimizer = None + self._warn_once = set() + self._wrapped_criterion = None + self._wrapped_model = None + + # TODO(myleott): support tpu + if self.cuda and self.data_parallel_world_size > 1: + self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size) + else: + self._grad_norm_buf = None + + self.quantizer = quantizer + if self.quantizer is not None: + self.quantizer.set_trainer(self) + + # get detailed cuda environment + if self.cuda: + self.cuda_env = utils.CudaEnvironment() + if self.data_parallel_world_size > 1: + self.cuda_env_arr = distributed_utils.all_gather_list(self.cuda_env) + else: + self.cuda_env_arr = [self.cuda_env] + if self.data_parallel_rank == 0: + utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr) + else: + self.cuda_env = None + self.cuda_env_arr = None + + metrics.log_start_time("wall", priority=790, round=0) + + self._start_time = time.time() + self._previous_training_time = 0 + self._cumulative_training_time = None + + def reinitialize(self): + """Reinitialize the Trainer, typically after model params change.""" + self._lr_scheduler = None + self._optimizer = None + self._wrapped_criterion = None + self._wrapped_model = None + + @property + def data_parallel_world_size(self): + return self.args.distributed_world_size + + @property + def data_parallel_process_group(self): + if self.tpu: + return ("tpu", None) + else: + return None + + @property + def data_parallel_rank(self): + return self.args.distributed_rank + + @property + def is_data_parallel_master(self): + return distributed_utils.is_master(self.args) + + @property + def criterion(self): + if self._wrapped_criterion is None: + if ( + utils.has_parameters(self._criterion) + and self.data_parallel_world_size > 1 + and not self.args.use_bmuf + and not self.tpu + ): + self._wrapped_criterion = models.DistributedFairseqModel( + self.args, + self._criterion, + process_group=self.data_parallel_process_group, + ) + else: + self._wrapped_criterion = self._criterion + return self._wrapped_criterion + + @property + def model(self): + if self._wrapped_model is None: + if ( + self.data_parallel_world_size > 1 + and not self.args.use_bmuf + and not self.tpu + ): + self._wrapped_model = models.DistributedFairseqModel( + self.args, + self._model, + process_group=self.data_parallel_process_group, + ) + else: + self._wrapped_model = self._model + return self._wrapped_model + + @property + def optimizer(self): + if self._optimizer is None: + self._build_optimizer() + return self._optimizer + + @property + def lr_scheduler(self): + if self._lr_scheduler is None: + self._build_optimizer() # this will initialize self._lr_scheduler + return self._lr_scheduler + + def _build_optimizer(self): + params = list( + filter( + lambda p: p.requires_grad, + chain(self.model.parameters(), self.criterion.parameters()), + ) + ) + + if self.args.fp16 or self.args.bf16: + if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: + logger.info( + "NOTE: your device does NOT support faster training with --fp16, " + "please switch to FP32 which is likely to be faster" + ) + if self.args.memory_efficient_fp16 or self.args.memory_efficient_bf16: + self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( + self.args, params + ) + else: + self._optimizer = optim.FP16Optimizer.build_optimizer(self.args, params) + else: + if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: + logger.info("NOTE: your device may support faster training with --fp16") + self._optimizer = optim.build_optimizer(self.args, params) + + if self.args.use_bmuf: + self._optimizer = optim.FairseqBMUF(self.args, self._optimizer) + + if self.args.zero_sharding == "os": + if ( + self.args.fp16 + and not self.args.memory_efficient_fp16 + and not self.args.memory_efficient_bf16 + ) and not self.args.fp16_no_flatten_grads: + raise ValueError( + "ZeRO is incomptabile with fp16 and flattened grads. " + "Please use --fp16-no-flatten-grads" + ) + else: + optim.shard_( + self.args, self._optimizer, self.data_parallel_process_group + ) + + # We should initialize the learning rate scheduler immediately after + # building the optimizer, so that the initial learning rate is set. + self._lr_scheduler = lr_scheduler.build_lr_scheduler(self.args, self.optimizer) + self._lr_scheduler.step_update(0) + + def consolidate_optimizer(self): + """For OSS, we need to consolidate the state dict.""" + if hasattr(self.optimizer.optimizer, "consolidate_state_dict"): + self.optimizer.optimizer.consolidate_state_dict() + + def save_checkpoint(self, filename, extra_state): + """Save all training state in a checkpoint file.""" + if self.is_data_parallel_master: # only save one checkpoint + extra_state["metrics"] = metrics.state_dict() + extra_state["previous_training_time"] = self.cumulative_training_time() + checkpoint_utils.save_state( + filename, + self.args, + self.get_model().state_dict(), + self.get_criterion(), + self.optimizer, + self.lr_scheduler, + self.get_num_updates(), + self._optim_history, + extra_state, + ) + + def load_checkpoint( + self, + filename, + reset_optimizer=False, + reset_lr_scheduler=False, + optimizer_overrides=None, + reset_meters=False, + ): + """Load all training state from a checkpoint file.""" + extra_state, self._optim_history, last_optim_state = None, [], None + + bexists = PathManager.isfile(filename) + if bexists: + state = checkpoint_utils.load_checkpoint_to_cpu(filename) + + # load model parameters + try: + self.get_model().load_state_dict( + state["model"], strict=True, args=self.args + ) + if utils.has_parameters(self.get_criterion()): + self.get_criterion().load_state_dict( + state["criterion"], strict=True + ) + except Exception: + raise Exception( + "Cannot load model parameters from checkpoint {}; " + "please ensure that the architectures match.".format(filename) + ) + + extra_state = state["extra_state"] + self._optim_history = state["optimizer_history"] + last_optim_state = state.get("last_optimizer_state", None) + + if last_optim_state is not None and not reset_optimizer: + # rebuild optimizer after loading model, since params may have changed + self._build_optimizer() + + # only reload optimizer and lr_scheduler if they match + last_optim = self._optim_history[-1] + assert ( + last_optim["criterion_name"] == self.get_criterion().__class__.__name__ + ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." + assert ( + last_optim["optimizer_name"] == self.optimizer.__class__.__name__ + ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." + + if not reset_lr_scheduler: + self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"]) + self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) + + self.set_num_updates(last_optim["num_updates"]) + + if extra_state is not None: + epoch = extra_state["train_iterator"]["epoch"] + logger.info( + "loaded checkpoint {} (epoch {} @ {} updates)".format( + filename, epoch, self.get_num_updates() + ) + ) + + if "previous_training_time" in extra_state: + self._previous_training_time = extra_state["previous_training_time"] + self._start_time = time.time() + + self.lr_step(epoch) + + if "metrics" in extra_state and not reset_meters: + metrics.load_state_dict(extra_state["metrics"]) + + # reset TimeMeters, since their start times don't make sense anymore + for meter in metrics.get_meters("default"): + if isinstance(meter, meters.TimeMeter): + meter.reset() + else: + logger.info("no existing checkpoint found {}".format(filename)) + + return extra_state + + def get_train_iterator( + self, + epoch, + combine=True, + load_dataset=True, + data_selector=None, + shard_batch_itr=True, + disable_iterator_cache=False, + ): + """Return an EpochBatchIterator over the training set for a given epoch.""" + if load_dataset: + logger.info("loading train data for epoch {}".format(epoch)) + self.task.load_dataset( + self.args.train_subset, + epoch=epoch, + combine=combine, + data_selector=data_selector, + ) + batch_iterator = self.task.get_batch_iterator( + dataset=self.task.dataset(self.args.train_subset), + max_tokens=self.args.max_tokens, + max_sentences=self.args.batch_size, + max_positions=utils.resolve_max_positions( + self.task.max_positions(), + self.model.max_positions(), + self.args.max_tokens, + ), + ignore_invalid_inputs=True, + required_batch_size_multiple=self.args.required_batch_size_multiple, + seed=self.args.seed, + num_shards=self.data_parallel_world_size if shard_batch_itr else 1, + shard_id=self.data_parallel_rank if shard_batch_itr else 0, + num_workers=self.args.num_workers, + epoch=epoch, + data_buffer_size=self.args.data_buffer_size, + disable_iterator_cache=disable_iterator_cache, + ) + self.reset_dummy_batch(batch_iterator.first_batch) + return batch_iterator + + def get_valid_iterator( + self, + subset, + disable_iterator_cache=False, + ): + """Return an EpochBatchIterator over given validation subset for a given epoch.""" + batch_iterator = self.task.get_batch_iterator( + dataset=self.task.dataset(subset), + max_tokens=self.args.max_tokens_valid, + max_sentences=self.args.batch_size_valid, + max_positions=utils.resolve_max_positions( + self.task.max_positions(), + self.model.max_positions(), + ), + ignore_invalid_inputs=self.args.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=self.args.required_batch_size_multiple, + seed=self.args.seed, + num_shards=self.data_parallel_world_size, + shard_id=self.data_parallel_rank, + num_workers=self.args.num_workers, + data_buffer_size=self.args.data_buffer_size, + disable_iterator_cache=disable_iterator_cache, + ) + self.reset_dummy_batch(batch_iterator.first_batch) + return batch_iterator + + def begin_epoch(self, epoch): + """Called at the beginning of each epoch.""" + logger.info("begin training epoch {}".format(epoch)) + + self.lr_step_begin_epoch(epoch) + + if self.quantizer is not None: + self.quantizer.begin_epoch(epoch) + + # task specific setup per epoch + self.task.begin_epoch(epoch, self.get_model()) + + if self.tpu: + import torch_xla.core.xla_model as xm + + xm.rendezvous("begin_epoch") # wait for all workers + xm.mark_step() + + def begin_valid_epoch(self, epoch): + """Called at the beginning of each validation epoch.""" + + # task specific setup per validation epoch + self.task.begin_valid_epoch(epoch, self.get_model()) + + def reset_dummy_batch(self, batch): + self._dummy_batch = batch + + @metrics.aggregate("train") + def train_step(self, samples, raise_oom=False): + """Do forward, backward and parameter update.""" + self._set_seed() + self.model.train() + self.criterion.train() + self.zero_grad() + + metrics.log_start_time("train_wall", priority=800, round=0) + + # forward and backward pass + logging_outputs, sample_size, ooms = [], 0, 0 + for i, sample in enumerate(samples): + sample = self._prepare_sample(sample) + if sample is None: + # when sample is None, run forward/backward on a dummy batch + # and ignore the resulting gradients + sample = self._prepare_sample(self._dummy_batch) + is_dummy_batch = True + else: + if self._dummy_batch == "DUMMY": + self._dummy_batch = sample + is_dummy_batch = False + + def maybe_no_sync(): + """ + Whenever *samples* contains more than one mini-batch, we + want to accumulate gradients locally and only call + all-reduce in the last backwards pass. + """ + if ( + self.data_parallel_world_size > 1 + and hasattr(self.model, "no_sync") + and i < len(samples) - 1 + ): + return self.model.no_sync() + else: + return contextlib.ExitStack() # dummy contextmanager + + try: + with maybe_no_sync(): + # forward and backward + loss, sample_size_i, logging_output = self.task.train_step( + sample=sample, + model=self.model, + criterion=self.criterion, + optimizer=self.optimizer, + update_num=self.get_num_updates(), + ignore_grad=is_dummy_batch, + ) + del loss + + logging_outputs.append(logging_output) + sample_size += sample_size_i + + # emptying the CUDA cache after the first step can + # reduce the chance of OOM + if self.cuda and self.get_num_updates() == 0: + torch.cuda.empty_cache() + except RuntimeError as e: + if "out of memory" in str(e): + self._log_oom(e) + if raise_oom: + raise e + logger.warning( + "attempting to recover from OOM in forward/backward pass" + ) + ooms += 1 + self.zero_grad() + if self.cuda: + torch.cuda.empty_cache() + if self.args.distributed_world_size == 1: + return None + else: + raise e + + if self.tpu and i < len(samples) - 1: + # tpu-comment: every XLA operation before marking step is + # appended to the IR graph, and processing too many batches + # before marking step can lead to OOM errors. + # To handle gradient accumulation use case, we explicitly + # mark step here for every forward pass without a backward pass + import torch_xla.core.xla_model as xm + + xm.mark_step() + + if is_dummy_batch: + if torch.is_tensor(sample_size): + sample_size.zero_() + else: + sample_size *= 0.0 + + if torch.is_tensor(sample_size): + sample_size = sample_size.float() + else: + sample_size = float(sample_size) + + # gather logging outputs from all replicas + if self._sync_stats(): + train_time = self._local_cumulative_training_time() + logging_outputs, ( + sample_size, + ooms, + total_train_time, + ) = self._aggregate_logging_outputs( + logging_outputs, + sample_size, + ooms, + train_time, + ignore=is_dummy_batch, + ) + self._cumulative_training_time = ( + total_train_time / self.data_parallel_world_size + ) + + if hasattr(self.model, "all_reduce"): + self.model.all_reduce() + + overflow = False + try: + if self.tpu and self.data_parallel_world_size > 1: + import torch_xla.core.xla_model as xm + + gradients = xm._fetch_gradients(self.optimizer.optimizer) + xm.all_reduce( + "sum", gradients, scale=1.0 / self.data_parallel_world_size + ) + + with torch.autograd.profiler.record_function("multiply-grads"): + # multiply gradients by (# GPUs / sample_size) since DDP + # already normalizes by the number of GPUs. Thus we get + # (sum_of_gradients / sample_size). + if not self.args.use_bmuf: + self.optimizer.multiply_grads( + self.data_parallel_world_size / sample_size + ) + elif sample_size > 0: # BMUF needs to check sample size + num = self.data_parallel_world_size if self._sync_stats() else 1 + self.optimizer.multiply_grads(num / sample_size) + + with torch.autograd.profiler.record_function("clip-grads"): + # clip grads + grad_norm = self.clip_grad_norm(self.args.clip_norm) + + # check that grad norms are consistent across workers + # on tpu check tensor is slow + if not self.tpu: + if ( + not self.args.use_bmuf + and self.args.distributed_wrapper != "SlowMo" + ): + self._check_grad_norms(grad_norm) + if not torch.isfinite(grad_norm).all(): + # check local gradnorm single GPU case, trigger NanDetector + raise FloatingPointError("gradients are Nan/Inf") + + with torch.autograd.profiler.record_function("optimizer"): + # take an optimization step + self.optimizer.step() + + except FloatingPointError: + # re-run the forward and backward pass with hooks attached to print + # out where it fails + with NanDetector(self.get_model()): + self.task.train_step( + sample, + self.model, + self.criterion, + self.optimizer, + self.get_num_updates(), + ignore_grad=False, + ) + raise + except OverflowError as e: + overflow = True + logger.info("NOTE: overflow detected, " + str(e)) + grad_norm = torch.tensor(0.0).cuda() + self.zero_grad() + except RuntimeError as e: + if "out of memory" in str(e): + self._log_oom(e) + logger.error("OOM during optimization, irrecoverable") + raise e + + # Some distributed wrappers (e.g., SlowMo) need access to the optimizer after the step + if hasattr(self.model, "perform_additional_optimizer_actions"): + if hasattr(self.optimizer, "fp32_params"): + self.model.perform_additional_optimizer_actions( + self.optimizer.optimizer, self.optimizer.fp32_params + ) + else: + self.model.perform_additional_optimizer_actions( + self.optimizer.optimizer + ) + + if not overflow or self.args.distributed_wrapper == "SlowMo": + self.set_num_updates(self.get_num_updates() + 1) + + if self.tpu: + # mark step on TPUs + import torch_xla.core.xla_model as xm + + xm.mark_step() + + # only log stats every log_interval steps + # this causes wps to be misreported when log_interval > 1 + logging_output = {} + if self.get_num_updates() % self.args.log_interval == 0: + # log memory usage + mem_info = xm.get_memory_info(self.device) + gb_free = mem_info["kb_free"] / 1024 / 1024 + gb_total = mem_info["kb_total"] / 1024 / 1024 + metrics.log_scalar( + "gb_free", + gb_free, + priority=1500, + round=1, + weight=0, + ) + metrics.log_scalar( + "gb_total", + gb_total, + priority=1600, + round=1, + weight=0, + ) + + logging_output = self._reduce_and_log_stats( + logging_outputs, + sample_size, + grad_norm, + ) + + # log whenever there's an XLA compilation, since these + # slow down training and may indicate opportunities for + # optimization + self._check_xla_compilation() + else: + # log stats + logging_output = self._reduce_and_log_stats( + logging_outputs, + sample_size, + grad_norm, + ) + + # clear CUDA cache to reduce memory fragmentation + if ( + self.cuda + and self.args.empty_cache_freq > 0 + and ( + (self.get_num_updates() + self.args.empty_cache_freq - 1) + % self.args.empty_cache_freq + ) + == 0 + ): + torch.cuda.empty_cache() + + if self.args.fp16: + metrics.log_scalar( + "loss_scale", + self.optimizer.scaler.loss_scale, + priority=700, + round=4, + weight=0, + ) + + metrics.log_stop_time("train_wall") + return logging_output + + @metrics.aggregate("valid") + def valid_step(self, sample, raise_oom=False): + """Do forward pass in evaluation mode.""" + if self.tpu: + import torch_xla.core.xla_model as xm + + xm.rendezvous("valid_step") # wait for all workers + xm.mark_step() + + with torch.no_grad(): + self.model.eval() + self.criterion.eval() + + sample = self._prepare_sample(sample) + if sample is None: + sample = self._prepare_sample(self._dummy_batch) + is_dummy_batch = True + else: + if self._dummy_batch == "DUMMY": + self._dummy_batch = sample + is_dummy_batch = False + + try: + _loss, sample_size, logging_output = self.task.valid_step( + sample, self.model, self.criterion + ) + except RuntimeError as e: + if "out of memory" in str(e): + self._log_oom(e) + if not raise_oom: + logger.warning( + "ran out of memory in validation step, retrying batch" + ) + for p in self.model.parameters(): + if p.grad is not None: + p.grad = None # free some memory + if self.cuda: + torch.cuda.empty_cache() + return self.valid_step(sample, raise_oom=True) + raise e + + logging_outputs = [logging_output] + if is_dummy_batch: + if torch.is_tensor(sample_size): + sample_size.zero_() + else: + sample_size *= 0.0 + + # gather logging outputs from all replicas + if self.data_parallel_world_size > 1: + logging_outputs, (sample_size,) = self._aggregate_logging_outputs( + logging_outputs, + sample_size, + ignore=is_dummy_batch, + ) + + # log validation stats + logging_output = self._reduce_and_log_stats(logging_outputs, sample_size) + + return logging_output + + def zero_grad(self): + self.optimizer.zero_grad() + + def lr_step_begin_epoch(self, epoch): + """Adjust the learning rate at the beginning of the epoch.""" + self.lr_scheduler.step_begin_epoch(epoch) + # prefer updating the LR based on the number of steps + return self.lr_step_update() + + def lr_step(self, epoch, val_loss=None): + """Adjust the learning rate at the end of the epoch.""" + self.lr_scheduler.step(epoch, val_loss) + # prefer updating the LR based on the number of steps + return self.lr_step_update() + + def lr_step_update(self): + """Update the learning rate after each update.""" + new_lr = self.lr_scheduler.step_update(self.get_num_updates()) + metrics.log_scalar("lr", new_lr, weight=0, priority=300) + return new_lr + + def get_lr(self): + """Get the current learning rate.""" + return self.optimizer.get_lr() + + def get_model(self): + """Get the (non-wrapped) model instance.""" + return self._model + + def get_criterion(self): + """Get the (non-wrapped) criterion instance.""" + return self._criterion + + def get_meter(self, name): + """[deprecated] Get a specific meter by name.""" + from fairseq import meters + + if "get_meter" not in self._warn_once: + self._warn_once.add("get_meter") + utils.deprecation_warning( + "Trainer.get_meter is deprecated. Please use fairseq.metrics instead." + ) + + train_meters = metrics.get_meters("train") + if train_meters is None: + train_meters = {} + + if name == "train_loss" and "loss" in train_meters: + return train_meters["loss"] + elif name == "train_nll_loss": + # support for legacy train.py, which assumed this meter is + # always initialized + m = train_meters.get("nll_loss", None) + return m or meters.AverageMeter() + elif name == "wall": + # support for legacy train.py, which assumed this meter is + # always initialized + m = metrics.get_meter("default", "wall") + return m or meters.TimeMeter() + elif name == "wps": + m = metrics.get_meter("train", "wps") + return m or meters.TimeMeter() + elif name in {"valid_loss", "valid_nll_loss"}: + # support for legacy train.py, which assumed these meters + # are always initialized + k = name[len("valid_") :] + m = metrics.get_meter("valid", k) + return m or meters.AverageMeter() + elif name == "oom": + return meters.AverageMeter() + elif name in train_meters: + return train_meters[name] + return None + + def get_num_updates(self): + """Get the number of parameters updates.""" + return self._num_updates + + def set_num_updates(self, num_updates): + """Set the number of parameters updates.""" + self._num_updates = num_updates + self.lr_step_update() + if self.quantizer: + self.quantizer.step_update(self._num_updates) + metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200) + + def clip_grad_norm(self, clip_norm): + return self.optimizer.clip_grad_norm(clip_norm, aggregate_norm_fn=None) + + def cumulative_training_time(self): + if self._cumulative_training_time is None: + # single GPU + return self._local_cumulative_training_time() + else: + return self._cumulative_training_time + + def _local_cumulative_training_time(self): + """Aggregate training time in seconds.""" + return time.time() - self._start_time + self._previous_training_time + + def _prepare_sample(self, sample): + if sample == "DUMMY": + raise Exception( + "Trying to use an uninitialized 'dummy' batch. This usually indicates " + "that the total number of batches is smaller than the number of " + "participating GPUs. Try reducing the batch size or using fewer GPUs." + ) + + if sample is None or len(sample) == 0: + return None + + if self.cuda: + if self.pipeline_model_parallel: + if "target" in sample: + sample["target"] = utils.move_to_cuda( + sample["target"], device=self.last_device + ) + else: + sample = utils.move_to_cuda(sample) + + def apply_half(t): + if t.dtype is torch.float32: + return t.half() + return t + + def apply_bfloat16(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.bfloat16) + return t + + if self.args.fp16: + sample = utils.apply_to_sample(apply_half, sample) + + if self.args.bf16: + sample = utils.apply_to_sample(apply_bfloat16, sample) + + return sample + + def _set_seed(self): + # Set seed based on args.seed and the update number so that we get + # reproducible results when resuming from checkpoints + seed = self.args.seed + self.get_num_updates() + utils.set_torch_seed(seed) + + def _sync_stats(self): + # Return True if it's using multiple GPUs and DDP or multiple GPUs with + # BMUF and it's a bmuf sync with warmup iterations completed before. + if self.data_parallel_world_size == 1: + return False + elif self.args.use_bmuf: + return (self.get_num_updates() + 1) % self.args.global_sync_iter == 0 and ( + self.get_num_updates() + 1 + ) > self.args.warmup_iterations + else: + return True + + def _log_oom(self, exc): + msg = "OOM: Ran out of memory with exception: {}".format(exc) + logger.warning(msg) + if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"): + for device_idx in range(torch.cuda.device_count()): + logger.warning(torch.cuda.memory_summary(device=device_idx)) + sys.stderr.flush() + + def _aggregate_logging_outputs( + self, + logging_outputs: List[Dict[str, Any]], + *extra_stats_to_sum, + ignore=False, + ): + if self.task.__class__.logging_outputs_can_be_summed(self.get_criterion()): + return self._fast_stat_sync_sum( + logging_outputs, *extra_stats_to_sum, ignore=ignore + ) + else: + return self._all_gather_list_sync( + logging_outputs, *extra_stats_to_sum, ignore=ignore + ) + + def _all_gather_list_sync( + self, + logging_outputs: List[Dict[str, Any]], + *extra_stats_to_sum, + ignore=False, + ): + """ + Sync logging outputs across workers. all_gather_list_sync is + suitable when logging outputs are complex types. + """ + if self.tpu: + raise NotImplementedError + if ignore: + logging_outputs = [] + results = list( + zip( + *distributed_utils.all_gather_list( + [logging_outputs] + list(extra_stats_to_sum), + max_size=getattr(self.args, "all_gather_list_size", 16384), + group=self.data_parallel_process_group, + ) + ) + ) + logging_outputs, extra_stats_to_sum = results[0], results[1:] + logging_outputs = list(chain.from_iterable(logging_outputs)) + extra_stats_to_sum = [sum(s) for s in extra_stats_to_sum] + return logging_outputs, extra_stats_to_sum + + def _fast_stat_sync_sum( + self, + logging_outputs: List[Dict[str, Any]], + *extra_stats_to_sum, + ignore=False, + ): + """ + Sync logging outputs across workers. fast_stat_sync_sum is + faster than all_gather_list_sync, but is only suitable when + logging outputs are scalars and can be summed. Note that + *logging_outputs* cannot contain any nested dicts/lists. + """ + data = {} + for i, stat in enumerate(extra_stats_to_sum): + data["extra_stats_" + str(i)] = stat + if len(logging_outputs) > 0: + log_keys = list(logging_outputs[0].keys()) + for k in log_keys: + if not ignore: + v = sum(log[k] for log in logging_outputs if k in log) + else: + v = logging_outputs[0][k] + v = torch.zeros_like(v) if torch.is_tensor(v) else 0 + data["logging_outputs_" + k] = v + else: + log_keys = None + + data = distributed_utils.all_reduce_dict( + data, device=self.device, group=self.data_parallel_process_group + ) + + extra_stats_to_sum = [ + data["extra_stats_" + str(i)] for i in range(len(extra_stats_to_sum)) + ] + if log_keys is not None: + logging_outputs = [{k: data["logging_outputs_" + k] for k in log_keys}] + else: + logging_outputs = [] + return logging_outputs, extra_stats_to_sum + + def _check_grad_norms(self, grad_norm): + """Check that grad norms are consistent across workers.""" + if self._grad_norm_buf is not None: + self._grad_norm_buf.zero_() + self._grad_norm_buf[self.data_parallel_rank] = grad_norm + distributed_utils.all_reduce( + self._grad_norm_buf, group=self.data_parallel_process_group + ) + + def is_consistent(tensor): + max_abs_diff = torch.max(torch.abs(tensor - tensor[0])) + return ( + torch.isfinite(tensor).all() + or (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all() + ) + + if not is_consistent(self._grad_norm_buf): + pretty_detail = "\n".join( + "rank {:3d} = {:.8f}".format(r, n) + for r, n in enumerate(self._grad_norm_buf.tolist()) + ) + error_detail = "grad_norm across the workers:\n{}\n".format( + pretty_detail + ) + # use FloatingPointError to trigger NanDetector + raise FloatingPointError( + "Fatal error: gradients are inconsistent between workers. " + "Try --ddp-backend=no_c10d. " + "Or are you mixing up different generation of GPUs in training?" + + "\n" + + "-" * 80 + + "\n{}\n".format(error_detail) + + "-" * 80 + ) + + def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None): + if grad_norm is not None: + metrics.log_speed("ups", 1.0, priority=100, round=2) + metrics.log_scalar("gnorm", grad_norm, priority=400, round=3) + if self.args.clip_norm > 0: + metrics.log_scalar( + "clip", + torch.where( + grad_norm > self.args.clip_norm, + grad_norm.new_tensor(100), + grad_norm.new_tensor(0), + ), + priority=500, + round=1, + ) + + with metrics.aggregate() as agg: + if logging_outputs is not None: + self.task.reduce_metrics(logging_outputs, self.get_criterion()) + del logging_outputs + + # extra warning for criterions that don't properly log a loss value + if "loss" not in agg: + if "loss" not in self._warn_once: + self._warn_once.add("loss") + logger.warning( + "Criterion.reduce_metrics did not log a 'loss' value, " + "which may break some functionality" + ) + metrics.log_scalar("loss", -1) + + # support legacy interface + if self.tpu: + logging_output = {} + else: + logging_output = agg.get_smoothed_values() + logging_output["sample_size"] = sample_size + for key_to_delete in ["ppl", "wps", "wpb", "bsz"]: + if key_to_delete in logging_output: + del logging_output[key_to_delete] + return logging_output + + def _check_xla_compilation(self): + import torch_xla.debug.metrics as met + + compile_stats = met.metric_data("CompileTime") + if compile_stats is None: + return + num_xla_compiles = compile_stats[0] + if num_xla_compiles > self._num_xla_compiles: + logger.warning( + "XLA compilation detected on device #{}; too many of these can lead " + "to slow training, but we expect a few in the beginning".format( + self.args.distributed_rank + ) + ) + self._num_xla_compiles = num_xla_compiles + + +def _catalog_shared_params(module, memo=None, prefix=""): + if memo is None: + first_call = True + memo = {} + else: + first_call = False + for name, param in module._parameters.items(): + param_prefix = prefix + ("." if prefix else "") + name + if param not in memo: + memo[param] = [] + memo[param].append(param_prefix) + for name, m in module._modules.items(): + if m is None: + continue + submodule_prefix = prefix + ("." if prefix else "") + name + _catalog_shared_params(m, memo, submodule_prefix) + if first_call: + return [x for x in memo.values() if len(x) > 1] + + +def _get_module_by_path(module, path): + path = path.split(".") + for name in path: + module = getattr(module, name) + return module + + +def _set_module_by_path(module, path, value): + path = path.split(".") + for name in path[:-1]: + module = getattr(module, name) + setattr(module, path[-1], value) diff --git a/fairseq-0.10.2/fairseq_cli/generate.py b/fairseq-0.10.2/fairseq_cli/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..8ddf981cc37dd0ec7e2464aefdda174726db8d7e --- /dev/null +++ b/fairseq-0.10.2/fairseq_cli/generate.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 -u +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" +Translate pre-processed data with a trained model. +""" + +import ast +import logging +import math +import os +import sys +from itertools import chain + +import numpy as np +import torch +from fairseq import checkpoint_utils, options, scoring, tasks, utils +from fairseq.logging import progress_bar +from fairseq.logging.meters import StopwatchMeter, TimeMeter + + +def main(args): + assert args.path is not None, "--path required for generation!" + assert ( + not args.sampling or args.nbest == args.beam + ), "--sampling requires --nbest to be equal to --beam" + assert ( + args.replace_unk is None or args.dataset_impl == "raw" + ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)" + + if args.results_path is not None: + os.makedirs(args.results_path, exist_ok=True) + output_path = os.path.join( + args.results_path, "generate-{}.txt".format(args.gen_subset) + ) + with open(output_path, "w", buffering=1, encoding="utf-8") as h: + return _main(args, h) + else: + return _main(args, sys.stdout) + + +def get_symbols_to_strip_from_output(generator): + if hasattr(generator, "symbols_to_strip_from_output"): + return generator.symbols_to_strip_from_output + else: + return {generator.eos} + + +def _main(args, output_file): + logging.basicConfig( + format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=os.environ.get("LOGLEVEL", "INFO").upper(), + stream=output_file, + ) + logger = logging.getLogger("fairseq_cli.generate") + + utils.import_user_module(args) + + if args.max_tokens is None and args.batch_size is None: + args.max_tokens = 12000 + logger.info(args) + + # Fix seed for stochastic decoding + if args.seed is not None and not args.no_seed_provided: + np.random.seed(args.seed) + utils.set_torch_seed(args.seed) + + use_cuda = torch.cuda.is_available() and not args.cpu + + # Load dataset splits + task = tasks.setup_task(args) + task.load_dataset(args.gen_subset) + + # Set dictionaries + try: + src_dict = getattr(task, "source_dictionary", None) + except NotImplementedError: + src_dict = None + tgt_dict = task.target_dictionary + + overrides = ast.literal_eval(args.model_overrides) + + # Load ensemble + logger.info("loading model(s) from {}".format(args.path)) + models, _model_args = checkpoint_utils.load_model_ensemble( + utils.split_paths(args.path), + arg_overrides=overrides, + task=task, + suffix=getattr(args, "checkpoint_suffix", ""), + strict=(args.checkpoint_shard_count == 1), + num_shards=args.checkpoint_shard_count, + ) + + if args.lm_path is not None: + overrides["data"] = args.data + + try: + lms, _ = checkpoint_utils.load_model_ensemble( + [args.lm_path], + arg_overrides=overrides, + task=None, + ) + except: + logger.warning( + f"Failed to load language model! Please make sure that the language model dict is the same " + f"as target dict and is located in the data dir ({args.data})" + ) + raise + + assert len(lms) == 1 + else: + lms = [None] + + # Optimize ensemble for generation + for model in chain(models, lms): + if model is None: + continue + if args.fp16: + model.half() + if use_cuda and not args.pipeline_model_parallel: + model.cuda() + model.prepare_for_inference_(args) + + # Load alignment dictionary for unknown word replacement + # (None if no unknown word replacement, empty if no path to align dictionary) + align_dict = utils.load_align_dict(args.replace_unk) + + # Load dataset (possibly sharded) + itr = task.get_batch_iterator( + dataset=task.dataset(args.gen_subset), + max_tokens=args.max_tokens, + max_sentences=args.batch_size, + max_positions=utils.resolve_max_positions( + task.max_positions(), *[model.max_positions() for model in models] + ), + ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, + required_batch_size_multiple=args.required_batch_size_multiple, + num_shards=args.num_shards, + shard_id=args.shard_id, + num_workers=args.num_workers, + data_buffer_size=args.data_buffer_size, + ).next_epoch_itr(shuffle=False) + progress = progress_bar.progress_bar( + itr, + log_format=args.log_format, + log_interval=args.log_interval, + default_log_format=("tqdm" if not args.no_progress_bar else "none"), + ) + + # Initialize generator + gen_timer = StopwatchMeter() + + extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": args.lm_weight} + generator = task.build_generator( + models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs + ) + + # Handle tokenization and BPE + tokenizer = task.build_tokenizer(args) + bpe = task.build_bpe(args) + + def decode_fn(x): + if bpe is not None: + x = bpe.decode(x) + if tokenizer is not None: + x = tokenizer.decode(x) + return x + + scorer = scoring.build_scorer(args, tgt_dict) + + num_sentences = 0 + has_target = True + wps_meter = TimeMeter() + for sample in progress: + sample = utils.move_to_cuda(sample) if use_cuda else sample + if "net_input" not in sample: + continue + + prefix_tokens = None + if args.prefix_size > 0: + prefix_tokens = sample["target"][:, : args.prefix_size] + + constraints = None + if "constraints" in sample: + constraints = sample["constraints"] + + gen_timer.start() + hypos = task.inference_step( + generator, + models, + sample, + prefix_tokens=prefix_tokens, + constraints=constraints, + ) + num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) + gen_timer.stop(num_generated_tokens) + + for i, sample_id in enumerate(sample["id"].tolist()): + has_target = sample["target"] is not None + + # Remove padding + if "src_tokens" in sample["net_input"]: + src_tokens = utils.strip_pad( + sample["net_input"]["src_tokens"][i, :], tgt_dict.pad() + ) + else: + src_tokens = None + + target_tokens = None + if has_target: + target_tokens = ( + utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu() + ) + + # Either retrieve the original sentences or regenerate them from tokens. + if align_dict is not None: + src_str = task.dataset(args.gen_subset).src.get_original_text(sample_id) + target_str = task.dataset(args.gen_subset).tgt.get_original_text( + sample_id + ) + else: + if src_dict is not None: + src_str = src_dict.string(src_tokens, args.remove_bpe) + else: + src_str = "" + if has_target: + target_str = tgt_dict.string( + target_tokens, + args.remove_bpe, + escape_unk=True, + extra_symbols_to_ignore=get_symbols_to_strip_from_output( + generator + ), + ) + + src_str = decode_fn(src_str) + if has_target: + target_str = decode_fn(target_str) + + if not args.quiet: + if src_dict is not None: + print("S-{}\t{}".format(sample_id, src_str), file=output_file) + if has_target: + print("T-{}\t{}".format(sample_id, target_str), file=output_file) + + # Process top predictions + for j, hypo in enumerate(hypos[i][: args.nbest]): + hypo_tokens, hypo_str, alignment = utils.post_process_prediction( + hypo_tokens=hypo["tokens"].int().cpu(), + src_str=src_str, + alignment=hypo["alignment"], + align_dict=align_dict, + tgt_dict=tgt_dict, + remove_bpe=args.remove_bpe, + extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator), + ) + detok_hypo_str = decode_fn(hypo_str) + if not args.quiet: + score = hypo["score"] / math.log(2) # convert to base 2 + # original hypothesis (after tokenization and BPE) + print( + "H-{}\t{}\t{}".format(sample_id, score, hypo_str), + file=output_file, + ) + # detokenized hypothesis + print( + "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str), + file=output_file, + ) + print( + "P-{}\t{}".format( + sample_id, + " ".join( + map( + lambda x: "{:.4f}".format(x), + # convert from base e to base 2 + hypo["positional_scores"] + .div_(math.log(2)) + .tolist(), + ) + ), + ), + file=output_file, + ) + + if args.print_alignment: + print( + "A-{}\t{}".format( + sample_id, + " ".join( + [ + "{}-{}".format(src_idx, tgt_idx) + for src_idx, tgt_idx in alignment + ] + ), + ), + file=output_file, + ) + + if args.print_step: + print( + "I-{}\t{}".format(sample_id, hypo["steps"]), + file=output_file, + ) + + if getattr(args, "retain_iter_history", False): + for step, h in enumerate(hypo["history"]): + _, h_str, _ = utils.post_process_prediction( + hypo_tokens=h["tokens"].int().cpu(), + src_str=src_str, + alignment=None, + align_dict=None, + tgt_dict=tgt_dict, + remove_bpe=None, + ) + print( + "E-{}_{}\t{}".format(sample_id, step, h_str), + file=output_file, + ) + + # Score only the top hypothesis + if has_target and j == 0: + if align_dict is not None or args.remove_bpe is not None: + # Convert back to tokens for evaluation with unk replacement and/or without BPE + target_tokens = tgt_dict.encode_line( + target_str, add_if_not_exist=True + ) + hypo_tokens = tgt_dict.encode_line( + detok_hypo_str, add_if_not_exist=True + ) + if hasattr(scorer, "add_string"): + scorer.add_string(target_str, detok_hypo_str) + else: + scorer.add(target_tokens, hypo_tokens) + + wps_meter.update(num_generated_tokens) + progress.log({"wps": round(wps_meter.avg)}) + num_sentences += ( + sample["nsentences"] if "nsentences" in sample else sample["id"].numel() + ) + + logger.info("NOTE: hypothesis and token scores are output in base 2") + logger.info( + "Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format( + num_sentences, + gen_timer.n, + gen_timer.sum, + num_sentences / gen_timer.sum, + 1.0 / gen_timer.avg, + ) + ) + if has_target: + if args.bpe and not args.sacrebleu: + if args.remove_bpe: + logger.warning( + "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" + ) + else: + logger.warning( + "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" + ) + # use print to be consistent with other main outputs: S-, H-, T-, D- and so on + print( + "Generate {} with beam={}: {}".format( + args.gen_subset, args.beam, scorer.result_string() + ), + file=output_file, + ) + + return scorer + + +def cli_main(): + parser = options.get_generation_parser() + args = options.parse_args_and_arch(parser) + main(args) + + +if __name__ == "__main__": + cli_main() diff --git a/fairseq-0.10.2/tests/gpu/__init__.py b/fairseq-0.10.2/tests/gpu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/fairseq-0.10.2/tests/gpu/test_binaries_gpu.py b/fairseq-0.10.2/tests/gpu/test_binaries_gpu.py new file mode 100644 index 0000000000000000000000000000000000000000..2ac60a09341746f3155ed4e876f6a93bfb8337cf --- /dev/null +++ b/fairseq-0.10.2/tests/gpu/test_binaries_gpu.py @@ -0,0 +1,305 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import logging +import os +import tempfile +import unittest +from io import StringIO + +import torch +from fairseq import options +from fairseq_cli import train +from tests.utils import ( + create_dummy_data, + generate_main, + preprocess_lm_data, + preprocess_translation_data, + train_translation_model, +) + + +class TestTranslationGPU(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_fp16(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_fp16") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model(data_dir, "fconv_iwslt_de_en", ["--fp16"]) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_memory_efficient_fp16(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"] + ) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_transformer_fp16(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_transformer") as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir) + train_translation_model( + data_dir, + "transformer_iwslt_de_en", + [ + "--encoder-layers", + "2", + "--decoder-layers", + "2", + "--encoder-embed-dim", + "64", + "--decoder-embed-dim", + "64", + "--fp16", + ], + run_validation=True, + ) + generate_main(data_dir) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_levenshtein_transformer(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory( + "test_levenshtein_transformer" + ) as data_dir: + create_dummy_data(data_dir) + preprocess_translation_data(data_dir, ["--joined-dictionary"]) + train_translation_model( + data_dir, + "levenshtein_transformer", + [ + "--apply-bert-init", + "--early-exit", + "6,6,6", + "--criterion", + "nat_loss", + ], + task="translation_lev", + ) + generate_main( + data_dir, + [ + "--task", + "translation_lev", + "--iter-decode-max-iter", + "9", + "--iter-decode-eos-penalty", + "0", + "--print-step", + ], + ) + + +def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + train.main(train_args) + + # try scalar quantization + scalar_quant_train_parser = options.get_training_parser() + scalar_quant_train_args = options.parse_args_and_arch( + scalar_quant_train_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "500", + "--tokens-per-sample", + "500", + "--save-dir", + data_dir, + "--max-update", + "3", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + "--quant-noise-scalar", + "0.5", + ] + + (extra_flags or []), + ) + train.main(scalar_quant_train_args) + + # try iterative PQ quantization + quantize_parser = options.get_training_parser() + quantize_args = options.parse_args_and_arch( + quantize_parser, + [ + "--task", + "language_modeling", + data_dir, + "--arch", + arch, + "--optimizer", + "adam", + "--lr", + "0.0001", + "--criterion", + "adaptive_loss", + "--adaptive-softmax-cutoff", + "5,10,15", + "--max-tokens", + "50", + "--tokens-per-sample", + "50", + "--max-update", + "6", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--ddp-backend", + "no_c10d", + "--num-workers", + "0", + "--restore-file", + os.path.join(data_dir, "checkpoint_last.pt"), + "--reset-optimizer", + "--quantization-config-path", + os.path.join( + os.path.dirname(__file__), "transformer_quantization_config.yaml" + ), + ] + + (extra_flags or []), + ) + train.main(quantize_args) + + +class TestQuantization(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_quantization(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_quantization") as data_dir: + create_dummy_data(data_dir) + preprocess_lm_data(data_dir) + # tests both scalar and iterative PQ quantization + _quantize_language_model(data_dir, "transformer_lm") + + +class TestOptimizersGPU(unittest.TestCase): + def setUp(self): + logging.disable(logging.CRITICAL) + + def tearDown(self): + logging.disable(logging.NOTSET) + + @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") + def test_flat_grads(self): + with contextlib.redirect_stdout(StringIO()): + with tempfile.TemporaryDirectory("test_flat_grads") as data_dir: + # Use just a bit of data and tiny model to keep this test runtime reasonable + create_dummy_data(data_dir, num_examples=10, maxlen=5) + preprocess_translation_data(data_dir) + with self.assertRaises(RuntimeError): + # adafactor isn't compatible with flat grads, which + # are used by default with --fp16 + train_translation_model( + data_dir, + "lstm", + [ + "--required-batch-size-multiple", + "1", + "--encoder-layers", + "1", + "--encoder-hidden-size", + "32", + "--decoder-layers", + "1", + "--optimizer", + "adafactor", + "--fp16", + ], + ) + # but it should pass once we set --fp16-no-flatten-grads + train_translation_model( + data_dir, + "lstm", + [ + "--required-batch-size-multiple", + "1", + "--encoder-layers", + "1", + "--encoder-hidden-size", + "32", + "--decoder-layers", + "1", + "--optimizer", + "adafactor", + "--fp16", + "--fp16-no-flatten-grads", + ], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/gpu/transformer_quantization_config.yaml b/fairseq-0.10.2/tests/gpu/transformer_quantization_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de31d8116ced675b81eb74119642217d768e7736 --- /dev/null +++ b/fairseq-0.10.2/tests/gpu/transformer_quantization_config.yaml @@ -0,0 +1,28 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# This file defines example configuration arguments for quantizing +# a transformer model with product quantization + +n_centroids: + Linear: + key: in_features + value: {"*": 8} + Embedding: + key: embedding_dim + value: {"*": 8} + +block_sizes: + Linear: + key: fuzzy_name + value: {fc: 8, attn: 4, emb: 4} + Embedding: + key: fuzzy_name + value: {emb: 8} + +layers_to_quantize: + - decoder\\.layers\\.\d+\\.fc[12] + - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] + - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj) diff --git a/fairseq-0.10.2/tests/speech_recognition/test_collaters.py b/fairseq-0.10.2/tests/speech_recognition/test_collaters.py new file mode 100644 index 0000000000000000000000000000000000000000..6a5029a48faea2426d7a0277655a2c7c08c1d16c --- /dev/null +++ b/fairseq-0.10.2/tests/speech_recognition/test_collaters.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import numpy as np +import torch +from examples.speech_recognition.data.collaters import Seq2SeqCollater + + +class TestSeq2SeqCollator(unittest.TestCase): + def test_collate(self): + + eos_idx = 1 + pad_idx = 0 + collater = Seq2SeqCollater( + feature_index=0, label_index=1, pad_index=pad_idx, eos_index=eos_idx + ) + + # 2 frames in the first sample and 3 frames in the second one + frames1 = np.array([[7, 8], [9, 10]]) + frames2 = np.array([[1, 2], [3, 4], [5, 6]]) + target1 = np.array([4, 2, 3, eos_idx]) + target2 = np.array([3, 2, eos_idx]) + sample1 = {"id": 0, "data": [frames1, target1]} + sample2 = {"id": 1, "data": [frames2, target2]} + batch = collater.collate([sample1, sample2]) + + # collate sort inputs by frame's length before creating the batch + self.assertTensorEqual(batch["id"], torch.tensor([1, 0])) + self.assertEqual(batch["ntokens"], 7) + self.assertTensorEqual( + batch["net_input"]["src_tokens"], + torch.tensor( + [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [pad_idx, pad_idx]]] + ), + ) + self.assertTensorEqual( + batch["net_input"]["prev_output_tokens"], + torch.tensor([[eos_idx, 3, 2, pad_idx], [eos_idx, 4, 2, 3]]), + ) + self.assertTensorEqual(batch["net_input"]["src_lengths"], torch.tensor([3, 2])) + self.assertTensorEqual( + batch["target"], + torch.tensor([[3, 2, eos_idx, pad_idx], [4, 2, 3, eos_idx]]), + ) + self.assertEqual(batch["nsentences"], 2) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/speech_recognition/test_cross_entropy.py b/fairseq-0.10.2/tests/speech_recognition/test_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..b05400ed95e22762c3e3e5e8fd3ebfa6caf1e325 --- /dev/null +++ b/fairseq-0.10.2/tests/speech_recognition/test_cross_entropy.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from examples.speech_recognition.criterions.cross_entropy_acc import ( + CrossEntropyWithAccCriterion, +) + +from .asr_test_base import CrossEntropyCriterionTestBase + + +class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase): + def setUp(self): + self.criterion_cls = CrossEntropyWithAccCriterion + super().setUp() + + def test_cross_entropy_all_correct(self): + sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False) + loss, sample_size, logging_output = self.criterion( + self.model, sample, "sum", log_probs=True + ) + assert logging_output["correct"] == 20 + assert logging_output["total"] == 20 + assert logging_output["sample_size"] == 20 + assert logging_output["ntokens"] == 20 + + def test_cross_entropy_all_wrong(self): + sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False) + loss, sample_size, logging_output = self.criterion( + self.model, sample, "sum", log_probs=True + ) + assert logging_output["correct"] == 0 + assert logging_output["total"] == 20 + assert logging_output["sample_size"] == 20 + assert logging_output["ntokens"] == 20 diff --git a/fairseq-0.10.2/tests/speech_recognition/test_vggtransformer.py b/fairseq-0.10.2/tests/speech_recognition/test_vggtransformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc73b8c7379970dc0bcc16fcb088a64a1bd7e3b --- /dev/null +++ b/fairseq-0.10.2/tests/speech_recognition/test_vggtransformer.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 + +# import models/encoder/decoder to be tested +from examples.speech_recognition.models.vggtransformer import ( + TransformerDecoder, + VGGTransformerEncoder, + VGGTransformerModel, + vggtransformer_1, + vggtransformer_2, + vggtransformer_base, +) + +# import base test class +from .asr_test_base import ( + DEFAULT_TEST_VOCAB_SIZE, + TestFairseqDecoderBase, + TestFairseqEncoderBase, + TestFairseqEncoderDecoderModelBase, + get_dummy_dictionary, + get_dummy_encoder_output, + get_dummy_input, +) + + +class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_1 use 14 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_1, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_2 use 16 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_2, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase): + def setUp(self): + def override_config(args): + """ + vggtrasformer_base use 12 layers of transformer, + for testing purpose, it is too expensive. For fast turn-around + test, reduce the number of layers to 3. + """ + args.transformer_enc_config = ( + "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3" + ) + + super().setUp() + extra_args_setter = [vggtransformer_base, override_config] + + self.setUpModel(VGGTransformerModel, extra_args_setter) + self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) + + +class VGGTransformerEncoderTest(TestFairseqEncoderBase): + def setUp(self): + super().setUp() + + self.setUpInput(get_dummy_input(T=50, D=80, B=5)) + + def test_forward(self): + print("1. test standard vggtransformer") + self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80)) + super().test_forward() + print("2. test vggtransformer with limited right context") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, transformer_context=(-1, 5) + ) + ) + super().test_forward() + print("3. test vggtransformer with limited left context") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, transformer_context=(5, -1) + ) + ) + super().test_forward() + print("4. test vggtransformer with limited right context and sampling") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, + transformer_context=(-1, 12), + transformer_sampling=(2, 2), + ) + ) + super().test_forward() + print("5. test vggtransformer with windowed context and sampling") + self.setUpEncoder( + VGGTransformerEncoder( + input_feat_per_channel=80, + transformer_context=(12, 12), + transformer_sampling=(2, 2), + ) + ) + + +class TransformerDecoderTest(TestFairseqDecoderBase): + def setUp(self): + super().setUp() + + dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE) + decoder = TransformerDecoder(dict) + dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256)) + + self.setUpDecoder(decoder) + self.setUpInput(dummy_encoder_output) + self.setUpPrevOutputTokens() diff --git a/fairseq-0.10.2/tests/test_average_checkpoints.py b/fairseq-0.10.2/tests/test_average_checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..f348b56b869372d8434fe03f13324d78e9093fa2 --- /dev/null +++ b/fairseq-0.10.2/tests/test_average_checkpoints.py @@ -0,0 +1,134 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import collections +import os +import shutil +import tempfile +import unittest + +import numpy as np +import torch +from scripts.average_checkpoints import average_checkpoints +from torch import nn + + +class ModelWithSharedParameter(nn.Module): + def __init__(self): + super(ModelWithSharedParameter, self).__init__() + self.embedding = nn.Embedding(1000, 200) + self.FC1 = nn.Linear(200, 200) + self.FC2 = nn.Linear(200, 200) + # tie weight in FC2 to FC1 + self.FC2.weight = nn.Parameter(self.FC1.weight) + self.FC2.bias = nn.Parameter(self.FC1.bias) + + self.relu = nn.ReLU() + + def forward(self, input): + return self.FC2(self.ReLU(self.FC1(input))) + self.FC1(input) + + +class TestAverageCheckpoints(unittest.TestCase): + def test_average_checkpoints(self): + params_0 = collections.OrderedDict( + [ + ("a", torch.DoubleTensor([100.0])), + ("b", torch.FloatTensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])), + ("c", torch.IntTensor([7, 8, 9])), + ] + ) + params_1 = collections.OrderedDict( + [ + ("a", torch.DoubleTensor([1.0])), + ("b", torch.FloatTensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])), + ("c", torch.IntTensor([2, 2, 2])), + ] + ) + params_avg = collections.OrderedDict( + [ + ("a", torch.DoubleTensor([50.5])), + ("b", torch.FloatTensor([[1.0, 1.5, 2.0], [2.5, 3.0, 3.5]])), + # We expect truncation for integer division + ("c", torch.IntTensor([4, 5, 5])), + ] + ) + + fd_0, path_0 = tempfile.mkstemp() + fd_1, path_1 = tempfile.mkstemp() + torch.save(collections.OrderedDict([("model", params_0)]), path_0) + torch.save(collections.OrderedDict([("model", params_1)]), path_1) + + output = average_checkpoints([path_0, path_1])["model"] + + os.close(fd_0) + os.remove(path_0) + os.close(fd_1) + os.remove(path_1) + + for (k_expected, v_expected), (k_out, v_out) in zip( + params_avg.items(), output.items() + ): + self.assertEqual( + k_expected, + k_out, + "Key mismatch - expected {} but found {}. " + "(Expected list of keys: {} vs actual list of keys: {})".format( + k_expected, k_out, params_avg.keys(), output.keys() + ), + ) + np.testing.assert_allclose( + v_expected.numpy(), + v_out.numpy(), + err_msg="Tensor value mismatch for key {}".format(k_expected), + ) + + def test_average_checkpoints_with_shared_parameters(self): + def _construct_model_with_shared_parameters(path, value): + m = ModelWithSharedParameter() + nn.init.constant_(m.FC1.weight, value) + torch.save({"model": m.state_dict()}, path) + return m + + tmpdir = tempfile.mkdtemp() + paths = [] + path = os.path.join(tmpdir, "m1.pt") + m1 = _construct_model_with_shared_parameters(path, 1.0) + paths.append(path) + + path = os.path.join(tmpdir, "m2.pt") + m2 = _construct_model_with_shared_parameters(path, 2.0) + paths.append(path) + + path = os.path.join(tmpdir, "m3.pt") + m3 = _construct_model_with_shared_parameters(path, 3.0) + paths.append(path) + + new_model = average_checkpoints(paths) + self.assertTrue( + torch.equal( + new_model["model"]["embedding.weight"], + (m1.embedding.weight + m2.embedding.weight + m3.embedding.weight) / 3.0, + ) + ) + + self.assertTrue( + torch.equal( + new_model["model"]["FC1.weight"], + (m1.FC1.weight + m2.FC1.weight + m3.FC1.weight) / 3.0, + ) + ) + + self.assertTrue( + torch.equal( + new_model["model"]["FC2.weight"], + (m1.FC2.weight + m2.FC2.weight + m3.FC2.weight) / 3.0, + ) + ) + shutil.rmtree(tmpdir) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_bmuf.py b/fairseq-0.10.2/tests/test_bmuf.py new file mode 100644 index 0000000000000000000000000000000000000000..0165b2955bdf64903717c2bd27580c83d3cf8e21 --- /dev/null +++ b/fairseq-0.10.2/tests/test_bmuf.py @@ -0,0 +1,172 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import random +import unittest +from multiprocessing import Manager + +import torch +import torch.nn as nn +from fairseq import distributed_utils, optim + + +class Model(nn.Module): + def __init__(self, input_size, output_size): + super(Model, self).__init__() + self.fc = nn.Linear(input_size, output_size) + + def forward(self, input): + output = self.fc(input) + return output + + +def setup_model_loss_criterion(args, rank, is_cuda): + """ + setup model, criterion and optimizer based on input args + """ + args.distributed_rank = rank + if args.distributed_world_size > 1: + distributed_utils.distributed_init(args) + torch.manual_seed(1) + model = Model(args.input_size, args.nb_classes) + loss_fn = nn.CrossEntropyLoss() + if is_cuda: + model = model.cuda() + loss_fn = loss_fn.cuda() + + optimizer = optim.sgd.SGD(args, model.parameters()) + optimizer = optim.FairseqBMUF(args, optimizer) + + return model, loss_fn, optimizer + + +def train_step(input, target, model, loss_fn, optimizer, **unused): + """Do forward, backward and parameter update.""" + model.train() + output = model(input) + loss = loss_fn(output, target) + optimizer.backward(loss) + optimizer.step() + + +def single_gpu_training(args, rank, iterations, shared_results): + + is_cuda = torch.cuda.is_available() + if is_cuda: + torch.cuda.set_device(rank) + + model, loss_fn, optimizer = setup_model_loss_criterion(args, rank, is_cuda) + + for _ in range(iterations): + input = torch.randn(1, args.input_size) + target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes) + + if is_cuda: + input = input.cuda() + target = target.cuda() + train_step(input, target, model, loss_fn, optimizer) + + results = [] + for param in model.parameters(): + if len(results) == 0: + results = param.flatten().cpu().data + else: + results = torch.cat((results, param.flatten().cpu().data), 0) + + shared_results[rank] = results + + +def setup_args(): + args = argparse.Namespace() + args.global_sync_iter = 20 + args.block_momentum = 0.875 + args.block_lr = 0.5 + args.input_size = 5 + args.nb_classes = 2 + args.batch_size = 1 + args.lr = [1e-3] + args.momentum = 0 + args.weight_decay = 0 + args.warmup_iterations = 0 + args.use_nbm = True + args.average_sync = True + args.global_sync_iter = 1 + args.model_parallel_size = 1 + args.distributed_backend = "gloo" + + args.distributed_world_size = 2 + port = random.randint(10000, 20000) + args.distributed_init_method = "tcp://localhost:{port}".format(port=port) + args.distributed_init_host = "localhost" + args.distributed_port = port + 1 + args.local_world_size = args.distributed_world_size + return args + + +@unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs") +class TestBMUF(unittest.TestCase): + def bmuf_process(self, args, iterations): + processes = [] + results = Manager().dict() + ctx = torch.multiprocessing.get_context("spawn") + for rank in range(args.distributed_world_size): + p = ctx.Process( + target=single_gpu_training, args=(args, rank, iterations, results) + ) + p.start() + processes.append(p) + + for p in processes: + p.join() + return results + + def test_bmuf_sync(self): + # Train model for 1 iteration and do bmuf sync without doing warmup + args = setup_args() + iterations = 1 + results = self.bmuf_process(args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_warmup_sync(self): + # Train model for 20 iteration and do warmup sync without doing bmuf sync + args = setup_args() + args.warmup_iterations = 20 + iterations = 20 + results = self.bmuf_process(args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_warmup_sync_bmuf_sync(self): + # Train model for 25 iteration and do warmup sync after 20 iteration + # and bmuf sync after 25 iteration + args = setup_args() + args.warmup_iterations = 20 + args.global_sync_iter = 5 + iterations = 25 + results = self.bmuf_process(args, iterations) + # Make sure params in both machines are same + assert len(results) == 2 + self.assertAlmostEqual(results[0], results[1]) + + def test_single_gpu_bmuf(self): + # Train model for 5 iterations and use GPU 1 + args = setup_args() + args.distributed_world_size = 1 + args.warmup_iterations = 5 + iterations = 20 + results = self.bmuf_process(args, iterations) + assert len(results) == 1 + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_label_smoothing.py b/fairseq-0.10.2/tests/test_label_smoothing.py new file mode 100644 index 0000000000000000000000000000000000000000..04c0f974ac80f7606327f868e948712c3c18f1d0 --- /dev/null +++ b/fairseq-0.10.2/tests/test_label_smoothing.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import copy +import unittest + +import tests.utils as test_utils +import torch +from fairseq.criterions.cross_entropy import CrossEntropyCriterion +from fairseq.criterions.label_smoothed_cross_entropy import ( + LabelSmoothedCrossEntropyCriterion, +) + + +class TestLabelSmoothing(unittest.TestCase): + def setUp(self): + # build dictionary + self.d = test_utils.dummy_dictionary(3) + vocab = len(self.d) + self.assertEqual(vocab, 4 + 3) # 4 special + 3 tokens + self.assertEqual(self.d.pad(), 1) + self.assertEqual(self.d.eos(), 2) + self.assertEqual(self.d.unk(), 3) + pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6 # noqa: F841 + + # build dataset + self.data = [ + # the first batch item has padding + { + "source": torch.LongTensor([w1, eos]), + "target": torch.LongTensor([w1, eos]), + }, + { + "source": torch.LongTensor([w1, eos]), + "target": torch.LongTensor([w1, w1, eos]), + }, + ] + self.sample = next(test_utils.dummy_dataloader(self.data)) + + # build model + self.args = argparse.Namespace() + self.args.sentence_avg = False + self.args.report_accuracy = False + self.args.probs = ( + torch.FloatTensor( + [ + # pad eos unk w1 w2 w3 + [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05], + [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10], + [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15], + ] + ) + .unsqueeze(0) + .expand(2, 3, 7) + ) # add batch dimension + self.task = test_utils.TestTranslationTask.setup_task(self.args, self.d, self.d) + self.model = self.task.build_model(self.args) + + def test_nll_loss(self): + self.args.label_smoothing = 0.1 + nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) + smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( + self.args, self.task + ) + nll_loss, nll_sample_size, nll_logging_output = nll_crit( + self.model, self.sample + ) + smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( + self.model, self.sample + ) + self.assertLess(abs(nll_loss - nll_logging_output["loss"]), 1e-6) + self.assertLess(abs(nll_loss - smooth_logging_output["nll_loss"]), 1e-6) + + def test_padding(self): + self.args.label_smoothing = 0.1 + crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) + loss, _, logging_output = crit(self.model, self.sample) + + def get_one_no_padding(idx): + # create a new sample with just a single batch item so that there's + # no padding + sample1 = next(test_utils.dummy_dataloader([self.data[idx]])) + args1 = copy.copy(self.args) + args1.probs = args1.probs[idx, :, :].unsqueeze(0) + model1 = self.task.build_model(args1) + loss1, _, _ = crit(model1, sample1) + return loss1 + + loss1 = get_one_no_padding(0) + loss2 = get_one_no_padding(1) + self.assertAlmostEqual(loss, loss1 + loss2) + + def test_reduction(self): + self.args.label_smoothing = 0.1 + crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) + loss, _, logging_output = crit(self.model, self.sample, reduce=True) + unreduced_loss, _, _ = crit(self.model, self.sample, reduce=False) + self.assertAlmostEqual(loss, unreduced_loss.sum()) + + def test_zero_eps(self): + self.args.label_smoothing = 0.0 + nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) + smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( + self.args, self.task + ) + nll_loss, nll_sample_size, nll_logging_output = nll_crit( + self.model, self.sample + ) + smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( + self.model, self.sample + ) + self.assertAlmostEqual(nll_loss, smooth_loss) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess((t1 - t2).abs().max(), 1e-6) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_lstm_jitable.py b/fairseq-0.10.2/tests/test_lstm_jitable.py new file mode 100644 index 0000000000000000000000000000000000000000..38f79d17931c32447e96c0fbae2630ac397e1804 --- /dev/null +++ b/fairseq-0.10.2/tests/test_lstm_jitable.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import tempfile +import unittest + +import torch +from fairseq.data.dictionary import Dictionary +from fairseq.models.lstm import LSTMModel +from fairseq.tasks.fairseq_task import LegacyFairseqTask + + +DEFAULT_TEST_VOCAB_SIZE = 100 + + +class DummyTask(LegacyFairseqTask): + def __init__(self, args): + super().__init__(args) + self.dictionary = get_dummy_dictionary() + if getattr(self.args, "ctc", False): + self.dictionary.add_symbol("") + self.src_dict = self.dictionary + self.tgt_dict = self.dictionary + + @property + def source_dictionary(self): + return self.src_dict + + @property + def target_dictionary(self): + return self.dictionary + + +def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): + dummy_dict = Dictionary() + # add dummy symbol to satisfy vocab size + for id, _ in enumerate(range(vocab_size)): + dummy_dict.add_symbol("{}".format(id), 1000) + return dummy_dict + + +def get_dummy_task_and_parser(): + """ + to build a fariseq model, we need some dummy parse and task. This function + is used to create dummy task and parser to faciliate model/criterion test + + Note: we use FbSpeechRecognitionTask as the dummy task. You may want + to use other task by providing another function + """ + parser = argparse.ArgumentParser( + description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS + ) + DummyTask.add_args(parser) + args = parser.parse_args([]) + task = DummyTask.setup_task(args) + return task, parser + + +class TestJitLSTMModel(unittest.TestCase): + def _test_save_and_load(self, scripted_module): + with tempfile.NamedTemporaryFile() as f: + scripted_module.save(f.name) + torch.jit.load(f.name) + + def assertTensorEqual(self, t1, t2): + t1 = t1[~torch.isnan(t1)] # can cause size mismatch errors if there are NaNs + t2 = t2[~torch.isnan(t2)] + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + def test_jit_and_export_lstm(self): + task, parser = get_dummy_task_and_parser() + LSTMModel.add_args(parser) + args = parser.parse_args([]) + args.criterion = "" + model = LSTMModel.build_model(args, task) + scripted_model = torch.jit.script(model) + self._test_save_and_load(scripted_model) + + def test_assert_jit_vs_nonjit_(self): + task, parser = get_dummy_task_and_parser() + LSTMModel.add_args(parser) + args = parser.parse_args([]) + args.criterion = "" + model = LSTMModel.build_model(args, task) + model.eval() + scripted_model = torch.jit.script(model) + scripted_model.eval() + idx = len(task.source_dictionary) + iter = 100 + # Inject random input and check output + seq_len_tensor = torch.randint(1, 10, (iter,)) + num_samples_tensor = torch.randint(1, 10, (iter,)) + for i in range(iter): + seq_len = seq_len_tensor[i] + num_samples = num_samples_tensor[i] + src_token = (torch.randint(0, idx, (num_samples, seq_len)),) + src_lengths = torch.randint(1, seq_len + 1, (num_samples,)) + src_lengths, _ = torch.sort(src_lengths, descending=True) + # Force the first sample to have seq_len + src_lengths[0] = seq_len + prev_output_token = (torch.randint(0, idx, (num_samples, 1)),) + result = model(src_token[0], src_lengths, prev_output_token[0], None) + scripted_result = scripted_model( + src_token[0], src_lengths, prev_output_token[0], None + ) + self.assertTensorEqual(result[0], scripted_result[0]) + self.assertTensorEqual(result[1], scripted_result[1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_noising.py b/fairseq-0.10.2/tests/test_noising.py new file mode 100644 index 0000000000000000000000000000000000000000..b3d0d123c42eaca6f79371aa268049e668fcfcce --- /dev/null +++ b/fairseq-0.10.2/tests/test_noising.py @@ -0,0 +1,530 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Dict, List + +import tests.utils as test_utils +import torch +from fairseq import utils +from fairseq.data import ( + Dictionary, + LanguagePairDataset, + TransformEosDataset, + data_utils, + noising, +) + + +class TestDataNoising(unittest.TestCase): + def _get_test_data_with_bpe_cont_marker(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: BPE vocab with continuation markers as suffixes to denote + non-end of word tokens. This is the standard BPE format used in + fairseq's preprocessing. + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + vocab.add_symbol("he@@") + vocab.add_symbol("llo") + vocab.add_symbol("how") + vocab.add_symbol("are") + vocab.add_symbol("y@@") + vocab.add_symbol("ou") + vocab.add_symbol("n@@") + vocab.add_symbol("ew") + vocab.add_symbol("or@@") + vocab.add_symbol("k") + + src_tokens = [ + ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], + ["how", "are", "y@@", "ou"], + ] + x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _get_test_data_with_bpe_end_marker(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: BPE vocab with end-of-word markers as suffixes to denote + tokens at the end of a word. This is an alternative to fairseq's + standard preprocessing framework and is not generally supported + within fairseq. + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + vocab.add_symbol("he") + vocab.add_symbol("llo_EOW") + vocab.add_symbol("how_EOW") + vocab.add_symbol("are_EOW") + vocab.add_symbol("y") + vocab.add_symbol("ou_EOW") + vocab.add_symbol("n") + vocab.add_symbol("ew_EOW") + vocab.add_symbol("or") + vocab.add_symbol("k_EOW") + + src_tokens = [ + ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"], + ["how_EOW", "are_EOW", "y", "ou_EOW"], + ] + x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _get_test_data_with_word_vocab(self, append_eos=True): + """ + Args: + append_eos: if True, each input sentence in the source tokens tensor + will have an EOS appended to the end. + + Returns: + vocabs: word vocab + x: input tensor containing numberized source tokens, with EOS at the + end if append_eos is true + src_lengths: and source lengths. + """ + vocab = Dictionary() + + vocab.add_symbol("hello") + vocab.add_symbol("how") + vocab.add_symbol("are") + vocab.add_symbol("you") + vocab.add_symbol("new") + vocab.add_symbol("york") + src_tokens = [ + ["hello", "new", "york", "you"], + ["how", "are", "you", "new", "york"], + ] + x, src_lengths = self._convert_src_tokens_to_tensor( + vocab=vocab, src_tokens=src_tokens, append_eos=append_eos + ) + return vocab, x, src_lengths + + def _convert_src_tokens_to_tensor( + self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool + ): + src_len = [len(x) for x in src_tokens] + # If we have to append EOS, we include EOS in counting src length + if append_eos: + src_len = [length + 1 for length in src_len] + + x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) + for i in range(len(src_tokens)): + for j in range(len(src_tokens[i])): + x[i][j] = vocab.index(src_tokens[i][j]) + if append_eos: + x[i][j + 1] = vocab.eos() + + x = x.transpose(1, 0) + return x, torch.LongTensor(src_len) + + def assert_eos_at_end(self, x, x_len, eos): + """Asserts last token of every sentence in x is EOS """ + for i in range(len(x_len)): + self.assertEqual( + x[x_len[i] - 1][i], + eos, + ( + "Expected eos (token id {eos}) at the end of sentence {i} " + "but got {other} instead" + ).format(i=i, eos=eos, other=x[i][-1]), + ) + + def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised): + # Expect only the first word (2 bpe tokens) of the first example + # was dropped out + self.assertEqual(x_len[0] - 2, l_noised[0]) + for i in range(l_noised[0]): + self.assertEqual(x_noised[i][0], x[i + 2][0]) + + def test_word_dropout_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) + self.assert_word_dropout_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised + ) + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk): + # Expect only the first word (2 bpe tokens) of the first example + # was blanked out + self.assertEqual(x_len[0], l_noised[0]) + for i in range(l_noised[0]): + if i < 2: + self.assertEqual(x_noised[i][0], unk) + else: + self.assertEqual(x_noised[i][0], x[i][0]) + + def test_word_blank_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) + self.assert_word_blanking_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() + ) + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def generate_unchanged_shuffle_map(self, length): + return {i: i for i in range(length)} + + def assert_word_shuffle_matches_expected( + self, + x, + x_len, + max_shuffle_distance: int, + vocab: Dictionary, + expected_shufle_maps: List[Dict[int, int]], + expect_eos_at_end: bool, + bpe_end_marker=None, + ): + """ + This verifies that with a given x, x_len, max_shuffle_distance, and + vocab, we get the expected shuffle result. + + Args: + x: Tensor of shape (T x B) = (sequence_length, batch_size) + x_len: Tensor of length B = batch_size + max_shuffle_distance: arg to pass to noising + expected_shuffle_maps: List[mapping] where mapping is a + Dict[old_index, new_index], mapping x's elements from their + old positions in x to their new positions in x. + expect_eos_at_end: if True, check the output to make sure there is + an EOS at the end. + bpe_end_marker: str denoting the BPE end token. If this is not None, we + set the BPE cont token to None in the noising classes. + """ + bpe_cont_marker = None + if bpe_end_marker is None: + bpe_cont_marker = "@@" + + with data_utils.numpy_seed(1234): + word_shuffle = noising.WordShuffle( + vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker + ) + x_noised, l_noised = word_shuffle.noising( + x, x_len, max_shuffle_distance=max_shuffle_distance + ) + + # For every example, we have a different expected shuffle map. We check + # that each example is shuffled as expected according to each + # corresponding shuffle map. + for i in range(len(expected_shufle_maps)): + shuffle_map = expected_shufle_maps[i] + for k, v in shuffle_map.items(): + self.assertEqual(x[k][i], x_noised[v][i]) + + # Shuffling should not affect the length of each example + for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised): + self.assertEqual(pre_shuffle_length, post_shuffle_length) + if expect_eos_at_end: + self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def test_word_shuffle_with_eos(self): + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=True, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=True, + ) + + def test_word_shuffle_with_eos_nonbpe(self): + """The purpose of this is to test shuffling logic with word vocabs""" + vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=True, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + {0: 0, 1: 1, 2: 3, 3: 2}, + {0: 0, 1: 2, 2: 1, 3: 3, 4: 4}, + ], + expect_eos_at_end=True, + ) + + def test_word_shuffle_without_eos(self): + """Same result as word shuffle with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=False, + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=False, + ) + + def test_word_shuffle_without_eos_with_bpe_end_marker(self): + """Same result as word shuffle without eos except using BPE end token""" + vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False) + + # Assert word shuffle with max shuffle distance 0 causes input to be + # unchanged + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + max_shuffle_distance=0, + vocab=vocab, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(example_len) + for example_len in x_len + ], + expect_eos_at_end=False, + bpe_end_marker="_EOW", + ) + + # Assert word shuffle with max shuffle distance 3 matches our expected + # shuffle order + self.assert_word_shuffle_matches_expected( + x=x, + x_len=x_len, + vocab=vocab, + max_shuffle_distance=3, + expected_shufle_maps=[ + self.generate_unchanged_shuffle_map(x_len[0]), + {0: 0, 1: 3, 2: 1, 3: 2}, + ], + expect_eos_at_end=False, + bpe_end_marker="_EOW", + ) + + def assert_no_eos_at_end(self, x, x_len, eos): + """Asserts that the last token of each sentence in x is not EOS """ + for i in range(len(x_len)): + self.assertNotEqual( + x[x_len[i] - 1][i], + eos, + "Expected no eos (token id {eos}) at the end of sentence {i}.".format( + eos=eos, i=i + ), + ) + + def test_word_dropout_without_eos(self): + """Same result as word dropout with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) + self.assert_word_dropout_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised + ) + self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def test_word_blank_without_eos(self): + """Same result as word blank with eos except no EOS at end""" + vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) + + with data_utils.numpy_seed(1234): + noising_gen = noising.WordDropout(vocab) + x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) + self.assert_word_blanking_correct( + x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() + ) + self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) + + def _get_noising_dataset_batch( + self, + src_tokens_no_pad, + src_dict, + append_eos_to_tgt=False, + ): + """ + Constructs a NoisingDataset and the corresponding + ``LanguagePairDataset(NoisingDataset(src), src)``. If + *append_eos_to_tgt* is True, wrap the source dataset in + :class:`TransformEosDataset` to append EOS to the clean source when + using it as the target. + """ + src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) + + noising_dataset = noising.NoisingDataset( + src_dataset=src_dataset, + src_dict=src_dict, + seed=1234, + max_word_shuffle_distance=3, + word_dropout_prob=0.2, + word_blanking_prob=0.2, + noising_class=noising.UnsupervisedMTNoising, + ) + tgt = src_dataset + language_pair_dataset = LanguagePairDataset( + src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict + ) + language_pair_dataset = TransformEosDataset( + language_pair_dataset, + src_dict.eos(), + append_eos_to_tgt=append_eos_to_tgt, + ) + + dataloader = torch.utils.data.DataLoader( + dataset=language_pair_dataset, + batch_size=2, + collate_fn=language_pair_dataset.collater, + ) + denoising_batch_result = next(iter(dataloader)) + return denoising_batch_result + + def test_noising_dataset_with_eos(self): + src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( + append_eos=True + ) + + # Format data for src_dataset + src_tokens = torch.t(src_tokens) + src_tokens_no_pad = [] + for src_sentence in src_tokens: + src_tokens_no_pad.append( + utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) + ) + denoising_batch_result = self._get_noising_dataset_batch( + src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict + ) + + eos, pad = src_dict.eos(), src_dict.pad() + + # Generated noisy source as source + expected_src = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]] + ) + # Original clean source as target (right-padded) + expected_tgt = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] + ) + generated_src = denoising_batch_result["net_input"]["src_tokens"] + tgt_tokens = denoising_batch_result["target"] + + self.assertTensorEqual(expected_src, generated_src) + self.assertTensorEqual(expected_tgt, tgt_tokens) + + def test_noising_dataset_without_eos(self): + """ + Similar to test noising dataset with eos except that we have to set + *append_eos_to_tgt* to ``True``. + """ + + src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( + append_eos=False + ) + + # Format data for src_dataset + src_tokens = torch.t(src_tokens) + src_tokens_no_pad = [] + for src_sentence in src_tokens: + src_tokens_no_pad.append( + utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) + ) + denoising_batch_result = self._get_noising_dataset_batch( + src_tokens_no_pad=src_tokens_no_pad, + src_dict=src_dict, + append_eos_to_tgt=True, + ) + + eos, pad = src_dict.eos(), src_dict.pad() + + # Generated noisy source as source + expected_src = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]] + ) + # Original clean source as target (right-padded) + expected_tgt = torch.LongTensor( + [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] + ) + + generated_src = denoising_batch_result["net_input"]["src_tokens"] + tgt_tokens = denoising_batch_result["target"] + + self.assertTensorEqual(expected_src, generated_src) + self.assertTensorEqual(expected_tgt, tgt_tokens) + + def assertTensorEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertEqual(t1.ne(t2).long().sum(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_sparse_multihead_attention.py b/fairseq-0.10.2/tests/test_sparse_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..3e32b25a7fb1e12295b84d0c65064f8e42b7bdd3 --- /dev/null +++ b/fairseq-0.10.2/tests/test_sparse_multihead_attention.py @@ -0,0 +1,114 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention + + +class TestSparseMultiheadAttention(unittest.TestCase): + def test_sparse_multihead_attention(self): + attn_weights = torch.randn(1, 8, 8) + bidirectional_sparse_mask = torch.tensor( + [ + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + ] + ) + + bidirectional_attention = SparseMultiheadAttention( + 16, 1, stride=4, expressivity=1, is_bidirectional=True + ) + bidirectional_attention_sparse_mask = ( + bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8) + ) + torch.all( + torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask) + ) + + sparse_mask = torch.tensor( + [ + [ + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [ + 0, + 0, + 0, + 0, + float("-inf"), + float("-inf"), + float("-inf"), + float("-inf"), + ], + [0, 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf")], + [ + float("-inf"), + float("-inf"), + float("-inf"), + 0, + 0, + 0, + float("-inf"), + float("-inf"), + ], + [ + float("-inf"), + float("-inf"), + float("-inf"), + 0, + 0, + 0, + 0, + float("-inf"), + ], + [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], + ] + ) + + attention = SparseMultiheadAttention( + 16, 1, stride=4, expressivity=1, is_bidirectional=False + ) + attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8) + + torch.all(torch.eq(attention_sparse_mask, sparse_mask)) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/test_utils.py b/fairseq-0.10.2/tests/test_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..79195903e0f34372a24fa50312a6e00170c14471 --- /dev/null +++ b/fairseq-0.10.2/tests/test_utils.py @@ -0,0 +1,114 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from fairseq import utils + + +class TestUtils(unittest.TestCase): + def test_convert_padding_direction(self): + pad = 1 + left_pad = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [1, 7, 8, 9, 10], + [1, 1, 1, 11, 12], + ] + ) + right_pad = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [7, 8, 9, 10, 1], + [11, 12, 1, 1, 1], + ] + ) + + self.assertAlmostEqual( + right_pad, + utils.convert_padding_direction( + left_pad, + pad, + left_to_right=True, + ), + ) + self.assertAlmostEqual( + left_pad, + utils.convert_padding_direction( + right_pad, + pad, + right_to_left=True, + ), + ) + + def test_make_positions(self): + pad = 1 + left_pad_input = torch.LongTensor( + [ + [9, 9, 9, 9, 9], + [1, 9, 9, 9, 9], + [1, 1, 1, 9, 9], + ] + ) + left_pad_output = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [1, 2, 3, 4, 5], + [1, 1, 1, 2, 3], + ] + ) + right_pad_input = torch.LongTensor( + [ + [9, 9, 9, 9, 9], + [9, 9, 9, 9, 1], + [9, 9, 1, 1, 1], + ] + ) + right_pad_output = torch.LongTensor( + [ + [2, 3, 4, 5, 6], + [2, 3, 4, 5, 1], + [2, 3, 1, 1, 1], + ] + ) + + self.assertAlmostEqual( + left_pad_output, + utils.make_positions(left_pad_input, pad), + ) + self.assertAlmostEqual( + right_pad_output, + utils.make_positions(right_pad_input, pad), + ) + + def test_clip_grad_norm_(self): + params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False) + grad_norm = utils.clip_grad_norm_(params, 1.0) + self.assertTrue(torch.is_tensor(grad_norm)) + self.assertEqual(grad_norm, 0.0) + + params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)] + for p in params: + p.grad = torch.full((5,), fill_value=2.0) + grad_norm = utils.clip_grad_norm_(params, 1.0) + exp_grad_norm = torch.full((15,), fill_value=2.0).norm() + self.assertTrue(torch.is_tensor(grad_norm)) + self.assertEqual(grad_norm, exp_grad_norm) + + grad_norm = utils.clip_grad_norm_(params, 1.0) + self.assertAlmostEqual(grad_norm, torch.tensor(1.0)) + + def test_resolve_max_positions_with_tuple(self): + resolved = utils.resolve_max_positions(None, (2000, 100, 2000), 12000) + self.assertEqual(resolved, (2000, 100, 2000)) + + def assertAlmostEqual(self, t1, t2): + self.assertEqual(t1.size(), t2.size(), "size mismatch") + self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4) + + +if __name__ == "__main__": + unittest.main() diff --git a/fairseq-0.10.2/tests/utils.py b/fairseq-0.10.2/tests/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..91feca6b2acae86bfeb327709c8746792356da9f --- /dev/null +++ b/fairseq-0.10.2/tests/utils.py @@ -0,0 +1,608 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import os +import random +import sys +from io import StringIO + +import torch +import torch.nn.functional as F +from fairseq import options, utils +from fairseq.data import Dictionary +from fairseq.data.language_pair_dataset import collate +from fairseq.models import ( + FairseqEncoder, + FairseqEncoderDecoderModel, + FairseqIncrementalDecoder, +) +from fairseq.models.fairseq_encoder import EncoderOut +from fairseq.tasks import FairseqTask, LegacyFairseqTask +from fairseq_cli import generate, interactive, preprocess, train, validate + + +def dummy_dictionary(vocab_size, prefix="token_"): + d = Dictionary() + for i in range(vocab_size): + token = prefix + str(i) + d.add_symbol(token) + d.finalize(padding_factor=1) # don't add extra padding symbols + return d + + +def dummy_dataloader( + samples, + padding_idx=1, + eos_idx=2, + batch_size=None, +): + if batch_size is None: + batch_size = len(samples) + + # add any missing data to samples + for i, sample in enumerate(samples): + if "id" not in sample: + sample["id"] = i + + # create dataloader + dataset = TestDataset(samples) + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)), + ) + return iter(dataloader) + + +def sequence_generator_setup(): + # construct dummy dictionary + d = dummy_dictionary(vocab_size=2) + + eos = d.eos() + w1 = 4 + w2 = 5 + + # construct source data + src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]]) + src_lengths = torch.LongTensor([2, 2]) + + args = argparse.Namespace() + unk = 0.0 + args.beam_probs = [ + # step 0: + torch.FloatTensor( + [ + # eos w1 w2 + # sentence 1: + [0.0, unk, 0.9, 0.1], # beam 1 + [0.0, unk, 0.9, 0.1], # beam 2 + # sentence 2: + [0.0, unk, 0.7, 0.3], + [0.0, unk, 0.7, 0.3], + ] + ), + # step 1: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 : 0.9*1.0) + [0.0, unk, 0.9, 0.1], # w2: 0.1 + # sentence 2: + [0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 : 0.7*0.25) + [0.00, unk, 0.10, 0.9], # w2: 0.3 + ] + ), + # step 2: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9 + [ + 0.6, + unk, + 0.2, + 0.2, + ], # w2 w2: 0.1*0.1 (emit: w2 w2 : 0.1*0.1*0.6) + # sentence 2: + [ + 0.60, + unk, + 0.4, + 0.00, + ], # w1 w2: 0.7*0.4 (emit: w1 w2 : 0.7*0.4*0.6) + [0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9 + ] + ), + # step 3: + torch.FloatTensor( + [ + # eos w1 w2 prefix + # sentence 1: + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 : 0.1*0.9*0.9*1.0) + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 : 0.1*0.9*0.1*1.0) + # sentence 2: + [ + 0.1, + unk, + 0.5, + 0.4, + ], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 : 0.3*0.9*0.99*0.1) + [ + 1.0, + unk, + 0.0, + 0.0, + ], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 : 0.7*0.4*0.4*1.0) + ] + ), + ] + + task = TestTranslationTask.setup_task(args, d, d) + model = task.build_model(args) + tgt_dict = task.target_dictionary + + return tgt_dict, w1, w2, src_tokens, src_lengths, model + + +def create_dummy_data(data_dir, num_examples=100, maxlen=20, alignment=False): + def _create_dummy_data(filename): + data = torch.rand(num_examples * maxlen) + data = 97 + torch.floor(26 * data).int() + with open(os.path.join(data_dir, filename), "w") as h: + offset = 0 + for _ in range(num_examples): + ex_len = random.randint(1, maxlen) + ex_str = " ".join(map(chr, data[offset : offset + ex_len])) + print(ex_str, file=h) + offset += ex_len + + def _create_dummy_alignment_data(filename_src, filename_tgt, filename): + with open(os.path.join(data_dir, filename_src), "r") as src_f, open( + os.path.join(data_dir, filename_tgt), "r" + ) as tgt_f, open(os.path.join(data_dir, filename), "w") as h: + for src, tgt in zip(src_f, tgt_f): + src_len = len(src.split()) + tgt_len = len(tgt.split()) + avg_len = (src_len + tgt_len) // 2 + num_alignments = random.randint(avg_len // 2, 2 * avg_len) + src_indices = torch.floor(torch.rand(num_alignments) * src_len).int() + tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int() + ex_str = " ".join( + [ + "{}-{}".format(src, tgt) + for src, tgt in zip(src_indices, tgt_indices) + ] + ) + print(ex_str, file=h) + + _create_dummy_data("train.in") + _create_dummy_data("train.out") + _create_dummy_data("valid.in") + _create_dummy_data("valid.out") + _create_dummy_data("test.in") + _create_dummy_data("test.out") + + if alignment: + _create_dummy_alignment_data("train.in", "train.out", "train.align") + _create_dummy_alignment_data("valid.in", "valid.out", "valid.align") + _create_dummy_alignment_data("test.in", "test.out", "test.align") + + +def preprocess_lm_data(data_dir): + preprocess_parser = options.get_preprocessing_parser() + preprocess_args = preprocess_parser.parse_args( + [ + "--only-source", + "--trainpref", + os.path.join(data_dir, "train.out"), + "--validpref", + os.path.join(data_dir, "valid.out"), + "--testpref", + os.path.join(data_dir, "test.out"), + "--destdir", + data_dir, + ] + ) + preprocess.main(preprocess_args) + + +def preprocess_translation_data(data_dir, extra_flags=None): + preprocess_parser = options.get_preprocessing_parser() + preprocess_args = preprocess_parser.parse_args( + [ + "--source-lang", + "in", + "--target-lang", + "out", + "--trainpref", + os.path.join(data_dir, "train"), + "--validpref", + os.path.join(data_dir, "valid"), + "--testpref", + os.path.join(data_dir, "test"), + "--thresholdtgt", + "0", + "--thresholdsrc", + "0", + "--destdir", + data_dir, + ] + + (extra_flags or []), + ) + preprocess.main(preprocess_args) + + +def preprocess_summarization_data(data_dir, extra_flags=None): + preprocess_parser = options.get_preprocessing_parser() + preprocess_args = preprocess_parser.parse_args( + [ + "--source-lang", + "in", + "--target-lang", + "out", + "--trainpref", + os.path.join(data_dir, "train"), + "--validpref", + os.path.join(data_dir, "valid"), + "--testpref", + os.path.join(data_dir, "test"), + "--thresholdtgt", + "0", + "--thresholdsrc", + "0", + "--joined-dictionary", + "--destdir", + data_dir, + ] + + (extra_flags or []), + ) + preprocess.main(preprocess_args) + + +def train_translation_model( + data_dir, + arch, + extra_flags=None, + task="translation", + run_validation=False, + lang_flags=None, + extra_valid_flags=None, +): + if lang_flags is None: + lang_flags = [ + "--source-lang", + "in", + "--target-lang", + "out", + ] + train_parser = options.get_training_parser() + train_args = options.parse_args_and_arch( + train_parser, + [ + "--task", + task, + data_dir, + "--save-dir", + data_dir, + "--arch", + arch, + "--optimizer", + "nag", + "--lr", + "0.05", + "--max-tokens", + "500", + "--max-epoch", + "1", + "--no-progress-bar", + "--distributed-world-size", + "1", + "--num-workers", + "0", + ] + + lang_flags + + (extra_flags or []), + ) + train.main(train_args) + + if run_validation: + # test validation + validate_parser = options.get_validation_parser() + validate_args = options.parse_args_and_arch( + validate_parser, + [ + "--task", + task, + data_dir, + "--path", + os.path.join(data_dir, "checkpoint_last.pt"), + "--valid-subset", + "valid", + "--max-tokens", + "500", + "--no-progress-bar", + "--num-workers", + "0", + ] + + lang_flags + + (extra_valid_flags or []), + ) + validate.main(validate_args) + + +def generate_main(data_dir, extra_flags=None): + if extra_flags is None: + extra_flags = [ + "--print-alignment", + ] + generate_parser = options.get_generation_parser() + generate_args = options.parse_args_and_arch( + generate_parser, + [ + data_dir, + "--path", + os.path.join(data_dir, "checkpoint_last.pt"), + "--beam", + "3", + "--batch-size", + "64", + "--max-len-b", + "5", + "--gen-subset", + "valid", + "--no-progress-bar", + "--num-workers", + "0", + ] + + (extra_flags or []), + ) + + # evaluate model in batch mode + generate.main(generate_args) + + # evaluate model interactively + generate_args.buffer_size = 0 + generate_args.input = "-" + generate_args.batch_size = None + orig_stdin = sys.stdin + sys.stdin = StringIO("h e l l o\n") + interactive.main(generate_args) + sys.stdin = orig_stdin + + +class TestDataset(torch.utils.data.Dataset): + def __init__(self, data): + super().__init__() + self.data = data + self.sizes = None + + def __getitem__(self, index): + return self.data[index] + + def __len__(self): + return len(self.data) + + +class TestTranslationTask(LegacyFairseqTask): + def __init__(self, args, src_dict, tgt_dict, model): + super().__init__(args) + self.src_dict = src_dict + self.tgt_dict = tgt_dict + self.model = model + + @classmethod + def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None): + return cls(args, src_dict, tgt_dict, model) + + def build_model(self, args): + return TestModel.build_model(args, self) + + @property + def source_dictionary(self): + return self.src_dict + + @property + def target_dictionary(self): + return self.tgt_dict + + +class TestModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + +class TestEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + return EncoderOut( + encoder_out=src_tokens, + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestIncrementalDecoder(FairseqIncrementalDecoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + assert hasattr(args, "beam_probs") or hasattr(args, "probs") + args.max_decoder_positions = getattr(args, "max_decoder_positions", 100) + self.args = args + + def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): + if incremental_state is not None: + prev_output_tokens = prev_output_tokens[:, -1:] + bbsz = prev_output_tokens.size(0) + vocab = len(self.dictionary) + src_len = encoder_out.encoder_out.size(1) + tgt_len = prev_output_tokens.size(1) + + # determine number of steps + if incremental_state is not None: + # cache step number + step = utils.get_incremental_state(self, incremental_state, "step") + if step is None: + step = 0 + utils.set_incremental_state(self, incremental_state, "step", step + 1) + steps = [step] + else: + steps = list(range(tgt_len)) + + # define output in terms of raw probs + if hasattr(self.args, "probs"): + assert ( + self.args.probs.dim() == 3 + ), "expected probs to have size bsz*steps*vocab" + probs = self.args.probs.index_select(1, torch.LongTensor(steps)) + else: + probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_() + for i, step in enumerate(steps): + # args.beam_probs gives the probability for every vocab element, + # starting with eos, then unknown, and then the rest of the vocab + if step < len(self.args.beam_probs): + probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step] + else: + probs[:, i, self.dictionary.eos()] = 1.0 + + # random attention + attn = torch.rand(bbsz, tgt_len, src_len) + + dev = prev_output_tokens.device + return probs.to(dev), {"attn": [attn.to(dev)]} + + def get_normalized_probs(self, net_output, log_probs, _): + # the decoder returns probabilities directly + probs = net_output[0] + if log_probs: + return probs.log() + else: + return probs + + def max_positions(self): + return self.args.max_decoder_positions + + +class TestReshapingEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + b_sz, t_sz = src_tokens.shape + padding_needed = t_sz % 2 + x = src_tokens + if padding_needed > 0: + padding_needed = 2 - padding_needed + x = F.pad(x, (0, padding_needed)) + + return EncoderOut( + encoder_out=x.view(b_sz, -1, 2), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestReshapingModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestReshapingEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + +class TestAdditionalInputEncoder(FairseqEncoder): + def __init__(self, args, dictionary): + super().__init__(dictionary) + self.args = args + + def forward(self, src_tokens, src_lengths=None, **kwargs): + assert "fancy_other_input" in kwargs + assert kwargs["fancy_other_input"] is not None + return EncoderOut( + encoder_out=src_tokens, + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + def reorder_encoder_out(self, encoder_out, new_order): + return EncoderOut( + encoder_out=encoder_out.encoder_out.index_select(0, new_order), + encoder_padding_mask=None, + encoder_embedding=None, + encoder_states=None, + src_tokens=None, + src_lengths=None, + ) + + +class TestAdditionalInputModel(FairseqEncoderDecoderModel): + def __init__(self, encoder, decoder): + super().__init__(encoder, decoder) + + @classmethod + def build_model(cls, args, task): + encoder = TestAdditionalInputEncoder(args, task.source_dictionary) + decoder = TestIncrementalDecoder(args, task.target_dictionary) + return cls(encoder, decoder) + + def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): + encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) + decoder_out = self.decoder( + prev_output_tokens, encoder_out=encoder_out, **kwargs + ) + return decoder_out diff --git a/hf.py b/hf.py new file mode 100644 index 0000000000000000000000000000000000000000..d3b206200a895c2bacb6cfd25eb607b7f2173c33 --- /dev/null +++ b/hf.py @@ -0,0 +1,59 @@ +from huggingface_hub import HfApi +api = HfApi() +# api.create_repo(repo_id="sleepyhead111/vllm_fairseq", repo_type="model") + +api.upload_large_folder( + repo_id="sleepyhead111/vllm_fairseq", + repo_type="model", + folder_path="/data/luoyingfeng/xzq-threshold", +) + +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/data", +# path_in_repo="vllm_fairseq/data", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/fairseq-0.10.2", +# path_in_repo="vllm_fairseq/fairseq-0.10.2", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/mosesdecoder", +# path_in_repo="vllm_fairseq/mosesdecoder", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/subword-nmt", +# path_in_repo="vllm_fairseq/subword-nmt", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/wmt22-comet-da", +# path_in_repo="vllm_fairseq/wmt22-comet-da", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/scripts", +# path_in_repo="vllm_fairseq/scripts", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) +# api.upload_folder( +# folder_path="/data/luoyingfeng/xzq-threshold/FacebookAI", +# path_in_repo="vllm_fairseq/FacebookAI", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# ) + +# api.upload_file( +# path_or_fileobj="/data/luoyingfeng/xzq-threshold/scripts/run.sh", +# path_in_repo="vllm_fairseq/scripts/run.sh", +# repo_id="sleepyhead111/vllm_fairseq", +# repo_type="model", +# )