sleepyhead111 commited on Apr 20, 2025

Commit

fdc723d

verified ·

1 Parent(s): fbd4c5c

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

fairseq-0.10.2/examples/camembert/README.md +75 -0
fairseq-0.10.2/examples/joint_alignment_translation/README.md +89 -0
fairseq-0.10.2/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh +118 -0
fairseq-0.10.2/examples/language_model/README.adaptive_inputs.md +39 -0
fairseq-0.10.2/examples/language_model/README.conv.md +40 -0
fairseq-0.10.2/examples/language_model/README.md +123 -0
fairseq-0.10.2/examples/language_model/prepare-wikitext-103.sh +33 -0
fairseq-0.10.2/examples/linformer/README.md +22 -0
fairseq-0.10.2/examples/linformer/linformer_src/__init__.py +6 -0
fairseq-0.10.2/examples/linformer/linformer_src/models/__init__.py +0 -0
fairseq-0.10.2/examples/linformer/linformer_src/models/linformer_roberta.py +134 -0
fairseq-0.10.2/examples/linformer/linformer_src/modules/__init__.py +0 -0
fairseq-0.10.2/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py +169 -0
fairseq-0.10.2/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py +84 -0
fairseq-0.10.2/examples/linformer/linformer_src/modules/multihead_linear_attention.py +485 -0
fairseq-0.10.2/examples/multilingual/README.md +124 -0
fairseq-0.10.2/examples/multilingual/finetune_multilingual_model.sh +27 -0
fairseq-0.10.2/examples/multilingual/multilingual_fairseq_gen.sh +21 -0
fairseq-0.10.2/examples/multilingual/train_multilingual_model.sh +23 -0
fairseq-0.10.2/examples/nonautoregressive_translation/README.md +146 -0
fairseq-0.10.2/examples/nonautoregressive_translation/scripts.md +179 -0
fairseq-0.10.2/examples/pay_less_attention_paper/README.md +176 -0
fairseq-0.10.2/examples/pointer_generator/README.md +82 -0
fairseq-0.10.2/examples/pointer_generator/README.xsum.md +180 -0
fairseq-0.10.2/examples/pointer_generator/pointer_generator_src/__init__.py +6 -0
fairseq-0.10.2/examples/pointer_generator/pointer_generator_src/transformer_pg.py +468 -0
fairseq-0.10.2/examples/pointer_generator/postprocess.py +96 -0
fairseq-0.10.2/examples/pointer_generator/preprocess.py +102 -0
fairseq-0.10.2/examples/roberta/README.custom_classification.md +168 -0
fairseq-0.10.2/examples/roberta/commonsense_qa/README.md +99 -0
fairseq-0.10.2/examples/roberta/commonsense_qa/__init__.py +6 -0
fairseq-0.10.2/examples/roberta/commonsense_qa/commonsense_qa_task.py +190 -0
fairseq-0.10.2/examples/roberta/commonsense_qa/download_cqa_data.sh +14 -0
fairseq-0.10.2/examples/roberta/preprocess_RACE.py +102 -0
fairseq-0.10.2/examples/roberta/wsc/README.md +125 -0
fairseq-0.10.2/examples/roberta/wsc/__init__.py +7 -0
fairseq-0.10.2/examples/roberta/wsc/wsc_criterion.py +167 -0
fairseq-0.10.2/examples/roberta/wsc/wsc_task.py +401 -0
fairseq-0.10.2/examples/roberta/wsc/wsc_utils.py +241 -0
fairseq-0.10.2/examples/scaling_nmt/README.md +114 -0
fairseq-0.10.2/examples/simultaneous_translation/criterions/__init__.py +15 -0
fairseq-0.10.2/examples/simultaneous_translation/criterions/label_smoothed_cross_entropy_latency_augmented.py +73 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/agents/__init__.py +24 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/agents/agent.py +67 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/agents/simul_trans_agent.py +167 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/agents/simul_trans_text_agent.py +81 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/__init__.py +19 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/scorer.py +175 -0
fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/text_scorer.py +41 -0
fairseq-0.10.2/examples/speech_recognition/criterions/ASG_loss.py +170 -0

fairseq-0.10.2/examples/camembert/README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+# CamemBERT: a Tasty French Language Model
+## Introduction
+[CamemBERT](https://arxiv.org/abs/1911.03894) is a pretrained language model trained on 138GB of French text based on RoBERTa.
+Also available in [github.com/huggingface/transformers](https://github.com/huggingface/transformers/).
+## Pre-trained models
+| Model                          | #params | Download                                                                                                                 | Arch. | Training data                     |
+|--------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------|-------|-----------------------------------|
+| `camembert` / `camembert-base` | 110M    | [camembert-base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz)                             | Base  | OSCAR (138 GB of text)            |
+| `camembert-large`              | 335M    | [camembert-large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz)                           | Large | CCNet (135 GB of text)            |
+| `camembert-base-ccnet`         | 110M    | [camembert-base-ccnet.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz)                 | Base  | CCNet (135 GB of text)            |
+| `camembert-base-wikipedia-4gb` | 110M    | [camembert-base-wikipedia-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz) | Base  | Wikipedia (4 GB of text)          |
+| `camembert-base-oscar-4gb`     | 110M    | [camembert-base-oscar-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz)         | Base  | Subsample of OSCAR (4 GB of text) |
+| `camembert-base-ccnet-4gb`     | 110M    | [camembert-base-ccnet-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz)         | Base  | Subsample of CCNet (4 GB of text) |
+## Example usage
+### fairseq
+##### Load CamemBERT from torch.hub (PyTorch >= 1.1):
+```python
+import torch
+camembert = torch.hub.load('pytorch/fairseq', 'camembert')
+camembert.eval()  # disable dropout (or leave in train mode to finetune)
+```
+##### Load CamemBERT (for PyTorch 1.0 or custom models):
+```python
+# Download camembert model
+wget https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz
+tar -xzvf camembert.tar.gz
+# Load the model in fairseq
+from fairseq.models.roberta import CamembertModel
+camembert = CamembertModel.from_pretrained('/path/to/camembert')
+camembert.eval()  # disable dropout (or leave in train mode to finetune)
+```
+##### Filling masks:
+```python
+masked_line = 'Le camembert est <mask> :)'
+camembert.fill_mask(masked_line, topk=3)
+# [('Le camembert est délicieux :)', 0.4909118115901947, ' délicieux'),
+#  ('Le camembert est excellent :)', 0.10556942224502563, ' excellent'),
+#  ('Le camembert est succulent :)', 0.03453322499990463, ' succulent')]
+```
+##### Extract features from Camembert:
+```python
+# Extract the last layer's features
+line = "J'aime le camembert !"
+tokens = camembert.encode(line)
+last_layer_features = camembert.extract_features(tokens)
+assert last_layer_features.size() == torch.Size([1, 10, 768])
+# Extract all layer's features (layer 0 is the embedding layer)
+all_layers = camembert.extract_features(tokens, return_all_hiddens=True)
+assert len(all_layers) == 13
+assert torch.all(all_layers[-1] == last_layer_features)
+```
+## Citation
+If you use our work, please cite:
+```bibtex
+@inproceedings{martin2020camembert,
+  title={CamemBERT: a Tasty French Language Model},
+  author={Martin, Louis and Muller, Benjamin and Su{\'a}rez, Pedro Javier Ortiz and Dupont, Yoann and Romary, Laurent and de la Clergerie, {\'E}ric Villemonte and Seddah, Djam{\'e} and Sagot, Beno{\^\i}t},
+  booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+  year={2020}
+}
+```

fairseq-0.10.2/examples/joint_alignment_translation/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+# Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)
+This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074).
+## Training a joint alignment-translation model on WMT'18 En-De
+##### 1. Extract and preprocess the WMT'18 En-De data
+```bash
+./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh
+```
+##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign.
+In this example, we use FastAlign.
+```bash
+git clone git@github.com:clab/fast_align.git
+pushd fast_align
+mkdir build
+cd build
+cmake ..
+make
+popd
+ALIGN=fast_align/build/fast_align
+paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de
+$ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align
+```
+##### 3. Preprocess the dataset with the above generated alignments.
+```bash
+fairseq-preprocess \
+    --source-lang en --target-lang de \
+    --trainpref bpe.32k/train \
+    --validpref bpe.32k/valid \
+    --testpref bpe.32k/test \
+    --align-suffix align \
+    --destdir binarized/ \
+    --joined-dictionary \
+    --workers 32
+```
+##### 4. Train a model
+```bash
+fairseq-train \
+    binarized \
+    --arch transformer_wmt_en_de_big_align --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\
+    --lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+    --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+    --max-tokens 3500 --label-smoothing 0.1 \
+    --save-dir ./checkpoints --log-interval 1000 --max-update 60000 \
+    --keep-interval-updates -1 --save-interval-updates 0 \
+    --load-alignments --criterion label_smoothed_cross_entropy_with_alignment \
+    --fp16
+```
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
+If you want to train the above model with big batches (assuming your machine has 8 GPUs):
+- add `--update-freq 8` to simulate training on 8x8=64 GPUs
+- increase the learning rate; 0.0007 works well for big batches
+##### 5. Evaluate and generate the alignments (BPE level)
+```bash
+fairseq-generate \
+    binarized --gen-subset test --print-alignment \
+    --source-lang en --target-lang de \
+    --path checkpoints/checkpoint_best.pt --beam 5 --nbest 1
+```
+##### 6. Other resources.
+The code for:
+1. preparing alignment test sets
+2. converting BPE level alignments to token level alignments
+3. symmetrizing bidirectional alignments
+4. evaluating alignments using AER metric
+can be found [here](https://github.com/lilt/alignment-scripts)
+## Citation
+```bibtex
+@inproceedings{garg2019jointly,
+  title = {Jointly Learning to Align and Translate with Transformer Models},
+  author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias},
+  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  address = {Hong Kong},
+  month = {November},
+  url = {https://arxiv.org/abs/1909.02074},
+  year = {2019},
+}
+```

fairseq-0.10.2/examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+echo 'Cloning Moses github repository (for tokenization scripts)...'
+git clone https://github.com/moses-smt/mosesdecoder.git
+SCRIPTS=mosesdecoder/scripts
+TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl
+CLEAN=$SCRIPTS/training/clean-corpus-n.perl
+REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl
+URLS=(
+    "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz"
+    "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz"
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz"
+    "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz"
+    "http://data.statmt.org/wmt17/translation-task/dev.tgz"
+    "http://statmt.org/wmt14/test-full.tgz"
+)
+CORPORA=(
+    "training/europarl-v7.de-en"
+    "commoncrawl.de-en"
+    "training-parallel-nc-v13/news-commentary-v13.de-en"
+    "rapid2016.de-en"
+)
+if [ ! -d "$SCRIPTS" ]; then
+    echo "Please set SCRIPTS variable correctly to point to Moses scripts."
+    exit
+fi
+src=en
+tgt=de
+lang=en-de
+prep=wmt18_en_de
+tmp=$prep/tmp
+orig=orig
+dev=dev/newstest2012
+codes=32000
+bpe=bpe.32k
+mkdir -p $orig $tmp $prep $bpe
+cd $orig
+for ((i=0;i<${#URLS[@]};++i)); do
+    url=${URLS[i]}
+    file=$(basename $url)
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit 1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        fi
+    fi
+done
+cd ..
+echo "pre-processing train data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/train.tags.$lang.tok.$l
+    for f in "${CORPORA[@]}"; do
+        cat $orig/$f.$l | \
+            perl $REM_NON_PRINT_CHAR | \
+            perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l
+    done
+done
+echo "pre-processing test data..."
+for l in $src $tgt; do
+    if [ "$l" == "$src" ]; then
+        t="src"
+    else
+        t="ref"
+    fi
+    grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \
+        sed -e 's/<seg id="[0-9]*">\s*//g' | \
+        sed -e 's/\s*<\/seg>\s*//g' | \
+        sed -e "s/\’/\'/g" | \
+    perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l
+    echo ""
+done
+# apply length filtering before BPE
+perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100
+# use newstest2012 for valid
+echo "pre-processing valid data..."
+for l in $src $tgt; do
+    rm  -rf $tmp/valid.$l
+    cat $orig/$dev.$l | \
+        perl $REM_NON_PRINT_CHAR | \
+        perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l
+done
+mkdir output
+mv $tmp/{train,valid,test}.{$src,$tgt} output
+#BPE
+git clone https://github.com/glample/fastBPE.git
+pushd fastBPE
+g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast
+popd
+fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes
+for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done

fairseq-0.10.2/examples/language_model/README.adaptive_inputs.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)
+## Pre-trained models
+Description | Parameters | Dataset | Model and Test set(s)
+---|---:|---|---
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+## Training an LM with adaptive inputs
+First, see the general [language modeling README](README.md) for instructions on
+preprocessing the WikiText-103 data.
+Then use the following training command to train a model with adaptive inputs
+using the `transformer_lm_wiki103` model architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/transformer_wikitext-103 \
+    --arch transformer_lm_wiki103 \
+    --max-update 286000 --max-lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \
+    --warmup-updates 16000 --warmup-init-lr 1e-07 --min-lr 1e-09 --optimizer nag --lr 0.0001 --clip-norm 0.1 \
+    --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \
+    --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=no_c10d
+```
+## Citation
+```bibtex
+@inproceedings{
+    baevski2018adaptive,
+    title={Adaptive Input Representations for Neural Language Modeling},
+    author={Alexei Baevski and Michael Auli},
+    booktitle={International Conference on Learning Representations},
+    year={2019},
+    url={https://openreview.net/forum?id=ByxZX20qFQ},
+}
+```

fairseq-0.10.2/examples/language_model/README.conv.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)
+## Example usage
+First download and preprocess the data following the main [language modeling README](README.md).
+Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103`
+architecture:
+```bash
+fairseq-train --task language_modeling \
+    data-bin/wikitext-103 \
+    --save-dir checkpoints/fconv_wikitext-103 \
+    --arch fconv_lm_dauphin_wikitext103 \
+    --adaptive-softmax-cutoff 10000,20000,200000 \
+    --dropout 0.2 \
+    --criterion adaptive_loss \
+    --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \
+    --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \
+    --max-tokens 1024 --tokens-per-sample 1024 \
+    --ddp-backend no_c10d \
+    --max-epoch 35
+```
+And evaluate with:
+```bash
+fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt
+```
+## Citation
+```bibtex
+@inproceedings{dauphin2017language,
+  title={Language Modeling with Gated Convolutional Networks},
+  author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David},
+  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+  pages={933--941},
+  year={2017},
+  organization={JMLR}
+}
+```

fairseq-0.10.2/examples/language_model/README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# Neural Language Modeling
+## Pre-trained models
+Model | Description | Dataset | Download
+---|---|---|---
+`transformer_lm.gbw.adaptive_huge` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2)
+`transformer_lm.wiki103.adaptive` | Adaptive Inputs <br> ([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) <br> 247M params | [WikiText-103](https://einstein.ai/research/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2)
+`transformer_lm.wmt19.en` | English LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz)
+`transformer_lm.wmt19.de` | German LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz)
+`transformer_lm.wmt19.ru` | Russian LM <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz)
+## Example usage
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install fastBPE sacremoses
+```
+To sample from a language model using PyTorch Hub:
+```python
+import torch
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'transformer_lm.wmt19.en', ...]
+# Load an English LM trained on WMT'19 News Crawl data
+en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe')
+en_lm.eval()  # disable dropout
+# Move model to GPU
+en_lm.cuda()
+# Sample from the language model
+en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8)
+# "Barack Obama is coming to Sydney and New Zealand (...)"
+# Compute perplexity for a sequence
+en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp()
+# tensor(15.1474)
+# The same interface can be used with custom models as well
+from fairseq.models.transformer_lm import TransformerLanguageModel
+custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe')
+custom_lm.sample('Barack Obama', beam=5)
+# "Barack Obama (...)"
+```
+## Training a transformer language model with the CLI tools
+### 1) Preprocess the data
+First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/):
+```bash
+cd examples/language_model/
+bash prepare-wikitext-103.sh
+cd ../..
+```
+Next preprocess/binarize the data:
+```bash
+TEXT=examples/language_model/wikitext-103
+fairseq-preprocess \
+    --only-source \
+    --trainpref $TEXT/wiki.train.tokens \
+    --validpref $TEXT/wiki.valid.tokens \
+    --testpref $TEXT/wiki.test.tokens \
+    --destdir data-bin/wikitext-103 \
+    --workers 20
+```
+### 2) Train a language model
+Next we'll train a basic transformer language model on wikitext-103. For more
+advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md).
+To train a basic LM (assumes 2 GPUs):
+```
+$ fairseq-train --task language_modeling \
+  data-bin/wikitext-103 \
+  --save-dir checkpoints/transformer_wikitext-103 \
+  --arch transformer_lm --share-decoder-input-output-embed \
+  --dropout 0.1 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \
+  --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+  --tokens-per-sample 512 --sample-break-mode none \
+  --max-tokens 2048 --update-freq 16 \
+  --fp16 \
+  --max-update 50000
+```
+If you run out of memory, try reducing `--max-tokens` (max number of tokens per
+batch) or `--tokens-per-sample` (max sequence length). You can also adjust
+`--update-freq` to accumulate gradients and simulate training on a different
+number of GPUs.
+### 3) Evaluate
+```bash
+fairseq-eval-lm data-bin/wikitext-103 \
+    --path checkpoints/transformer_wiki103/checkpoint_best.pt \
+    --batch-size 2 \
+    --tokens-per-sample 512 \
+    --context-window 400
+# | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s)
+# | Loss: 3.4164, Perplexity: 30.46
+```
+*Note:* The `--context-window` option controls how much context is provided to
+each token when computing perplexity. When the window size is 0, the dataset is
+chunked into segments of length 512 and perplexity is computed over each segment
+normally. However, this results in worse (higher) perplexity since tokens that
+appear earlier in each segment have less conditioning. When the maximum window
+size is used (511 in this case), then we compute perplexity for each token
+fully conditioned on 511 tokens of context. This slows down evaluation
+significantly, since we must run a separate forward pass for every token in the
+dataset, but results in better (lower) perplexity.
+## Convolutional language models
+Please see the [convolutional LM README](README.conv.md) for instructions on
+training convolutional language models.

fairseq-0.10.2/examples/language_model/prepare-wikitext-103.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh
+URLS=(
+    "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip"
+)
+FILES=(
+    "wikitext-103-v1.zip"
+)
+for ((i=0;i<${#URLS[@]};++i)); do
+    file=${FILES[i]}
+    if [ -f $file ]; then
+        echo "$file already exists, skipping download"
+    else
+        url=${URLS[i]}
+        wget "$url"
+        if [ -f $file ]; then
+            echo "$url successfully downloaded."
+        else
+            echo "$url not successfully downloaded."
+            exit -1
+        fi
+        if [ ${file: -4} == ".tgz" ]; then
+            tar zxvf $file
+        elif [ ${file: -4} == ".tar" ]; then
+            tar xvf $file
+        elif [ ${file: -4} == ".zip" ]; then
+            unzip $file
+        fi
+    fi
+done
+cd ..

fairseq-0.10.2/examples/linformer/README.md ADDED Viewed

	@@ -0,0 +1,22 @@

+# Linformer: Self-Attention with Linear Complexity (Wang et al., 2020)
+This example contains code to train Linformer models as described in our paper
+[Linformer: Self-Attention with Linear Complexity](https://arxiv.org/abs/2006.04768).
+## Training a new Linformer RoBERTa model
+You can mostly follow the [RoBERTa pretraining README](/examples/roberta/README.pretraining.md),
+updating your training command with `--user-dir examples/linformer/linformer_src --arch linformer_roberta_base`.
+## Citation
+If you use our work, please cite:
+```bibtex
+@article{wang2020linformer,
+  title={Linformer: Self-Attention with Linear Complexity},
+  author={Wang, Sinong and Li, Belinda and Khabsa, Madian and Fang, Han and Ma, Hao},
+  journal={arXiv preprint arXiv:2006.04768},
+  year={2020}
+}
+```

fairseq-0.10.2/examples/linformer/linformer_src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from .models import linformer_roberta  # noqa

fairseq-0.10.2/examples/linformer/linformer_src/models/__init__.py ADDED Viewed

File without changes

fairseq-0.10.2/examples/linformer/linformer_src/models/linformer_roberta.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Linformer: Self-Attention with Linear Complexity
+"""
+import logging
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.roberta import RobertaEncoder, RobertaModel
+from ..modules.linformer_sentence_encoder import LinformerSentenceEncoder
+logger = logging.getLogger(__name__)
+@register_model("linformer_roberta")
+class LinformerModel(RobertaModel):
+    @staticmethod
+    def add_args(parser):
+        RobertaModel.add_args(parser)
+        # add args for Linformer
+        parser.add_argument(
+            "--compressed", type=int, help="compressed ratio of sequence length"
+        )
+        parser.add_argument(
+            "--shared-kv-compressed",
+            type=int,
+            help="share compressed matrix between k and v, in each layer",
+        )
+        parser.add_argument(
+            "--shared-layer-kv-compressed",
+            type=int,
+            help="share compressed matrix between k and v and across all layers",
+        )
+        parser.add_argument(
+            "--freeze-compress",
+            type=int,
+            help="freeze the parameters in compressed layer",
+        )
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present
+        base_architecture(args)
+        if not hasattr(args, "max_positions"):
+            args.max_positions = args.tokens_per_sample
+        encoder = LinformerEncoder(args, task.source_dictionary)
+        return cls(args, encoder)
+class LinformerEncoder(RobertaEncoder):
+    """Linformer encoder."""
+    def __init__(self, args, dictionary):
+        super().__init__(args, dictionary)
+        self.sentence_encoder = LinformerSentenceEncoder(
+            padding_idx=dictionary.pad(),
+            vocab_size=len(dictionary),
+            num_encoder_layers=args.encoder_layers,
+            embedding_dim=args.encoder_embed_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=args.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.activation_dropout,
+            layerdrop=args.encoder_layerdrop,
+            max_seq_len=args.max_positions,
+            num_segments=0,
+            encoder_normalize_before=True,
+            apply_bert_init=True,
+            activation_fn=args.activation_fn,
+            q_noise=args.quant_noise_pq,
+            qn_block_size=args.quant_noise_pq_block_size,
+            compressed=args.compressed,
+            shared_kv_compressed=args.shared_kv_compressed,
+            shared_layer_kv_compressed=args.shared_layer_kv_compressed,
+            freeze_compress=args.freeze_compress,
+        )
+@register_model_architecture("linformer_roberta", "linformer_roberta")
+def base_architecture(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
+    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
+    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.0)
+    args.compressed = getattr(args, "compressed", 4)
+    args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0)
+    args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0)
+    args.freeze_compress = getattr(args, "freeze_compress", 0)
+@register_model_architecture("linformer_roberta", "linformer_roberta_base")
+def linformer_roberta_base_architecture(args):
+    base_architecture(args)
+@register_model_architecture("linformer_roberta", "linformer_roberta_large")
+def linformer_roberta_large_architecture(args):
+    args.encoder_layers = getattr(args, "encoder_layers", 24)
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.activation_fn = getattr(args, "activation_fn", "gelu")
+    args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh")
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
+    args.pooler_dropout = getattr(args, "pooler_dropout", 0.0)
+    args.compressed = getattr(args, "compressed", 4)
+    args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0)
+    args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0)

fairseq-0.10.2/examples/linformer/linformer_src/modules/__init__.py ADDED Viewed

File without changes

fairseq-0.10.2/examples/linformer/linformer_src/modules/linformer_sentence_encoder.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch.nn as nn
+from fairseq.modules import TransformerSentenceEncoder
+from .linformer_sentence_encoder_layer import LinformerSentenceEncoderLayer
+class LinformerSentenceEncoder(TransformerSentenceEncoder):
+    """
+    Implementation for a Bi-directional Linformer based Sentence Encoder used
+    in BERT/XLM style pre-trained models.
+    This first computes the token embedding using the token embedding matrix,
+    position embeddings (if specified) and segment embeddings
+    (if specified). After applying the specified number of
+    LinformerEncoderLayers, it outputs all the internal states of the
+    encoder as well as the final representation associated with the first
+    token (usually CLS token).
+    Input:
+        - tokens: B x T matrix representing sentences
+        - segment_labels: B x T matrix representing segment label for tokens
+    Output:
+        - a tuple of the following:
+            - a list of internal model states used to compute the
+              predictions where each tensor has shape T x B x C
+            - sentence representation associated with first input token
+              in format B x C.
+    """
+    def __init__(
+        self,
+        padding_idx: int,
+        vocab_size: int,
+        num_encoder_layers: int = 6,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        max_seq_len: int = 256,
+        num_segments: int = 2,
+        use_position_embeddings: bool = True,
+        offset_positions_by_padding: bool = True,
+        encoder_normalize_before: bool = False,
+        apply_bert_init: bool = False,
+        activation_fn: str = "relu",
+        learned_pos_embedding: bool = True,
+        embed_scale: float = None,
+        freeze_embeddings: bool = False,
+        n_trans_layers_to_freeze: int = 0,
+        export: bool = False,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        compressed: int = 4,
+        shared_kv_compressed: int = 0,
+        shared_layer_kv_compressed: int = 0,
+        freeze_compress: int = 0,
+    ) -> None:
+        # Initialize linformer parameters
+        self.compressed = compressed
+        self.shared_kv_compressed = shared_kv_compressed
+        self.shared_layer_kv_compressed = shared_layer_kv_compressed
+        self.compress_layer = None
+        self.freeze_compress = freeze_compress
+        super().__init__(
+            padding_idx=padding_idx,
+            vocab_size=vocab_size,
+            num_encoder_layers=num_encoder_layers,
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            layerdrop=layerdrop,
+            max_seq_len=max_seq_len,
+            num_segments=num_segments,
+            use_position_embeddings=use_position_embeddings,
+            offset_positions_by_padding=offset_positions_by_padding,
+            encoder_normalize_before=encoder_normalize_before,
+            apply_bert_init=apply_bert_init,
+            activation_fn=activation_fn,
+            learned_pos_embedding=learned_pos_embedding,
+            embed_scale=embed_scale,
+            freeze_embeddings=freeze_embeddings,
+            n_trans_layers_to_freeze=n_trans_layers_to_freeze,
+            export=export,
+            traceable=traceable,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+        )
+    def build_transformer_sentence_encoder_layer(
+        self,
+        embedding_dim,
+        ffn_embedding_dim,
+        num_attention_heads,
+        dropout,
+        attention_dropout,
+        activation_dropout,
+        activation_fn,
+        export,
+        q_noise,
+        qn_block_size,
+    ):
+        if self.shared_layer_kv_compressed == 1:
+            compress_layer = nn.Linear(
+                self.max_seq_len, self.max_seq_len // self.compressed
+            )
+            # intialize parameters for compressed layer
+            nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2))
+            if self.freeze_compress == 1:
+                compress_layer.weight.requires_grad = False
+            self.compress_layer = compress_layer
+        return LinformerSentenceEncoderLayer(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            compressed=self.compressed,
+            max_seq_len=self.max_seq_len,
+            shared_kv_compressed=self.shared_kv_compressed,
+            shared_compress_layer=(
+                None if self.shared_layer_kv_compressed == 0 else self.compress_layer
+            ),
+            freeze_compress=self.freeze_compress,
+        )
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        # update key name for shared layer in new version of code
+        for k in state_dict.keys():
+            if k.startswith(prefix + "compress_layer"):
+                if self.shared_layer_kv_compressed:
+                    for layer_idx in range(len(self.layers)):
+                        new_k = prefix + "layers.{0}.shared_compress_layer.{1}".format(
+                            layer_idx,
+                            k[len(prefix + "compress_layer.") :],
+                        )
+                        items_to_add[new_k] = state_dict[k]
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value

fairseq-0.10.2/examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py ADDED Viewed

	@@ -0,0 +1,84 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable
+from fairseq.modules import TransformerSentenceEncoderLayer
+from .multihead_linear_attention import MultiheadLinearAttention
+class LinformerSentenceEncoderLayer(TransformerSentenceEncoderLayer):
+    """
+    Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 3072,
+        num_attention_heads: int = 8,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        activation_fn: str = "relu",
+        export: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        init_fn: Callable = None,
+        compressed: int = 1,
+        max_seq_len: int = 256,
+        shared_kv_compressed: int = 0,
+        shared_compress_layer: any = None,
+        freeze_compress: int = 0,
+    ) -> None:
+        # Initialize linformer parameters
+        self.compressed = compressed
+        self.max_seq_len = max_seq_len
+        self.shared_kv_compressed = shared_kv_compressed
+        self.freeze_compress = freeze_compress
+        def init_fn():
+            # This needs to be set after nn.Module.__init__ is called
+            self.shared_compress_layer = shared_compress_layer
+        super().__init__(
+            embedding_dim=embedding_dim,
+            ffn_embedding_dim=ffn_embedding_dim,
+            num_attention_heads=num_attention_heads,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            activation_dropout=activation_dropout,
+            activation_fn=activation_fn,
+            export=export,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            init_fn=init_fn,
+        )
+    def build_self_attention(
+        self,
+        embed_dim,
+        num_attention_heads,
+        dropout,
+        self_attention,
+        q_noise,
+        qn_block_size,
+    ):
+        return MultiheadLinearAttention(
+            embed_dim,
+            num_attention_heads,
+            dropout=dropout,
+            self_attention=True,
+            q_noise=q_noise,
+            qn_block_size=qn_block_size,
+            compressed=self.compressed,
+            max_seq_len=self.max_seq_len,
+            shared_kv_compressed=self.shared_kv_compressed,
+            shared_compress_layer=self.shared_compress_layer,
+            freeze_compress=self.freeze_compress,
+        )

fairseq-0.10.2/examples/linformer/linformer_src/modules/multihead_linear_attention.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.incremental_decoding_utils import with_incremental_state
+from fairseq.modules.quant_noise import quant_noise
+from torch import Tensor, nn
+from torch.nn import Parameter
+@with_incremental_state
+class MultiheadLinearAttention(nn.Module):
+    """Multi-headed linformer attention.
+    Projects the key and values down to the compressed dimension, before computing self-attention.
+    See "Linformer: Self-Attention with Linear Complexity" for more details.
+    """
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        kdim=None,
+        vdim=None,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        self_attention=False,
+        encoder_decoder_attention=False,
+        q_noise=0.0,
+        qn_block_size=8,
+        compressed=1,
+        max_seq_len=256,
+        shared_kv_compressed=0,
+        shared_compress_layer=None,
+        freeze_compress=0,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        # used for compress sequence to subsequence
+        if shared_compress_layer is None:
+            self.compress_seq_len = max_seq_len // compressed
+            self.compress_k = nn.Linear(max_seq_len, self.compress_seq_len, bias=False)
+            if shared_kv_compressed == 0:
+                self.compress_v = nn.Linear(
+                    max_seq_len, self.compress_seq_len, bias=False
+                )
+            self.layerwise_sharing = False
+        else:
+            self.compress_k = shared_compress_layer
+            if shared_kv_compressed == 0:
+                self.compress_v = shared_compress_layer
+            self.layerwise_sharing = True
+        self.shared_kv_compressed = shared_kv_compressed
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
+        )
+        if add_bias_kv:
+            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
+            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
+        else:
+            self.bias_k = self.bias_v = None
+        self.add_zero_attn = add_zero_attn
+        self.reset_parameters()
+        if freeze_compress == 1:
+            self.compress_k.weight.requires_grad = False
+            if shared_kv_compressed == 0:
+                self.compress_v.weight.requires_grad = False
+        self.onnx_trace = False
+        self.tpu = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def prepare_for_tpu_(self, **kwargs):
+        self.tpu = True
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+            if (
+                not self.layerwise_sharing
+            ):  # otherwise, we already initialize the parameters
+                nn.init.xavier_uniform_(self.compress_k.weight, gain=1 / math.sqrt(2))
+                if self.shared_kv_compressed == 0:
+                    nn.init.xavier_uniform_(
+                        self.compress_v.weight, gain=1 / math.sqrt(2)
+                    )
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+            if (
+                not self.layerwise_sharing
+            ):  # otherwise, we already initialize the parameters
+                nn.init.xavier_uniform_(self.compress_k.weight)
+                if self.shared_kv_compressed == 0:
+                    nn.init.xavier_uniform_(self.compress_v.weight)
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+        if self.bias_k is not None:
+            nn.init.xavier_normal_(self.bias_k)
+        if self.bias_v is not None:
+            nn.init.xavier_normal_(self.bias_v)
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        value: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        need_weights: bool = True,
+        static_kv: bool = False,
+        attn_mask: Optional[Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+        if self.self_attention:
+            q = self.q_proj(query)
+            k_input = query.permute(1, 2, 0).contiguous()  # B * C * T
+            k_input = (
+                F.linear(k_input, self.compress_k.weight[:, 0:tgt_len])
+                .permute(2, 0, 1)
+                .contiguous()
+            )
+            k = self.k_proj(k_input)
+            v_input = query.permute(1, 2, 0).contiguous()  # B * C * T
+            if self.shared_kv_compressed == 0:
+                v_input = (
+                    F.linear(v_input, self.compress_v.weight[:, 0:tgt_len])
+                    .permute(2, 0, 1)
+                    .contiguous()
+                )
+            if self.shared_kv_compressed == 1:  # use shared kv compressed linear layer
+                v_input = (
+                    F.linear(v_input, self.compress_k.weight[:, 0:tgt_len])
+                    .permute(2, 0, 1)
+                    .contiguous()
+                )
+            v = self.v_proj(v_input)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+            if key_padding_mask is not None:
+                key_padding_mask = torch.cat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    dim=1,
+                )
+        q = (
+            q.contiguous()
+            .view(tgt_len, bsz * self.num_heads, self.head_dim)
+            .transpose(0, 1)
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                .view(-1, bsz * self.num_heads, self.head_dim)
+                .transpose(0, 1)
+            )
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = torch.cat([prev_key, k], dim=1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = torch.cat([prev_value, v], dim=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadLinearAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        src_len = k.size(1)
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
+            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
+            if attn_mask is not None:
+                attn_mask = torch.cat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = MultiheadLinearAttention.apply_sparse_mask(
+            attn_weights, tgt_len, src_len, bsz
+        )
+        assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
+            attn_weights += attn_mask
+        if before_softmax:
+            return attn_weights, v
+        attn_weights_float = utils.softmax(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )
+        assert v is not None
+        attn = torch.bmm(attn_probs, v)
+        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.size(1) == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
+        else:
+            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+        return attn, attn_weights
+    @staticmethod
+    def _append_prev_key_padding_mask(
+        key_padding_mask: Optional[Tensor],
+        prev_key_padding_mask: Optional[Tensor],
+        batch_size: int,
+        src_len: int,
+        static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - prev_key_padding_mask.size(1)),
+                device=prev_key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [prev_key_padding_mask.float(), filler.float()], dim=1
+            )
+        elif key_padding_mask is not None:
+            filler = torch.zeros(
+                (batch_size, src_len - key_padding_mask.size(1)),
+                device=key_padding_mask.device,
+            )
+            new_key_padding_mask = torch.cat(
+                [filler.float(), key_padding_mask.float()], dim=1
+            )
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+    @torch.jit.export
+    def reorder_incremental_state(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        new_order: Tensor,
+    ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention and input_buffer_k.size(
+                        0
+                    ) == new_order.size(0):
+                        break
+                    input_buffer[k] = input_buffer_k.index_select(0, new_order)
+            incremental_state = self._set_input_buffer(incremental_state, input_buffer)
+        return incremental_state
+    def _get_input_buffer(
+        self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+    def _set_input_buffer(
+        self,
+        incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+        buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+    def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]
+                keys_to_remove.append(k)
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim : 2 * dim
+                    ]
+                    items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]
+                    keys_to_remove.append(prefix + "in_proj_bias")
+        for k in keys_to_remove:
+            del state_dict[k]
+        for key, value in items_to_add.items():
+            state_dict[key] = value

fairseq-0.10.2/examples/multilingual/README.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Multilingual Translation
+[[Multilingual Translation with Extensible Multilingual Pretraining and Finetuning, https://arxiv.org/abs/2008.00401]](https://arxiv.org/abs/2008.00401)
+## Introduction
+This work is for training multilingual translation models with multiple bitext datasets. This multilingual translation framework supports (see [[training section]](#Training) and [[finetuning section]](#Finetuning) for examples)
+* temperature based sampling over unbalancing datasets of different translation directions
+  - --sampling-method' with
+            choices=['uniform', 'temperature',  'concat']
+  - --sampling-temperature
+* configurable to automatically add source and/or target language tokens to source/target sentences using data which are prepared in the same way as bilignual training
+  - --encoder-langtok with choices=['src', 'tgt', None] to specify whether to add source or target language tokens to the source sentences
+  - --decoder-langtok (binary option) to specify whether to add target language tokens to the target sentences or not
+* finetuning mBART pretrained models for multilingual translation
+  - --finetune-from-model to specify the path from which to load the pretrained model
+## Preprocessing data
+Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/master/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model.
+You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/master/examples/translation#multilingual-translation).
+## Training
+```bash
+lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
+path_2_data=<set to data path>
+lang_list=<a file which contains a list of languages separated by new lines>
+fairseq-train $path_2_data \
+  --encoder-normalize-before --decoder-normalize-before \
+  --arch transformer --layernorm-embedding \
+  --task translation_multi_simple_epoch \
+  --sampling-method "temperature" \
+  --sampling-temperature 1.5 \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs" \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
+  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+  --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
+  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+  --max-tokens 1024 --update-freq 2 \
+  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
+  --seed 222 --log-format simple --log-interval 2
+```
+## Finetuning
+We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/master/examples/mbart).
+```bash
+lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
+path_2_data=<set to data path>
+lang_list=<a file which contains a list of languages separated by new lines>
+pretrained_model=<path to the pretrained model, e.g. mbart or another trained multilingual model>
+fairseq-train $path_2_data \
+  --finetune-from-model $pretrained_model \
+  --encoder-normalize-before --decoder-normalize-before \
+  --arch transformer --layernorm-embedding \
+  --task translation_multi_simple_epoch \
+  --sampling-method "temperature" \
+  --sampling-temperature 1.5 \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs" \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
+  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+  --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
+  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+  --max-tokens 1024 --update-freq 2 \
+  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
+  --seed 222 --log-format simple --log-interval 2
+```
+## Generate
+The following command uses the multilingual task (translation_multi_simple_epoch) to generate translation  from $source_lang to $target_lang on the test dataset. During generaton, the source language tokens are added to source sentences and the target language tokens are added as the starting token to decode target sentences. Options --lang-dict and --lang-pairs are needed to tell the generation process the ordered list of languages and translation directions that the trained model are awared of; they will need to be consistent with the training.
+```bash
+model=<multilingual model>
+source_lang=<source language>
+target_lang=<target language>
+fairseq-generate $path_2_data \
+  --path $model \
+  --task translation_multi_simple_epoch \
+  --gen-subset test \
+  --source-lang $source_lang \
+  --target-lang $target_lang
+  --sacrebleu --remove-bpe 'sentencepiece'\
+  --batch-size 32 \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs" > ${source_lang}_${target_lang}.txt
+```
+Fairseq will generate translation into a file {source_lang}_${target_lang}.txt with sacreblue at the end.
+You can also use costomized tokenizer to compare the performance with the literature. For example, you get a tokenizer [here](https://github.com/rsennrich/wmt16-scripts) and do the following:
+```bash
+TOKENIZER=<path to a customized tokenizer for decoding evaluation>
+TOK_CMD=<"$TOKENIZER $target_lang" or cat for sacrebleu>
+cat {source_lang}_${target_lang}.txt | grep -P "^H" |sort -V |cut -f 3- |$TOK_CMD > ${source_lang}_${target_lang}.hyp
+cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CMD > ${source_lang}_${target_lang}.ref
+sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp
+```
+## Citation
+```bibtex
+@article{tang2020multilingual,
+    title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
+    author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
+    year={2020},
+    eprint={2008.00401},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```

fairseq-0.10.2/examples/multilingual/finetune_multilingual_model.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+path_2_data=$1  # <path to data> which contains binarized data for each directions
+lang_list=$2  # <path to a file which contains a list of languages separted by new lines>
+lang_pairs=$3  #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en"
+# pretrained can be an mBART pretrained model as well
+pretrained_model=$4 #<path to a pretrained model>
+fairseq-train "$path_2_data" \
+  --encoder-normalize-before --decoder-normalize-before \
+  --arch transformer --layernorm-embedding \
+  --task translation_multi_simple_epoch \
+  --finetune-from-model "$pretrained_model" \
+  --sampling-method "temperature" \
+  --sampling-temperature "1.5" \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs" \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
+  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+  --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
+  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+  --max-tokens 1024 --update-freq 2 \
+  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
+  --seed 222 --log-format simple --log-interval 2

fairseq-0.10.2/examples/multilingual/multilingual_fairseq_gen.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+lang_pairs="en-fr,en-cs,fr-en,cs-en"
+path_2_data=$1 # <path to data>
+lang_list=$2 # <path to a file which contains list of languages separted by new lines>
+model=$3  # <path to a trained model>
+source_lang=cs
+target_lang=en
+fairseq-generate "$path_2_data" \
+  --path "$model" \
+  --task translation_multi_simple_epoch \
+  --gen-subset test \
+  --source-lang "$source_lang" \
+  --target-lang "$target_lang" \
+  --sacrebleu --remove-bpe 'sentencepiece'\
+  --batch-size 32 \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs"

fairseq-0.10.2/examples/multilingual/train_multilingual_model.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+#!/bin/bash
+path_2_data=$1  # <path to data> which contains binarized data for each directions
+lang_list=$2  # <path to a file which contains a list of languages separted by new lines>
+lang_pairs=$3  #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en"
+fairseq-train "$path_2_data" \
+  --encoder-normalize-before --decoder-normalize-before \
+  --arch transformer --layernorm-embedding \
+  --task translation_multi_simple_epoch \
+  --sampling-method "temperature" \
+  --sampling-temperature 1.5 \
+  --encoder-langtok "src" \
+  --decoder-langtok \
+  --lang-dict "$lang_list" \
+  --lang-pairs "$lang_pairs" \
+  --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
+  --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
+  --lr-scheduler inverse_sqrt --lr 3e-05 --min-lr -1 --warmup-updates 2500 --max-update 40000 \
+  --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
+  --max-tokens 1024 --update-freq 2 \
+  --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
+  --seed 222 --log-format simple --log-interval 2

fairseq-0.10.2/examples/nonautoregressive_translation/README.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# Non-autoregressive Neural Machine Translation (NAT)
+This page mainly includes instructions for reproducing results from the following papers
+* [Levenshtein Transformer (Gu et al., 2019)](https://arxiv.org/abs/1905.11006).
+* [Understanding Knowledge Distillation in Non-autoregressive Machine Translation (Zhou et al., 2019)](https://arxiv.org/abs/1911.02727).
+We also provided our own implementations for several popular non-autoregressive-based models as reference:<br>
+* [Non-Autoregressive Neural Machine Translation (Gu et al., 2017)](https://arxiv.org/abs/1711.02281)<br>
+* [Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al., 2018)](https://arxiv.org/abs/1802.06901)<br>
+* [Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al., 2019)](https://arxiv.org/abs/1902.03249)<br>
+* [Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019)](https://arxiv.org/abs/1904.09324v2)<br>
+* [Fast Structured Decoding for Sequence Models (Sun et al., 2019)](https://arxiv.org/abs/1910.11555)
+## Dataset
+First, follow the [instructions to download and preprocess the WMT'14 En-De dataset](../translation#wmt14-english-to-german-convolutional).
+Make sure to learn a joint vocabulary by passing the `--joined-dictionary` option to `fairseq-preprocess`.
+### Knowledge Distillation
+Following [Gu et al. 2019](https://arxiv.org/abs/1905.11006), [knowledge distillation](https://arxiv.org/abs/1606.07947) from an autoregressive model can effectively simplify the training data distribution, which is sometimes essential for NAT-based models to learn good translations.
+The easiest way of performing distillation is to follow the [instructions of training a standard transformer model](../translation) on the same data, and then decode the training set to produce a distillation dataset for NAT.
+### Download
+We also provided the preprocessed [original](http://dl.fbaipublicfiles.com/nat/original_dataset.zip) and [distillation](http://dl.fbaipublicfiles.com/nat/distill_dataset.zip) datasets. Please build the binarized dataset on your own.
+## Train a model
+Then we can train a nonautoregressive model using the `translation_lev` task and a new criterion `nat_loss`.
+Use the `--noise` flag to specify the input noise used on the target sentences.
+In default, we run the task for *Levenshtein Transformer*, with `--noise='random_delete'`. Full scripts to run other models can also be found [here](./scripts.md).
+The following command will train a *Levenshtein Transformer* on the binarized dataset.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch levenshtein_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+## Translate
+Once a model is trained, we can generate translations using an `iterative_refinement_generator` which will based on the model's initial output and iteratively read and greedily refine the translation until (1) the model predicts the same translations for two consecutive iterations; or (2) the generator reaches the maximum iterations (`--iter-decode-max-iter`). Use `--print-step` to check the actual # of iteration for each sentence.
+For *Levenshtein Transformer*, it sometimes helps to apply a `--iter-decode-eos-penalty` (typically, 0~3) to penalize the model finishing generation too early and generating too short translations.
+For example, to generate with `--iter-decode-max-iter=9`:
+```bash
+fairseq-generate \
+    data-bin/wmt14_en_de_distill \
+    --gen-subset test \
+    --task translation_lev \
+    --path checkpoints/checkpoint_best.pt \
+    --iter-decode-max-iter 9 \
+    --iter-decode-eos-penalty 0 \
+    --beam 1 --remove-bpe \
+    --print-step \
+    --batch-size 400
+```
+In the end of the generation, we can see the tokenized BLEU score for the translation.
+## Advanced Decoding Methods
+### Ensemble
+The NAT models use special implementations of [ensembling](https://github.com/fairinternal/fairseq-py/blob/b98d88da52f2f21f1b169bab8c70c1c4ca19a768/fairseq/sequence_generator.py#L522) to support iterative refinement and a variety of parallel operations in different models, while it shares the same API as standard autoregressive models as follows:
+```bash
+fairseq-generate \
+    data-bin/wmt14_en_de_distill \
+    --gen-subset test \
+    --task translation_lev \
+    --path checkpoint_1.pt:checkpoint_2.pt:checkpoint_3.pt \
+    --iter-decode-max-iter 9 \
+    --iter-decode-eos-penalty 0 \
+    --beam 1 --remove-bpe \
+    --print-step \
+    --batch-size 400
+```
+We use ``:`` to split multiple models. Note that, not all NAT models support ensembling for now.
+### Length-beam
+For models that predict lengths before decoding (e.g. the vanilla NAT, Mask-Predict, etc), it is possible to improve the translation quality by varying the target lengths around the predicted value, and translating the same example multiple times in parallel. We can select the best translation with the highest scores defined by your model's output.
+Note that, not all models support length beams. For models which dynamically change the lengths (e.g. *Insertion Transformer*, *Levenshtein Transformer*), the same trick does not apply.
+### Re-ranking
+If the model generates multiple translations with length beam, we can also introduce an autoregressive model to rerank the translations considering scoring from an autoregressive model is much faster than decoding from that.
+For example, to generate translations with length beam and reranking,
+```bash
+fairseq-generate \
+    data-bin/wmt14_en_de_distill \
+    --gen-subset test \
+    --task translation_lev \
+    --path checkpoints/checkpoint_best.pt:at_checkpoints/checkpoint_best.pt \
+    --iter-decode-max-iter 9 \
+    --iter-decode-eos-penalty 0 \
+    --iter-decode-with-beam 9 \
+    --iter-decode-with-external-reranker \
+    --beam 1 --remove-bpe \
+    --print-step \
+    --batch-size 100
+```
+Note that we need to make sure the autoregressive model shares the same vocabulary as our target non-autoregressive model.
+## Citation
+```bibtex
+@incollection{NIPS2019_9297,
+    title = {Levenshtein Transformer},
+    author = {Gu, Jiatao and Wang, Changhan and Zhao, Junbo},
+    booktitle = {Advances in Neural Information Processing Systems 32},
+    editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett},
+    pages = {11179--11189},
+    year = {2019},
+    publisher = {Curran Associates, Inc.},
+    url = {http://papers.nips.cc/paper/9297-levenshtein-transformer.pdf}
+}
+```
+```bibtex
+@article{zhou2019understanding,
+  title={Understanding Knowledge Distillation in Non-autoregressive Machine Translation},
+  author={Zhou, Chunting and Neubig, Graham and Gu, Jiatao},
+  journal={arXiv preprint arXiv:1911.02727},
+  year={2019}
+}
+```

fairseq-0.10.2/examples/nonautoregressive_translation/scripts.md ADDED Viewed

	@@ -0,0 +1,179 @@

+# Examples of Training scripts for Non-autoregressive Machine Translation models
+### Non-autoregressive Transformer (NAT, Gu et al., 2017)
+Note that we need to have an additional module to perform "length prediction" (`--length-loss-factor`) before generating the whole sequence.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch nonautoregressive_transformer \
+    --noise full_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+### Fast Structured Decoding for Sequence Models (NAT-CRF, Sun et al., 2019)
+Note that we implemented a low-rank appromixated CRF model by setting `--crf-lowrank-approx=32` and `--crf-beam-approx=64` as discribed in the original paper. All other settings are the same as the vanilla NAT model.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch nacrf_transformer \
+    --noise full_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --word-ins-loss-factor 0.5 \
+    --crf-lowrank-approx 32 \
+    --crf-beam-approx 64 \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+### Non-autoregressive Transformer with Iterative Refinement (iNAT, Lee et al., 2018)
+Note that `--train-step` means how many iterations of refinement we used during training, and `--dae-ratio` controls the ratio of denoising auto-encoder training described in the original paper.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch iterative_nonautoregressive_transformer \
+    --noise full_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --pred-length-offset \
+    --length-loss-factor 0.1 \
+    --train-step 4 \
+    --dae-ratio 0.5 \
+    --stochastic-approx \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+### Insertion Transformer (InsT, Stern et al., 2019)
+Note that we need to specify the "slot-loss" (uniform or balanced tree) described in the original paper. Here we use `--label-tau` to control the temperature.
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch insertion_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+### Mask Predict (CMLM, Ghazvininejad et al., 2019)
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch cmlm_transformer \
+    --noise random_mask \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```
+### Levenshtein Transformer (LevT, Gu et al., 2019)
+```bash
+fairseq-train \
+    data-bin/wmt14_en_de_distill \
+    --save-dir checkpoints \
+    --ddp-backend=no_c10d \
+    --task translation_lev \
+    --criterion nat_loss \
+    --arch levenshtein_transformer \
+    --noise random_delete \
+    --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9,0.98)' \
+    --lr 0.0005 --lr-scheduler inverse_sqrt \
+    --min-lr '1e-09' --warmup-updates 10000 \
+    --warmup-init-lr '1e-07' --label-smoothing 0.1 \
+    --dropout 0.3 --weight-decay 0.01 \
+    --decoder-learned-pos \
+    --encoder-learned-pos \
+    --apply-bert-init \
+    --log-format 'simple' --log-interval 100 \
+    --fixed-validation-seed 7 \
+    --max-tokens 8000 \
+    --save-interval-updates 10000 \
+    --max-update 300000
+```

fairseq-0.10.2/examples/pay_less_attention_paper/README.md ADDED Viewed

	@@ -0,0 +1,176 @@

+# Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)
+This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://arxiv.org/abs/1901.10430).
+## Citation:
+```bibtex
+@inproceedings{wu2018pay,
+  title = {Pay Less Attention with Lightweight and Dynamic Convolutions},
+  author = {Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli},
+  booktitle = {International Conference on Learning Representations},
+  year = {2019},
+  url = {https://arxiv.org/abs/1901.10430},
+}
+```
+## Translation
+### Pre-trained models
+For some datasets we release models without GLUs which are faster at inference.
+Model | Description | Dataset | Download
+---|---|---|---
+`lightconv.no_glu.iwslt14.de-en` | LightConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
+`dynamicconv.no_glu.iwslt14.de-en` | DynamicConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2)
+`lightconv.no_glu.wmt16.en-de` | LightConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.no_glu.wmt16.en-de` | DynamicConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt16.en-de` | LightConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.glu.wmt16.en-de` | DynamicConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt14.en-fr` | LightConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`dynamicconv.glu.wmt14.en-fr` | DynamicConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`lightconv.glu.wmt17.zh-en` | LightConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
+`dynamicconv.glu.wmt17.zh-en` | DynamicConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2)
+### Memory-Efficient CUDA Kernels
+Since the PyTorch implementations of Light/Dynamic conv are quite memory intensive, we have developed CUDA kernels that implement the light and dynamic convolution operator in a memory-efficient and performant manner. For large sequence lengths, these kernels save about 50% memory compared to the PyTorch equivalent.
+To install the kernels, use the commands below. Once installed, they will automatically be used in place of the PyTorch implementations whenever a light or dynamic convolution is used.
+```sh
+# to install lightconv
+cd fairseq/modules/lightconv_layer
+python cuda_function_gen.py
+python setup.py install
+# to install dynamicconv
+cd fairseq/modules/dynamicconv_layer
+python cuda_function_gen.py
+python setup.py install
+```
+### Example usage (torch.hub)
+We require a few additional Python dependencies for preprocessing:
+```bash
+pip install sacremoses subword_nmt
+```
+Interactive translation via PyTorch Hub:
+```python
+import torch
+# List available models
+torch.hub.list('pytorch/fairseq')  # [..., 'lightconv.glu.wmt17.zh-en', ... ]
+# Load a transformer trained on WMT'16 En-De
+zh2en = torch.hub.load('pytorch/fairseq', 'lightconv.glu.wmt17.zh-en', tokenizer='moses', bpe='subword_nmt')
+# The underlying model is available under the *models* attribute
+assert isinstance(zh2en.models[0], fairseq.models.lightconv.LightConvModel)
+# Translate a sentence
+zh2en.translate('你好 世界')
+# 'Hello World'
+```
+Loading custom models:
+```python
+from fairseq.models.lightconv import LightConvModel
+en2fr = LightConvModel.from_pretrained(
+  '/path/to/checkpoints',
+  checkpoint_file='checkpoint_best.pt',
+  data_name_or_path='data-bin/wmt14_en_fr',
+  bpe='subword_nmt',
+  bpe_codes='data-bin/wmt14_en_fr/en.code'
+)
+en2fr.translate('Hello world!')
+# 'Bonjour le monde'
+```
+### Preprocessing the training datasets
+Please follow the instructions in [`examples/translation/README.md`](../translation/README.md) to preprocess the data.
+### Training and evaluation options:
+To use the model without GLU, please set `--encoder-glu 0 --decoder-glu 0`.
+For LightConv, please use `--encoder-conv-type lightweight --decoder-conv-type lightweight`, otherwise the default is DynamicConv.
+For best BLEU results, lenpen may need to be manually tuned.
+To use the CUDA kernels, first install the PyTorch modules using the commands
+above. Once the CUDA modules are installed, they will automatically be used
+instead of the PyTorch modules.
+### IWSLT14 De-En
+Training and evaluating DynamicConv (without GLU) on a GPU:
+```sh
+# Training
+SAVE="save/dynamic_conv_iwslt"
+mkdir -p $SAVE
+CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \
+    --clip-norm 0 --optimizer adam --lr 0.0005 \
+    --source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \
+    --log-interval 100 --min-lr '1e-09' --weight-decay 0.0001 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --lr-scheduler inverse_sqrt \
+    --ddp-backend=no_c10d \
+    --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \
+    --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \
+    -a lightconv_iwslt_de_en --save-dir $SAVE \
+    --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 0 --decoder-glu 0
+python scripts/average_checkpoints.py --inputs $SAVE \
+    --num-epoch-checkpoints 10 --output "${SAVE}/checkpoint_last10_avg.pt"
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/iwslt14.tokenized.de-en --path "${SAVE}/checkpoint_last10_avg.pt" --batch-size 128 --beam 4 --remove-bpe --lenpen 1 --gen-subset test --quiet
+```
+### WMT16 En-De
+Training and evaluating DynamicConv (with GLU) on WMT16 En-De using cosine scheduler on one machine with 8 V100 GPUs:
+```sh
+# Training
+SAVE="save/dynamic_conv_wmt16en2de"
+mkdir -p $SAVE
+python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
+    data-bin/wmt16_en_de_bpe32k --fp16  --log-interval 100 --no-progress-bar \
+    --max-update 30000 --share-all-embeddings --optimizer adam \
+    --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
+    --ddp-backend=no_c10d --max-tokens 3584 \
+    --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
+    --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \
+    --t-mult 1 --lr-period-updates 20000 \
+    --arch lightconv_wmt_en_de_big --save-dir $SAVE \
+    --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 1 --decoder-glu 1
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt16.en-de.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.5 --gen-subset test > wmt16_gen.txt
+bash scripts/compound_split_bleu.sh wmt16_gen.txt
+```
+### WMT14 En-Fr
+Training DynamicConv (with GLU) on WMT14 En-Fr using cosine scheduler on one machine with 8 V100 GPUs:
+```sh
+# Training
+SAVE="save/dynamic_conv_wmt14en2fr"
+mkdir -p $SAVE
+python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \
+    data-bin/wmt14_en_fr --fp16  --log-interval 100 --no-progress-bar \
+    --max-update 30000 --share-all-embeddings --optimizer adam \
+    --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \
+    --ddp-backend=no_c10d --max-tokens 3584 \
+    --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \
+    --lr-shrink 1 --max-lr 0.001 --lr 1e-7 --min-lr 1e-9 --warmup-init-lr 1e-07 \
+    --t-mult 1 --lr-period-updates 70000 \
+    --arch lightconv_wmt_en_fr_big --save-dir $SAVE \
+    --dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \
+    --encoder-glu 1 --decoder-glu 1
+# Evaluation
+CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt14.en-fr.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.9 --gen-subset test
+```

fairseq-0.10.2/examples/pointer_generator/README.md ADDED Viewed

	@@ -0,0 +1,82 @@

+# Transformer with Pointer-Generator Network
+This page describes the `transformer_pointer_generator` model that incorporates
+a pointing mechanism in the Transformer model that facilitates copying of input
+words to the output. This architecture is described in [Enarvi et al. (2020)](https://www.aclweb.org/anthology/2020.nlpmc-1.4/).
+## Background
+The pointer-generator network was introduced in [See et al. (2017)](https://arxiv.org/abs/1704.04368)
+for RNN encoder-decoder attention models. A similar mechanism can be
+incorporated in a Transformer model by reusing one of the many attention
+distributions for pointing. The attention distribution over the input words is
+interpolated with the normal output distribution over the vocabulary words. This
+allows the model to generate words that appear in the input, even if they don't
+appear in the vocabulary, helping especially with small vocabularies.
+## Implementation
+The mechanism for copying out-of-vocabulary words from the input has been
+implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator)
+they convey the word identities through the model in order to be able to produce
+words that appear in the input sequence but not in the vocabulary. A different
+approach was taken in the Fairseq implementation to keep it self-contained in
+the model file, avoiding any changes to the rest of the code base. Copying
+out-of-vocabulary words is possible by pre-processing the input and
+post-processing the output. This is described in detail in the next section.
+## Usage
+The training and evaluation procedure is outlined below. You can also find a
+more detailed example for the XSum dataset on [this page](README.xsum.md).
+##### 1. Create a vocabulary and extend it with source position markers
+The pointing mechanism is especially helpful with small vocabularies, if we are
+able to recover the identities of any out-of-vocabulary words that are copied
+from the input. For this purpose, the model allows extending the vocabulary with
+special tokens that can be used in place of `<unk>` tokens to identify different
+input positions. For example, the user may add `<unk-0>`, `<unk-1>`, `<unk-2>`,
+etc. to the end of the vocabulary, after the normal words. Below is an example
+of how to create a vocabulary of 10000 most common words and add 1000 input
+position markers.
+```bash
+vocab_size=10000
+position_markers=1000
+export LC_ALL=C
+cat train.src train.tgt |
+  tr -s '[:space:]' '\n' |
+  sort |
+  uniq -c |
+  sort -k1,1bnr -k2 |
+  head -n "$((vocab_size - 4))" |
+  awk '{ print $2 " " $1 }' >dict.pg.txt
+python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
+```
+##### 2. Preprocess the text data
+The idea is that any `<unk>` tokens in the text are replaced with `<unk-0>` if
+it appears in the first input position, `<unk-1>` if it appears in the second
+input position, and so on. This can be achieved using the `preprocess.py` script
+that is provided in this directory.
+##### 3. Train a model
+The number of these special tokens is given to the model with the
+`--source-position-markers` argument—the model simply maps all of these to the
+same word embedding as `<unk>`.
+The attention distribution that is used for pointing is selected using the
+`--alignment-heads` and `--alignment-layer` command-line arguments in the same
+way as with the `transformer_align` model.
+##### 4. Generate text and postprocess it
+When using the model to generate text, you want to preprocess the input text in
+the same way that training data was processed, replacing out-of-vocabulary words
+with `<unk-N>` tokens. If any of these tokens are copied to the output, the
+actual words can be retrieved from the unprocessed input text. Any `<unk-N>`
+token should be replaced with the word at position N in the original input
+sequence. This can be achieved using the `postprocess.py` script.

fairseq-0.10.2/examples/pointer_generator/README.xsum.md ADDED Viewed

	@@ -0,0 +1,180 @@

+## Training a pointer-generator model on the Extreme Summarization dataset
+##### 1. Download the Extreme Summarization data and preprocess it
+Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to obtain
+the original Extreme Summarization dataset. You should have six files,
+{train,validation,test}.{document,summary}.
+##### 2. Create a vocabulary and extend it with source position markers
+```bash
+vocab_size=10000
+position_markers=1000
+export LC_ALL=C
+cat train.document train.summary |
+  tr -s '[:space:]' '\n' |
+  sort |
+  uniq -c |
+  sort -k1,1bnr -k2 |
+  head -n "$((vocab_size - 4))" |
+  awk '{ print $2 " " $1 }' >dict.pg.txt
+python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt
+```
+This creates the file dict.pg.txt that contains the 10k most frequent words,
+followed by 1k source position markers:
+```
+the 4954867
+. 4157552
+, 3439668
+to 2212159
+a 1916857
+of 1916820
+and 1823350
+...
+<unk-0> 0
+<unk-1> 0
+<unk-2> 0
+<unk-3> 0
+<unk-4> 0
+...
+```
+##### 2. Preprocess the text data
+```bash
+./preprocess.py --source train.document --target train.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out train.pg.src --target-out train.pg.tgt
+./preprocess.py --source validation.document --target validation.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out valid.pg.src --target-out valid.pg.tgt
+./preprocess.py --source test.document --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out test.pg.src
+```
+The data should now contain `<unk-N>` tokens in place of out-of-vocabulary words.
+##### 3. Binarize the dataset:
+```bash
+fairseq-preprocess \
+  --source-lang src \
+  --target-lang tgt \
+  --trainpref train.pg \
+  --validpref valid.pg \
+  --destdir bin \
+  --workers 60 \
+  --srcdict dict.pg.txt \
+  --joined-dictionary
+```
+##### 3. Train a model
+```bash
+total_updates=20000
+warmup_updates=500
+lr=0.001
+max_tokens=4096
+update_freq=4
+pointer_layer=-2
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \
+    --user-dir examples/pointer_generator/pointer_generator_src \
+    --max-tokens "$max_tokens" \
+    --task translation \
+    --source-lang src --target-lang tgt \
+    --truncate-source \
+    --layernorm-embedding \
+    --share-all-embeddings \
+    --encoder-normalize-before \
+    --decoder-normalize-before \
+    --required-batch-size-multiple 1 \
+    --arch transformer_pointer_generator \
+    --alignment-layer "$pointer_layer" \
+    --alignment-heads 1 \
+    --source-position-markers 1000 \
+    --criterion label_smoothed_cross_entropy \
+    --label-smoothing 0.1 \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \
+    --clip-norm 0.1 \
+    --lr-scheduler inverse_sqrt --lr "$lr" --max-update "$total_updates" --warmup-updates "$warmup_updates" \
+    --update-freq "$update_freq" \
+    --skip-invalid-size-inputs-valid-test
+```
+Above we specify that our dictionary contains 1000 source position markers, and
+that we want to use one attention head from the penultimate decoder layer for
+pointing. It should run in 5.5 hours on one node with eight 32GB V100 GPUs. The
+logged messages confirm that dictionary indices above 10000 will be mapped to
+the `<unk>` embedding:
+```
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [src] dictionary: 11000 types
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [tgt] dictionary: 11000 types
+2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.src
+2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.tgt
+2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | bin valid src-tgt 11332 examples
+2020-09-24 20:43:53 | INFO | fairseq.models.transformer_pg | dictionary indices from 10000 to 10999 will be mapped to 3
+```
+##### 4. Summarize the test sequences
+```bash
+batch_size=32
+beam_size=6
+max_length=60
+length_penalty=1.0
+fairseq-interactive bin \
+    --user-dir examples/pointer_generator/pointer_generator_src \
+    --batch-size "$batch_size" \
+    --task translation \
+    --source-lang src --target-lang tgt \
+    --path checkpoints/checkpoint_last.pt \
+    --input test.pg.src \
+    --buffer-size 200 \
+    --max-len-a 0 \
+    --max-len-b "$max_length" \
+    --lenpen "$length_penalty" \
+    --beam "$beam_size" \
+    --skip-invalid-size-inputs-valid-test |
+    tee generate.out
+grep ^H generate.out | cut -f 3- >generate.hyp
+```
+Now you should have the generated sequences in `generate.hyp`. They contain
+`<unk-N>` tokens that the model has copied from the source sequence. In order to
+retrieve the original words, we need the unprocessed source sequences from
+`test.document`.
+##### 5. Process the generated output
+Since we skipped too long inputs when producing `generate.hyp`, we also have to
+skip too long sequences now that we read `test.document`.
+```bash
+./postprocess.py \
+    --source <(awk 'NF<1024' test.document) \
+    --target generate.hyp \
+    --target-out generate.hyp.processed
+```
+Now you'll find the final sequences from `generate.hyp.processed`, with
+`<unk-N>` replaced with the original word from the source sequence.
+##### An example of a summarized sequence
+The original source document in `test.document`:
+> de roon moved to teesside in june 2016 for an initial # 8.8 m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
+The preprocessed source document in `test.src.pg`:
+> de \<unk-1> moved to \<unk-4> in june 2016 for an initial # \<unk-12> m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page .
+The generated summary in `generate.hyp`:
+> middlesbrough striker \<unk> de \<unk-1> has joined spanish side \<unk> on a season-long loan .
+The generated summary after postprocessing in `generate.hyp.processed`:
+> middlesbrough striker \<unk> de roon has joined spanish side \<unk> on a season-long loan .

fairseq-0.10.2/examples/pointer_generator/pointer_generator_src/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import transformer_pg  # noqa

fairseq-0.10.2/examples/pointer_generator/pointer_generator_src/transformer_pg.py ADDED Viewed

	@@ -0,0 +1,468 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Any, Dict, Optional
+import torch
+import torch.nn as nn
+from fairseq import metrics, utils
+from fairseq.models import register_model, register_model_architecture
+from fairseq.models.fairseq_encoder import EncoderOut
+from fairseq.models.transformer import (
+    DEFAULT_MAX_SOURCE_POSITIONS,
+    DEFAULT_MAX_TARGET_POSITIONS,
+    TransformerDecoder,
+    TransformerEncoder,
+    TransformerModel,
+    base_architecture,
+)
+from torch import Tensor
+logger = logging.getLogger(__name__)
+@register_model("transformer_pointer_generator")
+class TransformerPointerGeneratorModel(TransformerModel):
+    """
+    Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017)
+    <https://arxiv.org/abs/1706.03762>`_, augmented with a pointer-generator
+    network from `"Get To The Point: Summarization with Pointer-Generator
+    Networks" (See et al, 2017) <https://arxiv.org/abs/1704.04368>`_.
+    Args:
+        encoder (TransformerPointerGeneratorEncoder): the encoder
+        decoder (TransformerPointerGeneratorDecoder): the decoder
+    The Transformer pointer-generator model provides the following named
+    architectures and command-line arguments:
+    .. argparse::
+        :ref: fairseq.models.transformer_pointer_generator_parser
+        :prog:
+    """
+    @staticmethod
+    def add_args(parser):
+        """Add model-specific arguments to the parser."""
+        # fmt: off
+        TransformerModel.add_args(parser)
+        parser.add_argument('--alignment-heads', type=int, metavar='N',
+                            help='number of attention heads to be used for '
+                                 'pointing')
+        parser.add_argument('--alignment-layer', type=int, metavar='I',
+                            help='layer number to be used for pointing (0 '
+                                 'corresponding to the bottommost layer)')
+        parser.add_argument('--source-position-markers', type=int, metavar='N',
+                            help='dictionary includes N additional items that '
+                                 'represent an OOV token at a particular input '
+                                 'position')
+        parser.add_argument('--force-generation', type=float, metavar='P',
+                            default=None,
+                            help='set the vocabulary distribution weight to P, '
+                                 'instead of predicting it from the input (1.0 '
+                                 'corresponding to generation, 0.0 to pointing)')
+        # fmt: on
+    @classmethod
+    def build_model(cls, args, task):
+        """Build a new model instance."""
+        # make sure all arguments are present in older models
+        base_architecture(args)
+        if args.encoder_layers_to_keep:
+            args.encoder_layers = len(args.encoder_layers_to_keep.split(","))
+        if args.decoder_layers_to_keep:
+            args.decoder_layers = len(args.decoder_layers_to_keep.split(","))
+        if getattr(args, "max_source_positions", None) is None:
+            args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS
+        if getattr(args, "max_target_positions", None) is None:
+            args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS
+        if getattr(args, "source_position_markers", None) is None:
+            args.source_position_markers = args.max_source_positions
+        src_dict, tgt_dict = task.source_dictionary, task.target_dictionary
+        if src_dict != tgt_dict:
+            raise ValueError("Pointer-generator requires a joined dictionary")
+        def build_embedding(dictionary, embed_dim, path=None):
+            # The dictionary may include additional items that can be used in
+            # place of the normal OOV token and that all map to the same
+            # embedding. Using a different token for each input position allows
+            # one to restore the word identities from the original source text.
+            num_embeddings = len(dictionary) - args.source_position_markers
+            padding_idx = dictionary.pad()
+            unk_idx = dictionary.unk()
+            logger.info(
+                "dictionary indices from {0} to {1} will be mapped to {2}".format(
+                    num_embeddings, len(dictionary) - 1, unk_idx
+                )
+            )
+            emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx)
+            # if provided, load from preloaded dictionaries
+            if path:
+                embed_dict = utils.parse_embedding(path)
+                utils.load_embedding(embed_dict, dictionary, emb)
+            return emb
+        if args.share_all_embeddings:
+            if args.encoder_embed_dim != args.decoder_embed_dim:
+                raise ValueError(
+                    "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim"
+                )
+            if args.decoder_embed_path and (
+                args.decoder_embed_path != args.encoder_embed_path
+            ):
+                raise ValueError(
+                    "--share-all-embeddings not compatible with --decoder-embed-path"
+                )
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = encoder_embed_tokens
+            args.share_decoder_input_output_embed = True
+        else:
+            encoder_embed_tokens = build_embedding(
+                src_dict, args.encoder_embed_dim, args.encoder_embed_path
+            )
+            decoder_embed_tokens = build_embedding(
+                tgt_dict, args.decoder_embed_dim, args.decoder_embed_path
+            )
+        encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens)
+        decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens)
+        return cls(args, encoder, decoder)
+    @classmethod
+    def build_encoder(cls, args, src_dict, embed_tokens):
+        return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens)
+    @classmethod
+    def build_decoder(cls, args, tgt_dict, embed_tokens):
+        return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens)
+class TransformerPointerGeneratorEncoder(TransformerEncoder):
+    """
+    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
+    is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds
+    the source tokens to the encoder output as these are otherwise not passed
+    to the decoder.
+    """
+    def forward(self, src_tokens, src_lengths, **kwargs):
+        """
+        Runs the `forward()` method of the parent Transformer class. Then adds
+        the source tokens into the encoder output tuple.
+        While it might be more elegant that the model would pass the source
+        tokens to the `forward()` method of the decoder too, this would require
+        changes to `SequenceGenerator`.
+        Args:
+            src_tokens (torch.LongTensor): tokens in the source language of
+                shape `(batch, src_len)`
+            src_lengths (torch.LongTensor): lengths of each source sentence of
+                shape `(batch)`
+        Returns:
+            namedtuple:
+                - **encoder_out** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_padding_mask** (ByteTensor): the positions of
+                  padding elements of shape `(batch, src_len)`
+                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
+                  of shape `(batch, src_len, embed_dim)`
+                - **encoder_states** (List[Tensor]): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *return_all_hiddens* is True.
+                - **src_tokens** (Tensor): input token ids of shape
+                  `(batch, src_len)`
+        """
+        encoder_out = super().forward(src_tokens, src_lengths, **kwargs)
+        return EncoderOut(
+            encoder_out=encoder_out.encoder_out,  # T x B x C
+            encoder_padding_mask=encoder_out.encoder_padding_mask,  # B x T
+            encoder_embedding=encoder_out.encoder_embedding,  # B x T x C
+            encoder_states=encoder_out.encoder_states,  # List[T x B x C]
+            src_tokens=src_tokens,  # B x T
+            src_lengths=None,
+        )
+class TransformerPointerGeneratorDecoder(TransformerDecoder):
+    """
+    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
+    is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes
+    the output probabilities with an attention distribution in the output layer.
+    Args:
+        args (argparse.Namespace): parsed command-line arguments
+        dictionary (~fairseq.data.Dictionary): decoding dictionary
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False)
+        # In the pointer-generator model these arguments define the decoder
+        # layer and the number of attention heads that will be averaged to
+        # create the alignment for pointing.
+        self.alignment_heads = args.alignment_heads
+        self.alignment_layer = args.alignment_layer
+        input_embed_dim = embed_tokens.embedding_dim
+        # Generation probabilities / interpolation coefficients are predicted
+        # from the current decoder input embedding and the decoder output, which
+        # is the size of output_embed_dim.
+        p_gen_input_size = input_embed_dim + self.output_embed_dim
+        self.project_p_gens = nn.Linear(p_gen_input_size, 1)
+        nn.init.zeros_(self.project_p_gens.bias)
+        # The dictionary may include a separate entry for an OOV token in each
+        # input position, so that their identity can be restored from the
+        # original source text.
+        self.num_types = len(dictionary)
+        self.num_oov_types = args.source_position_markers
+        self.num_embeddings = self.num_types - self.num_oov_types
+        self.force_p_gen = args.force_generation
+    def forward(
+        self,
+        prev_output_tokens,
+        encoder_out: Optional[EncoderOut] = None,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        features_only: bool = False,
+        alignment_layer: Optional[int] = 0,
+        alignment_heads: Optional[int] = 1,
+        src_lengths: Optional[Any] = None,
+        return_all_hiddens: bool = False,
+    ):
+        """
+        Args:
+            prev_output_tokens (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_out (EncoderOut, optional): output from the encoder, used
+                for encoder-side attention
+            incremental_state (dict, optional): dictionary used for storing
+                state during :ref:`Incremental decoding`
+            features_only (bool, optional): only return features without
+                applying output layer (default: False)
+            alignment_layer (int, optional): 0-based index of the layer to be
+                used for pointing (default: 0)
+            alignment_heads (int, optional): number of attention heads to be
+                used for pointing (default: 1)
+        Returns:
+            tuple:
+                - the decoder's output of shape `(batch, tgt_len, vocab)`
+                - a dictionary with any model-specific outputs
+        """
+        # The normal Transformer model doesn't pass the alignment_layer and
+        # alignment_heads parameters correctly. We use our local variables.
+        x, extra = self.extract_features(
+            prev_output_tokens,
+            encoder_out=encoder_out,
+            incremental_state=incremental_state,
+            alignment_layer=self.alignment_layer,
+            alignment_heads=self.alignment_heads,
+        )
+        if not features_only:
+            # Embedding the tokens again for generation probability prediction,
+            # so that we don't have to reimplement the whole extract_features()
+            # method.
+            if incremental_state is not None:
+                prev_output_tokens = prev_output_tokens[:, -1:]
+            prev_output_embed = self.embed_tokens(prev_output_tokens)
+            prev_output_embed *= self.embed_scale
+            predictors = torch.cat((prev_output_embed, x), 2)
+            p_gens = self.project_p_gens(predictors)
+            p_gens = torch.sigmoid(p_gens)
+            x = self.output_layer(x, extra["attn"][0], encoder_out.src_tokens, p_gens)
+        return x, extra
+    def output_layer(self, features, attn, src_tokens, p_gens, **kwargs):
+        """
+        Project features to the vocabulary size and mix with the attention
+        distributions.
+        """
+        if self.force_p_gen is not None:
+            p_gens = self.force_p_gen
+        # project back to size of vocabulary
+        logits = super().output_layer(features, **kwargs)
+        batch_size = logits.shape[0]
+        output_length = logits.shape[1]
+        assert logits.shape[2] == self.num_embeddings
+        assert src_tokens.shape[0] == batch_size
+        src_length = src_tokens.shape[1]
+        # The final output distribution will be a mixture of the normal output
+        # distribution (softmax of logits) and attention weights.
+        gen_dists = super().get_normalized_probs(
+            (logits, None), log_probs=False, sample=None
+        )
+        gen_dists = torch.mul(gen_dists, p_gens)
+        padding_size = (batch_size, output_length, self.num_oov_types)
+        padding = gen_dists.new_zeros(padding_size)
+        gen_dists = torch.cat((gen_dists, padding), 2)
+        assert gen_dists.shape[2] == self.num_types
+        # Scatter attention distributions to distributions over the extended
+        # vocabulary in a tensor of shape [batch_size, output_length,
+        # vocab_size]. Each attention weight will be written into a location
+        # that is for other dimensions the same as in the index tensor, but for
+        # the third dimension it's the value of the index tensor (the token ID).
+        attn = torch.mul(attn, 1 - p_gens)
+        index = src_tokens[:, None, :]
+        index = index.expand(batch_size, output_length, src_length)
+        attn_dists_size = (batch_size, output_length, self.num_types)
+        attn_dists = attn.new_zeros(attn_dists_size)
+        attn_dists.scatter_add_(2, index, attn)
+        # Final distributions, [batch_size, output_length, num_types].
+        return gen_dists + attn_dists
+    def get_normalized_probs(self, net_output, log_probs, sample):
+        """
+        Get normalized probabilities (or log probs) from a net's output.
+        Pointer-generator network output is already normalized.
+        """
+        probs = net_output[0]
+        # Make sure the probabilities are greater than zero when returning log
+        # probabilities.
+        return probs.clamp(1e-10, 1.0).log() if log_probs else probs
+class Embedding(nn.Embedding):
+    r"""A simple lookup table that stores embeddings of a fixed dictionary and size.
+    This module is often used to store word embeddings and retrieve them using indices.
+    The input to the module is a list of indices, and the output is the corresponding
+    word embeddings. This subclass differs from the standard PyTorch Embedding class by
+    allowing additional vocabulary entries that will be mapped to the unknown token
+    embedding.
+    Args:
+        num_embeddings (int): size of the dictionary of embeddings
+        embedding_dim (int): the size of each embedding vector
+        padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx`
+                           (initialized to zeros) whenever it encounters the index.
+        unk_idx (int): Maps all token indices that are greater than or equal to
+                       num_embeddings to this index.
+    Attributes:
+        weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim)
+                         initialized from :math:`\mathcal{N}(0, 1)`
+    Shape:
+        - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract
+        - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}`
+    .. note::
+        Keep in mind that only a limited number of optimizers support
+        sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`),
+        :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`)
+    .. note::
+        With :attr:`padding_idx` set, the embedding vector at
+        :attr:`padding_idx` is initialized to all zeros. However, note that this
+        vector can be modified afterwards, e.g., using a customized
+        initialization method, and thus changing the vector used to pad the
+        output. The gradient for this vector from :class:`~torch.nn.Embedding`
+        is always zero.
+    """
+    __constants__ = ["unk_idx"]
+    def __init__(self, num_embeddings, embedding_dim, padding_idx, unk_idx):
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+        self.unk_idx = unk_idx
+        nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5)
+        nn.init.constant_(self.weight[padding_idx], 0)
+    def forward(self, input):
+        input = torch.where(
+            input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input
+        )
+        return super().forward(input)
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator"
+)
+def transformer_pointer_generator(args):
+    args.alignment_heads = getattr(args, "alignment_heads", 1)
+    args.alignment_layer = getattr(args, "alignment_layer", -1)
+    base_architecture(args)
+    if args.alignment_layer < 0:
+        args.alignment_layer = args.decoder_layers + args.alignment_layer
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en"
+)
+def transformer_pointer_generator_iwslt_de_en(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.encoder_layers = getattr(args, "encoder_layers", 6)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    transformer_pointer_generator(args)
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de"
+)
+def transformer_pointer_generator_wmt_en_de(args):
+    transformer_pointer_generator(args)
+# Transformer pointer-generator with the base Transformer parameters as used in
+# the "Attention Is All You Need" paper (Vaswani et al., 2017)
+@register_model_architecture(
+    "transformer_pointer_generator",
+    "transformer_pointer_generator_vaswani_wmt_en_de_big",
+)
+def transformer_pointer_generator_vaswani_wmt_en_de_big(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
+    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
+    args.dropout = getattr(args, "dropout", 0.3)
+    transformer_pointer_generator(args)
+@register_model_architecture(
+    "transformer_pointer_generator",
+    "transformer_pointer_generator_vaswani_wmt_en_fr_big",
+)
+def transformer_pointer_generator_vaswani_wmt_en_fr_big(args):
+    args.dropout = getattr(args, "dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big"
+)
+def transformer_pointer_generator_wmt_en_de_big(args):
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)
+# default parameters used in tensor2tensor implementation
+@register_model_architecture(
+    "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t"
+)
+def transformer_pointer_generator_wmt_en_de_big_t2t(args):
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
+    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
+    transformer_pointer_generator_vaswani_wmt_en_de_big(args)

fairseq-0.10.2/examples/pointer_generator/postprocess.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import re
+import sys
+class OOVIndexError(IndexError):
+    def __init__(self, pos, source_seq, target_seq):
+        super(OOVIndexError, self).__init__(
+            "A <unk-N> tag in the target sequence refers to a position that is "
+            "outside the source sequence. Most likely there was a mismatch in "
+            "provided source and target sequences. Otherwise this would mean that "
+            "the pointing mechanism somehow attended to a position that is past "
+            "the actual sequence end."
+        )
+        self.source_pos = pos
+        self.source_seq = source_seq
+        self.target_seq = target_seq
+def replace_oovs(source_in, target_in, target_out):
+    """Replaces <unk-N> tokens in the target text with the corresponding word in
+    the source text.
+    """
+    oov_re = re.compile("^<unk-([0-9]+)>$")
+    for source_seq, target_seq in zip(source_in, target_in):
+        target_seq_out = []
+        pos_to_word = source_seq.strip().split()
+        for token in target_seq.strip().split():
+            m = oov_re.match(token)
+            if m:
+                pos = int(m.group(1))
+                if pos >= len(pos_to_word):
+                    raise OOVIndexError(pos, source_seq, target_seq)
+                token_out = pos_to_word[pos]
+            else:
+                token_out = token
+            target_seq_out.append(token_out)
+        target_out.write(" ".join(target_seq_out) + "\n")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replaces <unk-N> tokens in target sequences with words from "
+        "the corresponding position in the source sequence."
+    )
+    parser.add_argument(
+        "--source", type=str, help="text file with source sequences", required=True
+    )
+    parser.add_argument(
+        "--target", type=str, help="text file with target sequences", required=True
+    )
+    parser.add_argument(
+        "--target-out",
+        type=str,
+        help="where to write target sequences without <unk-N> " "entries",
+        required=True,
+    )
+    args = parser.parse_args()
+    target_in = (
+        open(args.target, "r", encoding="utf-8") if args.target is not None else None
+    )
+    target_out = (
+        open(args.target_out, "w", encoding="utf-8")
+        if args.target_out is not None
+        else None
+    )
+    with open(args.source, "r", encoding="utf-8") as source_in, open(
+        args.target, "r", encoding="utf-8"
+    ) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out:
+        replace_oovs(source_in, target_in, target_out)
+if __name__ == "__main__":
+    try:
+        main()
+    except OOVIndexError as e:
+        print(e, file=sys.stderr)
+        print("Source sequence:", e.source_seq.strip(), file=sys.stderr)
+        print("Target sequence:", e.target_seq.strip(), file=sys.stderr)
+        print(
+            "Source sequence length:",
+            len(e.source_seq.strip().split()),
+            file=sys.stderr,
+        )
+        print("The offending tag points to:", e.source_pos)
+        sys.exit(2)

fairseq-0.10.2/examples/pointer_generator/preprocess.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+from itertools import zip_longest
+def replace_oovs(source_in, target_in, vocabulary, source_out, target_out):
+    """Replaces out-of-vocabulary words in source and target text with <unk-N>,
+    where N in is the position of the word in the source sequence.
+    """
+    def format_unk(pos):
+        return "<unk-{}>".format(pos)
+    if target_in is None:
+        target_in = []
+    for seq_num, (source_seq, target_seq) in enumerate(
+        zip_longest(source_in, target_in)
+    ):
+        source_seq_out = []
+        target_seq_out = []
+        word_to_pos = dict()
+        for position, token in enumerate(source_seq.strip().split()):
+            if token in vocabulary:
+                token_out = token
+            else:
+                if token in word_to_pos:
+                    oov_pos = word_to_pos[token]
+                else:
+                    word_to_pos[token] = position
+                    oov_pos = position
+                token_out = format_unk(oov_pos)
+            source_seq_out.append(token_out)
+        source_out.write(" ".join(source_seq_out) + "\n")
+        if target_seq is not None:
+            for token in target_seq.strip().split():
+                if token in word_to_pos:
+                    token_out = format_unk(word_to_pos[token])
+                else:
+                    token_out = token
+                target_seq_out.append(token_out)
+        if target_out is not None:
+            target_out.write(" ".join(target_seq_out) + "\n")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replaces out-of-vocabulary words in both source and target "
+        "sequences with tokens that indicate the position of the word "
+        "in the source sequence."
+    )
+    parser.add_argument(
+        "--source", type=str, help="text file with source sequences", required=True
+    )
+    parser.add_argument(
+        "--target", type=str, help="text file with target sequences", default=None
+    )
+    parser.add_argument("--vocab", type=str, help="vocabulary file", required=True)
+    parser.add_argument(
+        "--source-out",
+        type=str,
+        help="where to write source sequences with <unk-N> entries",
+        required=True,
+    )
+    parser.add_argument(
+        "--target-out",
+        type=str,
+        help="where to write target sequences with <unk-N> entries",
+        default=None,
+    )
+    args = parser.parse_args()
+    with open(args.vocab, encoding="utf-8") as vocab:
+        vocabulary = vocab.read().splitlines()
+    target_in = (
+        open(args.target, "r", encoding="utf-8") if args.target is not None else None
+    )
+    target_out = (
+        open(args.target_out, "w", encoding="utf-8")
+        if args.target_out is not None
+        else None
+    )
+    with open(args.source, "r", encoding="utf-8") as source_in, open(
+        args.source_out, "w", encoding="utf-8"
+    ) as source_out:
+        replace_oovs(source_in, target_in, vocabulary, source_out, target_out)
+    if target_in is not None:
+        target_in.close()
+    if target_out is not None:
+        target_out.close()
+if __name__ == "__main__":
+    main()

fairseq-0.10.2/examples/roberta/README.custom_classification.md ADDED Viewed

	@@ -0,0 +1,168 @@

+# Finetuning RoBERTa on a custom classification task
+This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks.
+### 1) Get the data
+```bash
+wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar zxvf aclImdb_v1.tar.gz
+```
+### 2) Format data
+`IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing.
+```python
+import argparse
+import os
+import random
+from glob import glob
+random.seed(0)
+def main(args):
+    for split in ['train', 'test']:
+        samples = []
+        for class_label in ['pos', 'neg']:
+            fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt')
+            for fname in fnames:
+                with open(fname) as fin:
+                    line = fin.readline()
+                    samples.append((line, 1 if class_label == 'pos' else 0))
+        random.shuffle(samples)
+        out_fname = 'train' if split == 'train' else 'dev'
+        f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w')
+        f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w')
+        for sample in samples:
+            f1.write(sample[0] + '\n')
+            f2.write(str(sample[1]) + '\n')
+        f1.close()
+        f2.close()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--datadir', default='aclImdb')
+    args = parser.parse_args()
+    main(args)
+```
+### 3) BPE encode
+Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower.
+```bash
+# Download encoder.json and vocab.bpe
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
+for SPLIT in train dev; do
+    python -m examples.roberta.multiprocessing_bpe_encoder \
+        --encoder-json encoder.json \
+        --vocab-bpe vocab.bpe \
+        --inputs "aclImdb/$SPLIT.input0" \
+        --outputs "aclImdb/$SPLIT.input0.bpe" \
+        --workers 60 \
+        --keep-empty
+done
+```
+### 4) Preprocess data
+```bash
+# Download fairseq dictionary.
+wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
+fairseq-preprocess \
+    --only-source \
+    --trainpref "aclImdb/train.input0.bpe" \
+    --validpref "aclImdb/dev.input0.bpe" \
+    --destdir "IMDB-bin/input0" \
+    --workers 60 \
+    --srcdict dict.txt
+fairseq-preprocess \
+    --only-source \
+    --trainpref "aclImdb/train.label" \
+    --validpref "aclImdb/dev.label" \
+    --destdir "IMDB-bin/label" \
+    --workers 60
+```
+### 5) Run training
+```bash
+TOTAL_NUM_UPDATES=7812  # 10 epochs through IMDB for bsz 32
+WARMUP_UPDATES=469      # 6 percent of the number of updates
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+HEAD_NAME=imdb_head     # Custom name for the classification head.
+NUM_CLASSES=2           # Number of classes for the classification task.
+MAX_SENTENCES=8         # Batch size.
+ROBERTA_PATH=/path/to/roberta.large/model.pt
+CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \
+    --restore-file $ROBERTA_PATH \
+    --max-positions 512 \
+    --batch-size $MAX_SENTENCES \
+    --max-tokens 4400 \
+    --task sentence_prediction \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --required-batch-size-multiple 1 \
+    --init-token 0 --separator-token 2 \
+    --arch roberta_large \
+    --criterion sentence_prediction \
+    --classification-head-name $HEAD_NAME \
+    --num-classes $NUM_CLASSES \
+    --dropout 0.1 --attention-dropout 0.1 \
+    --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \
+    --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \
+    --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \
+    --max-epoch 10 \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --shorten-method "truncate" \
+    --find-unused-parameters \
+    --update-freq 4
+```
+The above command will finetune RoBERTa-large with an effective batch-size of 32
+sentences (`--batch-size=8 --update-freq=4`). The expected
+`best-validation-accuracy` after 10 epochs is ~96.5%.
+If you run out of GPU memory, try decreasing `--batch-size` and increase
+`--update-freq` to compensate.
+### 6) Load model using hub interface
+Now we can load the trained model checkpoint using the RoBERTa hub interface.
+Assuming your checkpoints are stored in `checkpoints/`:
+```python
+from fairseq.models.roberta import RobertaModel
+roberta = RobertaModel.from_pretrained(
+    'checkpoints',
+    checkpoint_file='checkpoint_best.pt',
+    data_name_or_path='IMDB-bin'
+)
+roberta.eval()  # disable dropout
+```
+Finally you can make predictions using the `imdb_head` (or whatever you set
+`--classification-head-name` to during training):
+```python
+label_fn = lambda label: roberta.task.label_dictionary.string(
+    [label + roberta.task.label_dictionary.nspecial]
+)
+tokens = roberta.encode('Best movie this year')
+pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
+assert pred == '1'  # positive
+tokens = roberta.encode('Worst movie ever')
+pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item())
+assert pred == '0'  # negative
+```

fairseq-0.10.2/examples/roberta/commonsense_qa/README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Finetuning RoBERTa on Commonsense QA
+We follow a similar approach to [finetuning RACE](../README.race.md). Specifically
+for each question we construct five inputs, one for each of the five candidate
+answer choices. Each input is constructed by concatenating the question and
+candidate answer. We then encode each input and pass the resulting "[CLS]"
+representations through a fully-connected layer to predict the correct answer.
+We train with a standard cross-entropy loss.
+We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to
+the answer. The complete input format is:
+```
+<s> Q: Where would I not want a fox? </s> A: hen house </s>
+```
+Our final submission is based on a hyperparameter search over the learning rate
+(1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000,
+4000) and random seed. We selected the model with the best performance on the
+development set after 100 trials.
+### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa)
+```bash
+bash examples/roberta/commonsense_qa/download_cqa_data.sh
+```
+### 2) Finetune
+```bash
+MAX_UPDATES=3000      # Number of training steps.
+WARMUP_UPDATES=150    # Linearly increase LR over this many steps.
+LR=1e-05              # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16      # Batch size.
+SEED=1                # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+DATA_DIR=data/CommonsenseQA
+# we use the --user-dir option to load the task from
+# the examples/roberta/commonsense_qa directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa
+CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=no_c10d \
+    $DATA_DIR \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --task commonsense_qa --init-token 0 --bpe gpt2 \
+    --arch roberta_large --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --criterion sentence_ranking --num-classes 5 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \
+    --lr-scheduler polynomial_decay --lr $LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \
+    --batch-size $MAX_SENTENCES \
+    --max-update $MAX_UPDATES \
+    --log-format simple --log-interval 25 \
+    --seed $SEED
+```
+The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with
+less memory, decrease `--batch-size` and increase `--update-freq`
+accordingly to compensate.
+### 3) Evaluate
+```python
+import json
+import torch
+from fairseq.models.roberta import RobertaModel
+from examples.roberta import commonsense_qa  # load the Commonsense QA task
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA')
+roberta.eval()  # disable dropout
+roberta.cuda()  # use the GPU (optional)
+nsamples, ncorrect = 0, 0
+with open('data/CommonsenseQA/valid.jsonl') as h:
+    for line in h:
+        example = json.loads(line)
+        scores = []
+        for choice in example['question']['choices']:
+            input = roberta.encode(
+                'Q: ' + example['question']['stem'],
+                'A: ' + choice['text'],
+                no_separator=True
+            )
+            score = roberta.predict('sentence_classification_head', input, return_logits=True)
+            scores.append(score)
+        pred = torch.cat(scores).argmax()
+        answer = ord(example['answerKey']) - ord('A')
+        nsamples += 1
+        if pred == answer:
+            ncorrect += 1
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.7846027846027847
+```
+The above snippet is not batched, which makes it quite slow. See [instructions
+for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta#batched-prediction).

fairseq-0.10.2/examples/roberta/commonsense_qa/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import commonsense_qa_task  # noqa

fairseq-0.10.2/examples/roberta/commonsense_qa/commonsense_qa_task.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import numpy as np
+import torch
+from fairseq.data import (
+    Dictionary,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    RawLabelDataset,
+    RightPadDataset,
+    SortDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+@register_task("commonsense_qa")
+class CommonsenseQATask(LegacyFairseqTask):
+    """Task to finetune RoBERTa for Commonsense QA."""
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
+        )
+        parser.add_argument(
+            "--init-token",
+            type=int,
+            default=None,
+            help="add token at the beginning of each batch item",
+        )
+        parser.add_argument("--num-classes", type=int, default=5)
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol("<mask>")
+        self.bpe = encoders.build_bpe(args)
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert (
+            args.criterion == "sentence_ranking"
+        ), "Must set --criterion=sentence_ranking"
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+        return cls(args, vocab)
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        def binarize(s, append_bos=False):
+            if self.bpe is not None:
+                s = self.bpe.encode(s)
+            tokens = self.vocab.encode_line(
+                s,
+                append_eos=True,
+                add_if_not_exist=False,
+            ).long()
+            if append_bos and self.args.init_token is not None:
+                tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+            return tokens
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+        src_tokens = [[] for i in range(self.args.num_classes)]
+        src_lengths = [[] for i in range(self.args.num_classes)]
+        labels = []
+        with open(data_path) as h:
+            for line in h:
+                example = json.loads(line.strip())
+                if "answerKey" in example:
+                    label = ord(example["answerKey"]) - ord("A")
+                    labels.append(label)
+                question = example["question"]["stem"]
+                assert len(example["question"]["choices"]) == self.args.num_classes
+                # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>`
+                question = "Q: " + question
+                question_toks = binarize(question, append_bos=True)
+                for i, choice in enumerate(example["question"]["choices"]):
+                    src = "A: " + choice["text"]
+                    src_bin = torch.cat([question_toks, binarize(src)])
+                    src_tokens[i].append(src_bin)
+                    src_lengths[i].append(len(src_bin))
+        assert all(
+            len(src_tokens[0]) == len(src_tokens[i])
+            for i in range(self.args.num_classes)
+        )
+        assert len(src_tokens[0]) == len(src_lengths[0])
+        assert len(labels) == 0 or len(labels) == len(src_tokens[0])
+        for i in range(self.args.num_classes):
+            src_lengths[i] = np.array(src_lengths[i])
+            src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i])
+            src_lengths[i] = ListDataset(src_lengths[i])
+        dataset = {
+            "id": IdDataset(),
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(src_tokens[0], reduce=True),
+        }
+        for i in range(self.args.num_classes):
+            dataset.update(
+                {
+                    "net_input{}".format(i + 1): {
+                        "src_tokens": RightPadDataset(
+                            src_tokens[i],
+                            pad_idx=self.source_dictionary.pad(),
+                        ),
+                        "src_lengths": src_lengths[i],
+                    }
+                }
+            )
+        if len(labels) > 0:
+            dataset.update({"target": RawLabelDataset(labels)})
+        dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])],
+        )
+        with data_utils.numpy_seed(self.args.seed):
+            dataset = SortDataset(
+                dataset,
+                # shuffle
+                sort_order=[np.random.permutation(len(dataset))],
+            )
+        print("| Loaded {} with {} samples".format(split, len(dataset)))
+        self.datasets[split] = dataset
+        return self.datasets[split]
+    def build_model(self, args):
+        from fairseq import models
+        model = models.build_model(args, self)
+        model.register_classification_head(
+            "sentence_classification_head",
+            num_classes=1,
+        )
+        return model
+    @property
+    def source_dictionary(self):
+        return self.vocab
+    @property
+    def target_dictionary(self):
+        return self.vocab

fairseq-0.10.2/examples/roberta/commonsense_qa/download_cqa_data.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+OUTDIR=data/CommonsenseQA
+mkdir -p $OUTDIR
+wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl
+wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl
+wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl
+wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt

fairseq-0.10.2/examples/roberta/preprocess_RACE.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import json
+import os
+import re
+class InputExample:
+    def __init__(self, paragraph, qa_list, label):
+        self.paragraph = paragraph
+        self.qa_list = qa_list
+        self.label = label
+def get_examples(data_dir, set_type):
+    """
+    Extract paragraph and question-answer list from each json file
+    """
+    examples = []
+    levels = ["middle", "high"]
+    set_type_c = set_type.split("-")
+    if len(set_type_c) == 2:
+        levels = [set_type_c[1]]
+        set_type = set_type_c[0]
+    for level in levels:
+        cur_dir = os.path.join(data_dir, set_type, level)
+        for filename in os.listdir(cur_dir):
+            cur_path = os.path.join(cur_dir, filename)
+            with open(cur_path, "r") as f:
+                cur_data = json.load(f)
+                answers = cur_data["answers"]
+                options = cur_data["options"]
+                questions = cur_data["questions"]
+                context = cur_data["article"].replace("\n", " ")
+                context = re.sub(r"\s+", " ", context)
+                for i in range(len(answers)):
+                    label = ord(answers[i]) - ord("A")
+                    qa_list = []
+                    question = questions[i]
+                    for j in range(4):
+                        option = options[i][j]
+                        if "_" in question:
+                            qa_cat = question.replace("_", option)
+                        else:
+                            qa_cat = " ".join([question, option])
+                        qa_cat = re.sub(r"\s+", " ", qa_cat)
+                        qa_list.append(qa_cat)
+                    examples.append(InputExample(context, qa_list, label))
+    return examples
+def main():
+    """
+    Helper script to extract paragraphs questions and answers from RACE datasets.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input-dir",
+        help="input directory for downloaded RACE dataset",
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="output directory for extracted data",
+    )
+    args = parser.parse_args()
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+    for set_type in ["train", "dev", "test-middle", "test-high"]:
+        examples = get_examples(args.input_dir, set_type)
+        qa_file_paths = [
+            os.path.join(args.output_dir, set_type + ".input" + str(i + 1))
+            for i in range(4)
+        ]
+        qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths]
+        outf_context_path = os.path.join(args.output_dir, set_type + ".input0")
+        outf_label_path = os.path.join(args.output_dir, set_type + ".label")
+        outf_context = open(outf_context_path, "w")
+        outf_label = open(outf_label_path, "w")
+        for example in examples:
+            outf_context.write(example.paragraph + "\n")
+            for i in range(4):
+                qa_files[i].write(example.qa_list[i] + "\n")
+            outf_label.write(str(example.label) + "\n")
+        for f in qa_files:
+            f.close()
+        outf_label.close()
+        outf_context.close()
+if __name__ == "__main__":
+    main()

fairseq-0.10.2/examples/roberta/wsc/README.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Finetuning RoBERTa on Winograd Schema Challenge (WSC) data
+The following instructions can be used to finetune RoBERTa on the WSC training
+data provided by [SuperGLUE](https://super.gluebenchmark.com/).
+Note that there is high variance in the results. For our GLUE/SuperGLUE
+submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16,
+32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the
+random seed. Out of ~100 runs we chose the best 7 models and ensembled them.
+**Approach:** The instructions below use a slightly different loss function than
+what's described in the original RoBERTa arXiv paper. In particular,
+[Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin
+ranking loss between `(query, candidate)` pairs with tunable hyperparameters
+alpha and beta. This is supported in our code as well with the `--wsc-alpha` and
+`--wsc-beta` arguments. However, we achieved slightly better (and more robust)
+results on the development set by instead using a single cross entropy loss term
+over the log-probabilities for the query and all mined candidates. **The
+candidates are mined using spaCy from each input sentence in isolation, so the
+approach remains strictly pointwise.** This reduces the number of
+hyperparameters and our best model achieved 92.3% development set accuracy,
+compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa
+arXiv paper will describe this updated formulation.
+### 1) Download the WSC data from the SuperGLUE website:
+```bash
+wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip
+unzip WSC.zip
+# we also need to copy the RoBERTa dictionary into the same directory
+wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
+```
+### 2) Finetune over the provided training data:
+```bash
+TOTAL_NUM_UPDATES=2000  # Total number of training steps.
+WARMUP_UPDATES=250      # Linearly increase LR over this many steps.
+LR=2e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=16        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \
+    --restore-file $ROBERTA_PATH \
+    --reset-optimizer --reset-dataloader --reset-meters \
+    --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+    --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+    --valid-subset val \
+    --fp16 --ddp-backend no_c10d \
+    --user-dir $FAIRSEQ_USER_DIR \
+    --task wsc --criterion wsc --wsc-cross-entropy \
+    --arch roberta_large --bpe gpt2 --max-positions 512 \
+    --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+    --lr-scheduler polynomial_decay --lr $LR \
+    --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+    --batch-size $MAX_SENTENCES \
+    --max-update $TOTAL_NUM_UPDATES \
+    --log-format simple --log-interval 100 \
+    --seed $SEED
+```
+The above command assumes training on 4 GPUs, but you can achieve the same
+results on a single GPU by adding `--update-freq=4`.
+### 3) Evaluate
+```python
+from fairseq.models.roberta import RobertaModel
+from examples.roberta.wsc import wsc_utils  # also loads WSC task and criterion
+roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/')
+roberta.cuda()
+nsamples, ncorrect = 0, 0
+for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True):
+    pred = roberta.disambiguate_pronoun(sentence)
+    nsamples += 1
+    if pred == label:
+        ncorrect += 1
+print('Accuracy: ' + str(ncorrect / float(nsamples)))
+# Accuracy: 0.9230769230769231
+```
+## RoBERTa training on WinoGrande dataset
+We have also provided `winogrande` task and criterion for finetuning on the
+[WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets
+where there are always two candidates and one is correct.
+It's more efficient implementation for such subcases.
+```bash
+TOTAL_NUM_UPDATES=23750 # Total number of training steps.
+WARMUP_UPDATES=2375     # Linearly increase LR over this many steps.
+LR=1e-05                # Peak LR for polynomial LR scheduler.
+MAX_SENTENCES=32        # Batch size per GPU.
+SEED=1                  # Random seed.
+ROBERTA_PATH=/path/to/roberta/model.pt
+# we use the --user-dir option to load the task and criterion
+# from the examples/roberta/wsc directory:
+FAIRSEQ_PATH=/path/to/fairseq
+FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc
+cd fairseq
+CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \
+  --restore-file $ROBERTA_PATH \
+  --reset-optimizer --reset-dataloader --reset-meters \
+  --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \
+  --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \
+  --valid-subset val \
+  --fp16 --ddp-backend no_c10d \
+  --user-dir $FAIRSEQ_USER_DIR \
+  --task winogrande --criterion winogrande \
+  --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \
+  --arch roberta_large --bpe gpt2 --max-positions 512 \
+  --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \
+  --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \
+  --lr-scheduler polynomial_decay --lr $LR \
+  --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \
+  --batch-size $MAX_SENTENCES \
+  --max-update $TOTAL_NUM_UPDATES \
+  --log-format simple --log-interval 100
+```

fairseq-0.10.2/examples/roberta/wsc/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import wsc_criterion  # noqa
+from . import wsc_task  # noqa

fairseq-0.10.2/examples/roberta/wsc/wsc_criterion.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.criterions import LegacyFairseqCriterion, register_criterion
+from fairseq.data import encoders
+@register_criterion("wsc")
+class WSCCriterion(LegacyFairseqCriterion):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        if self.args.save_predictions is not None:
+            self.prediction_h = open(self.args.save_predictions, "w")
+        else:
+            self.prediction_h = None
+        self.bpe = encoders.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+    def __del__(self):
+        if self.prediction_h is not None:
+            self.prediction_h.close()
+    @staticmethod
+    def add_args(parser):
+        """Add criterion-specific arguments to the parser."""
+        parser.add_argument("--wsc-margin-alpha", type=float, metavar="A", default=1.0)
+        parser.add_argument("--wsc-margin-beta", type=float, metavar="B", default=0.0)
+        parser.add_argument(
+            "--wsc-cross-entropy",
+            action="store_true",
+            help="use cross entropy formulation instead of margin loss",
+        )
+        parser.add_argument(
+            "--save-predictions", metavar="FILE", help="file to save predictions to"
+        )
+    def get_masked_input(self, tokens, mask):
+        masked_tokens = tokens.clone()
+        masked_tokens[mask] = self.task.mask
+        return masked_tokens
+    def get_lprobs(self, model, tokens, mask):
+        logits, _ = model(src_tokens=self.get_masked_input(tokens, mask))
+        lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+        scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+        mask = mask.type_as(scores)
+        scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+        return scores
+    def get_loss(self, query_lprobs, cand_lprobs):
+        if self.args.wsc_cross_entropy:
+            return F.cross_entropy(
+                torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0),
+                query_lprobs.new([0]).long(),
+            )
+        else:
+            return (
+                -query_lprobs
+                + self.args.wsc_margin_alpha
+                * (cand_lprobs - query_lprobs + self.args.wsc_margin_beta).clamp(min=0)
+            ).sum()
+    def forward(self, model, sample, reduce=True):
+        # compute loss and accuracy
+        loss, nloss = 0.0, 0
+        ncorrect, nqueries = 0, 0
+        for i, label in enumerate(sample["labels"]):
+            query_lprobs = self.get_lprobs(
+                model,
+                sample["query_tokens"][i].unsqueeze(0),
+                sample["query_masks"][i].unsqueeze(0),
+            )
+            cand_lprobs = self.get_lprobs(
+                model,
+                sample["candidate_tokens"][i],
+                sample["candidate_masks"][i],
+            )
+            pred = (query_lprobs >= cand_lprobs).all().item()
+            if label is not None:
+                label = 1 if label else 0
+                ncorrect += 1 if pred == label else 0
+                nqueries += 1
+            if label:
+                # only compute a loss for positive instances
+                nloss += 1
+                loss += self.get_loss(query_lprobs, cand_lprobs)
+            id = sample["id"][i].item()
+            if self.prediction_h is not None:
+                print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h)
+        if nloss == 0:
+            loss = torch.tensor(0.0, requires_grad=True)
+        sample_size = nqueries if nqueries > 0 else 1
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "ncorrect": ncorrect,
+            "nqueries": nqueries,
+        }
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / sample_size / math.log(2),
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs)
+        nqueries = sum(log.get("nqueries", 0) for log in logging_outputs)
+        if nqueries > 0:
+            agg_output["accuracy"] = ncorrect / float(nqueries)
+        return agg_output
+@register_criterion("winogrande")
+class WinograndeCriterion(WSCCriterion):
+    def forward(self, model, sample, reduce=True):
+        # compute loss and accuracy
+        query_lprobs = self.get_lprobs(
+            model,
+            sample["query_tokens"],
+            sample["query_masks"],
+        )
+        cand_lprobs = self.get_lprobs(
+            model,
+            sample["candidate_tokens"],
+            sample["candidate_masks"],
+        )
+        pred = query_lprobs >= cand_lprobs
+        loss = self.get_loss(query_lprobs, cand_lprobs)
+        sample_size = sample["query_tokens"].size(0)
+        ncorrect = pred.sum().item()
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["nsentences"],
+            "sample_size": sample_size,
+            "ncorrect": ncorrect,
+            "nqueries": sample_size,
+        }
+        return loss, sample_size, logging_output

fairseq-0.10.2/examples/roberta/wsc/wsc_task.py ADDED Viewed

	@@ -0,0 +1,401 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import tempfile
+import numpy as np
+import torch
+import torch.nn.functional as F
+from fairseq import utils
+from fairseq.data import (
+    Dictionary,
+    IdDataset,
+    ListDataset,
+    NestedDictionaryDataset,
+    NumelDataset,
+    NumSamplesDataset,
+    PadDataset,
+    SortDataset,
+    data_utils,
+    encoders,
+)
+from fairseq.tasks import LegacyFairseqTask, register_task
+from . import wsc_utils
+@register_task("wsc")
+class WSCTask(LegacyFairseqTask):
+    """Task to finetune RoBERTa for Winograd Schemas."""
+    @staticmethod
+    def add_args(parser):
+        """Add task-specific arguments to the parser."""
+        parser.add_argument(
+            "data", metavar="DIR", help="path to data directory; we load <split>.jsonl"
+        )
+        parser.add_argument(
+            "--init-token",
+            type=int,
+            default=None,
+            help="add token at the beginning of each batch item",
+        )
+    def __init__(self, args, vocab):
+        super().__init__(args)
+        self.vocab = vocab
+        self.mask = vocab.add_symbol("<mask>")
+        self.bpe = encoders.build_bpe(args)
+        self.tokenizer = encoders.build_tokenizer(args)
+        # hack to handle GPT-2 BPE, which includes leading spaces
+        if args.bpe == "gpt2":
+            self.leading_space = True
+            self.trailing_space = False
+        else:
+            self.leading_space = False
+            self.trailing_space = True
+    @classmethod
+    def load_dictionary(cls, filename):
+        """Load the dictionary from the filename
+        Args:
+            filename (str): the filename
+        """
+        dictionary = Dictionary.load(filename)
+        dictionary.add_symbol("<mask>")
+        return dictionary
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == "wsc", "Must set --criterion=wsc"
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+        return cls(args, vocab)
+    def binarize(self, s: str, append_eos: bool = False):
+        if self.tokenizer is not None:
+            s = self.tokenizer.encode(s)
+        if self.bpe is not None:
+            s = self.bpe.encode(s)
+        tokens = self.vocab.encode_line(
+            s,
+            append_eos=append_eos,
+            add_if_not_exist=False,
+        ).long()
+        if self.args.init_token is not None:
+            tokens = torch.cat([tokens.new([self.args.init_token]), tokens])
+        return tokens
+    def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space):
+        toks = self.binarize(
+            prefix + leading_space + txt + trailing_space + suffix,
+            append_eos=True,
+        )
+        mask = torch.zeros_like(toks, dtype=torch.bool)
+        mask_start = len(self.binarize(prefix))
+        mask_size = len(self.binarize(leading_space + txt))
+        mask[mask_start : mask_start + mask_size] = 1
+        return toks, mask
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+        labels = []
+        for sentence, pronoun_span, query, label in wsc_utils.jsonl_iterator(data_path):
+            prefix = sentence[: pronoun_span.start].text
+            suffix = sentence[pronoun_span.end :].text_with_ws
+            # spaCy spans include trailing spaces, but we need to know about
+            # leading spaces for the GPT-2 BPE
+            leading_space = (
+                " " if sentence[: pronoun_span.start].text_with_ws.endswith(" ") else ""
+            )
+            trailing_space = " " if pronoun_span.text_with_ws.endswith(" ") else ""
+            # get noun phrases, excluding pronouns and anything overlapping with the query
+            cand_spans = wsc_utils.filter_noun_chunks(
+                wsc_utils.extended_noun_chunks(sentence),
+                exclude_pronouns=True,
+                exclude_query=query,
+                exact_match=False,
+            )
+            if query is not None:
+                query_toks, query_mask = self.binarize_with_mask(
+                    query, prefix, suffix, leading_space, trailing_space
+                )
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+            cand_toks, cand_masks = [], []
+            for cand_span in cand_spans:
+                toks, mask = self.binarize_with_mask(
+                    cand_span.text,
+                    prefix,
+                    suffix,
+                    leading_space,
+                    trailing_space,
+                )
+                cand_toks.append(toks)
+                cand_masks.append(mask)
+            # collate candidates
+            cand_toks = data_utils.collate_tokens(cand_toks, pad_idx=self.vocab.pad())
+            cand_masks = data_utils.collate_tokens(cand_masks, pad_idx=0)
+            assert cand_toks.size() == cand_masks.size()
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_masks)
+            candidate_lengths.append(cand_toks.size(1))
+            labels.append(label)
+        query_lengths = np.array(query_lengths)
+        query_tokens = ListDataset(query_tokens, query_lengths)
+        query_masks = ListDataset(query_masks, query_lengths)
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = ListDataset(candidate_tokens, candidate_lengths)
+        candidate_masks = ListDataset(candidate_masks, candidate_lengths)
+        labels = ListDataset(labels, [1] * len(labels))
+        dataset = {
+            "id": IdDataset(),
+            "query_tokens": query_tokens,
+            "query_masks": query_masks,
+            "candidate_tokens": candidate_tokens,
+            "candidate_masks": candidate_masks,
+            "labels": labels,
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(query_tokens, reduce=True),
+        }
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+        if return_only:
+            return dataset
+        self.datasets[split] = dataset
+        return self.datasets[split]
+    def build_dataset_for_inference(self, sample_json):
+        with tempfile.NamedTemporaryFile(buffering=0) as h:
+            h.write((json.dumps(sample_json) + "\n").encode("utf-8"))
+            dataset = self.load_dataset(
+                "disambiguate_pronoun",
+                data_path=h.name,
+                return_only=True,
+            )
+        return dataset
+    def disambiguate_pronoun(self, model, sentence, use_cuda=False):
+        sample_json = wsc_utils.convert_sentence_to_json(sentence)
+        dataset = self.build_dataset_for_inference(sample_json)
+        sample = dataset.collater([dataset[0]])
+        if use_cuda:
+            sample = utils.move_to_cuda(sample)
+        def get_masked_input(tokens, mask):
+            masked_tokens = tokens.clone()
+            masked_tokens[mask.bool()] = self.mask
+            return masked_tokens
+        def get_lprobs(tokens, mask):
+            logits, _ = model(src_tokens=get_masked_input(tokens, mask))
+            lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float)
+            scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
+            mask = mask.type_as(scores)
+            scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1)
+            return scores
+        cand_lprobs = get_lprobs(
+            sample["candidate_tokens"][0],
+            sample["candidate_masks"][0],
+        )
+        if sample["query_tokens"][0] is not None:
+            query_lprobs = get_lprobs(
+                sample["query_tokens"][0].unsqueeze(0),
+                sample["query_masks"][0].unsqueeze(0),
+            )
+            return (query_lprobs >= cand_lprobs).all().item() == 1
+        else:
+            best_idx = cand_lprobs.argmax().item()
+            full_cand = sample["candidate_tokens"][0][best_idx]
+            mask = sample["candidate_masks"][0][best_idx]
+            toks = full_cand[mask.bool()]
+            return self.bpe.decode(self.source_dictionary.string(toks)).strip()
+    @property
+    def source_dictionary(self):
+        return self.vocab
+    @property
+    def target_dictionary(self):
+        return self.vocab
+@register_task("winogrande")
+class WinograndeTask(WSCTask):
+    """
+    Task for WinoGrande dataset. Efficient implementation for Winograd schema
+    tasks with exactly two candidates, one of which is correct.
+    """
+    @classmethod
+    def setup_task(cls, args, **kwargs):
+        assert args.criterion == "winogrande", "Must set --criterion=winogrande"
+        # load data and label dictionaries
+        vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt"))
+        print("| dictionary: {} types".format(len(vocab)))
+        return cls(args, vocab)
+    def load_dataset(
+        self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs
+    ):
+        """Load a given dataset split.
+        Args:
+            split (str): name of the split (e.g., train, valid, test)
+        """
+        if data_path is None:
+            data_path = os.path.join(self.args.data, split + ".jsonl")
+        if not os.path.exists(data_path):
+            raise FileNotFoundError("Cannot find data: {}".format(data_path))
+        query_tokens = []
+        query_masks = []
+        query_lengths = []
+        candidate_tokens = []
+        candidate_masks = []
+        candidate_lengths = []
+        itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=(split == "test"))
+        for sample in itr:
+            sentence, pronoun_span, query, cand_text = sample
+            prefix = sentence[: pronoun_span[0]].rstrip()
+            suffix = sentence[pronoun_span[1] :]
+            leading_space = " " if sentence[: pronoun_span[0]].endswith(" ") else ""
+            trailing_space = ""
+            if query is not None:
+                query_toks, query_mask = self.binarize_with_mask(
+                    query,
+                    prefix,
+                    suffix,
+                    leading_space,
+                    trailing_space,
+                )
+                query_len = len(query_toks)
+            else:
+                query_toks, query_mask, query_len = None, None, 0
+            query_tokens.append(query_toks)
+            query_masks.append(query_mask)
+            query_lengths.append(query_len)
+            cand_toks, cand_mask = self.binarize_with_mask(
+                cand_text,
+                prefix,
+                suffix,
+                leading_space,
+                trailing_space,
+            )
+            candidate_tokens.append(cand_toks)
+            candidate_masks.append(cand_mask)
+            candidate_lengths.append(cand_toks.size(0))
+        query_lengths = np.array(query_lengths)
+        def get_pad_dataset_fn(tokens, length, pad_idx):
+            return PadDataset(
+                ListDataset(tokens, length),
+                pad_idx=pad_idx,
+                left_pad=False,
+            )
+        query_tokens = get_pad_dataset_fn(query_tokens, query_lengths, self.vocab.pad())
+        query_masks = get_pad_dataset_fn(query_masks, query_lengths, 0)
+        candidate_lengths = np.array(candidate_lengths)
+        candidate_tokens = get_pad_dataset_fn(
+            candidate_tokens, candidate_lengths, self.vocab.pad()
+        )
+        candidate_masks = get_pad_dataset_fn(candidate_masks, candidate_lengths, 0)
+        dataset = {
+            "id": IdDataset(),
+            "query_tokens": query_tokens,
+            "query_masks": query_masks,
+            "candidate_tokens": candidate_tokens,
+            "candidate_masks": candidate_masks,
+            "nsentences": NumSamplesDataset(),
+            "ntokens": NumelDataset(query_tokens, reduce=True),
+        }
+        nested_dataset = NestedDictionaryDataset(
+            dataset,
+            sizes=[query_lengths],
+        )
+        with data_utils.numpy_seed(self.args.seed):
+            shuffle = np.random.permutation(len(query_tokens))
+        dataset = SortDataset(
+            nested_dataset,
+            # shuffle
+            sort_order=[shuffle],
+        )
+        if return_only:
+            return dataset
+        self.datasets[split] = dataset
+        return self.datasets[split]

fairseq-0.10.2/examples/roberta/wsc/wsc_utils.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+from functools import lru_cache
+def convert_sentence_to_json(sentence):
+    if "_" in sentence:
+        prefix, rest = sentence.split("_", 1)
+        query, rest = rest.split("_", 1)
+        query_index = len(prefix.rstrip().split(" "))
+    else:
+        query, query_index = None, None
+    prefix, rest = sentence.split("[", 1)
+    pronoun, rest = rest.split("]", 1)
+    pronoun_index = len(prefix.rstrip().split(" "))
+    sentence = sentence.replace("_", "").replace("[", "").replace("]", "")
+    return {
+        "idx": 0,
+        "text": sentence,
+        "target": {
+            "span1_index": query_index,
+            "span1_text": query,
+            "span2_index": pronoun_index,
+            "span2_text": pronoun,
+        },
+    }
+def extended_noun_chunks(sentence):
+    noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks}
+    np_start, cur_np = 0, "NONE"
+    for i, token in enumerate(sentence):
+        np_type = token.pos_ if token.pos_ in {"NOUN", "PROPN"} else "NONE"
+        if np_type != cur_np:
+            if cur_np != "NONE":
+                noun_chunks.add((np_start, i))
+            if np_type != "NONE":
+                np_start = i
+            cur_np = np_type
+    if cur_np != "NONE":
+        noun_chunks.add((np_start, len(sentence)))
+    return [sentence[s:e] for (s, e) in sorted(noun_chunks)]
+def find_token(sentence, start_pos):
+    found_tok = None
+    for tok in sentence:
+        if tok.idx == start_pos:
+            found_tok = tok
+            break
+    return found_tok
+def find_span(sentence, search_text, start=0):
+    search_text = search_text.lower()
+    for tok in sentence[start:]:
+        remainder = sentence[tok.i :].text.lower()
+        if remainder.startswith(search_text):
+            len_to_consume = len(search_text)
+            start_idx = tok.idx
+            for next_tok in sentence[tok.i :]:
+                end_idx = next_tok.idx + len(next_tok.text)
+                if end_idx - start_idx == len_to_consume:
+                    span = sentence[tok.i : next_tok.i + 1]
+                    return span
+    return None
+@lru_cache(maxsize=1)
+def get_detokenizer():
+    from sacremoses import MosesDetokenizer
+    detok = MosesDetokenizer(lang="en")
+    return detok
+@lru_cache(maxsize=1)
+def get_spacy_nlp():
+    import en_core_web_lg
+    nlp = en_core_web_lg.load()
+    return nlp
+def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False):
+    detok = get_detokenizer()
+    nlp = get_spacy_nlp()
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+            if positive_only and "label" in sample and not sample["label"]:
+                # only consider examples where the query is correct
+                continue
+            target = sample["target"]
+            # clean up the query
+            query = target["span1_text"]
+            if query is not None:
+                if "\n" in query:
+                    continue
+                if query.endswith(".") or query.endswith(","):
+                    query = query[:-1]
+            # split tokens
+            tokens = sample["text"].split(" ")
+            def strip_pronoun(x):
+                return x.rstrip('.,"')
+            # find the pronoun
+            pronoun_idx = target["span2_index"]
+            pronoun = strip_pronoun(target["span2_text"])
+            if strip_pronoun(tokens[pronoun_idx]) != pronoun:
+                # hack: sometimes the index is misaligned
+                if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun:
+                    pronoun_idx += 1
+                else:
+                    raise Exception("Misaligned pronoun!")
+            assert strip_pronoun(tokens[pronoun_idx]) == pronoun
+            # split tokens before and after the pronoun
+            before = tokens[:pronoun_idx]
+            after = tokens[pronoun_idx + 1 :]
+            # the GPT BPE attaches leading spaces to tokens, so we keep track
+            # of whether we need spaces before or after the pronoun
+            leading_space = " " if pronoun_idx > 0 else ""
+            trailing_space = " " if len(after) > 0 else ""
+            # detokenize
+            before = detok.detokenize(before, return_str=True)
+            pronoun = detok.detokenize([pronoun], return_str=True)
+            after = detok.detokenize(after, return_str=True)
+            # hack: when the pronoun ends in a period (or comma), move the
+            # punctuation to the "after" part
+            if pronoun.endswith(".") or pronoun.endswith(","):
+                after = pronoun[-1] + trailing_space + after
+                pronoun = pronoun[:-1]
+            # hack: when the "after" part begins with a comma or period, remove
+            # the trailing space
+            if after.startswith(".") or after.startswith(","):
+                trailing_space = ""
+            # parse sentence with spacy
+            sentence = nlp(before + leading_space + pronoun + trailing_space + after)
+            # find pronoun span
+            start = len(before + leading_space)
+            first_pronoun_tok = find_token(sentence, start_pos=start)
+            pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i)
+            assert pronoun_span.text == pronoun
+            if eval:
+                # convert to format where pronoun is surrounded by "[]" and
+                # query is surrounded by "_"
+                query_span = find_span(sentence, query)
+                query_with_ws = "_{}_{}".format(
+                    query_span.text,
+                    (" " if query_span.text_with_ws.endswith(" ") else ""),
+                )
+                pronoun_with_ws = "[{}]{}".format(
+                    pronoun_span.text,
+                    (" " if pronoun_span.text_with_ws.endswith(" ") else ""),
+                )
+                if query_span.start < pronoun_span.start:
+                    first = (query_span, query_with_ws)
+                    second = (pronoun_span, pronoun_with_ws)
+                else:
+                    first = (pronoun_span, pronoun_with_ws)
+                    second = (query_span, query_with_ws)
+                sentence = (
+                    sentence[: first[0].start].text_with_ws
+                    + first[1]
+                    + sentence[first[0].end : second[0].start].text_with_ws
+                    + second[1]
+                    + sentence[second[0].end :].text
+                )
+                yield sentence, sample.get("label", None)
+            else:
+                yield sentence, pronoun_span, query, sample.get("label", None)
+def winogrande_jsonl_iterator(input_fname, eval=False):
+    with open(input_fname) as fin:
+        for line in fin:
+            sample = json.loads(line.strip())
+            sentence, option1, option2 = (
+                sample["sentence"],
+                sample["option1"],
+                sample["option2"],
+            )
+            pronoun_span = (sentence.index("_"), sentence.index("_") + 1)
+            if eval:
+                query, cand = option1, option2
+            else:
+                query = option1 if sample["answer"] == "1" else option2
+                cand = option2 if sample["answer"] == "1" else option1
+            yield sentence, pronoun_span, query, cand
+def filter_noun_chunks(
+    chunks, exclude_pronouns=False, exclude_query=None, exact_match=False
+):
+    if exclude_pronouns:
+        chunks = [
+            np
+            for np in chunks
+            if (np.lemma_ != "-PRON-" and not all(tok.pos_ == "PRON" for tok in np))
+        ]
+    if exclude_query is not None:
+        excl_txt = [exclude_query.lower()]
+        filtered_chunks = []
+        for chunk in chunks:
+            lower_chunk = chunk.text.lower()
+            found = False
+            for excl in excl_txt:
+                if (
+                    not exact_match and (lower_chunk in excl or excl in lower_chunk)
+                ) or lower_chunk == excl:
+                    found = True
+                    break
+            if not found:
+                filtered_chunks.append(chunk)
+        chunks = filtered_chunks
+    return chunks

fairseq-0.10.2/examples/scaling_nmt/README.md ADDED Viewed

	@@ -0,0 +1,114 @@

+# Scaling Neural Machine Translation (Ott et al., 2018)
+This page includes instructions for reproducing results from the paper [Scaling Neural Machine Translation (Ott et al., 2018)](https://arxiv.org/abs/1806.00187).
+## Pre-trained models
+Model | Description | Dataset | Download
+---|---|---|---
+`transformer.wmt14.en-fr` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2)
+`transformer.wmt16.en-de` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2)
+## Training a new model on WMT'16 En-De
+First download the [preprocessed WMT'16 En-De data provided by Google](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8).
+Then:
+##### 1. Extract the WMT'16 En-De data
+```bash
+TEXT=wmt16_en_de_bpe32k
+mkdir -p $TEXT
+tar -xzvf wmt16_en_de.tar.gz -C $TEXT
+```
+##### 2. Preprocess the dataset with a joined dictionary
+```bash
+fairseq-preprocess \
+    --source-lang en --target-lang de \
+    --trainpref $TEXT/train.tok.clean.bpe.32000 \
+    --validpref $TEXT/newstest2013.tok.bpe.32000 \
+    --testpref $TEXT/newstest2014.tok.bpe.32000 \
+    --destdir data-bin/wmt16_en_de_bpe32k \
+    --nwordssrc 32768 --nwordstgt 32768 \
+    --joined-dictionary \
+    --workers 20
+```
+##### 3. Train a model
+```bash
+fairseq-train \
+    data-bin/wmt16_en_de_bpe32k \
+    --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \
+    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
+    --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \
+    --dropout 0.3 --weight-decay 0.0 \
+    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
+    --max-tokens 3584 \
+    --fp16
+```
+Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer.
+***IMPORTANT:*** You will get better performance by training with big batches and
+increasing the learning rate. If you want to train the above model with big batches
+(assuming your machine has 8 GPUs):
+- add `--update-freq 16` to simulate training on 8x16=128 GPUs
+- increase the learning rate; 0.001 works well for big batches
+##### 4. Evaluate
+Now we can evaluate our trained model.
+Note that the original [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+paper used a couple tricks to achieve better BLEU scores. We use these same tricks in
+the Scaling NMT paper, so it's important to apply them when reproducing our results.
+First, use the [average_checkpoints.py](/scripts/average_checkpoints.py) script to
+average the last few checkpoints. Averaging the last 5-10 checkpoints is usually
+good, but you may need to adjust this depending on how long you've trained:
+```bash
+python scripts/average_checkpoints \
+    --inputs /path/to/checkpoints \
+    --num-epoch-checkpoints 10 \
+    --output checkpoint.avg10.pt
+```
+Next, generate translations using a beam width of 4 and length penalty of 0.6:
+```bash
+fairseq-generate \
+    data-bin/wmt16_en_de_bpe32k \
+    --path checkpoint.avg10.pt \
+    --beam 4 --lenpen 0.6 --remove-bpe > gen.out
+```
+Finally, we apply the ["compound splitting" script](/scripts/compound_split_bleu.sh) to
+add spaces around dashes. For example "Café-Liebhaber" would become three tokens:
+"Café - Liebhaber". This typically results in larger BLEU scores, but it is not
+appropriate to compare these inflated scores to work which does not include this trick.
+This trick was used in the [original AIAYN code](https://github.com/tensorflow/tensor2tensor/blob/fc9335c0203685cbbfe2b30c92db4352d8f60779/tensor2tensor/utils/get_ende_bleu.sh),
+so we used it in the Scaling NMT paper as well. That said, it's strongly advised to
+report [sacrebleu](https://github.com/mjpost/sacrebleu) scores instead.
+To compute "compound split" tokenized BLEU (not recommended!):
+```bash
+bash scripts/compound_split_bleu.sh gen.out
+# BLEU4 = 29.29, 60.3/35.0/22.8/15.3 (BP=1.000, ratio=1.004, syslen=64763, reflen=64496)
+```
+To compute detokenized BLEU with sacrebleu (preferred):
+```bash
+bash scripts/sacrebleu.sh wmt14/full en de gen.out
+# BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt14/full+tok.13a+version.1.4.3 = 28.6 59.3/34.3/22.1/14.9 (BP = 1.000 ratio = 1.016 hyp_len = 63666 ref_len = 62688)
+```
+## Citation
+```bibtex
+@inproceedings{ott2018scaling,
+  title = {Scaling Neural Machine Translation},
+  author = {Ott, Myle and Edunov, Sergey and Grangier, David and Auli, Michael},
+  booktitle = {Proceedings of the Third Conference on Machine Translation (WMT)},
+  year = 2018,
+}
+```

fairseq-0.10.2/examples/simultaneous_translation/criterions/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        criterion_name = file[: file.find(".py")]
+        importlib.import_module(
+            "examples.simultaneous_translation.criterions." + criterion_name
+        )

fairseq-0.10.2/examples/simultaneous_translation/criterions/label_smoothed_cross_entropy_latency_augmented.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from examples.simultaneous_translation.utils.latency import LatencyTraining
+from fairseq.criterions import register_criterion
+from fairseq.criterions.label_smoothed_cross_entropy import (
+    LabelSmoothedCrossEntropyCriterion,
+)
+@register_criterion("latency_augmented_label_smoothed_cross_entropy")
+class LatencyAugmentedLabelSmoothedCrossEntropyCriterion(
+    LabelSmoothedCrossEntropyCriterion
+):
+    def __init__(self, args, task):
+        super().__init__(args, task)
+        self.eps = args.label_smoothing
+        self.latency_weight_avg = args.latency_weight_avg
+        self.latency_weight_avg_type = args.latency_weight_avg_type
+        self.latency_weight_var = args.latency_weight_var
+        self.latency_weight_var_type = args.latency_weight_var_type
+        self.mass_preservation = args.mass_preservation
+        self.average_method = args.average_method
+        self.latency_train = LatencyTraining(
+            self.latency_weight_avg,
+            self.latency_weight_var,
+            self.latency_weight_avg_type,
+            self.latency_weight_var_type,
+            self.mass_preservation,
+            self.average_method,
+        )
+    @staticmethod
+    def add_args(parser):
+        super(
+            LatencyAugmentedLabelSmoothedCrossEntropyCriterion,
+            LatencyAugmentedLabelSmoothedCrossEntropyCriterion,
+        ).add_args(parser)
+        """Add criterion-specific arguments to the parser."""
+        # fmt: off
+        parser.add_argument("--latency-weight-avg", default=0., type=float, metavar='D',
+                            help="Average loss weight")
+        parser.add_argument("--latency-weight-var", default=0., type=float, metavar='D',
+                            help="Variance loss weight")
+        parser.add_argument("--latency-weight-avg-type", default="differentiable_average_lagging",
+                            help="Statistics for Average loss type")
+        parser.add_argument("--latency-weight-var-type", default="variance_delay",
+                            help="Statistics for variance loss type")
+        parser.add_argument("--average-method", default="weighted_average",
+                            help="Average loss type")
+        # fmt: on
+    def compute_loss(self, model, net_output, sample, reduce=True):
+        # Compute cross entropy loss first
+        loss, nll_loss = super().compute_loss(model, net_output, sample, reduce)
+        # Obtain the expected alignment
+        attn_list = [item["alpha"] for item in net_output[-1]["attn_list"]]
+        target_padding_mask = model.get_targets(sample, net_output).eq(self.padding_idx)
+        source_padding_mask = net_output[-1].get("encoder_padding_mask", None)
+        # Get latency loss
+        latency_loss = self.latency_train.loss(
+            attn_list, source_padding_mask, target_padding_mask
+        )
+        loss += latency_loss
+        return loss, nll_loss

fairseq-0.10.2/examples/simultaneous_translation/eval/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os
+from fairseq import registry
+build_agent, register_agent, MONOTONIC_AGENT, _ = registry.setup_registry(
+    "--agent-type"
+)
+DEFAULT_EOS = "</s>"
+GET = 0
+SEND = 1
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("agents." + module)

fairseq-0.10.2/examples/simultaneous_translation/eval/agents/agent.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import time
+from functools import partial
+from multiprocessing.pool import ThreadPool as Pool
+from . import DEFAULT_EOS, GET, SEND
+class Agent(object):
+    "an agent needs to follow this pattern"
+    def __init__(self, *args, **kwargs):
+        pass
+    def init_states(self, *args, **kwargs):
+        raise NotImplementedError
+    def update_states(self, states, new_state):
+        raise NotImplementedError
+    def finish_eval(self, states, new_state):
+        raise NotImplementedError
+    def policy(self, state):
+        raise NotImplementedError
+    def reset(self):
+        raise NotImplementedError
+    def decode(self, session, low=0, high=100000, num_thread=10):
+        corpus_info = session.corpus_info()
+        high = min(corpus_info["num_sentences"] - 1, high)
+        if low >= high:
+            return
+        t0 = time.time()
+        if num_thread > 1:
+            with Pool(10) as p:
+                p.map(
+                    partial(self._decode_one, session),
+                    [sent_id for sent_id in range(low, high + 1)],
+                )
+        else:
+            for sent_id in range(low, high + 1):
+                self._decode_one(session, sent_id)
+        print(f"Finished {low} to {high} in {time.time() - t0}s")
+    def _decode_one(self, session, sent_id):
+        action = {}
+        self.reset()
+        states = self.init_states()
+        while action.get("value", None) != DEFAULT_EOS:
+            # take an action
+            action = self.policy(states)
+            if action["key"] == GET:
+                new_states = session.get_src(sent_id, action["value"])
+                states = self.update_states(states, new_states)
+            elif action["key"] == SEND:
+                session.send_hypo(sent_id, action["value"])
+        print(" ".join(states["tokens"]["tgt"]))

fairseq-0.10.2/examples/simultaneous_translation/eval/agents/simul_trans_agent.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from fairseq import checkpoint_utils, tasks, utils
+from . import DEFAULT_EOS, GET, SEND
+from .agent import Agent
+class SimulTransAgent(Agent):
+    def __init__(self, args):
+        # Load Model
+        self.load_model(args)
+        # build word spliter
+        self.build_word_splitter(args)
+        self.max_len = args.max_len
+        self.eos = DEFAULT_EOS
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--model-path', type=str, required=True,
+                            help='path to your pretrained model.')
+        parser.add_argument("--data-bin", type=str, required=True,
+                            help="Path of data binary")
+        parser.add_argument("--user-dir", type=str, default="example/simultaneous_translation",
+                            help="User directory for simultaneous translation")
+        parser.add_argument("--src-splitter-type", type=str, default=None,
+                            help="Subword splitter type for source text")
+        parser.add_argument("--tgt-splitter-type", type=str, default=None,
+                            help="Subword splitter type for target text")
+        parser.add_argument("--src-splitter-path", type=str, default=None,
+                            help="Subword splitter model path for source text")
+        parser.add_argument("--tgt-splitter-path", type=str, default=None,
+                            help="Subword splitter model path for target text")
+        parser.add_argument("--max-len", type=int, default=150,
+                            help="Maximum length difference between source and target prediction")
+        parser.add_argument('--model-overrides', default="{}", type=str, metavar='DICT',
+                            help='A dictionary used to override model args at generation '
+                                 'that were used during model training')
+        # fmt: on
+        return parser
+    def load_dictionary(self, task):
+        raise NotImplementedError
+    def load_model(self, args):
+        args.user_dir = os.path.join(os.path.dirname(__file__), "..", "..")
+        utils.import_user_module(args)
+        filename = args.model_path
+        if not os.path.exists(filename):
+            raise IOError("Model file not found: {}".format(filename))
+        state = checkpoint_utils.load_checkpoint_to_cpu(
+            filename, json.loads(args.model_overrides)
+        )
+        saved_args = state["args"]
+        saved_args.data = args.data_bin
+        task = tasks.setup_task(saved_args)
+        # build model for ensemble
+        self.model = task.build_model(saved_args)
+        self.model.load_state_dict(state["model"], strict=True)
+        # Set dictionary
+        self.load_dictionary(task)
+    def init_states(self):
+        return {
+            "indices": {"src": [], "tgt": []},
+            "tokens": {"src": [], "tgt": []},
+            "segments": {"src": [], "tgt": []},
+            "steps": {"src": 0, "tgt": 0},
+            "finished": False,
+            "finish_read": False,
+            "model_states": {},
+        }
+    def update_states(self, states, new_state):
+        raise NotImplementedError
+    def policy(self, states):
+        # Read and Write policy
+        action = None
+        while action is None:
+            if states["finished"]:
+                # Finish the hypo by sending eos to server
+                return self.finish_action()
+            # Model make decision given current states
+            decision = self.model.decision_from_states(states)
+            if decision == 0 and not self.finish_read(states):
+                # READ
+                action = self.read_action(states)
+            else:
+                # WRITE
+                action = self.write_action(states)
+            # None means we make decision again but not sending server anything
+            # This happened when read a bufffered token
+            # Or predict a subword
+        return action
+    def finish_read(self, states):
+        raise NotImplementedError
+    def write_action(self, states):
+        token, index = self.model.predict_from_states(states)
+        if (
+            index == self.dict["tgt"].eos()
+            or len(states["tokens"]["tgt"]) > self.max_len
+        ):
+            # Finish this sentence is predict EOS
+            states["finished"] = True
+            end_idx_last_full_word = self._target_length(states)
+        else:
+            states["tokens"]["tgt"] += [token]
+            end_idx_last_full_word = self.word_splitter["tgt"].end_idx_last_full_word(
+                states["tokens"]["tgt"]
+            )
+            self._append_indices(states, [index], "tgt")
+        if end_idx_last_full_word > states["steps"]["tgt"]:
+            # Only sent detokenized full words to the server
+            word = self.word_splitter["tgt"].merge(
+                states["tokens"]["tgt"][states["steps"]["tgt"] : end_idx_last_full_word]
+            )
+            states["steps"]["tgt"] = end_idx_last_full_word
+            states["segments"]["tgt"] += [word]
+            return {"key": SEND, "value": word}
+        else:
+            return None
+    def read_action(self, states):
+        return {"key": GET, "value": None}
+    def finish_action(self):
+        return {"key": SEND, "value": DEFAULT_EOS}
+    def reset(self):
+        pass
+    def finish_eval(self, states, new_state):
+        if len(new_state) == 0 and len(states["indices"]["src"]) == 0:
+            return True
+        return False
+    def _append_indices(self, states, new_indices, key):
+        states["indices"][key] += new_indices
+    def _target_length(self, states):
+        return len(states["tokens"]["tgt"])

fairseq-0.10.2/examples/simultaneous_translation/eval/agents/simul_trans_text_agent.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import DEFAULT_EOS, GET, register_agent
+from .simul_trans_agent import SimulTransAgent
+from .word_splitter import SPLITTER_DICT
+@register_agent("simul_trans_text")
+class SimulTransTextAgent(SimulTransAgent):
+    def build_word_splitter(self, args):
+        self.word_splitter = {}
+        self.word_splitter["src"] = SPLITTER_DICT[args.src_splitter_type](
+            getattr(args, f"src_splitter_path")
+        )
+        self.word_splitter["tgt"] = SPLITTER_DICT[args.tgt_splitter_type](
+            getattr(args, f"tgt_splitter_path")
+        )
+    def load_dictionary(self, task):
+        self.dict = {}
+        self.dict["tgt"] = task.target_dictionary
+        self.dict["src"] = task.source_dictionary
+    def update_states(self, states, new_state):
+        if states["finish_read"]:
+            return states
+        new_word = new_state["segment"]
+        # Split words and index the token
+        if new_word not in [DEFAULT_EOS]:
+            tokens = self.word_splitter["src"].split(new_word)
+            # Get indices from dictionary
+            # You can change to you own dictionary
+            indices = (
+                self.dict["src"]
+                .encode_line(
+                    tokens,
+                    line_tokenizer=lambda x: x,
+                    add_if_not_exist=False,
+                    append_eos=False,
+                )
+                .tolist()
+            )
+        else:
+            tokens = [new_word]
+            indices = [self.dict["src"].eos()]
+            states["finish_read"] = True
+        # Update states
+        states["segments"]["src"] += [new_word]
+        states["tokens"]["src"] += tokens
+        self._append_indices(states, indices, "src")
+        return states
+    def read_action(self, states):
+        # Increase source step by one
+        states["steps"]["src"] += 1
+        # At leat one word is read
+        if len(states["tokens"]["src"]) == 0:
+            return {"key": GET, "value": None}
+        # Only request new word if there is no buffered tokens
+        if len(states["tokens"]["src"]) <= states["steps"]["src"]:
+            return {"key": GET, "value": None}
+        return None
+    def finish_read(self, states):
+        # The first means all segments (full words) has been read from server
+        # The second means all tokens (subwords) has been read locally
+        return (
+            states["finish_read"]
+            and len(states["tokens"]["src"]) == states["steps"]["src"]
+        )

fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import importlib
+import os
+from fairseq import registry
+(build_scorer, register_scorer, SCORER_REGISTRIES, _) = registry.setup_registry(
+    "--scorer-type"
+)
+for file in os.listdir(os.path.dirname(__file__)):
+    if file.endswith(".py") and not file.startswith("_"):
+        module = file[: file.find(".py")]
+        importlib.import_module("scorers." + module)

fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/scorer.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from collections import defaultdict
+from examples.simultaneous_translation.eval.eval_latency import LatencyScorer
+from vizseq.scorers.bleu import BLEUScorer
+from vizseq.scorers.meteor import METEORScorer
+from vizseq.scorers.ter import TERScorer
+DEFAULT_EOS = "</s>"
+class SimulScorer(object):
+    def __init__(self, args):
+        self.tokenizer = args.tokenizer
+        self.output_dir = args.output
+        if args.output is not None:
+            self.output_files = {
+                "text": os.path.join(args.output, "text"),
+                "delay": os.path.join(args.output, "delay"),
+                "scores": os.path.join(args.output, "scores"),
+            }
+        else:
+            self.output_files = None
+        self.eos = DEFAULT_EOS
+        self.data = {"tgt": []}
+        self.reset()
+    def get_info(self):
+        return {"num_sentences": len(self)}
+    @staticmethod
+    def add_args(parser):
+        # fmt: off
+        parser.add_argument('--src-file', type=str, required=True,
+                            help='Source input file')
+        parser.add_argument('--tgt-file', type=str, required=True,
+                            help='Target reference file')
+        parser.add_argument('--tokenizer', default="13a", choices=["none", "13a"],
+                            help='Tokenizer used for sacrebleu')
+        parser.add_argument('--output', type=str, default=None,
+                            help='Path for output directory')
+        # fmt: on
+    def send_src(self, sent_id, *args):
+        raise NotImplementedError
+    def recv_hyp(self, sent_id, list_of_tokens):
+        for token in list_of_tokens:
+            self.translations[sent_id].append((token, self.steps[sent_id]))
+    def reset(self):
+        self.steps = defaultdict(int)
+        self.translations = defaultdict(list)
+    def src_lengths(self):
+        raise NotImplementedError
+    def score(self):
+        translations = []
+        delays = []
+        for i in range(1 + max(self.translations.keys())):
+            translations += [" ".join(t[0] for t in self.translations[i][:-1])]
+            delays += [[t[1] for t in self.translations[i]]]
+        bleu_score = BLEUScorer(
+            sent_level=False,
+            corpus_level=True,
+            extra_args={"bleu_tokenizer": self.tokenizer},
+        ).score(translations, [self.data["tgt"]])
+        ter_score = TERScorer(sent_level=False, corpus_level=True).score(
+            translations, [self.data["tgt"]]
+        )
+        meteor_score = METEORScorer(sent_level=False, corpus_level=True).score(
+            translations, [self.data["tgt"]]
+        )
+        latency_score = LatencyScorer().score(
+            [
+                {"src_len": src_len, "delays": delay}
+                for src_len, delay in zip(self.src_lengths(), delays)
+            ],
+            start_from_zero=False,
+        )
+        scores = {
+            "BLEU": bleu_score[0],
+            "TER": ter_score[0],
+            "METEOR": meteor_score[0],
+            "DAL": latency_score["differentiable_average_lagging"],
+            "AL": latency_score["average_lagging"],
+            "AP": latency_score["average_proportion"],
+        }
+        if self.output_files is not None:
+            try:
+                os.makedirs(self.output_dir, exist_ok=True)
+                self.write_results_to_file(translations, delays, scores)
+            except BaseException as be:
+                print(f"Failed to write results to {self.output_dir}.")
+                print(be)
+                print("Skip writing predictions")
+        return scores
+    def write_results_to_file(self, translations, delays, scores):
+        if self.output_files["text"] is not None:
+            with open(self.output_files["text"], "w") as f:
+                for line in translations:
+                    f.write(line + "\n")
+        if self.output_files["delay"] is not None:
+            with open(self.output_files["delay"], "w") as f:
+                for i, delay in enumerate(delays):
+                    f.write(
+                        json.dumps({"src_len": self.src_lengths()[i], "delays": delay})
+                        + "\n"
+                    )
+        with open(self.output_files["scores"], "w") as f:
+            for key, value in scores.items():
+                f.write(f"{key}, {value}\n")
+    @classmethod
+    def _load_text_file(cls, file, split=False):
+        with open(file) as f:
+            if split:
+                return [r.strip().split() for r in f]
+            else:
+                return [r.strip() for r in f]
+    @classmethod
+    def _load_text_from_json(cls, file):
+        list_to_return = []
+        with open(file) as f:
+            content = json.load(f)
+            for item in content["utts"].values():
+                list_to_return.append(item["output"]["text"].strip())
+        return list_to_return
+    @classmethod
+    def _load_wav_info_from_json(cls, file):
+        list_to_return = []
+        with open(file) as f:
+            content = json.load(f)
+            for item in content["utts"].values():
+                list_to_return.append(
+                    {
+                        "path": item["input"]["path"].strip(),
+                        "length": item["input"]["length_ms"],
+                    }
+                )
+        return list_to_return
+    @classmethod
+    def _load_wav_info_from_list(cls, file):
+        list_to_return = []
+        with open(file) as f:
+            for line in f:
+                list_to_return.append(
+                    {
+                        "path": line.strip(),
+                    }
+                )
+        return list_to_return
+    def __len__(self):
+        return len(self.data["tgt"])

fairseq-0.10.2/examples/simultaneous_translation/eval/scorers/text_scorer.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from . import register_scorer
+from .scorer import SimulScorer
+@register_scorer("text")
+class SimulTextScorer(SimulScorer):
+    def __init__(self, args):
+        super().__init__(args)
+        self.data = {
+            "src": self._load_text_file(args.src_file, split=True),
+            "tgt": self._load_text_file(args.tgt_file, split=False),
+        }
+    def send_src(self, sent_id, *args):
+        if self.steps[sent_id] >= len(self.data["src"][sent_id]):
+            dict_to_return = {
+                "sent_id": sent_id,
+                "segment_id": self.steps[sent_id],
+                "segment": self.eos,
+            }
+            # Consider EOS
+            self.steps[sent_id] = len(self.data["src"][sent_id]) + 1
+        else:
+            dict_to_return = {
+                "sent_id": sent_id,
+                "segment_id": self.steps[sent_id],
+                "segment": self.data["src"][sent_id][self.steps[sent_id]],
+            }
+            self.steps[sent_id] += 1
+        return dict_to_return
+    def src_lengths(self):
+        # +1 for eos
+        return [len(sent) + 1 for sent in self.data["src"]]

fairseq-0.10.2/examples/speech_recognition/criterions/ASG_loss.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+from examples.speech_recognition.data.replabels import pack_replabels
+from fairseq import utils
+from fairseq.criterions import FairseqCriterion, register_criterion
+@register_criterion("asg_loss")
+class ASGCriterion(FairseqCriterion):
+    @staticmethod
+    def add_args(parser):
+        group = parser.add_argument_group("ASG Loss")
+        group.add_argument(
+            "--asg-transitions-init",
+            help="initial diagonal value of transition matrix",
+            type=float,
+            default=0.0,
+        )
+        group.add_argument(
+            "--max-replabel", help="maximum # of replabels", type=int, default=2
+        )
+        group.add_argument(
+            "--linseg-updates",
+            help="# of training updates to use LinSeg initialization",
+            type=int,
+            default=0,
+        )
+        group.add_argument(
+            "--hide-linseg-messages",
+            help="hide messages about LinSeg initialization",
+            action="store_true",
+        )
+    def __init__(
+        self,
+        task,
+        silence_token,
+        asg_transitions_init,
+        max_replabel,
+        linseg_updates,
+        hide_linseg_messages,
+    ):
+        from wav2letter.criterion import ASGLoss, CriterionScaleMode
+        super().__init__(task)
+        self.tgt_dict = task.target_dictionary
+        self.eos = self.tgt_dict.eos()
+        self.silence = (
+            self.tgt_dict.index(silence_token)
+            if silence_token in self.tgt_dict
+            else None
+        )
+        self.max_replabel = max_replabel
+        num_labels = len(self.tgt_dict)
+        self.asg = ASGLoss(num_labels, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT)
+        self.asg.trans = torch.nn.Parameter(
+            asg_transitions_init * torch.eye(num_labels), requires_grad=True
+        )
+        self.linseg_progress = torch.nn.Parameter(
+            torch.tensor([0], dtype=torch.int), requires_grad=False
+        )
+        self.linseg_maximum = linseg_updates
+        self.linseg_message_state = "none" if hide_linseg_messages else "start"
+    @classmethod
+    def build_criterion(cls, args, task):
+        return cls(
+            task,
+            args.silence_token,
+            args.asg_transitions_init,
+            args.max_replabel,
+            args.linseg_updates,
+            args.hide_linseg_messages,
+        )
+    def linseg_step(self):
+        if not self.training:
+            return False
+        if self.linseg_progress.item() < self.linseg_maximum:
+            if self.linseg_message_state == "start":
+                print("| using LinSeg to initialize ASG")
+                self.linseg_message_state = "finish"
+            self.linseg_progress.add_(1)
+            return True
+        elif self.linseg_message_state == "finish":
+            print("| finished LinSeg initialization")
+            self.linseg_message_state = "none"
+        return False
+    def replace_eos_with_silence(self, tgt):
+        if tgt[-1] != self.eos:
+            return tgt
+        elif self.silence is None or (len(tgt) > 1 and tgt[-2] == self.silence):
+            return tgt[:-1]
+        else:
+            return tgt[:-1] + [self.silence]
+    def forward(self, model, sample, reduce=True):
+        """Compute the loss for the given sample.
+        Returns a tuple with three elements:
+        1) the loss
+        2) the sample size, which is used as the denominator for the gradient
+        3) logging outputs to display while training
+        """
+        net_output = model(**sample["net_input"])
+        emissions = net_output["encoder_out"].transpose(0, 1).contiguous()
+        B = emissions.size(0)
+        T = emissions.size(1)
+        device = emissions.device
+        target = torch.IntTensor(B, T)
+        target_size = torch.IntTensor(B)
+        using_linseg = self.linseg_step()
+        for b in range(B):
+            initial_target_size = sample["target_lengths"][b].item()
+            if initial_target_size == 0:
+                raise ValueError("target size cannot be zero")
+            tgt = sample["target"][b, :initial_target_size].tolist()
+            tgt = self.replace_eos_with_silence(tgt)
+            tgt = pack_replabels(tgt, self.tgt_dict, self.max_replabel)
+            tgt = tgt[:T]
+            if using_linseg:
+                tgt = [tgt[t * len(tgt) // T] for t in range(T)]
+            target[b][: len(tgt)] = torch.IntTensor(tgt)
+            target_size[b] = len(tgt)
+        loss = self.asg.forward(emissions, target.to(device), target_size.to(device))
+        if reduce:
+            loss = torch.sum(loss)
+        sample_size = (
+            sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"]
+        )
+        logging_output = {
+            "loss": utils.item(loss.data) if reduce else loss.data,
+            "ntokens": sample["ntokens"],
+            "nsentences": sample["target"].size(0),
+            "sample_size": sample_size,
+        }
+        return loss, sample_size, logging_output
+    @staticmethod
+    def aggregate_logging_outputs(logging_outputs):
+        """Aggregate logging outputs from data parallel training."""
+        loss_sum = sum(log.get("loss", 0) for log in logging_outputs)
+        ntokens = sum(log.get("ntokens", 0) for log in logging_outputs)
+        nsentences = sum(log.get("nsentences", 0) for log in logging_outputs)
+        sample_size = sum(log.get("sample_size", 0) for log in logging_outputs)
+        agg_output = {
+            "loss": loss_sum / nsentences,
+            "ntokens": ntokens,
+            "nsentences": nsentences,
+            "sample_size": sample_size,
+        }
+        return agg_output