diff --git a/.gitattributes b/.gitattributes index 330615f4245edb126f9e19a7dc7fbb1eddce9b12..f2a2c76de831fff2251aff6720ca43af603fd329 100644 --- a/.gitattributes +++ b/.gitattributes @@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text fairseq/examples/MMPT/vlm.png filter=lfs diff=lfs merge=lfs -text fairseq/examples/MMPT/videoclip.png filter=lfs diff=lfs merge=lfs -text +fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text +fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text diff --git a/fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so b/fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..36137b1458a791b2673d8456cc9ebdc17c482b67 --- /dev/null +++ b/fairseq/alignment_train_cpu_binding.cpython-310-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c38fe0fe1fc34d8ef940b7d2c8bb7d81f4658444e18e1d6beb0ab981a3a9de75 +size 146280 diff --git a/fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so b/fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so new file mode 100644 index 0000000000000000000000000000000000000000..dc00d7b866c7f5eb878153bb5db91fd157115ad6 --- /dev/null +++ b/fairseq/alignment_train_cuda_binding.cpython-310-x86_64-linux-gnu.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b026c2052231c4d2995e584dcdfd0a31c3509884e0568f2dff7448004f87773 +size 1226768 diff --git a/fairseq/examples/backtranslation/prepare-wmt18en2de.sh b/fairseq/examples/backtranslation/prepare-wmt18en2de.sh new file mode 100644 index 0000000000000000000000000000000000000000..f6fd275307db50ca84c299440ae02dce49064030 --- /dev/null +++ b/fairseq/examples/backtranslation/prepare-wmt18en2de.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh + +echo 'Cloning Moses github repository (for tokenization scripts)...' +git clone https://github.com/moses-smt/mosesdecoder.git + +echo 'Cloning Subword NMT repository (for BPE pre-processing)...' +git clone https://github.com/rsennrich/subword-nmt.git + +SCRIPTS=mosesdecoder/scripts +TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl +CLEAN=$SCRIPTS/training/clean-corpus-n.perl +NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl +REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl +BPEROOT=subword-nmt/subword_nmt +BPE_TOKENS=32000 + +URLS=( + "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" + "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" + "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz" + "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz" + "http://data.statmt.org/wmt17/translation-task/dev.tgz" + "http://statmt.org/wmt14/test-full.tgz" +) +FILES=( + "training-parallel-europarl-v7.tgz" + "training-parallel-commoncrawl.tgz" + "training-parallel-nc-v13.tgz" + "rapid2016.tgz" + "dev.tgz" + "test-full.tgz" +) +CORPORA=( + "training/europarl-v7.de-en" + "commoncrawl.de-en" + "training-parallel-nc-v13/news-commentary-v13.de-en" + "rapid2016.de-en" +) + +if [ ! -d "$SCRIPTS" ]; then + echo "Please set SCRIPTS variable correctly to point to Moses scripts." + exit 1 +fi + +OUTDIR=wmt18_en_de + +src=en +tgt=de +lang=en-de +prep=$OUTDIR +tmp=$prep/tmp +orig=orig + +mkdir -p $orig $tmp $prep + +cd $orig + +for ((i=0;i<${#URLS[@]};++i)); do + file=${FILES[i]} + if [ -f $file ]; then + echo "$file already exists, skipping download" + else + url=${URLS[i]} + wget "$url" + if [ -f $file ]; then + echo "$url successfully downloaded." + else + echo "$url not successfully downloaded." + exit 1 + fi + if [ ${file: -4} == ".tgz" ]; then + tar zxvf $file + elif [ ${file: -4} == ".tar" ]; then + tar xvf $file + fi + fi +done +cd .. + +echo "pre-processing train data..." +for l in $src $tgt; do + rm $tmp/train.tags.$lang.tok.$l + for f in "${CORPORA[@]}"; do + cat $orig/$f.$l | \ + perl $NORM_PUNC $l | \ + perl $REM_NON_PRINT_CHAR | \ + perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l + done +done + +echo "pre-processing test data..." +for l in $src $tgt; do + if [ "$l" == "$src" ]; then + t="src" + else + t="ref" + fi + grep '\s*//g' | \ + sed -e 's/\s*<\/seg>\s*//g' | \ + sed -e "s/\’/\'/g" | \ + perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l + echo "" +done + +echo "splitting train and valid..." +for l in $src $tgt; do + awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l + awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l +done + +TRAIN=$tmp/train.de-en +BPE_CODE=$prep/code +rm -f $TRAIN +for l in $src $tgt; do + cat $tmp/train.$l >> $TRAIN +done + +echo "learn_bpe.py on ${TRAIN}..." +python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE + +for L in $src $tgt; do + for f in train.$L valid.$L test.$L; do + echo "apply_bpe.py to ${f}..." + python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f + done +done + +perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 +perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 + +for L in $src $tgt; do + cp $tmp/bpe.test.$L $prep/test.$L +done diff --git a/fairseq/examples/backtranslation/sacrebleu.sh b/fairseq/examples/backtranslation/sacrebleu.sh new file mode 100644 index 0000000000000000000000000000000000000000..a70da23f48e2699297799611412783d4560dc45a --- /dev/null +++ b/fairseq/examples/backtranslation/sacrebleu.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]" + exit +fi + + +DATASET=$1 +LANGPAIR=$2 +DATABIN=$3 +BPECODE=$4 +MODEL=$5 + +SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1) +TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2) + + +BPEROOT=examples/backtranslation/subword-nmt/subword_nmt +if [ ! -e $BPEROOT ]; then + BPEROOT=subword-nmt/subword_nmt + if [ ! -e $BPEROOT ]; then + echo 'Cloning Subword NMT repository (for BPE pre-processing)...' + git clone https://github.com/rsennrich/subword-nmt.git + fi +fi + + +sacrebleu -t $DATASET -l $LANGPAIR --echo src \ +| sacremoses tokenize -a -l $SRCLANG -q \ +| python $BPEROOT/apply_bpe.py -c $BPECODE \ +| fairseq-interactive $DATABIN --path $MODEL \ + -s $SRCLANG -t $TGTLANG \ + --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \ +| grep ^H- | cut -f 3- \ +| sacremoses detokenize -l $TGTLANG -q \ +| sacrebleu -t $DATASET -l $LANGPAIR diff --git a/fairseq/examples/backtranslation/tokenized_bleu.sh b/fairseq/examples/backtranslation/tokenized_bleu.sh new file mode 100644 index 0000000000000000000000000000000000000000..c6d6aaa193f6059299bc98909324fe4b9b060372 --- /dev/null +++ b/fairseq/examples/backtranslation/tokenized_bleu.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +if [ $# -ne 5 ]; then + echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]" + exit +fi + + +DATASET=$1 +LANGPAIR=$2 +DATABIN=$3 +BPECODE=$4 +MODEL=$5 + +SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1) +TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2) + + +BPEROOT=examples/backtranslation/subword-nmt/subword_nmt +if [ ! -e $BPEROOT ]; then + BPEROOT=subword-nmt/subword_nmt + if [ ! -e $BPEROOT ]; then + echo 'Cloning Subword NMT repository (for BPE pre-processing)...' + git clone https://github.com/rsennrich/subword-nmt.git + fi +fi + + +TMP_REF=$(mktemp) + +sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \ +| sacremoses normalize -l $TGTLANG -q \ +| sacremoses tokenize -a -l $TGTLANG -q \ +> $TMP_REF + +sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \ +| sacremoses normalize -l $SRCLANG -q \ +| sacremoses tokenize -a -l $SRCLANG -q \ +| python $BPEROOT/apply_bpe.py -c $BPECODE \ +| fairseq-interactive $DATABIN --path $MODEL \ + -s $SRCLANG -t $TGTLANG \ + --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \ +| grep ^H- | cut -f 3- \ +| fairseq-score --ref $TMP_REF + +rm -f $TMP_REF diff --git a/fairseq/examples/bart/README.glue.md b/fairseq/examples/bart/README.glue.md new file mode 100644 index 0000000000000000000000000000000000000000..a010934e1e6dec491eb1c704ec02ba7405760510 --- /dev/null +++ b/fairseq/examples/bart/README.glue.md @@ -0,0 +1,99 @@ +# Fine-tuning BART on GLUE tasks + +### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands: +```bash +wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py +python download_glue_data.py --data_dir glue_data --tasks all +``` + +### 2) Preprocess GLUE task data (same as RoBERTa): +```bash +./examples/roberta/preprocess_GLUE_tasks.sh glue_data +``` +`glue_task_name` is one of the following: +`{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}` +Use `ALL` for preprocessing all the glue tasks. + +### 3) Fine-tuning on GLUE task: +Example fine-tuning cmd for `RTE` task +```bash +TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16 +WARMUP_UPDATES=61 # 6 percent of the number of updates +LR=1e-05 # Peak LR for polynomial LR scheduler. +NUM_CLASSES=2 +MAX_SENTENCES=16 # Batch size. +BART_PATH=/path/to/bart/model.pt + +CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \ + --restore-file $BART_PATH \ + --batch-size $MAX_SENTENCES \ + --max-tokens 4400 \ + --task sentence_prediction \ + --add-prev-output-tokens \ + --layernorm-embedding \ + --share-all-embeddings \ + --share-decoder-input-output-embed \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --init-token 0 \ + --arch bart_large \ + --criterion sentence_prediction \ + --num-classes $NUM_CLASSES \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \ + --clip-norm 0.0 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ + --max-epoch 10 \ + --find-unused-parameters \ + --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; +``` + +For each of the GLUE task, you will need to use following cmd-line arguments: + +Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B +---|---|---|---|---|---|---|---|--- +`--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1 +`--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5 +`bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32 +`--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799 +`--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107 + +For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`. + +**Note:** + +a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task. + +b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. + +### Inference on GLUE task +After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: + +```python +from fairseq.models.bart import BARTModel + +bart = BARTModel.from_pretrained( + 'checkpoints/', + checkpoint_file='checkpoint_best.pt', + data_name_or_path='RTE-bin' +) + +label_fn = lambda label: bart.task.label_dictionary.string( + [label + bart.task.label_dictionary.nspecial] +) +ncorrect, nsamples = 0, 0 +bart.cuda() +bart.eval() +with open('glue_data/RTE/dev.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[1], tokens[2], tokens[3] + tokens = bart.encode(sent1, sent2) + prediction = bart.predict('sentence_classification_head', tokens).argmax().item() + prediction_label = label_fn(prediction) + ncorrect += int(prediction_label == target) + nsamples += 1 +print('| Accuracy: ', float(ncorrect)/float(nsamples)) +``` diff --git a/fairseq/examples/bart/README.md b/fairseq/examples/bart/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4050a724ee6a2f20c9998a95df48c58b64764ab1 --- /dev/null +++ b/fairseq/examples/bart/README.md @@ -0,0 +1,228 @@ +# BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension + +[https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461) + +## Introduction + +BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details. + +## Pre-trained models + +Model | Description | # params | Download +---|---|---|--- +`bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz) +`bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz) +`bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz) +`bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz) +`bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz) + +## Results + +**[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)** +_(dev set, single model, single-task finetuning)_ + +Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B +---|---|---|---|---|---|---|---|--- +`roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4 +`bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2 + +**[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)** +_(dev set, no additional data used)_ + +Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1 +---|---|--- +`roberta.large` | 88.9/94.6 | 86.5/89.4 +`bart.large` | 88.8/94.6 | 86.1/89.2 + +**[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)** +_(test set, no additional data used)_ + +Model | R1 | R2 | RL +---|---|---|--- +`BERTSUMEXTABS` | 42.13 | 19.60 | 39.18 +`bart.large` | 44.16 | 21.28 | 40.90 + +## Example usage + +##### Load BART from torch.hub (PyTorch >= 1.1): +```python +import torch +bart = torch.hub.load('pytorch/fairseq', 'bart.large') +bart.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Load BART (for PyTorch 1.0 or custom models): +```python +# Download bart.large model +wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz +tar -xzvf bart.large.tar.gz + +# Load the model in fairseq +from fairseq.models.bart import BARTModel +bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt') +bart.eval() # disable dropout (or leave in train mode to finetune) +``` + +##### Apply Byte-Pair Encoding (BPE) to input text: +```python +tokens = bart.encode('Hello world!') +assert tokens.tolist() == [0, 31414, 232, 328, 2] +bart.decode(tokens) # 'Hello world!' +``` + +##### Extract features from BART: +```python +# Extract the last layer's features +last_layer_features = bart.extract_features(tokens) +assert last_layer_features.size() == torch.Size([1, 5, 1024]) + +# Extract all layer's features from decoder (layer 0 is the embedding layer) +all_layers = bart.extract_features(tokens, return_all_hiddens=True) +assert len(all_layers) == 13 +assert torch.all(all_layers[-1] == last_layer_features) +``` + +##### Use BART for sentence-pair classification tasks: +```python +# Download BART already finetuned for MNLI +bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli') +bart.eval() # disable dropout for evaluation + +# Encode a pair of sentences and make a prediction +tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.') +bart.predict('mnli', tokens).argmax() # 0: contradiction + +# Encode another pair of sentences +tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.') +bart.predict('mnli', tokens).argmax() # 2: entailment +``` + +##### Register a new (randomly initialized) classification head: +```python +bart.register_classification_head('new_task', num_classes=3) +logprobs = bart.predict('new_task', tokens) +``` + +##### Batched prediction: +```python +import torch +from fairseq.data.data_utils import collate_tokens + +bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli') +bart.eval() + +batch_of_pairs = [ + ['BART is a seq2seq model.', 'BART is not sequence to sequence.'], + ['BART is denoising autoencoder.', 'BART is version of autoencoder.'], +] + +batch = collate_tokens( + [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1 +) + +logprobs = bart.predict('mnli', batch) +print(logprobs.argmax(dim=1)) +# tensor([0, 2]) +``` + +##### Using the GPU: +```python +bart.cuda() +bart.predict('new_task', tokens) +``` + +#### Filling masks: + +BART can be used to fill multiple `` tokens in the input. +```python +bart = torch.hub.load('pytorch/fairseq', 'bart.base') +bart.eval() +bart.fill_mask(['The cat on the .'], topk=3, beam=10) +# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]] +``` + +Note that by default we enforce the output length to match the input length. +This can be disabled by setting ``match_source_len=False``: +``` +bart.fill_mask(['The cat on the .'], topk=3, beam=10, match_source_len=False) +# [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]] +``` + +Example code to fill masks for a batch of sentences using GPU +``` +bart.cuda() +bart.fill_mask(['The cat on the .', 'The dog on the .'], topk=3, beam=10) +# [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)), +('The dog was asleep on the couch', tensor(-0.6796))]] +``` + +#### Evaluating the `bart.large.mnli` model: + +Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set. +```python +label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'} +ncorrect, nsamples = 0, 0 +bart.cuda() +bart.eval() +with open('glue_data/MNLI/dev_matched.tsv') as fin: + fin.readline() + for index, line in enumerate(fin): + tokens = line.strip().split('\t') + sent1, sent2, target = tokens[8], tokens[9], tokens[-1] + tokens = bart.encode(sent1, sent2) + prediction = bart.predict('mnli', tokens).argmax().item() + prediction_label = label_map[prediction] + ncorrect += int(prediction_label == target) + nsamples += 1 + print('| Accuracy: ', float(ncorrect)/float(nsamples)) +# Expected output: 0.9010 +``` + +#### Evaluating the `bart.large.cnn` model: +- Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample. +- For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores +- `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search. + In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`. + +In `fairseq`, summaries can be generated using: + +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir pytorch/fairseq \ + --model-file bart.large.cnn \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo +``` + +For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge). + +```bash +export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar + +# Tokenize hypothesis and target files. +cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized +cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target +files2rouge test.hypo.tokenized test.hypo.target +# Expected output: (ROUGE-2 Average_F: 0.21238) +``` + + +## Finetuning + +- [Finetuning on GLUE](README.glue.md) +- [Finetuning on CNN-DM](README.summarization.md) + +## Citation + +```bibtex +@article{lewis2019bart, + title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural +Language Generation, Translation, and Comprehension}, + author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and + Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov + and Luke Zettlemoyer }, + journal={arXiv preprint arXiv:1910.13461}, + year = {2019}, +} +``` diff --git a/fairseq/examples/bart/README.summarization.md b/fairseq/examples/bart/README.summarization.md new file mode 100644 index 0000000000000000000000000000000000000000..8727584f2b2bdd880c6cd3abbf39b75dfbf4a67c --- /dev/null +++ b/fairseq/examples/bart/README.summarization.md @@ -0,0 +1,102 @@ +# Fine-tuning BART on CNN-Dailymail summarization task + +### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples. + +Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail). + +Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset. + +### 2) BPE preprocess: + +```bash +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' +wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' + +TASK=cnn_dm +for SPLIT in train val +do + for LANG in source target + do + python -m examples.roberta.multiprocessing_bpe_encoder \ + --encoder-json encoder.json \ + --vocab-bpe vocab.bpe \ + --inputs "$TASK/$SPLIT.$LANG" \ + --outputs "$TASK/$SPLIT.bpe.$LANG" \ + --workers 60 \ + --keep-empty; + done +done +``` + +### 3) Binarize dataset: +```bash +fairseq-preprocess \ + --source-lang "source" \ + --target-lang "target" \ + --trainpref "${TASK}/train.bpe" \ + --validpref "${TASK}/val.bpe" \ + --destdir "${TASK}-bin/" \ + --workers 60 \ + --srcdict dict.txt \ + --tgtdict dict.txt; +``` + +### 4) Fine-tuning on CNN-DM summarization task: +Example fine-tuning CNN-DM +```bash +TOTAL_NUM_UPDATES=20000 +WARMUP_UPDATES=500 +LR=3e-05 +MAX_TOKENS=2048 +UPDATE_FREQ=4 +BART_PATH=/path/to/bart/model.pt + +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \ + --restore-file $BART_PATH \ + --max-tokens $MAX_TOKENS \ + --task translation \ + --source-lang source --target-lang target \ + --truncate-source \ + --layernorm-embedding \ + --share-all-embeddings \ + --share-decoder-input-output-embed \ + --reset-optimizer --reset-dataloader --reset-meters \ + --required-batch-size-multiple 1 \ + --arch bart_large \ + --criterion label_smoothed_cross_entropy \ + --label-smoothing 0.1 \ + --dropout 0.1 --attention-dropout 0.1 \ + --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \ + --clip-norm 0.1 \ + --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ + --fp16 --update-freq $UPDATE_FREQ \ + --skip-invalid-size-inputs-valid-test \ + --find-unused-parameters; +``` +Above is expected to run on `1` node with `8 32gb-V100`. +Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`. + +Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task + +### Inference for CNN-DM test data using above trained checkpoint. +After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example + +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir checkpoints \ + --model-file checkpoint_best.pt \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo +``` +For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10: +```bash +cp data-bin/cnn_dm/dict.source.txt checkpoints/ +python examples/bart/summarize.py \ + --model-dir checkpoints \ + --model-file checkpoint_best.pt \ + --src cnn_dm/test.source \ + --out cnn_dm/test.hypo \ + --xsum-kwargs +``` diff --git a/fairseq/examples/bart/summarize.py b/fairseq/examples/bart/summarize.py new file mode 100644 index 0000000000000000000000000000000000000000..04435f80e39c2d9d894696dae7cba5b381e13da9 --- /dev/null +++ b/fairseq/examples/bart/summarize.py @@ -0,0 +1,100 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from fairseq.models.bart import BARTModel +import argparse + +XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3) +CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) + + +@torch.no_grad() +def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs): + count = 1 + + # if n_obs is not None: bsz = min(bsz, n_obs) + + with open(infile) as source, open(outfile, "w") as fout: + sline = source.readline().strip() + slines = [sline] + for sline in source: + if n_obs is not None and count > n_obs: + break + if count % bsz == 0: + hypotheses_batch = bart.sample(slines, **eval_kwargs) + for hypothesis in hypotheses_batch: + fout.write(hypothesis + "\n") + fout.flush() + slines = [] + + slines.append(sline.strip()) + count += 1 + + if slines != []: + hypotheses_batch = bart.sample(slines, **eval_kwargs) + for hypothesis in hypotheses_batch: + fout.write(hypothesis + "\n") + fout.flush() + + +def main(): + """ + Usage:: + + python examples/bart/summarize.py \ + --model-dir $HOME/bart.large.cnn \ + --model-file model.pt \ + --src $HOME/data-bin/cnn_dm/test.source + """ + parser = argparse.ArgumentParser() + parser.add_argument( + "--model-dir", + required=True, + type=str, + default="bart.large.cnn/", + help="path containing model file and src_dict.txt", + ) + parser.add_argument( + "--model-file", + default="checkpoint_best.pt", + help="where in model_dir are weights saved", + ) + parser.add_argument( + "--src", default="test.source", help="text to summarize", type=str + ) + parser.add_argument( + "--out", default="test.hypo", help="where to save summaries", type=str + ) + parser.add_argument("--bsz", default=32, help="where to save summaries", type=int) + parser.add_argument( + "--n", default=None, help="how many examples to summarize", type=int + ) + parser.add_argument( + "--xsum-kwargs", + action="store_true", + default=False, + help="if true use XSUM_KWARGS else CNN_KWARGS", + ) + args = parser.parse_args() + eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS + if args.model_dir == "pytorch/fairseq": + bart = torch.hub.load("pytorch/fairseq", args.model_file) + else: + bart = BARTModel.from_pretrained( + args.model_dir, + checkpoint_file=args.model_file, + data_name_or_path=args.model_dir, + ) + bart = bart.eval() + if torch.cuda.is_available(): + bart = bart.cuda().half() + generate( + bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs + ) + + +if __name__ == "__main__": + main() diff --git a/fairseq/examples/byte_level_bpe/README.md b/fairseq/examples/byte_level_bpe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..657092660eae42d20f67647417623b8b8cb7b66c --- /dev/null +++ b/fairseq/examples/byte_level_bpe/README.md @@ -0,0 +1,88 @@ +# Neural Machine Translation with Byte-Level Subwords + +https://arxiv.org/abs/1909.03341 + +We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as +example. + +## Data +Get data and generate fairseq binary dataset: +```bash +bash ./get_data.sh +``` + +## Model Training +Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`): +```bash +# VOCAB=bytes +# VOCAB=chars +VOCAB=bbpe2048 +# VOCAB=bpe2048 +# VOCAB=bbpe4096 +# VOCAB=bpe4096 +# VOCAB=bpe16384 +``` +```bash +fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ + --arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \ + --optimizer adam --adam-betas '(0.9, 0.98)' \ + --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ + --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ + --log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \ + --batch-size 100 --max-update 100000 --update-freq 2 +``` + +## Generation +`fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters: +```bash +# BPE=--bpe bytes +# BPE=--bpe characters +BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model +# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model +# BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model +# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model +# BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model +``` + +```bash +fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ + --source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \ + --tokenizer moses --moses-target-lang en ${BPE} +``` +When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions: +```bash +fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ + --path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \ + --moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000 +``` + +## Results +| Vocabulary | Model | BLEU | +|:-------------:|:-------------:|:-------------:| +| Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 | +| Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) | +| Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) | +| Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) | +| Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) | +| Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) | +| Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) | +| Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) | + + +## Citation +``` +@misc{wang2019neural, + title={Neural Machine Translation with Byte-Level Subwords}, + author={Changhan Wang and Kyunghyun Cho and Jiatao Gu}, + year={2019}, + eprint={1909.03341}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + + +## Contact +Changhan Wang ([changhan@fb.com](mailto:changhan@fb.com)), +Kyunghyun Cho ([kyunghyuncho@fb.com](mailto:kyunghyuncho@fb.com)), +Jiatao Gu ([jgu@fb.com](mailto:jgu@fb.com)) diff --git a/fairseq/examples/byte_level_bpe/get_bitext.py b/fairseq/examples/byte_level_bpe/get_bitext.py new file mode 100644 index 0000000000000000000000000000000000000000..6ac1eeec1e6167ec6bafd76b37173ee6987cae7e --- /dev/null +++ b/fairseq/examples/byte_level_bpe/get_bitext.py @@ -0,0 +1,254 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +import os +import os.path as op +from collections import namedtuple +from multiprocessing import cpu_count +from typing import List, Optional + +import sentencepiece as sp +from fairseq.data.encoders.byte_bpe import ByteBPE +from fairseq.data.encoders.byte_utils import byte_encode +from fairseq.data.encoders.bytes import Bytes +from fairseq.data.encoders.characters import Characters +from fairseq.data.encoders.moses_tokenizer import MosesTokenizer +from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE + + +SPLITS = ["train", "valid", "test"] + + +def _convert_xml(in_path: str, out_path: str): + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + ss = s.strip() + if not ss.startswith("", "").split('">') + assert len(ss) == 2 + f_o.write(ss[1].strip() + "\n") + + +def _convert_train(in_path: str, out_path: str): + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + ss = s.strip() + if ss.startswith("<"): + continue + f_o.write(ss.strip() + "\n") + + +def _get_bytes(in_path: str, out_path: str): + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + f_o.write(Bytes.encode(s.strip()) + "\n") + + +def _get_chars(in_path: str, out_path: str): + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + f_o.write(Characters.encode(s.strip()) + "\n") + + +def pretokenize(in_path: str, out_path: str, src: str, tgt: str): + Args = namedtuple( + "Args", + [ + "moses_source_lang", + "moses_target_lang", + "moses_no_dash_splits", + "moses_no_escape", + ], + ) + args = Args( + moses_source_lang=src, + moses_target_lang=tgt, + moses_no_dash_splits=False, + moses_no_escape=False, + ) + pretokenizer = MosesTokenizer(args) + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + f_o.write(pretokenizer.encode(s.strip()) + "\n") + + +def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str): + with open(out_path, "w") as f_o: + for lang in [src, tgt]: + with open(f"{in_path_prefix}.{lang}") as f: + for s in f: + f_o.write(byte_encode(s.strip()) + "\n") + + +def _get_bpe(in_path: str, model_prefix: str, vocab_size: int): + arguments = [ + f"--input={in_path}", + f"--model_prefix={model_prefix}", + f"--model_type=bpe", + f"--vocab_size={vocab_size}", + "--character_coverage=1.0", + "--normalization_rule_name=identity", + f"--num_threads={cpu_count()}", + ] + sp.SentencePieceTrainer.Train(" ".join(arguments)) + + +def _apply_bbpe(model_path: str, in_path: str, out_path: str): + Args = namedtuple("Args", ["sentencepiece_model_path"]) + args = Args(sentencepiece_model_path=model_path) + tokenizer = ByteBPE(args) + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + f_o.write(tokenizer.encode(s.strip()) + "\n") + + +def _apply_bpe(model_path: str, in_path: str, out_path: str): + Args = namedtuple("Args", ["sentencepiece_model"]) + args = Args(sentencepiece_model=model_path) + tokenizer = SentencepieceBPE(args) + with open(in_path) as f, open(out_path, "w") as f_o: + for s in f: + f_o.write(tokenizer.encode(s.strip()) + "\n") + + +def _concat_files(in_paths: List[str], out_path: str): + with open(out_path, "w") as f_o: + for p in in_paths: + with open(p) as f: + for r in f: + f_o.write(r) + + +def preprocess_iwslt17( + root: str, + src: str, + tgt: str, + bpe_size: Optional[int], + need_chars: bool, + bbpe_size: Optional[int], + need_bytes: bool, +): + # extract bitext + in_root = op.join(root, f"{src}-{tgt}") + for lang in [src, tgt]: + _convert_train( + op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"), + op.join(root, f"train.{lang}"), + ) + _convert_xml( + op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"), + op.join(root, f"valid.{lang}"), + ) + _convert_xml( + op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"), + op.join(root, f"test.{lang}"), + ) + # pre-tokenize + for lang in [src, tgt]: + for split in SPLITS: + pretokenize( + op.join(root, f"{split}.{lang}"), + op.join(root, f"{split}.moses.{lang}"), + src, + tgt, + ) + # tokenize with BPE vocabulary + if bpe_size is not None: + # learn vocabulary + concated_train_path = op.join(root, "train.all") + _concat_files( + [op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")], + concated_train_path, + ) + bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}") + _get_bpe(concated_train_path, bpe_model_prefix, bpe_size) + os.remove(concated_train_path) + # apply + for lang in [src, tgt]: + for split in SPLITS: + _apply_bpe( + bpe_model_prefix + ".model", + op.join(root, f"{split}.moses.{lang}"), + op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"), + ) + # tokenize with bytes vocabulary + if need_bytes: + for lang in [src, tgt]: + for split in SPLITS: + _get_bytes( + op.join(root, f"{split}.moses.{lang}"), + op.join(root, f"{split}.moses.bytes.{lang}"), + ) + # tokenize with characters vocabulary + if need_chars: + for lang in [src, tgt]: + for split in SPLITS: + _get_chars( + op.join(root, f"{split}.moses.{lang}"), + op.join(root, f"{split}.moses.chars.{lang}"), + ) + # tokenize with byte-level BPE vocabulary + if bbpe_size is not None: + # learn vocabulary + bchar_path = op.join(root, "train.bchar") + _convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path) + bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}") + _get_bpe(bchar_path, bbpe_model_prefix, bbpe_size) + os.remove(bchar_path) + # apply + for lang in [src, tgt]: + for split in SPLITS: + _apply_bbpe( + bbpe_model_prefix + ".model", + op.join(root, f"{split}.moses.{lang}"), + op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"), + ) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--root", type=str, default="data") + parser.add_argument( + "--bpe-vocab", + default=None, + type=int, + help="Generate tokenized bitext with BPE of size K." + "Default to None (disabled).", + ) + parser.add_argument( + "--bbpe-vocab", + default=None, + type=int, + help="Generate tokenized bitext with BBPE of size K." + "Default to None (disabled).", + ) + parser.add_argument( + "--byte-vocab", + action="store_true", + help="Generate tokenized bitext with bytes vocabulary", + ) + parser.add_argument( + "--char-vocab", + action="store_true", + help="Generate tokenized bitext with chars vocabulary", + ) + args = parser.parse_args() + + preprocess_iwslt17( + args.root, + "fr", + "en", + args.bpe_vocab, + args.char_vocab, + args.bbpe_vocab, + args.byte_vocab, + ) + + +if __name__ == "__main__": + main() diff --git a/fairseq/examples/byte_level_bpe/get_data.sh b/fairseq/examples/byte_level_bpe/get_data.sh new file mode 100644 index 0000000000000000000000000000000000000000..c3d55d4925a6e6e23d12d293f093c1ae14acf76e --- /dev/null +++ b/fairseq/examples/byte_level_bpe/get_data.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +PY_BIN_ROOT= + +# PyPI dependency +${PY_BIN_ROOT}pip install sentencepiece sacremoses + +# Get data +if [ ! -d "data" ]; then + mkdir data +fi + +if [ ! -f "data/fr-en.tgz" ]; then + wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data + tar xvf data/fr-en.tgz -C data +fi +${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab +for VOCAB_SIZE in 2048 4096; do + ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE} +done +rm -r data/fr-en data/fr-en.tgz + +# Generate binary dataset +${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \ + --workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \ + --testpref data/test.moses.bpe16384 + +${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \ + --workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \ + --testpref data/test.moses.bytes + +${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \ + --workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \ + --testpref data/test.moses.chars + +for VOCAB_SIZE in 2048 4096; do + for TYPE in bbpe bpe; do + ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \ + --joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \ + --validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}" + done +done diff --git a/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..881a1583f871d272b43c6a2d13ef6768f0923bb2 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml new file mode 100644 index 0000000000000000000000000000000000000000..de7894d9cf0ac85b8d5a4282c1c9612f14f2d2b3 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 1 + tasks_per_node: 1 + mem_gb: 100 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b016cac9b5e4a5f978d83124697a9c01ec885164 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/classification/run_config/slurm_2.yaml @@ -0,0 +1,35 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml b/fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dd30fbedd5818debf49ef1c87c84af4b6c58fb2d --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/audioset.yaml @@ -0,0 +1,91 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + user_dir: /private/home/abaevski/fairseq-py/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /private/home/abaevski/data/audioset + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + num_workers: 6 + max_tokens: 3400000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 24 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var +# - avg_self_attn +# - weights + +optimization: + max_update: 200000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_audio + extractor_mode: layer_norm + encoder_layerdrop: 0.05 + dropout_input: 0.0 + dropout_features: 0.0 + feature_grad_mult: 1.0 + encoder_embed_dim: 768 + + mask_prob: 0.65 + mask_length: 10 + + loss_beta: 0 + loss_scale: null + + instance_norm_target_layer: true + layer_norm_targets: true + average_top_k_layers: 12 + + self_attn_norm_type: deepnorm + final_norm_type: deepnorm + + pos_conv_depth: 5 + conv_pos: 95 + + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 30000 + ema_transformer_only: true + ema_layers_only: false + + require_same_masks: true + mask_dropout: 0 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml b/fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c332c5a3f8e5e48150a3c0c3649d81234521bc16 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/base_librispeech.yaml @@ -0,0 +1,83 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: ??? + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + +dataset: + num_workers: 6 + max_tokens: 3800000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: tri_stage + phase_ratio: [0.03,0.9,0.07] + +model: + _name: data2vec_audio + extractor_mode: layer_norm + encoder_layerdrop: 0.05 + dropout_input: 0.0 + dropout_features: 0.0 + feature_grad_mult: 1.0 + encoder_embed_dim: 768 + + mask_prob: 0.65 + mask_length: 10 + + loss_beta: 0 + loss_scale: null + + instance_norm_target_layer: true + average_top_k_layers: 8 + + pos_conv_depth: 5 + conv_pos: 95 + + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 30000 + ema_transformer_only: true + ema_layers_only: true + + require_same_masks: true + mask_dropout: 0 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..732f0188993a14c847ab5d5e8addb24b0b536311 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2bab5675a561a6a01b5db06530b119a282bbf4a --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec53dc2a984babdf7e61930350868e38f0d769c3 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..70cc8cbb5b8078fb4dbebd66964892efca6607ac --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14b47d14e6282d5a846cc7c41c6b2080c5ed8c62 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c54d735fb2dc0b0af2d467caa7f64405af18ea10 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0231b2690d63f758d13eff9c3f5c5826cefba050 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9a4e43a987e8810b42a4601af5c51072c19fe6b7 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..78c9f57aeb2111f6a3e7d99005f25fbd359c7a72 --- /dev/null +++ b/fairseq/examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/text/pretraining/base.yaml b/fairseq/examples/data2vec/config/text/pretraining/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c6b07c4052aaf64d5b73bd983e395fa983b54cd5 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/base.yaml @@ -0,0 +1,77 @@ +# @package _group_ +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: ??? + sample_break_mode: complete_doc + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + mask_prob: 0.35 + mask_multiple_length: 4 + +criterion: model + +dataset: + max_tokens: 8192 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + +optimizer: + _name: adam + weight_decay: 0.01 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +optimization: + clip_norm: 5 + lr: [0.0002] + max_update: 1000000 + update_freq: [1] + +model: + _name: data2vec_text + head_layers: 2 + average_top_k_layers: 10 + layer_norm_target_layer: true + loss_scale: 1 + ema_decay: 0.999 + ema_end_decay: 0.9999 + ema_anneal_end_step: 300000 + loss_beta: 4 + ema_transformer_layers_only: true + + transformer: + dropout: 0.1 + attention_dropout: 0.1 + layernorm_embedding: true + activation_fn: gelu + no_scale_embedding: true + max_source_positions: 512 + encoder: + embed_dim: 768 + ffn_embed_dim: 3072 + layers: 12 + attention_heads: 12 + normalize_before: false + learned_pos: true + layerdrop: 0 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4bac45a58de960ea347167f6da6d8d73910da4e1 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..006a0f2116db5a85da558cada65873f1150eb717 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4292198b4ef9608b6550652fd72d55a9acf85cc0 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e1555d20f9380b9d82d5852be6799d35fa3d078 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c54d735fb2dc0b0af2d467caa7f64405af18ea10 --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5df84cd6da99d06fc32f3596cc1d0f8d5e62ee1f --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml @@ -0,0 +1,41 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 + +distributed_training: + distributed_world_size: 32 + ddp_backend: legacy_ddp diff --git a/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5b32c23a665f9374d1c919c52dcbf19665c47f4f --- /dev/null +++ b/fairseq/examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml @@ -0,0 +1,41 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: '_' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir}/submitit + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: pt + partition: wav2vec + max_num_timeout: 30 + exclude: a100-st-p4d24xlarge-471 + +distributed_training: + distributed_world_size: 64 + ddp_backend: legacy_ddp diff --git a/fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml b/fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..65a9ab3e738a8361b229b6c42fc11c7c0480f264 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/base_audio_only_task.yaml @@ -0,0 +1,113 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: false + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /private/home/abaevski/data/librispeech/full + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + precompute_mask_config: {} + +dataset: + num_workers: 6 + max_tokens: 1000000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 8 + ddp_backend: legacy_ddp + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 400000 + lr: [0.00075] + debug_param_names: true + +optimizer: + _name: adam + adam_betas: [ 0.9,0.98 ] + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 8000 + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: null + + depth: 12 + embed_dim: 768 + clone_batch: 8 + + ema_decay: 0.999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 75000 + ema_encoder_only: false + + average_top_k_layers: 8 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: false + + layerdrop: 0.05 + norm_eps: 1e-5 + + supported_modality: AUDIO + + modalities: + audio: + feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' + conv_pos_depth: 5 + conv_pos_width: 95 + conv_pos_groups: 16 + prenet_depth: 0 + mask_prob: 0.5 + mask_prob_adjust: 0.05 + inverse_mask: false + mask_length: 5 + mask_noise_std: 0.01 + mask_dropout: 0 + add_masks: false + ema_local_encoder: false + use_alibi_encoder: true + prenet_layerdrop: 0.05 + prenet_dropout: 0.1 + learned_alibi_scale: true + learned_alibi_scale_per_head: true + decoder: + input_dropout: 0.1 + decoder_dim: 384 + decoder_groups: 16 + decoder_kernel: 7 + decoder_layers: 4 diff --git a/fairseq/examples/data2vec/config/v2/base_images_only_task.yaml b/fairseq/examples/data2vec/config/v2/base_images_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ff0c247b1387f10945145788ed465177ac5d554e --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/base_images_only_task.yaml @@ -0,0 +1,116 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 16 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.001 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 1e-3 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 10 + average_top_k_layers: 10 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.8 + mask_prob_adjust: 0.07 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 2 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + decoder: + decoder_dim: 768 + decoder_groups: 16 + decoder_kernel: 3 + decoder_layers: 6 + input_dropout: 0 \ No newline at end of file diff --git a/fairseq/examples/data2vec/config/v2/base_text_only_task.yaml b/fairseq/examples/data2vec/config/v2/base_text_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..62f22eb0fe50c1b8d8febfb01529b32c14dece93 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/base_text_only_task.yaml @@ -0,0 +1,112 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 16 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +dataset: + batch_size: 4 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +optimization: + clip_norm: 1 + lr: [0.0002] + max_update: 1000000 + update_freq: [1] + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0002 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: 1 + + depth: 12 + embed_dim: 768 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 12 + layer_norm_target_layer: false + instance_norm_target_layer: true + batch_norm_target_layer: false + instance_norm_targets: false + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + + modalities: + text: + mask_prob: 0.48 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml b/fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a8a15253f2df07198bf541a76259f9784d43f276 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/huge_images14_only_task.yaml @@ -0,0 +1,122 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 32 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 500000 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 1 + ema_anneal_end_step: 300000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 32 + embed_dim: 1280 + num_heads: 16 + + average_top_k_layers: 24 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + patch_size: 14 + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1280 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + final_layer_norm: false + input_dropout: 0 \ No newline at end of file diff --git a/fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml b/fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a352ac3c741a70d958074cabd5d57021d7e9753 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/huge_images_only_task.yaml @@ -0,0 +1,120 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99995 + ema_anneal_end_step: 150000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 32 + embed_dim: 1280 + num_heads: 16 + + average_top_k_layers: 24 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1280 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + input_dropout: 0 \ No newline at end of file diff --git a/fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml b/fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f615897215b8c7cee45caab00d69b3342c7ba20 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/large_audio_only_task.yaml @@ -0,0 +1,122 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: audio_pretraining + data: /fsx-wav2vec/abaevski/data/librivox/no_silence + max_sample_size: 320000 + min_sample_size: 32000 + normalize: true + precompute_mask_config: {} + +dataset: + num_workers: 8 + max_tokens: 320000 + skip_invalid_size_inputs_valid_test: true + validate_interval: 5 + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 48 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 600000 + debug_param_names: true + clip_norm: 1 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0004 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 10000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: null + + depth: 16 + embed_dim: 1024 + num_heads: 16 + + clone_batch: 12 + + ema_decay: 0.9997 + ema_end_decay: 1 + ema_anneal_end_step: 300000 + ema_encoder_only: false + + average_top_k_layers: 16 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: AUDIO + + modalities: + audio: + feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' + conv_pos_depth: 5 + conv_pos_width: 95 + conv_pos_groups: 16 + prenet_depth: 8 + mask_prob: 0.55 + mask_prob_adjust: 0.1 + inverse_mask: false + mask_length: 5 + mask_noise_std: 0.01 + mask_dropout: 0 + add_masks: false + ema_local_encoder: false + use_alibi_encoder: true + prenet_layerdrop: 0 + prenet_dropout: 0.1 + learned_alibi_scale: true + learned_alibi_scale_per_head: true + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 16 + decoder_kernel: 7 + decoder_layers: 4 diff --git a/fairseq/examples/data2vec/config/v2/large_images_only_task.yaml b/fairseq/examples/data2vec/config/v2/large_images_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b957fc129deee8cae03a20a4f336716123a45ef --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/large_images_only_task.yaml @@ -0,0 +1,120 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + local_cache_path: /scratch/cache_abaevski/imagenet + key: source + precompute_mask_config: {} + +dataset: + num_workers: 10 + batch_size: 8 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 375300 + lr: [ 0.0004 ] + debug_param_names: true + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 4e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + ema_decay: 0.9998 + ema_end_decay: 0.99999 + ema_anneal_end_step: 150000 + instance_norm_target_layer: true + layer_norm_target_layer: false + layer_norm_targets: true + end_of_block_targets: false + + depth: 24 + embed_dim: 1024 + num_heads: 16 + + average_top_k_layers: 18 + clone_batch: 16 + + norm_eps: 1e-6 + + min_target_var: 0 + min_pred_var: 0 + + encoder_dropout: 0 + post_mlp_drop: 0 + attention_dropout: 0 + activation_dropout: 0 + + supported_modality: IMAGE + cls_loss: 0.01 + + ema_encoder_only: false + + modalities: + image: + inverse_mask: true + mask_prob: 0.75 + mask_prob_adjust: 0.1 + mask_length: 3 + mask_noise_std: 0.01 + prenet_depth: 0 + ema_local_encoder: true + num_extra_tokens: 1 + init_extra_token_zero: false + use_alibi_encoder: false + embed_dim: 1024 + decoder: + decoder_dim: 1024 + decoder_groups: 16 + decoder_kernel: 5 + decoder_layers: 3 + input_dropout: 0 \ No newline at end of file diff --git a/fairseq/examples/data2vec/config/v2/large_text_only_task.yaml b/fairseq/examples/data2vec/config/v2/large_text_only_task.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fd69048e77d6febb0eee6e7fbe459069900f202c --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/large_text_only_task.yaml @@ -0,0 +1,112 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + min_loss_scale: 1e-6 + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + save_interval_updates: 50000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +dataset: + batch_size: 2 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +distributed_training: + distributed_world_size: 32 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +optimization: + max_update: 600000 + clip_norm: 1 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0001 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 0 + loss_scale: 1 + + depth: 24 + num_heads: 16 + embed_dim: 1024 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 24 + layer_norm_target_layer: true + instance_norm_target_layer: false + batch_norm_target_layer: false + instance_norm_targets: true + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + + modalities: + text: + mask_prob: 0.5 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml b/fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml new file mode 100644 index 0000000000000000000000000000000000000000..739e6f6724eefef493d6b46ad1b815d88431def9 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml @@ -0,0 +1,123 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + user_dir: ${env:PWD}/examples/data2vec + +checkpoint: + no_epoch_checkpoints: true + save_interval_updates: 50000 + keep_interval_updates: 1 + +distributed_training: + distributed_world_size: 32 + ddp_backend: legacy_ddp + +task: + _name: masked_lm + data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin + sample_break_mode: none + tokens_per_sample: 512 + include_target_tokens: true + random_token_prob: 0 + leave_unmasked_prob: 0 + include_index: True + skip_masking: True + d2v2_multi: True + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + - model_norm + - ema_norm + - masked_pct + +dataset: + batch_size: 2 + ignore_unused_valid_subsets: true + skip_invalid_size_inputs_valid_test: true + disable_validation: true + +optimization: + clip_norm: 1 + lr: [3e-4] + max_update: 1000000 + update_freq: [1] + +optimizer: + _name: composite + groups: + default: + lr_float: 1e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + decoder: + lr_float: 1e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.98] + adam_eps: 1e-06 + weight_decay: 0.01 + lr_scheduler: + _name: cosine + warmup_updates: 4000 + +lr_scheduler: pass_through + +model: + _name: data2vec_multi + + loss_beta: 4 + loss_scale: 1 + + depth: 24 + num_heads: 16 + embed_dim: 1024 + clone_batch: 8 + + ema_decay: 0.9999 + ema_end_decay: 0.99999 + ema_anneal_end_step: 100000 + ema_encoder_only: true + + average_top_k_layers: 24 + layer_norm_target_layer: true + instance_norm_target_layer: false + batch_norm_target_layer: false + instance_norm_targets: true + layer_norm_targets: false + + layerdrop: 0 + norm_eps: 1e-5 + + supported_modality: TEXT + decoder_group: true + + modalities: + text: + mask_prob: 0.5 + mask_length: 1 + mask_noise_std: 0.01 + prenet_depth: 0 + decoder: + input_dropout: 0.1 + decoder_dim: 768 + decoder_groups: 1 + decoder_kernel: 9 + decoder_layers: 5 + decoder_residual: false + projection_layers: 2 + projection_ratio: 2.0 diff --git a/fairseq/examples/data2vec/config/v2/run_config/local.yaml b/fairseq/examples/data2vec/config/v2/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..732f0188993a14c847ab5d5e8addb24b0b536311 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2184f8cfa252f34ec71b965728d286ccd930eeb --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_1_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec53dc2a984babdf7e61930350868e38f0d769c3 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_2.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..553765597ff5437e9044d1bc0a5f1087438594be --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_2_aws.yaml @@ -0,0 +1,39 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.local_cache_path + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - model.model_path + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_3.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14b47d14e6282d5a846cc7c41c6b2080c5ed8c62 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_3.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 3 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_4.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c54d735fb2dc0b0af2d467caa7f64405af18ea10 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a77f62aece140d09373c225504e2025c799e2ea1 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_4_aws.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - task.post_save_script + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..20e06582be5673de5b5c19fb996172c9ddac15b2 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_6_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 6 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_8.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3ec2c28475076f0c9a1e77c7affa00ff6ed4008 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_8.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml b/fairseq/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9dce876cc9f5c26da77d8c8f35508cf3652c8d3 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/run_config/slurm_8_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 12 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 0 + nodes: 8 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/cola.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/cola.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4ac4ec8b83107d280b2eb7d6b7ad718dc19803a --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/cola.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: mcc + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_mcc: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 320 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 5336 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/mnli.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/mnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a9d6e52f072890273bb50b7b7ba9b6286c27f14 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/mnli.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 3 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + valid_subset: valid,valid1 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 7432 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 123873 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/mrpc.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/mrpc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8f93d9d9ea66fe6e341d3cd25749286bcc91e828 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/mrpc.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: acc_and_f1 + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_acc_and_f1: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 137 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2296 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/qnli.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/qnli.yaml new file mode 100644 index 0000000000000000000000000000000000000000..739fb53b694287e02f6521e2fe330e5e0c4f455e --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/qnli.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1986 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 33112 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/qqp.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/qqp.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9accbaa5210bc7105d7eb58f5982ae6cb5826e1c --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/qqp.yaml @@ -0,0 +1,60 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: acc_and_f1 + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + report_acc_and_f1: True + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 28318 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 113272 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/rte.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/rte.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea07764d9835c9dd22bde49ee7152a4a78fa87ba --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/rte.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 122 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 2036 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/sst_2.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/sst_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a273e5b9434cabd732d8a37b9fc8063ac8399d8f --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/sst_2.yaml @@ -0,0 +1,59 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 2 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + +dataset: + batch_size: 32 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 1256 + +optimization: + clip_norm: 0.0 + lr: [2e-05] + max_update: 20935 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/v2/text_finetuning/sts_b.yaml b/fairseq/examples/data2vec/config/v2/text_finetuning/sts_b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fb009ab95b53fd88342ea75b3e5e1eab344b0679 --- /dev/null +++ b/fairseq/examples/data2vec/config/v2/text_finetuning/sts_b.yaml @@ -0,0 +1,61 @@ +# @package _group_ + +common: + fp16: true + fp16_init_scale: 4 + threshold_loss_scale: 1 + fp16_scale_window: 128 + log_format: json + log_interval: 200 + user_dir: ${env:PWD}/examples/data2vec + +task: + _name: sentence_prediction + data: ??? + init_token: 0 + separator_token: 2 + num_classes: 1 + max_positions: 512 + d2v2_multi: True + +checkpoint: + best_checkpoint_metric: pearson_and_spearman + maximize_best_checkpoint_metric: true + no_epoch_checkpoints: true + +distributed_training: + find_unused_parameters: true + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +criterion: + _name: sentence_prediction + regression_target: true + report_pearson_and_spearman: True + +dataset: + batch_size: 16 + required_batch_size_multiple: 1 + max_tokens: 4400 + num_workers: 1 + +optimizer: + _name: adam + weight_decay: 0.1 + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + +lr_scheduler: + _name: polynomial_decay + warmup_updates: 214 + +optimization: + clip_norm: 0.0 + lr: [4e-05] + max_update: 3598 + max_epoch: 10 + +model: + _name: data2vec_text_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/vision/finetuning/imagenet.yaml b/fairseq/examples/data2vec/config/vision/finetuning/imagenet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6d4864cca01098e331c6eeb8c174851bdd8bc6c --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/imagenet.yaml @@ -0,0 +1,52 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + +task: + _name: image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + valid_subset: val + +distributed_training: + distributed_world_size: 8 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 100000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_image_classification + model_path: ??? diff --git a/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17d4c0a8f57140ee2e24555c46aff875039b5d39 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml @@ -0,0 +1,65 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 250200 + lr: [0.001] + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.001 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-6 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + + model_path: ??? diff --git a/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d2eb57bac1a89bc705f21dfa9768cfbfb14e26c --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml @@ -0,0 +1,68 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 125200 + lr: [0.0005] + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0005 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-20 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + layer_decay: 0.75 + drop_path_rate: 0.2 + + model_path: ??? diff --git a/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a9413cef6a41e09b541c7f88b5bbd785bba2289 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml @@ -0,0 +1,68 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 1 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + best_checkpoint_metric: accuracy + maximize_best_checkpoint_metric: true + +task: + _name: mae_image_classification + data: /datasets01/imagenet_full_size/061417 + +dataset: + num_workers: 6 + batch_size: 32 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 2 + valid_subset: val + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - correct + +optimization: + max_update: 125200 + lr: [0.0005] + clip_norm: 4 + +optimizer: + _name: composite + dynamic_groups: true + groups: + default: + lr_float: 0.0005 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 16000 + min_lr: 1e-7 + + +lr_scheduler: pass_through + +model: + _name: mae_image_classification + mixup: 0.7 + mixup_prob: 0.9 + layer_decay: 0.75 + drop_path_rate: 0.2 + + model_path: ??? diff --git a/fairseq/examples/data2vec/config/vision/finetuning/run_config/local.yaml b/fairseq/examples/data2vec/config/vision/finetuning/run_config/local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..45595f9eea7c369a1200d802bfa1883c1bdfe573 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/run_config/local.yaml @@ -0,0 +1,15 @@ +# @package _global_ +hydra: + sweep: + dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} + +distributed_training: + distributed_world_size: 1 + nprocs_per_node: 1 + distributed_port: -1 + +common: + log_interval: 1 + +dataset: + num_workers: 0 diff --git a/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml b/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..732f0188993a14c847ab5d5e8addb24b0b536311 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml @@ -0,0 +1,37 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 450 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml b/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c54d735fb2dc0b0af2d467caa7f64405af18ea10 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 4 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/vision/pretraining/base_imagenet.yaml b/fairseq/examples/data2vec/config/vision/pretraining/base_imagenet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bfc0f32b68a740909bcfd7515a55bdbae69edaa --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/pretraining/base_imagenet.yaml @@ -0,0 +1,52 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: image_pretraining + data: /datasets01/imagenet_full_size/061417/ + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + log_keys: + - ema_decay + - target_var + - pred_var + +optimization: + max_update: 400000 + lr: [0.0005] + +optimizer: + _name: adam + adam_betas: (0.9,0.98) + adam_eps: 1e-06 + weight_decay: 0.01 + +lr_scheduler: + _name: cosine + warmup_updates: 10000 + +model: + _name: data2vec_vision diff --git a/fairseq/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml b/fairseq/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d7872b5e0445cd909e71a3f7ab36732aa40c5b69 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml @@ -0,0 +1,64 @@ +# @package _group_ + +common: + fp16: true + log_format: json + log_interval: 200 + tensorboard_logdir: tb + fp16_no_flatten_grads: true + +checkpoint: + save_interval: 5 + save_interval_updates: 25000 + keep_interval_updates: 1 + no_epoch_checkpoints: true + +task: + _name: mae_image_pretraining + data: /datasets01/imagenet_full_size/061417/ + rebuild_batches: true + +dataset: + num_workers: 6 + batch_size: 64 + skip_invalid_size_inputs_valid_test: true + required_batch_size_multiple: 1 + disable_validation: true + +distributed_training: + distributed_world_size: 16 + ddp_backend: c10d + +criterion: + _name: model + +optimization: + max_update: 375300 + lr: [0.0006] + +optimizer: + _name: composite + groups: + with_decay: + lr_float: 6e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0.05 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + no_decay: + lr_float: 6e-4 + optimizer: + _name: adam + adam_betas: [0.9,0.95] + weight_decay: 0 + lr_scheduler: + _name: cosine + warmup_updates: 50040 + +lr_scheduler: pass_through + +model: + _name: mae diff --git a/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml b/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2bab5675a561a6a01b5db06530b119a282bbf4a --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml @@ -0,0 +1,36 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 80 + gpus_per_node: 8 + tasks_per_node: 1 + mem_gb: 0 + nodes: 1 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: wav2vec,learnlab,learnfair + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml b/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8b0f02a9b7795d900804c557cae6576153bbaf0 --- /dev/null +++ b/fairseq/examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml @@ -0,0 +1,38 @@ +# @package _global_ + +hydra: + job: + config: + override_dirname: + kv_sep: ':' + item_sep: '/' + exclude_keys: + - run_config + - distributed_training.distributed_port + - distributed_training.distributed_world_size + - model.pretrained_model_path + - model.target_network_path + - next_script + - task.cache_in_scratch + - task.data + - checkpoint.save_interval_updates + - checkpoint.keep_interval_updates + - checkpoint.save_on_overflow + - common.log_interval + - common.user_dir + - task.local_cache_path + sweep: + dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} + subdir: '' + launcher: + submitit_folder: ${hydra.sweep.dir} + timeout_min: 4320 + cpus_per_task: 10 + gpus_per_node: 8 + tasks_per_node: 8 + mem_gb: 450 + nodes: 2 + name: ${env:PREFIX}_${hydra.job.config_name} + partition: devlab,learnlab,learnfair,scavenge + constraint: volta32gb,ib4 + max_num_timeout: 30 diff --git a/fairseq/examples/data2vec/fb_convert_beit_cp.py b/fairseq/examples/data2vec/fb_convert_beit_cp.py new file mode 100644 index 0000000000000000000000000000000000000000..cf42ace762d12353d98ebdcd77f649d9b7025dca --- /dev/null +++ b/fairseq/examples/data2vec/fb_convert_beit_cp.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import torch + +from omegaconf import OmegaConf + +from fairseq.criterions.model_criterion import ModelCriterionConfig +from fairseq.dataclass.configs import FairseqConfig + +from tasks import ImageClassificationConfig, ImagePretrainingConfig +from models.data2vec_image_classification import ( + Data2VecImageClassificationConfig, + Data2VecImageClassificationModel, +) +from models.data2vec_vision import Data2VecVisionConfig, Data2VecVisionModel + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert beit checkpoint into data2vec - vision checkpoint" + ) + # fmt: off + parser.add_argument('checkpoint', help='checkpoint to convert') + parser.add_argument('--output', required=True, metavar='PATH', help='where to output converted checkpoint') + parser.add_argument('--type', type=str, choices=['vision', 'image_classification'], default='image_classification', help='type of model to upgrade') + parser.add_argument('--inception_norms', action='store_true', default=False) + # fmt: on + + return parser + + +def update_checkpoint(model_dict, prefix, is_nested): + + replace_paths = { + "cls_token": "model.cls_emb" if is_nested else "cls_emb", + "patch_embed": "model.patch_embed" if is_nested else "patch_embed", + "mask_token": "mask_emb", + } + + starts_with = { + "patch_embed.proj": "model.patch_embed.conv" + if is_nested + else "patch_embed.conv", + "lm_head": "final_proj", + "fc_norm": "fc_norm", + "head": "head", + } + + partial = { + "mlp.fc1": "mlp.0", + "mlp.fc2": "mlp.2", + } + + for k in list(model_dict.keys()): + for sw, r in starts_with.items(): + if k.startswith(sw): + replace_paths[k] = k.replace(sw, r) + for p, r in partial.items(): + if p in k: + replace_paths[k] = prefix + k.replace(p, r) + + if prefix != "": + for k in list(model_dict.keys()): + if k not in replace_paths: + replace_paths[k] = prefix + k + + for k in list(model_dict.keys()): + if k in replace_paths: + model_dict[replace_paths[k]] = model_dict[k] + if k != replace_paths[k]: + del model_dict[k] + + return model_dict + + +def main(): + parser = get_parser() + args = parser.parse_args() + + cp = torch.load(args.checkpoint, map_location="cpu") + + cfg = FairseqConfig( + criterion=ModelCriterionConfig(_name="model", log_keys=["correct"]), + ) + + if args.type == "image_classification": + + cfg.task = ImageClassificationConfig( + _name="image_classification", + data=".", + ) + + if args.inception_norms: + cfg.task.normalization_mean = [0.5, 0.5, 0.5] + cfg.task.normalization_std = [0.5, 0.5, 0.5] + + cfg.model = Data2VecImageClassificationConfig( + _name="data2vec_image_classification", + ) + cfg.model.pretrained_model_args = FairseqConfig( + model=Data2VecVisionConfig( + _name="data2vec_vision", shared_rel_pos_bias=False + ), + task=ImagePretrainingConfig( + _name="image_pretraining", + ), + ) + + cfg = OmegaConf.create(cfg) + + state = { + "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), + "model": cp["module"], + "best_loss": None, + "optimizer": None, + "extra_state": {}, + } + + model = Data2VecImageClassificationModel(cfg.model) + model.load_state_dict( + update_checkpoint(state["model"], prefix="model.encoder.", is_nested=True), + strict=True, + ) + elif args.type == "vision": + cfg.task = ImagePretrainingConfig( + _name="image_pretraining", + data=".", + ) + + if args.inception_norms: + cfg.task.normalization_mean = [0.5, 0.5, 0.5] + cfg.task.normalization_std = [0.5, 0.5, 0.5] + + cfg.model = Data2VecVisionConfig( + _name="data2vec_vision", + ) + cfg = OmegaConf.create(cfg) + + state = { + "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), + "model": cp["model"], + "best_loss": None, + "optimizer": None, + "extra_state": {}, + } + + model = Data2VecVisionModel(cfg.model) + model.load_state_dict( + update_checkpoint(state["model"], prefix="encoder.", is_nested=False), + strict=True, + ) + else: + raise Exception("unsupported type " + args.type) + + print(state["cfg"], state.keys()) + torch.save(state, args.output) + + +if __name__ == "__main__": + main()