| | #!/bin/bash |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | if [[ $# -ne 2 ]]; then |
| | echo "Run as following:" |
| | echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>" |
| | exit 1 |
| | fi |
| |
|
| | GLUE_DATA_FOLDER=$1 |
| |
|
| | |
| | wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' |
| | wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' |
| | wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' |
| |
|
| | TASKS=$2 |
| |
|
| | if [ "$TASKS" = "ALL" ] |
| | then |
| | TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA" |
| | fi |
| |
|
| | for TASK in $TASKS |
| | do |
| | echo "Preprocessing $TASK" |
| |
|
| | TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK" |
| | echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER" |
| |
|
| | SPLITS="train dev test" |
| | INPUT_COUNT=2 |
| | if [ "$TASK" = "QQP" ] |
| | then |
| | INPUT_COLUMNS=( 4 5 ) |
| | TEST_INPUT_COLUMNS=( 2 3 ) |
| | LABEL_COLUMN=6 |
| | elif [ "$TASK" = "MNLI" ] |
| | then |
| | SPLITS="train dev_matched dev_mismatched test_matched test_mismatched" |
| | INPUT_COLUMNS=( 9 10 ) |
| | TEST_INPUT_COLUMNS=( 9 10 ) |
| | DEV_LABEL_COLUMN=16 |
| | LABEL_COLUMN=12 |
| | elif [ "$TASK" = "QNLI" ] |
| | then |
| | INPUT_COLUMNS=( 2 3 ) |
| | TEST_INPUT_COLUMNS=( 2 3 ) |
| | LABEL_COLUMN=4 |
| | elif [ "$TASK" = "MRPC" ] |
| | then |
| | INPUT_COLUMNS=( 4 5 ) |
| | TEST_INPUT_COLUMNS=( 4 5 ) |
| | LABEL_COLUMN=1 |
| | elif [ "$TASK" = "RTE" ] |
| | then |
| | INPUT_COLUMNS=( 2 3 ) |
| | TEST_INPUT_COLUMNS=( 2 3 ) |
| | LABEL_COLUMN=4 |
| | elif [ "$TASK" = "STS-B" ] |
| | then |
| | INPUT_COLUMNS=( 8 9 ) |
| | TEST_INPUT_COLUMNS=( 8 9 ) |
| | LABEL_COLUMN=10 |
| | |
| | elif [ "$TASK" = "SST-2" ] |
| | then |
| | INPUT_COLUMNS=( 1 ) |
| | TEST_INPUT_COLUMNS=( 2 ) |
| | LABEL_COLUMN=2 |
| | INPUT_COUNT=1 |
| | elif [ "$TASK" = "CoLA" ] |
| | then |
| | INPUT_COLUMNS=( 4 ) |
| | TEST_INPUT_COLUMNS=( 2 ) |
| | LABEL_COLUMN=2 |
| | INPUT_COUNT=1 |
| | fi |
| |
|
| | |
| | rm -rf "$TASK_DATA_FOLDER/processed" |
| | mkdir -p "$TASK_DATA_FOLDER/processed" |
| | for SPLIT in $SPLITS |
| | do |
| | |
| | if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]] |
| | then |
| | cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; |
| | else |
| | tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; |
| | fi |
| |
|
| | |
| | if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]] |
| | then |
| | awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; |
| | else |
| | cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; |
| | fi |
| | rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; |
| | done |
| |
|
| | |
| | for SPLIT in $SPLITS |
| | do |
| | for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) |
| | do |
| | if [[ "$SPLIT" != test* ]] |
| | then |
| | COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]} |
| | else |
| | COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]} |
| | fi |
| | cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE"; |
| | done |
| |
|
| | if [[ "$SPLIT" != test* ]] |
| | then |
| | if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ] |
| | then |
| | cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; |
| | else |
| | cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; |
| | fi |
| | fi |
| |
|
| | |
| | for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) |
| | do |
| | LANG="input$INPUT_TYPE" |
| | echo "BPE encoding $SPLIT/$LANG" |
| | python -m examples.roberta.multiprocessing_bpe_encoder \ |
| | --encoder-json encoder.json \ |
| | --vocab-bpe vocab.bpe \ |
| | --inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \ |
| | --outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \ |
| | --workers 60 \ |
| | --keep-empty; |
| | done |
| | done |
| |
|
| | |
| | rm -rf "$TASK-bin" |
| |
|
| | DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG" |
| | TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG" |
| | if [ "$TASK" = "MNLI" ] |
| | then |
| | DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG" |
| | TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG" |
| | fi |
| |
|
| | |
| | for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) |
| | do |
| | LANG="input$INPUT_TYPE" |
| | fairseq-preprocess \ |
| | --only-source \ |
| | --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \ |
| | --validpref "${DEVPREF//LANG/$LANG}" \ |
| | --testpref "${TESTPREF//LANG/$LANG}" \ |
| | --destdir "$TASK-bin/$LANG" \ |
| | --workers 60 \ |
| | --srcdict dict.txt; |
| | done |
| | if [[ "$TASK" != "STS-B" ]] |
| | then |
| | fairseq-preprocess \ |
| | --only-source \ |
| | --trainpref "$TASK_DATA_FOLDER/processed/train.label" \ |
| | --validpref "${DEVPREF//LANG/label}" \ |
| | --destdir "$TASK-bin/label" \ |
| | --workers 60; |
| | else |
| | |
| | mkdir -p "$TASK-bin/label" |
| | awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label" |
| | awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label" |
| | fi |
| | done |
| |
|