| | #!/usr/bin/env bash |
| |
|
| | |
| |
|
| | DEST_DIR=$1 |
| | TRAIN_SPLIT=$2 |
| | VALID_SPLIT=$3 |
| | FAIRSEQ_ROOT=$4 |
| |
|
| | mkdir -p $DEST_DIR |
| |
|
| | |
| | cut -f1 $TRAIN_SPLIT.tsv > $DEST_DIR/train_fnames.txt |
| | cut -f1 $VALID_SPLIT.tsv > $DEST_DIR/valid_fnames.txt |
| | cut -f2 $TRAIN_SPLIT.tsv > $DEST_DIR/train.lengths |
| | cut -f2 $VALID_SPLIT.tsv > $DEST_DIR/valid.lengths |
| |
|
| | |
| | head -1 $TRAIN_SPLIT.tsv > $DEST_DIR/train.root |
| | head -1 $VALID_SPLIT.tsv > $DEST_DIR/valid.root |
| |
|
| | |
| | sed -i '1d' $DEST_DIR/train_fnames.txt |
| | sed -i '1d' $DEST_DIR/valid_fnames.txt |
| | sed -i '1d' $DEST_DIR/train.lengths |
| | sed -i '1d' $DEST_DIR/valid.lengths |
| |
|
| | |
| | sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/train_fnames.txt |
| | sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/valid_fnames.txt |
| |
|
| | |
| | PYTHONPATH=$FAIRSEQ_ROOT python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $DEST_DIR/train_fnames.txt --validpref $DEST_DIR/valid_fnames.txt --workers 60 --only-source --destdir $DEST_DIR |
| |
|