| #!/usr/bin/env bash |
|
|
| |
| export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python |
|
|
| set -eou pipefail |
|
|
| nj=15 |
| stage=-1 |
| stop_stage=100 |
|
|
| dl_dir=$PWD/download |
| |
| |
| |
| |
|
|
| . shared/parse_options.sh || exit 1 |
|
|
| |
| |
| |
| vocab_sizes=( |
| 500 |
| |
| |
| |
| ) |
|
|
| |
| |
| mkdir -p data |
| mkdir -p $dl_dir |
|
|
| log() { |
| |
| local fname=${BASH_SOURCE[1]##*/} |
| echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" |
| } |
|
|
| log "dl_dir: $dl_dir" |
|
|
| if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then |
| log "Stage -1: Download data" |
|
|
| |
|
|
| if [ ! -f $dl_dir/.complete ]; then |
| url=http://raw.githubusercontent.com/townie/PTB-dataset-from-Tomas-Mikolov-s-webpage/master/data |
| wget --directory-prefix $dl_dir $url/ptb.train.txt |
| wget --directory-prefix $dl_dir $url/ptb.valid.txt |
| wget --directory-prefix $dl_dir $url/ptb.test.txt |
| touch $dl_dir/.complete |
| fi |
| fi |
|
|
| if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then |
| log "Stage 0: Train BPE model" |
|
|
| |
| |
| |
|
|
| for vocab_size in ${vocab_sizes[@]}; do |
| lang_dir=data/lang_bpe_${vocab_size} |
| mkdir -p $lang_dir |
| ./local/train_bpe_model.py \ |
| --lang-dir $lang_dir \ |
| --vocab-size $vocab_size \ |
| --transcript $dl_dir/ptb.train.txt |
| done |
| fi |
|
|
| if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then |
| log "Stage 1: Generate LM training data" |
| |
|
|
| for vocab_size in ${vocab_sizes[@]}; do |
| lang_dir=data/lang_bpe_${vocab_size} |
| out_dir=data/lm_training_bpe_${vocab_size} |
| mkdir -p $out_dir |
| ./local/prepare_lm_training_data.py \ |
| --bpe-model $lang_dir/bpe.model \ |
| --lm-data $dl_dir/ptb.train.txt \ |
| --lm-archive $out_dir/lm_data.pt |
|
|
| ./local/prepare_lm_training_data.py \ |
| --bpe-model $lang_dir/bpe.model \ |
| --lm-data $dl_dir/ptb.valid.txt \ |
| --lm-archive $out_dir/lm_data-valid.pt |
|
|
| ./local/prepare_lm_training_data.py \ |
| --bpe-model $lang_dir/bpe.model \ |
| --lm-data $dl_dir/ptb.test.txt \ |
| --lm-archive $out_dir/lm_data-test.pt |
| done |
| fi |
|
|
| if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then |
| log "Stage 2: Sort LM training data" |
| |
| |
| |
| |
| |
| |
|
|
| for vocab_size in ${vocab_sizes[@]}; do |
| out_dir=data/lm_training_bpe_${vocab_size} |
| mkdir -p $out_dir |
| ./local/sort_lm_training_data.py \ |
| --in-lm-data $out_dir/lm_data.pt \ |
| --out-lm-data $out_dir/sorted_lm_data.pt \ |
| --out-statistics $out_dir/statistics.txt |
|
|
| ./local/sort_lm_training_data.py \ |
| --in-lm-data $out_dir/lm_data-valid.pt \ |
| --out-lm-data $out_dir/sorted_lm_data-valid.pt \ |
| --out-statistics $out_dir/statistics-valid.txt |
|
|
| ./local/sort_lm_training_data.py \ |
| --in-lm-data $out_dir/lm_data-test.pt \ |
| --out-lm-data $out_dir/sorted_lm_data-test.pt \ |
| --out-statistics $out_dir/statistics-test.txt |
| done |
| fi |
|
|