| #!/bin/bash |
|
|
| |
|
|
|
|
| echo `date` |
| exp_dir=$1 |
| data_dir=$2 |
| bpe_dir=$3 |
| src_lang=$4 |
| tgt_lang=$5 |
| split=$6 |
| parallel_installed=${7:-false} |
|
|
| in_split_dir=$data_dir/$split |
| out_split_dir=$bpe_dir/$split |
|
|
| echo "Apply Sentence Piece tokenization to SRC corpus" |
| |
|
|
| if $parallel_installed; then |
| parallel --pipe --keep-order \ |
| spm_encode --model=$exp_dir/vocab/model.SRC \ |
| --output_format=piece \ |
| < $in_split_dir.$src_lang \ |
| > $out_split_dir.$src_lang |
| else |
| spm_encode --model=$exp_dir/vocab/model.SRC \ |
| --output_format=piece \ |
| < $in_split_dir.$src_lang \ |
| > $out_split_dir.$src_lang |
| fi |
|
|
| echo "Apply Sentence Piece tokenization to TGT corpus" |
| |
|
|
| if $parallel_installed; then |
| parallel --pipe --keep-order \ |
| spm_encode --model=$exp_dir/vocab/model.TGT \ |
| --output_format=piece \ |
| < $in_split_dir.$tgt_lang \ |
| > $out_split_dir.$tgt_lang |
| else |
| spm_encode --model=$exp_dir/vocab/model.TGT \ |
| --output_format=piece \ |
| < $in_split_dir.$tgt_lang \ |
| > $out_split_dir.$tgt_lang |
| fi |