| # Copyright (c) 2019-present, Facebook, Inc. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| # Tokenize text data in various languages | |
| # Usage: e.g. cat wiki.ar | tokenize.sh ar | |
| set -e | |
| N_THREADS=8 | |
| lg=$1 | |
| TOOLS_PATH=$2 | |
| # moses | |
| MOSES=$TOOLS_PATH/mosesdecoder | |
| REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl | |
| NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl | |
| REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl | |
| TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl | |
| # Chinese | |
| if [ "$lg" = "zh" ]; then | |
| $TOOLS_PATH/stanford-segmenter-*/segment.sh pku /dev/stdin UTF-8 0 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | |
| # Thai | |
| elif [ "$lg" = "th" ]; then | |
| cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $TOOLS_PATH/segment_th.py | |
| # Japanese | |
| elif [ "$lg" = "ja" ]; then | |
| cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | kytea -notags | |
| # other languages | |
| else | |
| cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg | |
| fi | |