Quagmire1
/

wiki-cased

Model card Files Files and versions

wiki-cased / tools /tokenize.sh

Quagmire1's picture

Upload folder using huggingface_hub

41f6dd8 verified about 1 year ago

history blame contribute delete

1.23 kB

	# Copyright (c) 2019-present, Facebook, Inc.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	#

	# Tokenize text data in various languages
	# Usage: e.g. cat wiki.ar \| tokenize.sh ar

	set -e

	N_THREADS=8

	lg=$1
	TOOLS_PATH=$2

	# moses
	MOSES=$TOOLS_PATH/mosesdecoder
	REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl
	NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl
	REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl
	TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl

	# Chinese
	if [ "$lg" = "zh" ]; then
	$TOOLS_PATH/stanford-segmenter-*/segment.sh pku /dev/stdin UTF-8 0 \| $REPLACE_UNICODE_PUNCT \| $NORM_PUNC -l $lg \| $REM_NON_PRINT_CHAR
	# Thai
	elif [ "$lg" = "th" ]; then
	cat - \| $REPLACE_UNICODE_PUNCT \| $NORM_PUNC -l $lg \| $REM_NON_PRINT_CHAR \| python $TOOLS_PATH/segment_th.py
	# Japanese
	elif [ "$lg" = "ja" ]; then
	cat - \| $REPLACE_UNICODE_PUNCT \| $NORM_PUNC -l $lg \| $REM_NON_PRINT_CHAR \| kytea -notags
	# other languages
	else
	cat - \| $REPLACE_UNICODE_PUNCT \| $NORM_PUNC -l $lg \| $REM_NON_PRINT_CHAR \| $TOKENIZER -no-escape -threads $N_THREADS -l $lg
	fi