Spaces:

suricodes
/

hindi-sindhi-docker

Paused

App Files Files Community

hindi-sindhi-docker / mosesdecoder /scripts /recaser /train-recaser.perl

suricodes

Upload folder using huggingface_hub

fd49381 verified about 1 year ago

raw

history blame contribute delete

7.93 kB

	#!/usr/bin/env perl
	#
	# This file is part of moses. Its use is licensed under the GNU Lesser General
	# Public License version 2.1 or, at your option, any later version.

	# $Id$
	use warnings;
	use strict;
	use FindBin qw($Bin);
	use Getopt::Long "GetOptions";

	binmode(STDIN, ":utf8");
	binmode(STDOUT, ":utf8");

	# apply switches
	my ($DIR,$CORPUS,$SCRIPTS_ROOT_DIR,$CONFIG,$HELP,$ERROR);
	my $LM = "KENLM"; # KENLM is default.
	my $BUILD_LM = "build-lm.sh";
	my $BUILD_KENLM = "$Bin/../../bin/lmplz";
	my $BUILD_BINARY = "$Bin/../../bin/build_binary";
	my $EXTRACT = "$Bin/../../bin/extract";
	my $SCORE = "$Bin/../../bin/score";
	my $CONSOLIDATE_DIRECT = "$Bin/../../bin/consolidate-direct";
	my $NGRAM_COUNT = "ngram-count";
	my $TRAIN_SCRIPT = "$Bin/../training/train-model.perl";
	my $MAX_LEN = 1;
	my $FIRST_STEP = 1;
	my $LAST_STEP = 11;
	$ERROR = "training Aborted."
	unless &GetOptions('first-step=i' => \$FIRST_STEP,
	'last-step=i' => \$LAST_STEP,
	'corpus=s' => \$CORPUS,
	'config=s' => \$CONFIG,
	'dir=s' => \$DIR,
	'ngram-count=s' => \$NGRAM_COUNT,
	'build-lm=s' => \$BUILD_LM,
	'build-kenlm=s' => \$BUILD_KENLM,
	'lm=s' => \$LM,
	'train-script=s' => \$TRAIN_SCRIPT,
	'scripts-root-dir=s' => \$SCRIPTS_ROOT_DIR,
	'max-len=i' => \$MAX_LEN,
	'help' => \$HELP);

	# check and set default to unset parameters
	$ERROR = "please specify working dir --dir" unless defined($DIR) \|\| defined($HELP);
	$ERROR = "please specify --corpus" if !defined($CORPUS) && !defined($HELP)
	&& $FIRST_STEP <= 2 && $LAST_STEP >= 1;

	if ($HELP \|\| $ERROR) {
	if ($ERROR) {
	print STDERR "ERROR: " . $ERROR . "\n";
	}
	print STDERR "Usage: $0 --dir /output/recaser --corpus /Cased/corpus/files [options ...]";

	print STDERR "\n\nOptions:
	== MANDATORY ==
	--dir=dir ... outputted recaser directory.
	--corpus=file ... inputted cased corpus.

	== OPTIONAL ==
	= Recaser Training configuration =
	--train-script=file ... path to the train script (default: train-factored-phrase-model.perl in \$PATH).
	--config=config ... training script configuration.
	--scripts-root-dir=dir ... scripts directory.
	--max-len=int ... max phrase length (default: 1).

	= Language Model Training configuration =
	--lm=[IRSTLM,SRILM,KENLM] ... language model (default: KENLM).
	--build-lm=file ... path to build-lm.sh if not in \$PATH (used only with --lm=IRSTLM).
	--ngram-count=file ... path to ngram-count.sh if not in \$PATH (used only with --lm=SRILM).

	= Steps this script will perform =
	(1) Truecasing;
	(2) Language Model Training;
	(3) Data Preparation
	(4-10) Recaser Model Training;
	(11) Cleanup.
	--first-step=[1-11] ... step where script starts (default: 1).
	--last-step=[1-11] ... step where script ends (default: 11).

	--help ... this usage output.\n";
	if ($ERROR) {
	exit(1);
	}
	else {
	exit(0);
	}
	}

	# main loop
	`mkdir -p $DIR`;
	&truecase() if $FIRST_STEP == 1;
	$CORPUS = "$DIR/aligned.truecased" if (-e "$DIR/aligned.truecased");
	&train_lm() if $FIRST_STEP <= 2;
	&prepare_data() if $FIRST_STEP <= 3 && $LAST_STEP >= 3;
	&train_recase_model() if $FIRST_STEP <= 10 && $LAST_STEP >= 3;
	&cleanup() if $LAST_STEP == 11;

	exit(0);

	### subs ###

	sub truecase {
	print STDERR "(1) Truecase data @ ".`date`;
	print STDERR "(1) To build model without truecasing, use --first-step 2, and make sure $DIR/aligned.truecased does not exist\n";

	my $cmd = "$Bin/train-truecaser.perl --model $DIR/truecaser_model --corpus $CORPUS";
	print STDERR $cmd."\n";
	system($cmd) == 0 \|\| die("Training truecaser died with error " . ($? >> 8) . "\n");

	$cmd = "$Bin/truecase.perl --model $DIR/truecaser_model < $CORPUS > $DIR/aligned.truecased";
	print STDERR $cmd."\n";
	system($cmd) == 0 \|\| die("Applying truecaser died with error " . ($? >> 8) . "\n");

	}

	sub train_lm {
	print STDERR "(2) Train language model on cased data @ ".`date`;
	my $cmd = "";
	if (uc $LM eq "IRSTLM") {
	$cmd = "$BUILD_LM -t /tmp -i $CORPUS -n 3 -o $DIR/cased.irstlm.gz";
	}
	elsif (uc $LM eq "SRILM") {
	$LM = "SRILM";
	$cmd = "$NGRAM_COUNT -text $CORPUS -lm $DIR/cased.srilm.gz -interpolate -kndiscount";
	}
	else {
	$LM = "KENLM";
	$cmd = "$BUILD_KENLM --prune 0 0 1 -S 5% -T $DIR/lmtmp --order 3 --text $CORPUS --arpa $DIR/cased.kenlm.arpa.gz";
	}
	print STDERR " Using $LM " . "\n";
	print STDERR $cmd."\n";
	system($cmd) == 0 \|\| die("Language model training failed with error " . ($? >> 8) . "\n");
	if ($LM eq "KENLM") {
	system("$BUILD_BINARY $DIR/cased.kenlm.arpa.gz $DIR/cased.kenlm ; rm $DIR/cased.kenlm.arpa.gz");
	}
	}

	sub prepare_data {
	print STDERR "\n(3) Preparing data for training recasing model @ ".`date`;
	open(CORPUS,$CORPUS);
	binmode(CORPUS, ":utf8");
	open(CASED,">$DIR/aligned.cased");
	binmode(CASED, ":utf8");
	print "$DIR/aligned.lowercased\n";
	open(LOWERCASED,">$DIR/aligned.lowercased");
	binmode(LOWERCASED, ":utf8");
	open(ALIGNMENT,">$DIR/aligned.a");
	while(<CORPUS>) {
	next if length($_)>2000;
	s/\x{0}//g;
	s/\\|//g;
	s/ +/ /g;
	s/^ //;
	s/ [\r\n]*$/\n/;
	next if /^$/;
	print CASED $_;
	print LOWERCASED lc($_);
	my $i=0;
	foreach (split) {
	print ALIGNMENT "$i-$i ";
	$i++;
	}
	print ALIGNMENT "\n";
	}
	close(CORPUS);
	close(CASED);
	close(LOWERCASED);
	close(ALIGNMENT);
	}

	sub train_recase_model {
	print STDERR "\n(4) Training recasing model @ ".`date`;
	my $first = $FIRST_STEP;
	$first = 4 if $first < 4;
	if ($MAX_LEN == 1) {
	my $cmd = "$EXTRACT $DIR/aligned.cased $DIR/aligned.lowercased $DIR/aligned.a $DIR/extract 1";
	system($cmd) == 0 \|\| die("ERROR: extract (special case max-len 1) failed: $cmd");
	$cmd = "sort -S 2G $DIR/extract > $DIR/extract.sorted";
	system($cmd) == 0 \|\| die("ERROR: sort extract (special case max-len 1) failed: $cmd");
	$cmd = "$SCORE $DIR/extract.sorted /dev/null $DIR/phrase-table-half --NoLex";
	system($cmd) == 0 \|\| die("ERROR: score (special case max-len 1) failed: $cmd");
	$cmd = "$CONSOLIDATE_DIRECT $DIR/phrase-table-half $DIR/phrase-table";
	system($cmd) == 0 \|\| die("ERROR: consolidate-direct (special case max-len 1) failed: $cmd");
	system("rm $DIR/phrase-table-half");
	system("gzip $DIR/phrase-table");
	$first = 9;
	}
	my $cmd = "$TRAIN_SCRIPT --root-dir $DIR --model-dir $DIR --first-step $first --alignment a --corpus $DIR/aligned --f lowercased --e cased --max-phrase-length $MAX_LEN";
	if ($MAX_LEN == 1) {
	$cmd .= " --score-options='--NoLex --OnlyDirect'";
	}
	else {
	$cmd .= " --score-options='--OnlyDirect'";
	}
	if (uc $LM eq "IRSTLM") {
	$cmd .= " --lm 0:3:$DIR/cased.irstlm.gz:1";
	}
	elsif (uc $LM eq "SRILM") {
	$cmd .= " --lm 0:3:$DIR/cased.srilm.gz:8";
	}
	else {
	$cmd .= " --lm 0:3:$DIR/cased.kenlm:8";
	}
	$cmd .= " -config $CONFIG" if $CONFIG;
	print STDERR $cmd."\n";
	system($cmd) == 0 \|\| die("Recaser model training failed with error " . ($? >> 8) . "\n");
	}

	sub cleanup {
	print STDERR "\n(11) Cleaning up @ ".`date`;
	`rm -f $DIR/extract*`;
	my $clean_1 = $?;
	`rm -f $DIR/aligned*`;
	my $clean_2 = $?;
	`rm -f $DIR/lex*`;
	my $clean_3 = $?;
	`rm -f $DIR/truecaser_model`;
	my $clean_4 = $?;
	if ($clean_1 + $clean_2 + $clean_3 + $clean_4 != 0) {
	print STDERR "Training successful but some files could not be cleaned.\n";
	}
	}