Add files using upload-large-folder tool

88117f8 verified about 1 year ago

5.9 kB

	################################################
	### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
	################################################

	[GENERAL]

	### directory in which experiment is run
	#
	working-dir = /home/hieu/workspace/experiment/data/issues/toy

	# specification of the language pair
	input-extension = fr
	output-extension = en
	pair-extension = fr-en

	### directories that contain tools and data
	#
	# moses
	moses-src-dir = /home/hieu/workspace/github/mosesdecoder
	#
	# moses binaries
	moses-bin-dir = $moses-src-dir/bin
	#
	# moses scripts
	moses-script-dir = $moses-src-dir/scripts
	#
	# directory where GIZA++/MGIZA programs resides
	external-bin-dir = /home/hieu/workspace/bin/training-tools
	#
	# srilm
	srilm-dir = $moses-src-dir/srilm/bin/i686-m64
	#
	# irstlm
	irstlm-dir = $moses-src-dir/irstlm/bin

	# data
	wmt12-data = $working-dir/data

	### basic tools
	#
	# moses decoder
	decoder = $moses-bin-dir/moses_chart

	# conversion of phrase table into binary on-disk format
	#ttable-binarizer = $moses-bin-dir/processPhraseTable

	# conversion of rule table into binary on-disk format
	ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2"

	# tokenizers - comment out if all your data is already tokenized
	input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
	output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

	# truecasers - comment out if you do not use the truecaser
	input-truecaser = $moses-script-dir/recaser/truecase.perl
	output-truecaser = $moses-script-dir/recaser/truecase.perl
	detruecaser = $moses-script-dir/recaser/detruecase.perl


	### multi-core settings
	# when the generic parallelizer is used, the number of cores
	# specified here
	cores = 8

	#################################################################
	# PARALLEL CORPUS PREPARATION:
	# create a tokenized, sentence-aligned corpus, ready for training

	[CORPUS]

	### long sentences are filtered out, since they slow down GIZA++
	# and are a less reliable source of data. set here the maximum
	# length of a sentence
	#
	max-sentence-length = 80

	[CORPUS:nc]
	raw-stem = $wmt12-data/nc-5k

	#################################################################
	# LANGUAGE MODEL TRAINING

	[LM]

	### tool to be used for language model training
	# srilm
	lm-training = $srilm-dir/ngram-count
	settings = "-interpolate -kndiscount -unk"

	# irstlm training
	# msb = modified kneser ney; p=0 no singleton pruning
	#lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp"
	#settings = "-s msb -p 0"

	# order of the language model
	order = 5

	# kenlm, also set type to 8
	lm-binarizer = $moses-bin-dir/build_binary
	type = 8

	### each language model to be used has its own section here

	[LM:nc]
	raw-corpus = $wmt12-data/nc-5k.$output-extension

	#################################################################
	# TRANSLATION MODEL TRAINING

	[TRAINING]

	### training script to be used: either a legacy script or
	# current moses training script (default)
	#
	script = $moses-script-dir/training/train-model.perl

	### parallelization of data preparation step
	# the two directions of the data preparation can be run in parallel
	# comment out if not needed
	#
	parallel = yes

	### symmetrization method to obtain word alignments from giza output
	# (commonly used: grow-diag-final-and)
	#
	alignment-symmetrization-method = grow-diag-final-and

	### hierarchical rule set
	#
	hierarchical-rule-set = true

	### settings for rule scoring
	#
	score-settings = "--GoodTuring"

	#####################################################
	### TUNING: finding good weights for model components

	[TUNING]

	### tuning script to be used
	#
	tuning-script = $moses-script-dir/training/mert-moses.pl
	tuning-settings = "-mertdir $moses-bin-dir"

	### specify the corpus used for tuning
	# it should contain 1000s of sentences
	#
	input-sgm = $wmt12-data/test-src.$input-extension.sgm
	#raw-input =
	#tokenized-input =
	#factorized-input =
	#input =
	#
	reference-sgm = $wmt12-data/test-ref.$output-extension.sgm
	#raw-reference =
	#tokenized-reference =
	#factorized-reference =
	#reference =

	### size of n-best list used (typically 100)
	#
	nbest = 100

	#######################################################
	## TRUECASER: train model to truecase corpora and input

	[TRUECASER]

	### script to train truecaser models
	#
	trainer = $moses-script-dir/recaser/train-truecaser.perl

	######################################################################
	## EVALUATION: translating a test set using the tuned system and score it

	[EVALUATION]

	### prepare system output for scoring
	# this may include detokenization and wrapping output in sgm
	# (needed for nist-bleu, ter, meteor)
	#
	detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
	#recaser = $moses-script-dir/recaser/recase.perl
	wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
	#output-sgm =

	### BLEU
	#
	nist-bleu = $moses-script-dir/generic/mteval-v13a.pl
	nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c"
	#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
	#ibm-bleu =

	### Analysis: carry out various forms of analysis on the output
	#
	analysis = $moses-script-dir/ems/support/analysis.perl
	#
	# also report on input coverage
	analyze-coverage = yes
	#
	# also report on phrase mappings used
	report-segmentation = yes

	[EVALUATION:newstest2011]

	### input data
	#
	input-sgm = $wmt12-data/test-src.$input-extension.sgm

	### reference data
	#
	reference-sgm = $wmt12-data/test-ref.$output-extension.sgm

	### wrapping frame
	# for nist-bleu and other scoring scripts, the output needs to be wrapped
	# in sgm markup (typically like the input sgm)
	#
	wrapping-frame = $input-sgm

	##########################################
	### REPORTING: summarize evaluation scores

	[REPORTING]

	### currently no parameters for reporting section