Upload folder using huggingface_hub

41f6dd8 verified over 1 year ago

11.9 kB

	################################################
	### CONFIGURATION FILE FOR AN SMT EXPERIMENT ###
	################################################

	[GENERAL]

	### directory in which experiment is run
	#
	working-dir = WORKDIR/ems_workdir

	# Giza and friends
	external-bin-dir = WORKDIR/giza-pp/bin/

	# specification of the language pair
	input-extension = fr
	output-extension = en
	pair-extension = fr-en

	### directories that contain tools and data
	#
	# moses
	moses-src-dir = WORKDIR
	#
	# moses scripts
	moses-script-dir = WORKDIR/scripts
	#
	# srilm
	srilm-dir = SRILMDIR/bin/MACHINE_TYPE
	#
	# data
	toy-data = $moses-script-dir/ems/example/data

	### basic tools
	#
	# moses decoder
	decoder = $moses-src-dir/bin/moses

	# conversion of phrase table into binary on-disk format
	ttable-binarizer = $moses-src-dir/bin/processPhraseTable

	# conversion of rule table into binary on-disk format
	#ttable-binarizer = "$moses-src-dir/CreateOnDisk/src/CreateOnDiskPt 1 1 5 100 2"

	# tokenizers - comment out if all your data is already tokenized
	input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension"
	output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension"

	# truecasers - comment out if you do not use the truecaser
	input-truecaser = $moses-script-dir/recaser/truecase.perl
	output-truecaser = $moses-script-dir/recaser/truecase.perl
	detruecaser = $moses-script-dir/recaser/detruecase.perl

	### generic parallelizer for cluster and multi-core machines
	# you may specify a script that allows the parallel execution
	# parallizable steps (see meta file). you also need specify
	# the number of jobs (cluster) or cores (multicore)
	#
	#generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl
	#generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl

	### cluster settings (if run on a cluster machine)
	# number of jobs to be submitted in parallel
	#
	#jobs = 10

	# arguments to qsub when scheduling a job
	#qsub-settings = ""

	# project for priviledges and usage accounting
	#qsub-project = iccs_smt

	# memory and time
	#qsub-memory = 4
	#qsub-hours = 48

	### multi-core settings
	# when the generic parallelizer is used, the number of cores
	# specified here
	cores = 8

	#################################################################
	# PARALLEL CORPUS PREPARATION:
	# create a tokenized, sentence-aligned corpus, ready for training

	[CORPUS]

	### long sentences are filtered out, since they slow down GIZA++
	# and are a less reliable source of data. set here the maximum
	# length of a sentence
	#
	max-sentence-length = 80

	[CORPUS:toy]

	### command to run to get raw corpus files
	#
	# get-corpus-script =

	### raw corpus files (untokenized, but sentence aligned)
	#
	raw-stem = $toy-data/nc-5k

	### tokenized corpus files (may contain long sentences)
	#
	#tokenized-stem =

	### if sentence filtering should be skipped,
	# point to the clean training data
	#
	#clean-stem =

	### if corpus preparation should be skipped,
	# point to the prepared training data
	#
	#lowercased-stem =

	#################################################################
	# LANGUAGE MODEL TRAINING

	[LM]

	### tool to be used for language model training
	# for instance: ngram-count (SRILM), train-lm-on-disk.perl (Edinburgh)
	#
	lm-training = $srilm-dir/ngram-count
	settings = "-interpolate -kndiscount -unk"
	order = 5

	### tool to be used for training randomized language model from scratch
	# (more commonly, a SRILM is trained)
	#
	#rlm-training = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"

	### script to use for binary table format for irstlm or kenlm
	# (default: no binarization)

	# irstlm
	#lm-binarizer = $moses-src-dir/irstlm/bin/compile-lm

	# kenlm, also set type to 8
	#lm-binarizer = $moses-src-dir/kenlm/build_binary
	type = 8

	### script to create quantized language model format (irstlm)
	# (default: no quantization)
	#
	#lm-quantizer = $moses-src-dir/irstlm/bin/quantize-lm

	### script to use for converting into randomized table format
	# (default: no randomization)
	#
	#lm-randomizer = "$moses-src-dir/randlm/bin/buildlm -falsepos 8 -values 8"

	### each language model to be used has its own section here

	[LM:toy]

	### command to run to get raw corpus files
	#
	#get-corpus-script = ""

	type = 8

	### raw corpus (untokenized)
	#
	raw-corpus = $toy-data/nc-5k.$output-extension

	### tokenized corpus files (may contain long sentences)
	#
	#tokenized-corpus =

	### if corpus preparation should be skipped,
	# point to the prepared language model
	#
	#lm =


	[TRAINING]

	### training script to be used: either a legacy script or
	# current moses training script (default)
	#
	script = $moses-script-dir/training/train-model.perl

	### general options
	#
	#training-options = ""

	### factored training: specify here which factors used
	# if none specified, single factor training is assumed
	# (one translation step, surface to surface)
	#
	#input-factors = word lemma pos morph
	#output-factors = word lemma pos
	#alignment-factors = "word -> word"
	#translation-factors = "word -> word"
	#reordering-factors = "word -> word"
	#generation-factors = "word -> pos"
	#decoding-steps = "t0, g0"

	### pre-computation for giza++
	# giza++ has a more efficient data structure that needs to be
	# initialized with snt2cooc. if run in parallel, this may reduces
	# memory requirements. set here the number of parts
	#
	run-giza-in-parts = 5

	### symmetrization method to obtain word alignments from giza output
	# (commonly used: grow-diag-final-and)
	#
	alignment-symmetrization-method = grow-diag-final-and

	### use of berkeley aligner for word alignment
	#
	#use-berkeley = true
	#alignment-symmetrization-method = berkeley
	#berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh
	#berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh
	#berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar
	#berkeley-java-options = "-server -mx30000m -ea"
	#berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8"
	#berkeley-process-options = "-EMWordAligner.numThreads 8"
	#berkeley-posterior = 0.5

	### if word alignment should be skipped,
	# point to word alignment files
	#
	#word-alignment = $working-dir/model/aligned.1

	### create a bilingual concordancer for the model
	#
	#biconcor = $moses-script-dir/ems/biconcor/biconcor

	### lexicalized reordering: specify orientation type
	# (default: only distance-based reordering model)
	#
	lexicalized-reordering = msd-bidirectional-fe

	### hierarchical rule set
	#
	#hierarchical-rule-set = true

	### settings for rule extraction
	#
	#extract-settings = ""

	### unknown word labels (target syntax only)
	# enables use of unknown word labels during decoding
	# label file is generated during rule extraction
	#
	#use-unknown-word-labels = true

	### if phrase extraction should be skipped,
	# point to stem for extract files
	#
	# extracted-phrases =

	### settings for rule scoring
	#
	score-settings = "--GoodTuring"

	### include word alignment in phrase table
	#
	#include-word-alignment-in-rules = yes

	### if phrase table training should be skipped,
	# point to phrase translation table
	#
	# phrase-translation-table =

	### if reordering table training should be skipped,
	# point to reordering table
	#
	# reordering-table =

	### if training should be skipped,
	# point to a configuration file that contains
	# pointers to all relevant model files
	#
	#config =

	#####################################################
	### TUNING: finding good weights for model components

	[TUNING]

	### instead of tuning with this setting, old weights may be recycled
	# specify here an old configuration file with matching weights
	#
	weight-config = $toy-data/weight.ini

	### tuning script to be used
	#
	tuning-script = $moses-script-dir/training/mert-moses.pl
	tuning-settings = "-mertdir $moses-src-dir/mert"

	### specify the corpus used for tuning
	# it should contain 1000s of sentences
	#
	#input-sgm =
	#raw-input =
	#tokenized-input =
	#factorized-input =
	#input =
	#
	#reference-sgm =
	#raw-reference =
	#tokenized-reference =
	#factorized-reference =
	#reference =

	### size of n-best list used (typically 100)
	#
	nbest = 100

	### ranges for weights for random initialization
	# if not specified, the tuning script will use generic ranges
	# it is not clear, if this matters
	#
	# lambda =

	### additional flags for the filter script
	#
	filter-settings = ""

	### additional flags for the decoder
	#
	decoder-settings = ""

	### if tuning should be skipped, specify this here
	# and also point to a configuration file that contains
	# pointers to all relevant model files
	#
	#config =

	#########################################################
	## RECASER: restore case, this part only trains the model

	[RECASING]

	#decoder = $moses-src-dir/moses-cmd/src/moses.1521.srilm

	### training data
	# raw input needs to be still tokenized,
	# also also tokenized input may be specified
	#
	#tokenized = [LM:europarl:tokenized-corpus]

	# recase-config =

	#lm-training = $srilm-dir/ngram-count

	#######################################################
	## TRUECASER: train model to truecase corpora and input

	[TRUECASER]

	### script to train truecaser models
	#
	trainer = $moses-script-dir/recaser/train-truecaser.perl

	### training data
	# data on which truecaser is trained
	# if no training data is specified, parallel corpus is used
	#
	# raw-stem =
	# tokenized-stem =

	### trained model
	#
	# truecase-model =

	######################################################################
	## EVALUATION: translating a test set using the tuned system and score it

	[EVALUATION]

	### additional flags for the filter script
	#
	#filter-settings = ""

	### additional decoder settings
	# switches for the Moses decoder
	#
	decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000"

	### specify size of n-best list, if produced
	#
	#nbest = 100

	### multiple reference translations
	#
	#multiref = yes

	### prepare system output for scoring
	# this may include detokenization and wrapping output in sgm
	# (needed for nist-bleu, ter, meteor)
	#
	detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension"
	#recaser = $moses-script-dir/recaser/recase.perl
	wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension"
	#output-sgm =

	### BLEU
	#
	nist-bleu = $moses-script-dir/generic/mteval-v12.pl
	nist-bleu-c = "$moses-script-dir/generic/mteval-v12.pl -c"
	#multi-bleu = $moses-script-dir/generic/multi-bleu.perl
	#ibm-bleu =

	### TER: translation error rate (BBN metric) based on edit distance
	# not yet integrated
	#
	# ter =

	### METEOR: gives credit to stem / worknet synonym matches
	# not yet integrated
	#
	# meteor =

	### Analysis: carry out various forms of analysis on the output
	#
	analysis = $moses-script-dir/ems/support/analysis.perl
	#
	# also report on input coverage
	analyze-coverage = yes
	#
	# also report on phrase mappings used
	report-segmentation = yes
	#
	# report precision of translations for each input word, broken down by
	# count of input word in corpus and model
	#report-precision-by-coverage = yes
	#
	# further precision breakdown by factor
	#precision-by-coverage-factor = pos

	[EVALUATION:test]

	### input data
	#
	input-sgm = $toy-data/test-src.$input-extension.sgm
	# raw-input =
	# tokenized-input =
	# factorized-input =
	# input =

	### reference data
	#
	reference-sgm = $toy-data/test-ref.$output-extension.sgm
	# raw-reference =
	# tokenized-reference =
	# reference =

	### analysis settings
	# may contain any of the general evaluation analysis settings
	# specific setting: base coverage statistics on earlier run
	#
	#precision-by-coverage-base = $working-dir/evaluation/test.analysis.5

	### wrapping frame
	# for nist-bleu and other scoring scripts, the output needs to be wrapped
	# in sgm markup (typically like the input sgm)
	#
	wrapping-frame = $input-sgm

	##########################################
	### REPORTING: summarize evaluation scores

	[REPORTING]

	### currently no parameters for reporting section