sleepyhead111 commited on Apr 20, 2025

Commit

de68f2b

verified ·

1 Parent(s): 61f1661

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mosesdecoder/contrib/m4m/examples/giza-vs-fast.m4m +99 -0
mosesdecoder/contrib/m4m/modules/obsolete/Makefile +64 -0
mosesdecoder/contrib/m4m/modules/obsolete/baseline-system.m4m +48 -0
mosesdecoder/contrib/m4m/modules/obsolete/directory-structure.m4m +7 -0
mosesdecoder/contrib/m4m/modules/obsolete/model-filtering.m4m +37 -0
mosesdecoder/contrib/m4m/modules/obsolete/phrase-table.make.scratch +124 -0
mosesdecoder/contrib/m4m/modules/obsolete/reporting.m4m +95 -0
mosesdecoder/contrib/m4m/modules/obsolete/run-moses.m4m +37 -0
mosesdecoder/contrib/m4m/modules/obsolete/setup-experiments.m4m +121 -0
mosesdecoder/contrib/m4m/modules/obsolete/skip-steps.mak +19 -0
mosesdecoder/contrib/m4m/modules/obsolete/system.m4m +38 -0
mosesdecoder/contrib/m4m/modules/obsolete/template.m4m +66 -0
mosesdecoder/contrib/m4m/modules/obsolete/tune.m4m +45 -0
mosesdecoder/contrib/m4m/scripts/fast-align2bal.py +31 -0
mosesdecoder/contrib/m4m/scripts/giza.txt2snt.sh +41 -0
mosesdecoder/contrib/m4m/scripts/moses.extract-phrases.sh +63 -0
mosesdecoder/contrib/m4m/scripts/moses.make-lex.py +86 -0
mosesdecoder/contrib/m4m/scripts/moses.phrase-extract.sh +110 -0
mosesdecoder/contrib/m4m/scripts/moses.score-phrases.sh +41 -0
mosesdecoder/contrib/m4m/scripts/moses.transfer-weights.py +61 -0
mosesdecoder/contrib/m4m/util/Jamfile +12 -0
mosesdecoder/contrib/memscore/Makefile.in +581 -0
mosesdecoder/contrib/memscore/configure.ac +84 -0
mosesdecoder/contrib/memscore/lexdecom.h +41 -0
mosesdecoder/contrib/memscore/memscore.cpp +85 -0
mosesdecoder/contrib/memscore/memscore.h +57 -0
mosesdecoder/contrib/memscore/missing +360 -0
mosesdecoder/contrib/memscore/phraselm.h +45 -0
mosesdecoder/contrib/memscore/phrasetable.cpp +348 -0
mosesdecoder/contrib/memscore/scorer.h +71 -0
mosesdecoder/contrib/memscore/timestamp.h +29 -0
mosesdecoder/contrib/mira/Main.cpp +1849 -0
mosesdecoder/contrib/mira/Perceptron.cpp +53 -0
mosesdecoder/contrib/mira/mira.xcodeproj/project.pbxproj +401 -0
mosesdecoder/contrib/moses-speedtest/README.md +146 -0
mosesdecoder/contrib/moses-speedtest/check_for_regression.py +63 -0
mosesdecoder/contrib/moses-speedtest/cronjob +7 -0
mosesdecoder/contrib/moses-speedtest/runtests.py +439 -0
mosesdecoder/contrib/moses-speedtest/sys_drop_caches.py +22 -0
mosesdecoder/contrib/moses-speedtest/test_config +3 -0
mosesdecoder/contrib/moses-speedtest/testsuite_config +5 -0
mosesdecoder/contrib/picaro/README +62 -0
mosesdecoder/contrib/picaro/es/README +4 -0
mosesdecoder/contrib/picaro/es/sample.aln +1 -0
mosesdecoder/contrib/picaro/es/sample.e +1 -0
mosesdecoder/contrib/picaro/es/sample.f +1 -0
mosesdecoder/contrib/picaro/picaro.py +250 -0
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.idx +0 -0
mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.idx +0 -0
mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.srctree.wa +0 -0

mosesdecoder/contrib/m4m/examples/giza-vs-fast.m4m ADDED Viewed

	@@ -0,0 +1,99 @@

+# -*- Makefile -*-
+# some variables need to be set before m4m modules are included
+.SECONDARY:
+MOSES_ROOT = ${HOME}/code/moses/master/mosesdecoder
+MGIZA_ROOT = ${HOME}/tools/mgiza
+fast_align = ${HOME}/bin/fast_align
+# L1: source language; L2: target language
+L1   = de
+L2   = en
+WDIR = $(CURDIR)
+include ${MOSES_ROOT}/contrib/m4m/modules/m4m.m4m
+# both systems use the same language model
+L2raw    := $(wildcard ${WDIR}/crp/trn/*/raw/*.${L2}.gz)
+L2data   := $(subst /raw/,/cased/,${L2raw})
+lm.order  = 5
+lm.factor = 0
+lm.lazy   = 1
+lm.file   = ${WDIR}/lm/${L2}.5-grams.kenlm
+${lm.file}: | $(L2data)
+$(eval $(call add_kenlm,${lm.file},${lm.order},${lm.factor},${lm.lazy}))
+.INTERMEDIATE: ${L2data}
+dmodels = wbe-mslr-bidirectional-fe-allff
+mysystem  = systems/${word-alignment}-aligned
+myptable  = model/tm/${aligner}.${L1}-${L2}
+mydtable  = model/dm/${aligner}.${L1}-${L2}
+wa ?= $(error wa not specified on command line)
+SYSTEMS :=
+aligner :=
+$(foreach a,${wa},\
+$(eval aligner:=${a});\
+$(eval $(clear-ptables));\
+$(eval $(clear-dtables));\
+$(eval SYSTEMS+=systems/${a}-aligned);\
+$(eval $(call add_binary_phrase_table,0,0,4,$${myptable}));\
+$(eval $(call add_binary_reordering_table,0,0,8,\
+       ${dmodels},$${mydtable},$${myptable}));\
+$(eval $(call create_moses_ini,$${mysystem})))
+aln: $(foreach a,${wa},${WDIR}/crp/trn/aln/$a/${L1}-${L2}.symal.gz)
+info:
+dtable: ${DTABLES}
+ptable: ${PTABLES}
+system: $(addsuffix /moses.ini.0,${SYSTEMS})
+eval:    ${EVALUATIONS}
+ifdef tune.runs
+TUNED_SYSTEMS :=
+EVALUATIONS   :=
+$(eval $(tune_all_systems))
+$(eval $(bleu_score_all_systems))
+tune: ${TUNED_SYSTEMS}
+	echo TUNED ${TUNED_SYSTEMS}
+all: ${EVALUATIONS}
+else
+tune: all
+# The recursive calls below make sure that tuning runs happen sequentially
+# (moses runs multi-threaded anyway). The reason is that we may want to have
+# first results as soon as possible.
+tune.runs := 1 1
+$(info TUNE RUNS ${tune.runs})
+all:
+	$(foreach n,$(shell seq ${tune.runs}),\
+	${MAKE} -f $(word 1, ${MAKEFILE_LIST}) \
+	tune.runs="$n $n" ${MAKECMDGOALS} -${MAKEFLAGS})
+endif
+.PHONY: $(addprefix reset-,lm tm dm all aln tune eval systems)
+reset-aln: reset-mm
+	-rm -rf $(foreach a,${wa},crp/trn/aln/${a})
+reset-mm: reset-dm reset-tm
+	-rm -rf $(foreach a,${wa},crp/trn/mm/${a})
+reset-dm: reset-systems
+	-rm -rf $(foreach a,${wa},model/dm/${a}.*)
+reset-tm: reset-systems
+	-rm -rf $(foreach a,${wa},model/tm/${a}.*)
+reset-systems:
+	-rm -rf ${SYSTEMS}
+reset-tune:
+	-rm -rf $(foreach s,${SYSTEMS}/$s/tune)
+reset-eval:
+	-rm -rf $(foreach s,${SYSTEMS},$s/eval)
+reset-lm:
+	-rm -rf lm
+reset-all: reset-lm reset-aln
+	-rm -rf $(wildcard crp/trn/*/[ct]* crp/dev/[ct]* crp/tst/[ct]*)
+	-rm -rf auxiliary

mosesdecoder/contrib/m4m/modules/obsolete/Makefile ADDED Viewed

	@@ -0,0 +1,64 @@

+# -*- Makefile -*-
+# Mandatory at the beginning of the file, before m4m inclusions
+# L1,L2: tags that identify translation source (L1)
+#        and translation target (L2) language
+L1 ?= de
+L2 ?= en
+LL = $(word 1, $(sort ${L1} ${L2}))-$(word 2, $(sort ${L1} ${L2}))
+# a name for this experiment
+experiment = dynsa-vs-std-phrase-table
+# the working directry
+WDIR = $(CURDIR)
+MOSES_ROOT = ${HOME}/code/moses/master/mosesdecoder
+# include m4m boilerplate
+include ${MOSES_ROOT}/contrib/m4m/modules/m4m.m4m
+$(info M4MDIR=${m4mdir})
+#include ${m4mdir}/baseline-system.make
+#include ${m4mdir}dynsa-system.make
+#$(info ${MY_EXPERIMENT})
+tune.sets = $(subst /raw/,/cased/,$(wildcard crp/dev/raw/*.${L1}.gz))
+all:
+.PHONY: all
+ifdef tune.runs
+$(foreach tuneset, $(word 1,${tune.sets:.${L1}.gz=}),\
+$(foreach run,$(shell seq ${tune.runs}),\
+  $(eval $(call tune_system,baseline/moses.ini.0,\
+  baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+  ${tuneset}.${L1},${tuneset}.${L2},0));\
+  $(if ,$(info $(call tune_system,baseline/moses.ini.0,\
+  baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+  ${tuneset}.${L1},${tuneset}.${L2},0));)\
+  $(eval $(call copy_weights,dynsa/moses.ini.0,\
+  baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+  dynsa/tuned/$(notdir ${tuneset})/${run}/moses.ini));\
+  $(if ,$(info $(call copy_weights,dynsa/moses.ini.0,\
+  baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+  dynsa/tuned/$(notdir ${tuneset})/${run}/moses.ini));)\
+  $(foreach evalset,$(word 2,${tune.sets:.${L1}.gz=}),\
+  $(foreach system,baseline dynsa,\
+    $(eval evaltarget:=${system}/eval/$(notdir ${tuneset})/${run}/$(notdir ${evalset}));\
+    $(eval $(call bleu_eval,${evaltarget},\
+    ${system}/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+    ${evalset}.${L1},${moses.inputtype.plaintext},${evalset}.${L2}));\
+    $(if ,$(info $(call bleu_eval,${evaltarget},\
+    ${system}/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
+    ${evalset}.${L1},${moses.inputtype.plaintext},${evalset}.${L2}));)\
+    ));\
+))
+all: ${EVALUATIONS}
+	echo EVALS ${EVALUATIONS}
+else
+all:
+	$(foreach n,$(shell seq 1 1),${MAKE} tune.runs="$n $n";)
+endif

mosesdecoder/contrib/m4m/modules/obsolete/baseline-system.m4m ADDED Viewed

	@@ -0,0 +1,48 @@

+# -*- Makefile -*-
+# This module defines a simple phrase-based baseline system
+# - a single corpus
+# - no factors
+# - single ttable
+# - single distortion model
+# chose a name for the system
+# ${system}/moses.ini.0 then defines the system
+system = baseline
+SYSTEMS += ${system}
+.PHONY: ${system}
+${system}: ${system}/moses.ini.0
+#################################################################################
+#
+# Create phrase table(s) and distortion model(s) that you want to use in this
+# system. If you already have binary or text version of all tables, you don't
+# need to specify pll.{txt1,txt2,aln}.
+pll.txt1 = ${WDIR}/crp/trn/aln/fast/${L1}.txt.gz
+pll.txt2 = ${WDIR}/crp/trn/aln/fast/${L2}.txt.gz
+pll.aln  = ${WDIR}/crp/trn/aln/fast/${L1}-${L2}.symal.gz
+ptable   = ${WDIR}/model/tm/ptable.${L1}-${L2}
+dtable   = ${WDIR}/model/dm/dtable.${L1}-${L2}
+ptable.max-phrase-length = 7
+# ptable.smoothing         = --GoodTuring
+# dmodels                  = wbe-mslr-bidirectional-fe-allff
+LMODEL_ENTRIES = KENLM;name=KENLM0;order=5;factor=0;num-features=1;lazyken=0;path=$(abspath lm/europarl-v7.en.kenlm)
+LMODELS = lm/europarl-v7.en.kenlm
+MY_EXPERIMENT += $(call add_binary_phrase_table,0,0,5,${ptable})
+$(eval $(call add_binary_phrase_table,0,0,5,${ptable}))
+if 0
+MY_EXPERIMENT += $(call add_binary_reordering_table,0,0,8,\
+              wbe-mslr-bidirectional-fe-allff,${dtable},${ptable})
+$(eval $(call add_binary_reordering_table,0,0,8,\
+              wbe-mslr-bidirectional-fe-allff,${dtable},${ptable}))
+endif
+MY_EXPERIMENT += $(call create_moses_ini,${system})
+$(eval $(call create_moses_ini,${system}))
+#################################################################################

mosesdecoder/contrib/m4m/modules/obsolete/directory-structure.m4m ADDED Viewed

	@@ -0,0 +1,7 @@

+# -*- Makefile -*-
+# STANDARD LOCATIONS
+basedir   ?= $(CURDIR)
+tune.dir  ?= ${basedir}/tune
+eval.dir  ?= ${basedir}/eval
+input.dir ?= ${basedir}/input

mosesdecoder/contrib/m4m/modules/obsolete/model-filtering.m4m ADDED Viewed

	@@ -0,0 +1,37 @@

+# -*- Makefile -*-
+#
+# This module deals with model filtering (if necessary).
+# It produces the moses.ini files for filtered models for
+# tuning and evaluation.
+ifndef ${moses_ini_for_tuning}
+  moses_ini_for_tuning = # WHAT'S THE DEFAULT LOCATION FOR THIS IN EMS?
+endif
+ifndef ${moses_ini_for_eval}
+  moses_ini_for_eval = # WHAT'S THE DEFAULT LOCATION FOR THIS IN EMS?
+endif
+# filter models if suggested by set-up
+ifneq (${moses_ini_for_tuning}, ${untuned_moses_ini})
+  ${moses_ini_for_tuning}: | ${untuned_moses_ini}
+  ${moses_ini_for_tuning}: | ${tuning_input_ready}
+  # phrase table in text format?
+  ifeq ($(shell grep -v '^ *\#' ${untuned_moses_ini} \
+          | grep -A1 '\[ttable-file\]' | tail -n +2 \
+          | head -n1 | awk '{print $$1}'),0)
+    # ADD PHRASE TABLE FILTERING COMMAND HERE
+  endif
+  # how does moses know if a lexicalized distortion table is binary or not?
+  # ADD LEXICAL DISTORTION TABLE FILTERING COMMAND HERE
+ifneq (${moses_ini_for_eval),$(tuned_moses_ini))
+  # add code for model filtering for eval here
+endif

mosesdecoder/contrib/m4m/modules/obsolete/phrase-table.make.scratch ADDED Viewed

	@@ -0,0 +1,124 @@

+# .PHONY: $1
+# $1: $1.binphr.idx
+# $1.txt.gz: | L1text = $4
+# $1.txt.gz: | L2text = $5
+# $1.txt.gz: | symal  = $6
+# ${moses.ini}: $1
+# PTABLES += 1;$2;$3;5;$1
+# endef
+# ${target}.tmp/fwd/scored.gz: | ${target}/phrase-extraction.DONE
+# | ${L1File} ${L2File} ${symal}
+# # convert phrase table from text file to binary format
+# %.binphr.idx: | %.txt.gz ${MOSES_BIN}/processPhraseTable
+# 	$(lock)
+# 	zcat -f $*.txt.gz | ${MOSES_BIN}/processPhraseTable  \
+# 	-ttable ${L1factors} ${L2factors} - -nscores 5 -out ${@D}/_${@F} \
+# 	&& mv ${@D}/_${@F} $@
+# 	$(unlock)
+# # directory definitions
+# mo_mdl = model
+# mo_tmp = model/tmp
+# wrdaln = ${fstaln}/out
+# # wrdaln should be set elsewhere!
+# # milestone files created during phrase table construction
+# ptable_bin   = ${mo_mdl}/ptable.${L1}-${L2}
+# ptable       = ${mo_mdl}/ptable.${L1}-${L2}.txt.gz
+# lex1given2   = ${mo_mdl}/${L1}-given-${L2}.lex.gz
+# lex2given1   = ${mo_mdl}/${L2}-given-${L1}.lex.gz
+# mosesinifile = ${mo_mdl}/moses.ini.0
+# .PHONY: lex ptable
+# lex:    ${lex1given2} ${lex2given1}
+# ptable: ${ptable_bin}
+# # steps taken in this module
+# # -------------------------------------------------------------------------------
+# # --- STEP 1a: extract raw phrases from word-aligned corpus ---------------------
+# # -------------------------------------------------------------------------------
+# # Note: the script ${moses.extract-phrases} takes care of initial sorting
+# ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract-phrases}
+# ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract}
+# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}.txt.gz
+# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L2}.txt.gz
+# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}-${L2}.symal.gz
+# ${mo_tmp}/phrase-extraction.DONE:
+# 	$(lock)
+# 	${moses.extract-phrases} \
+# 	${moses.extract} \
+# 	${wrdaln}/${L1}.txt.gz \
+# 	${wrdaln}/${L2}.txt.gz \
+# 	${wrdaln}/${L1}-${L2}.symal.gz \
+# 	${mo_tmp} ${max_phrase_length} \
+# 	${dmodel.type}-${dmodel.orientation} \
+# 	&& touch $@
+# 	$(unlock)
+# # -------------------------------------------------------------------------------
+# # --- STEP 1a: extract word translation lexica from word-aligned corpus ---------
+# # ---          (for lexical phrase scoring)                             ---------
+# # -------------------------------------------------------------------------------
+# $(lex2given1): $(lex1given2)
+# $(lex1given2): | ${wrdaln}/${L1}.txt.gz
+# $(lex1given2): | ${wrdaln}/${L2}.txt.gz
+# $(lex1given2): | ${wrdaln}/${L1}-${L2}.symal.gz
+# 	$(lock)
+# 	$(moses.make-lex) \
+# 	${wrdaln}/${L1}.txt.gz \
+# 	${wrdaln}/${L2}.txt.gz \
+# 	${wrdaln}/${L1}-${L2}.symal.gz \
+# 	$(lex1given2) \
+# 	$(lex2given1)
+# 	$(unlock)
+# # -------------------------------------------------------------------------------
+# # --- STEP 2: score extracted phrase pairs --------------------------------------
+# # -------------------------------------------------------------------------------
+# ptfwdhalf    = ${mo_tmp}/fwd/phrases.fwd.scored.gz
+# ptbwdhalf    = ${mo_tmp}/bwd/phrase-scoring.DONE
+# # -------------------------------------------------------------------------------
+# # --- STEP 2a: score phrases in the 'forward' direction -------------------------
+# # -------------------------------------------------------------------------------
+# $(ptfwdhalf): | ${mo_tmp}/phrase-extraction.DONE
+# $(ptfwdhalf): | ${lex1given2}
+# 	$(lock)
+# 	$(merge-sorted) ${mo_tmp}/fwd/part.*.gz \
+# 	| ${moses.score-phrases} ${MOSES_BIN}/score - ${lex1given2} ${@:.scored.gz=} \
+# 	$(ptable.smoothing) && mv $@_ $@
+# 	$(unlock)
+# # -------------------------------------------------------------------------------
+# # --- STEP 2b: score phrases in the 'backward' direction -------------------------
+# # -------------------------------------------------------------------------------
+# # Note: ${moses.score-phrases} re-sorts the scored backward phrases
+# $(ptbwdhalf): | ${mo_tmp}/phrase-extraction.DONE
+# $(ptbwdhalf): | ${lex2given1}
+# 	$(lock)
+# 	$(merge-sorted) ${mo_tmp}/bwd/part.*.gz \
+# 	| ${moses.score-phrases} ${MOSES_BIN}/score - ${lex2given1} ${@D}/scored \
+# 	"$(ptable.smoothing)" --Inverse && touch $@
+# 	$(unlock)
+# # -------------------------------------------------------------------------------
+# # --- STEP 3: put the two phrase table halves together --------------------------
+# # -------------------------------------------------------------------------------
+# # ptfwdhalf is a single .gz file, ptbwdhalf is a collection .gz files
+# $(ptable): | ${MOSES_BIN}/consolidate
+# $(ptable): | $(ptfwdhalf) $(ptbwdhalf)
+# 	$(lock)
+# 	${MOSES_BIN}/consolidate \
+# 	<(zcat ${ptfwdhalf}) \
+# 	<(${merge-sorted} ${mo_tmp}/bwd/scored.*.gz) /dev/stdout \
+# 	$(if $(ptable.smoothing), \
+# 	$(ptable.smoothing) $(ptfwdhalf:.sorted.gz=.coc)) \
+# 	| gzip > $@_ && mv $@_ $@
+# 	$(unlock)

mosesdecoder/contrib/m4m/modules/obsolete/reporting.m4m ADDED Viewed

	@@ -0,0 +1,95 @@

+# -*- Makefile -*-
+rset = set=$2,type=$3,file=evaluation/$1/$2.$3
+analyses = $(foreach e, ${eval-sets}, \
+	   $(call rset,$1,$e,analysis-precision) \
+	   $(call rset,$1,$e,analysis-coverage))
+eval-scores = $(foreach e, ${eval-sets}, \
+	      $(foreach m, ${eval-metrics}, \
+	      $(call rset,$1,$e,$m)))
+eval-results = $(foreach e, ${eval-sets}, \
+	       $(foreach m, ${eval-metrics}, \
+	       evaluation/$1/$e.$m))
+.SECONDEXPANSION:
+# NOTA BENE: setup-experiments.make adds additional dependencies for
+# evaluation/%/report in the file experiments.make!
+evaluation/%/report: sets  = $(call eval-scores,$*)
+#evaluation/%/report: sets  += $(call analyses,$*)
+#evaluation/%/report: tuned_moses_ini := $(if ${have_tuned_moses_ini},${have_tuned_moses_ini},tuning/$*/moses.tuned.ini)
+evaluation/%/report: prereqs = $(call eval-results,$*)
+evaluation/%/report: $$(prereqs)
+	echo $(foreach s, ${sets}, $s) $^
+	mkdir $@.lock
+	echo $(call lockline) > $@.lock/owner
+	${report} ${sets} > $@_
+	mv $@_ $@
+	rm $@.lock/owner
+	rmdir $@.lock
+%.analysis: params1  = -input ${$(notdir $*)-src}
+%.analysis: params1 += -input-corpus ${crp_train}.${L1}
+%.analysis: params1 += -ttable ${ttable} -dir $@
+%.analysis: params2  = -precision-by-coverage
+%.analysis: params2 += -reference ${$(notdir $*)-ref}
+%.analysis: params2 += -system $*.truecased
+%.analysis: params2 += -segmentation $*.output
+%.analysis: params2 += -system-alignment $*.output.wa
+%.analysis: params2 += -coverage $@
+%.analysis: | ${ttable} ${crp_train}.${L1}
+%.analysis: %.output.wa %.output %.truecased
+	@echo ANALYSING $^
+	@mkdir $@.lock
+	@echo $(call lockline) > $@.lock/owner
+	${analyze} ${params1}
+	${analyze} ${params1} ${params2}
+	@rm$@.lock/owner
+	@rmdir $@.lock
+%.multi-bleu: %.cleaned
+	$(info )
+	$(info RUNNING MULTI-BLEU on $^)
+	@mkdir $@.lock
+	@echo $(call lockline) > $@.lock/owner
+	${multi-bleu} ${$(notdir $*)-ref} < $< > $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock
+%.truecased: %.cleaned
+	mkdir $@.lock
+	$(detruecase) < $< > $@_
+	mv $@_ $@
+	rmdir $@.lock
+%.cleaned: %.output
+	$(info )
+	$(info CLEANING UP DECODER OUTPUT: $<)
+	$(info )
+	mkdir $@.lock
+	echo $(call lockline) > $@.lock/owner
+	$(clean-decoder-output) < $< > $@_
+	mv $@_ $@
+	rm $@.lock/owner
+	rmdir $@.lock
+%.output.wa: %.output
+evaluation/%.output: decoder_flags += -threads ${moses.threads} -v 0
+evaluation/%.output: decoder_flags += -inputtype ${input-type}
+evaluation/%.output: decoder_flags += -alignment-output-file $@.wa
+evaluation/%.output: decoder_flags += -t -text-type "test"
+evaluation/%.output: decoder_flags += -f ${moses_ini}
+evaluation/%.output: input = ${$(notdir $*)-src}
+evaluation/%.output:
+	echo MOSES_INI = ${moses_ini}
+	@mkdir -p $(@D)
+	@mkdir $@.lock
+	@echo $(call lockline) > $@.lock/owner
+	${decode} ${decoder_flags} < ${input} > $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock
+.SECONDARY:

mosesdecoder/contrib/m4m/modules/obsolete/run-moses.m4m ADDED Viewed

	@@ -0,0 +1,37 @@

+# -*- Makefile -*-
+# This make module deals with running the moses decoder.
+# It sets default parameters and checks that parameters that
+# need to be set elsewhere are actually set.
+# The following parameters are translation-job specific and need to be set
+# explicitly for each job.
+moses.threads ?= 4
+moses.flags += -threads ${moses.threads}
+moses.flags += -v 0 -t -text-type "test"
+%.moses-out.wa: moses.flags += -alignment-output-file $*.output.wa
+%.moses-out.wa: %.moses-out
+.SECONDEXPANSION:
+%.moses-out:
+	echo MOSES $^
+	$(checkvar,moses.input)
+	$(checkvar,moses.ini)
+	$(lock)
+	${moses} -i ${moses.input} -inputtype ${moses.inputtype} \
+	-f ${moses.ini} ${moses.flags} > $@_ && mv $@_ $@
+	$(unlock)
+%.cleaned: %.moses-out
+	$(lock)
+	$(clean-decoder-output) < $< > $@_ && mv $@_ $@
+	$(unlock)
+%.natcased: %.cleaned
+	$(eval $(call lock))
+	$(detruecase) < $*.cleaned > $@_ && mv $@_ $@
+	$(eval $(call unlock))

mosesdecoder/contrib/m4m/modules/obsolete/setup-experiments.m4m ADDED Viewed

	@@ -0,0 +1,121 @@

+# -*- Makefile -*-
+# This make module sets up the actual experiments
+L1 = fr
+L2 = en
+tune-ref-ready = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/tuning/reference.tc.18
+eval-ref-ready = /fs/saxnot5/germann/accept/homophones/exp.new/evaluation/201201_devtest_b.reference.tok.1
+crp_train      = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/training/corpus.19
+ttable         = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/model/phrase-table.10
+untuned_moses_ini = model/moses.ini.0
+fixed-iweight = --activate-feature d_0,d_1,d_2,d_3,d_4,d_5,d_6,lm_0,w_0,tm_0,tm_1,tm_2,tm_3,tm_4
+# list the evaluation metrics to be used for evaluation
+# TO DO: list available metrics
+eval-metrics  = multi-bleu
+moses-threads = 20
+tuning-runs   = $(shell seq 25)
+# experiments.make: WSCHEMES = uniform unigram bigram bigram2
+# experiments.make: DATASETS = tune eval
+# experiments.make: PREPROC  = baseline uniq multi
+# experiments.make: CSETS    = unfiltered filtered edited
+experiments.make: WSCHEMES = bigram2
+experiments.make: DATASETS = tune eval
+experiments.make: PREPROC  = baseline
+experiments.make: CSETS    = filtered
+# remake experiments.make if this file changes
+experiments.make: $(word $(words ${MAKEFILE_LIST}), ${MAKEFILE_LIST})
+experiments.make:
+	mkdir $@.lock
+	echo $(call lockline) > $@.lock/owner
+	echo '# -*- Makefile -*-' > $@_
+	echo '# This file was automatically generated by setup-experiments.make.' >> $@_
+	echo 'experiments := '                   >> $@_;
+	$(foreach p, ${PREPROC}, \
+	echo '# NEW EXPERIMENT #####################################' >> $@_; \
+	echo 'experiments += $p'                   >> $@_; \
+	echo 'ctr = $$(words $${experiments})'     >> $@_; \
+	echo '$p: input-type = 0'                  >> $@_; \
+	echo '$p: eval-sets = $p.eval'             >> $@_; \
+	echo '$p: tune-src = input/$p.tune.tc'     >> $@_; \
+	echo '$p: tune-ref = ${tune-ref-ready}'    >> $@_; \
+	echo '$p: $p.eval-src = input/$p.eval.tc'  >> $@_; \
+	echo '$p: $p.eval-ref = ${eval-ref-ready}' >> $@_; \
+	echo '$p: evaluation/$${ctr}/report'       >> $@_; \
+	echo                                       >> $@_; \
+	echo 'evaluation/$p/%/$p.eval.output: input = input/$p.eval.tc' >> $@_; \
+	echo 'evaluation/$p/%/$p.eval.output: input/$p.eval.tc' >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	  'evaluation/$${ctr}/$p.eval.output: ${tuned_moses_ini}', \
+	  'evaluation/$${ctr}/$p.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	  'evaluation/$${ctr}/$p.eval.output: moses_ini := ${tuned_moses_ini}', \
+	  'evaluation/$${ctr}/$p.eval.output: moses_ini := tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo 'evaluation/$${ctr}/$p.eval.multi-bleu: $${$p.eval-ref}' >> $@_; \
+	echo                                       >> $@_;)
+	$(foreach c, ${CSETS}, \
+	$(foreach p, ${PREPROC}, \
+	$(foreach w, ${WSCHEMES}, \
+	echo '# NEW EXPERIMENT #####################################' >> $@_; \
+	echo 'experiments += $w-$c-$p'                                >> $@_; \
+	echo 'ctr = $$(words $${experiments})'     >> $@_; \
+	echo '$w-$c-$p: input-type = 1'                  >> $@_; \
+	echo '$w-$c-$p: eval-sets = $w-$c-$p.eval'             >> $@_; \
+	echo '$w-$c-$p: tune-src = input/$w-$c-$p.tune.cfn'     >> $@_; \
+	echo '$w-$c-$p: tune-ref = ${tune-ref-ready}'    >> $@_; \
+	echo '$w-$c-$p: $w-$c-$p.eval-src = input/$w-$c-$p.eval.cfn'  >> $@_; \
+	echo '$w-$c-$p: $w-$c-$p.eval-ref = ${eval-ref-ready}' >> $@_; \
+	echo '$w-$c-$p: evaluation/$${ctr}/report'       >> $@_; \
+	echo                                       >> $@_; \
+	echo 'evaluation/$${ctr}/$w-$c-$p.eval.output: input = input/$w-$c-$p.eval.cfn' >> $@_; \
+	echo 'evaluation/$${ctr}/$w-$c-$p.eval.output: input/$w-$c-$p.eval.cfn' >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	  'evaluation/$${ctr}/$w-$c-$p.eval.output: ${tuned_moses_ini}', \
+	  'evaluation/$${ctr}/$w-$c-$p.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	  'evaluation/$${ctr}/$w-$c-$p.eval.output: moses_ini := ${tuned_moses_ini}', \
+	  'evaluation/$${ctr}/$w-$c-$p.eval.output: moses_ini := tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo 'evaluation/$${ctr}/$w-$c-$p.eval.multi-bleu: $${$w-$c-$p.eval-ref}' >> $@_; \
+	echo                                       >> $@_;\
+	$(foreach d, tune eval, \
+	echo 'cfn-targets += input/$w-$c-$p.$d.cfn' >> $@_; \
+	echo 'input/$w-$c-$p.$d.cfn: input/$p.$d.tc' >> $@_; \
+	printf '\t@mkdir $$@.lock\n\t@echo $$(call lockline) > $$@.lock/owner\n' >> $@_; \
+	printf '\tcreate-confusion-network.01.exe -q -w $w -s csets/csets.$c.txt -c ../mm/fr < $$< > $$@_ && mv $$@_ $$@\n' >> $@_;\
+	printf '\t@rm $$@.lock/owner\n\t@rmdir $$@.lock\n' >> $@_;))))
+	echo '.PHONY += $$(experiments) cfn' >> $@_
+	echo 'cfns: $${cfn-targets}' >> $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock
+#	# echo 'ctr = $$(words $${experiments})'                        >> $@_; \
+	# echo 'eval-sets = $w-$c-$p.eval'                 >> $@_; \
+	# echo 'rx  := $$(call report-prereqs,$${ctr},$${eval-sets})' >> $@_; \
+	# echo '$w-$c-$p: run-id    := $${ctr}'                         >> $@_; \
+	# echo '$w-$c-$p: tune-input = input/$w-$c-$p.tune.cfn'         >> $@_; \
+	# echo '$w-$c-$p: tune-src = input/$w-$c-$p.tune.cfn'           >> $@_; \
+	# echo '$w-$c-$p: tune-ref = ${tune-ref-ready}'                 >> $@_; \
+	# echo '$w-$c-$p: $w-$c-$p.eval-src = input/$w-$c-$p.eval.cfn'  >> $@_; \
+	# echo '$w-$c-$p: $w-$c-$p.eval-ref = ${eval-ref-ready}'        >> $@_; \
+	# echo '$w-$c-$p: input-type = 1'                               >> $@_; \
+	# echo '$w-$c-$p: mert.options += $$(if $$(findstring uniform,$w),${fixed-iweight})' >> $@_; \
+	# echo '$w-$c-$p: evaluation/report.$${ctr}'                    >> $@_; \
+	# echo                                                          >> $@_; \
+	# echo 'evaluation/$w-$c-$p.eval.output.$${ctr}: input = input/$w-$c-$p.eval.cfn' >> $@_; \
+	# echo                                                          >> $@_; \
+	# $(foreach d, tune eval, \
+	# ofile=input/$w-$c-$p.$d.cfn; \
+	# ifile=input/$p.$d.tc; \
+	# echo "$$ofile: $$ifile" >> $@_ ; \
+	# printf '\t create-confusion-network.01.exe -w $w -s csets/cset.$c.txt -c ../mm/fr < $$< > $$@_ && mv $$@_ $$@\n' >> $@_ ; \
+	# echo                                                          >> $@_; ))))
+	# echo '.PHONY += $$(experiments)' >> $@_
+	# @mv $@_ $@
+	# @rm $@.lock/owner
+	# @rmdir $@.lock

mosesdecoder/contrib/m4m/modules/obsolete/skip-steps.mak ADDED Viewed

	@@ -0,0 +1,19 @@

+# -*- Makefile -*-
+# Specify in this file resources that you already have
+run_id ?= 0
+untuned_moses_ini    := model/moses.ini.0
+moses_ini_for_tuning  = ${untuned_moses_ini}
+moses_ini_for_eval    = ${tuned_moses_ini}
+# Notes:
+#
+# - if ${moses_ini_for_tuning} is different from ${untuned_mose_ini}, the phrase table and the
+#   lexical distortion table will be filtered for tuning (see tune.make)
+# - if ${moses_ini_for_eval} is different from ${tuned_mose_ini}, the phrase table and the
+#   lexical distortion table will be filtered for evaluation (see eval.make)
+all:
+	echo ";$(foo);"

mosesdecoder/contrib/m4m/modules/obsolete/system.m4m ADDED Viewed

	@@ -0,0 +1,38 @@

+# -*- Makefile -*-
+# This module defines the actual system
+# Choose names for translation and distortion model
+ptable = model/tm/ptable.${L1}-${L2}
+dtable = model/dm/dtable.${L1}-${L2}
+# specify the underlying corpus
+pll.txt1 ?= crp/trn/aln/${word-alignment}/${L1}.txt.gz
+pll.txt2 ?= crp/trn/aln/${word-alignment}/${L2}.txt.gz
+pll.aln  ?= crp/trn/aln/${word-alignment}/${L1}-${L2}.symal.gz
+# specify the distortion model parameters; we bunch them
+# all together in one string
+${ptable}: dmodels = wbe-mslr-bidirectional-fe-allff
+# phrase table parameters: maximum phrase length and smoothing
+ptable.max-phrase-length = 7
+ptable.smoothing         = --GoodTuring
+#$(info $(call add_binary_phrase_table,0,0,5,${ptable},info))
+$(eval $(call add_binary_phrase_table,0,0,5,${ptable}))
+$(eval $(call add_binary_reordering_table,\
+0-0,wbe-mslr-bidirectional-fe-allff,6,${dtable},${ptable}))
+$(info $(call add_binary_reordering_table,\
+0-0,wbe-mslr-bidirectional-fe-allff,6,${dtable},${ptable},info))
+# below: moses.ini.0 is the moses ini file PRE-TUNING!
+define build_system
+$1/moses.ini.0
+makefile:
+	$(info $(call add_binary_phrase_table,0,0,5,${ptable},info))

mosesdecoder/contrib/m4m/modules/obsolete/template.m4m ADDED Viewed

	@@ -0,0 +1,66 @@

+# -*- Makefile -*-
+define setup =
+	echo 'experiments := ' >> $@_; \
+	$(foreach p, ${PREPROC}, \
+	echo '# NEW EXPERIMENT #####################################' >> $@_; \
+	echo 'experiments += ${tag}' >> $@_; \
+	echo 'ctr = $$(words $${experiments})'     >> $@_; \
+	echo '$: input-type = $(2)' >> $@_; \
+	echo '${tag}: eval-sets = ${tag}.eval' >> $@_; \
+	echo '${tag}: tune-src = input/${tag}.tune.tc'  >> $@_; \
+	echo '${tag}: tune-ref = ${tune-ref-ready}' >> $@_; \
+	echo '${tag}: ${tag}.eval-src = input/${tag}.eval.$(if $(findstring 1,$(2),cfn,tc))'  >> $@_; \
+	echo '${tag}: ${tag}.eval-ref = ${eval-ref-ready}' >> $@_; \
+	echo '${tag}: evaluation/$${ctr}/report' >> $@_; \
+	$(foreach e, ${tag}.eval, \
+	$(foreach m, ${eval-metrics}, \
+	echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.$m' >> $@_;) \
+	echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.analysis' >> $@_;) \
+	echo                                       >> $@_; \
+	echo 'evaluation/$${ctr}/${tag}.eval.output: input = input/${tag}.eval.tc' >> $@_; \
+	echo 'evaluation/$${ctr}/${tag}.eval.output: input/${tag}.eval.tc' >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	'evaluation/$${ctr}/${tag}.eval.output: ${tuned_moses_ini}', \
+	'evaluation/$${ctr}/${tag}.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo 'evaluation/$${ctr}/${tag}.eval.multi-bleu: $${${tag}.eval-ref}' >> $@_; \
+	echo                                       >> $@_;)
+	echo '.PHONY += $$(experiments)' >> $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock
+# $(1): system / input processing
+# $(2): input type (cfn or text)
+define setup_experiment =
+	echo 'experiments := ' >> $@_; \
+	$(foreach p, ${PREPROC}, \
+	echo '# NEW EXPERIMENT #####################################' >> $@_; \
+	echo 'experiments += $(1)' >> $@_; \
+	echo 'ctr = $$(words $${experiments})'     >> $@_; \
+	echo '$(1): input-type = $(2)' >> $@_; \
+	echo '$(1): eval-sets = $(1).eval' >> $@_; \
+	echo '$(1): tune-src = input/$(1).tune.tc'  >> $@_; \
+	echo '$(1): tune-ref = ${tune-ref-ready}' >> $@_; \
+	echo '$(1): $(1).eval-src = input/$(1).eval.$(if $(findstring 1,$(2),cfn,tc))'  >> $@_; \
+	echo '$(1): $(1).eval-ref = ${eval-ref-ready}' >> $@_; \
+	echo '$(1): evaluation/$${ctr}/report' >> $@_; \
+	$(foreach e, $(1).eval, \
+	$(foreach m, ${eval-metrics}, \
+	echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.$m' >> $@_;) \
+	echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.analysis' >> $@_;) \
+	echo                                       >> $@_; \
+	echo 'evaluation/$${ctr}/$(1).eval.output: input = input/$(1).eval.tc' >> $@_; \
+	echo 'evaluation/$${ctr}/$(1).eval.output: input/$(1).eval.tc' >> $@_; \
+	echo $(if $(findstring ini,${tuned_moses_ini}), \
+	'evaluation/$${ctr}/$(1).eval.output: ${tuned_moses_ini}', \
+	'evaluation/$${ctr}/$(1).eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
+	echo 'evaluation/$${ctr}/$(1).eval.multi-bleu: $${$(1).eval-ref}' >> $@_; \
+	echo                                       >> $@_;)
+	echo '.PHONY += $$(experiments)' >> $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock
+endef

mosesdecoder/contrib/m4m/modules/obsolete/tune.m4m ADDED Viewed

	@@ -0,0 +1,45 @@

+# -*- Makefile -*-
+# make module for tuning a system
+#tune.input ?= $(error missing
+#tuned_moses_ini ?= tuning/moses.ini.${run_id}
+#$(tuned_moses_ini): | ${untuned_moses_ini}
+#$(tuned_moses_ini): | ${untuned_moses_ini}
+# make sure that all necessary variables are set
+untuned_moses_ini ?= $(error Fatal error: the required variable untuned_moses_ini is not set)
+tuning_input      ?= $(error Fatal error: the required variable tuning_input is not set)
+tuning_reference  ?= $(error Fatal error: the required variable tuning_reference is not set)
+tuning_itype      ?= $(error Fatal error: the required variable tuning_itype is not set)
+tuning_wdir       ?= $(error Fatal error: the required variable tuning_wdir is not set)
+$tuning_root_dir   ?= ${MOSES_ROOT}
+# default tuning parameters
+mert.nbest           ?= 100
+mert.decoder-threads ?=   4
+tuning/%/tmp/moses.ini: mertcmd =
+tuning/%/tmp/moses.ini: mert_flags += --working-dir $(CURDIR)/tuning/$*/tmp
+tuning/%/tmp/moses.ini: mert_flags += --decoder-flags "${mert.decoder_flags} -inputtype ${input-type}"
+tuning/%/tmp/moses.ini: mert_flags += --rootdir ${MOSES_ROOT}/scripts
+tuning/%/tmp/moses.ini: mert_flags += --mertdir ${MOSES_BIN}
+tuning/%/tmp/moses.ini: mert_flags += ${mert.options}
+tuning/%/tmp/moses.ini: ${untuned_moses_ini}
+	$(info TUNING: ${tune} ${tune-src} ${tune-ref} ${decode} ${untuned_moses_ini} ${mert_flags})
+	@mkdir -p $(@D)
+	@mkdir $@.lock
+	@echo $(call lockline) > $@.lock/owner
+	${tune} ${mert_flags} ${tune-src} ${tune-ref} ${decode} ${untuned_moses_ini}
+	@rm $@.lock/owner
+	@rmdir $@.lock
+tuning/%/moses.tuned.ini: tuning/%/tmp/moses.ini
+	@mkdir -p $(@D)
+	@mkdir $@.lock
+	@echo $(call lockline) > $@.lock/owner
+	${apply-weights} tuning/$*/tmp/moses.ini < ${untuned_moses_ini} > $@_
+	@mv $@_ $@
+	@rm $@.lock/owner
+	@rmdir $@.lock

mosesdecoder/contrib/m4m/scripts/fast-align2bal.py ADDED Viewed

	@@ -0,0 +1,31 @@

+#!/usr/bin/env python
+# Auxiliary script to convert fast_align output to the "bal" input format
+# that symal requires.
+# Script by Ulrich Germann.
+# command line args:
+#   <L1 plain text> <L2 plain text> <L1-L2 alignments> <L2-L1 alignments>
+#
+# TO DO: - proper argument parsing with getopt
+#        - help text
+import sys,os
+(T1,T2,fwd,bwd) = [open(x) for x in sys.argv[1:]]
+def alnvec(slen,alinks,mode):
+    d = dict([[int(x[mode]),int(x[(mode+1)%2])+1] for x
+              in [y.split('-') for y in alinks]])
+    return [d.get(i,0) for i in xrange(slen)]
+ctr = 0
+for t1 in T1:
+    t1 = t1.strip().split()
+    t2 = T2.readline().strip().split()
+    a1 = alnvec(len(t1),bwd.readline().split(),0)
+    a2 = alnvec(len(t2),fwd.readline().split(),1)
+    print 1
+    print len(t2), " ".join(t2), '#', " ".join(["%d"%x for x in a2])
+    print len(t1), " ".join(t1), '#', " ".join(["%d"%x for x in a1])
+    ctr += 1
+    pass

mosesdecoder/contrib/m4m/scripts/giza.txt2snt.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+# Wrapper script around plain2snt that allows us to generate the numberized
+# files from gzipped text files via named pipes. (c) 2011-2012 Ulrich Germann
+fail()
+{
+    echo $@
+    exit 1
+}
+on_term()
+{
+    rm $odir/${L1}
+    rm $odir/${L2}
+}
+trap 'on_term' TERM EXIT QUIT INT 0
+if [ $# -lt 4 ]; then
+    fail "usage: $0 <txtdir> <L1> <L2> <odir>"
+fi
+txtdir=$1
+L1=$2
+L2=$3
+odir=$4
+mkdir -p $odir
+mkfifo $odir/${L1} || exit 1
+mkfifo $odir/${L2} || exit 1
+find -L ${txtdir} -name "*.${L1}" -or -name "*.${L1}.gz" | sort | xargs zcat -f > $odir/${L1} &
+find -L ${txtdir} -name "*.${L2}" -or -name "*.${L2}.gz" | sort | xargs zcat -f > $odir/${L2} &
+pushd $odir
+plain2snt ${L1} ${L2}
+wait
+mv ${L1}_${L2}.snt ${L1}-${L2}.snt
+mv ${L2}_${L1}.snt ${L2}-${L1}.snt
+wait
+popd

mosesdecoder/contrib/m4m/scripts/moses.extract-phrases.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+# helper script for phrase extraction
+# (c) 2011-2012 Ulrich Germann
+# txtdir - directory with gzipped plain text files
+# sntdir - directory with files in Giza's .snt format, also including the .OK files
+#          produced by giza.txt2snt.sh
+# gizdir - directory where aligned corpus resides
+# L1,L2  - language tags for L1,L2
+# plmax  - max phrase length to be extraced
+extractor=$1
+L1_text=$2
+L2_text=$3
+aln=$4
+odir=$5
+max_plen=$6
+dmodel=$7
+echo $#
+if [ $# -lt 6 ] ; then
+    echo <<EOF \
+"usage: $0 <moses-extract-command> <L1 text> <L2 text> <alignment file> <output dir> <max phrase length> <distortion-model>"
+EOF
+exit 1
+fi
+fifo=$odir/fifo.$$
+cleanup()
+{
+    if [ -e $fifo ] ;     then rm $fifo;     fi
+    if [ -e $fifo.inv ] ; then rm $fifo.inv; fi
+    if [ -e $fifo.o ] ;   then rm $fifo.o;  fi
+}
+trap 'cleanup' 0
+export LC_ALL=C
+mkdir -p $odir/fwd $odir/bwd $odir/dst
+mkfifo $fifo
+parallel < $fifo -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/fwd/part.{#}.gz" &
+mkfifo $fifo.inv
+parallel < $fifo.inv -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/bwd/part.{#}.gz" &
+if [ "$dmodel" != "" ] ; then
+    mkfifo $fifo.o
+    parallel < $fifo.o -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/dst/part.{#}.gz" &
+    dmodel="orientation --model $dmodel"
+fi
+#echo "($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1"
+($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1
+wait
+# for part in fwd bwd dst; do
+#     echo -n '' > $odir/${part}/sort.batch
+#     for f in $odir/${part}/part.[0-9][0-9][0-9][0-9].gz; do
+#         g=`echo $f | sed 's/.gz$//'`
+# #        echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz && rm \$f.gz; fi" \
+#         echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz; fi" \
+#             >> $odir/${part}/sort.batch
+#     done
+# done

mosesdecoder/contrib/m4m/scripts/moses.make-lex.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#!/usr/bin/env python
+# Quick hack to extract lexica from Giza-Aligned corpus
+# (c) 2011 Ulrich Germann
+import sys, os
+D = os.popen("zcat %s" % sys.argv[1])
+E = os.popen("zcat %s" % sys.argv[2])
+A = os.popen("zcat %s" % sys.argv[3])
+d_given_e = sys.argv[4]
+e_given_d = sys.argv[5]
+try:
+    os.makedirs(os.path.dirname(d_given_e))
+    os.makedirs(os.path.dirname(e_given_d))
+except:
+    pass
+WD = ["NULL","UNK"]
+WE = ["NULL","UNK"]
+VD = {}
+VE = {}
+JJ = []
+MD = []
+ME = []
+def id(V,W,x):
+    i =  V.setdefault(x,len(W))
+    if i == len(W): W.append(x)
+    return i
+ctr = 0
+for dline in D:
+    ctr += 1
+    #if ctr % 1000 == 0: sys.stderr.write('.')
+    eline = E.readline()
+    aline = A.readline()
+    d = [id(VD,WD,w) for w in dline.strip().split()]
+    e = [id(VE,WE,w) for w in eline.strip().split()]
+    a = [[int(y) for y in x.split('-')] for x in aline.split()]
+    while len(MD) <= len(VD) + 2:
+        MD.append(0)
+        JJ.append({})
+        pass
+    while len(ME) <= len(VE) + 2:
+        ME.append(0)
+        pass
+    fd = [0 for i in xrange(len(d))]
+    fe = [0 for i in xrange(len(e))]
+    for x,y in a:
+        fd[x]         += 1
+        fe[y]         += 1
+        MD[d[x]]      += 1
+        ME[e[y]]      += 1
+        JJ[d[x]][e[y]] = JJ[d[x]].setdefault(e[y],0) + 1
+        # print WD[d[x]],WE[e[y]],JJ[d[x]][e[y]]
+        pass
+    for i in [d[k] for k in xrange(len(d)) if fd[k] == 0]:
+        ME[0]   += 1
+        MD[i]   += 1
+        JJ[i][0] = JJ[i].setdefault(0,0) + 1
+        pass
+    for i in [e[k] for k in xrange(len(e)) if fe[k] == 0]:
+        ME[i]   += 1
+        MD[0]   += 1
+        JJ[0][i] = JJ[0].setdefault(i,0) + 1
+        pass
+    pass
+ED = os.popen("gzip > %s" % e_given_d, 'w')
+DE = os.popen("gzip > %s" % d_given_e, 'w')
+for d in xrange(len(JJ)):
+    T = JJ[d]
+    for e,jj in T.items():
+        print >>ED, WE[e], WD[d], float(jj)/MD[d]
+        print >>DE, WD[d], WE[e], float(jj)/ME[e]
+        pass
+    pass
+ED.close()
+DE.close()

mosesdecoder/contrib/m4m/scripts/moses.phrase-extract.sh ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/bin/bash
+# Helper script for phrase extraction from a single corpus shard.
+# Written by Ulrich Germann.
+# to be added: built-in factor filtering for factored models
+cleanup()
+{
+    if [ -e $fifo ] ;     then rm $fifo;     fi
+    if [ -e $fifo.inv ] ; then rm $fifo.inv; fi
+    if [ -e $fifo.o ] ;   then rm $fifo.o;  fi
+}
+usage()
+{
+    echo
+    echo "$0: wrapper script to extract phrases from word-aligned corpus"
+    echo -e "usage:\n   $0 <extractor> <ibase> <L1tag> <L2tag> [-x] "
+    echo "options:"
+    echo "-l: maximum phrase length ($plen)"
+    echo "-m: distortion model specification"
+    echo "-o: base name for output files .fwd.gz .bwd.gz [.<dmodel>.dst.gz]"
+    echo "-x: (no argument) don't create .fwd.gz and .bwd.gz"
+    echo
+    echo "required input files:  <ibase>.<L1tag>.gz ibase.<L2tag>.gz ibase.<aln>.gz"
+}
+plen=7
+nottable=
+dmodel=
+dspec=
+pargs=
+sfactors=
+tfactors=
+while [ $# -gt 0 ]; do
+    case $1 in
+	-l*) plen=${1#-l}
+	    plen=${plen#=}
+	    if [ -z $plen ] ; then
+		shift
+		plen=$1
+	    fi
+	    ;;
+	-m*) dmodel=${1#-m}
+	    dmodel=${dmodel#=}
+	    if [ -z $dmodel ] ; then
+		shift
+		dmodel="$1"
+	    fi
+	    ;;
+	-o*) obase=${1#-o}
+	    obase=${obase#=}
+	    if [ -z $obase ] ; then
+		shift
+		obase=$1
+	    fi
+	    ;;
+	-s*) sfactors=${1#-s}
+	    sfactors=${sfactors#=}
+	    if [ -z $sfactors ] ; then
+		shift
+		sfactors = $1
+	    fi
+	    ;;
+	-t*) tfactors=${1#-t}
+	    tfactors=${tfactors#=}
+	    if [ -z $tfactors ] ; then
+		shift
+		sfactors = $1
+	    fi
+	    ;;
+	-x) nottable=1;;
+	-h) usage; exit 0;;
+	*) pargs=(${pargs[*]} $1);;
+    esac
+    shift
+done
+if [ -n "$sfactors" ] || [ -n "$tfactors" ] ; then
+    echo "Factor filtering is not implemented yet!"
+    exit 2
+fi
+extract=${pargs[0]}
+ibase=${pargs[1]}
+L1tag=${pargs[2]}
+L2tag=${pargs[3]}
+obase=${obase:=$ibase}
+fifo=$obase.$$
+trap 'cleanup' 0
+export LC_ALL=C
+if [ -z "$nottable" ] ; then
+mkfifo $fifo;     sort -S 5G < $fifo     | gzip > $obase.fwd.gz &
+mkfifo $fifo.inv; sort -S 5G < $fifo.inv | gzip > $obase.bwd.gz &
+fi
+if [ -n "$dmodel" ] ; then
+    mkfifo $fifo.o
+    sort -S 5G < $fifo.o | gzip > $obase.dst.gz &
+    dspec="orientation --model "
+    dspec+=`echo $dmodel | perl -pe 's/((hier|phrase|wbe)-(msd|msrl|mono)).*/$1/;'`
+fi
+txt1=${ibase}.${L1tag}.gz
+txt2=${ibase}.${L2tag}.gz
+aln=${ibase}.aln.gz
+echo "($extract <(zcat -f $txt1) <(zcat -f $txt2) <(zcat -f $aln) $fifo $plen $dspec) || exit 1"
+($extract <(zcat -f $txt2) <(zcat -f $txt1) <(zcat -f $aln) $fifo $plen $dspec) || exit 1
+wait

mosesdecoder/contrib/m4m/scripts/moses.score-phrases.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+# Wrapper script around the moses phrase scoring utility.
+# Script by Ulrich Germann. Called from within M4M.
+#
+# lexicon given should be
+# de-given-en for fwd
+# en-given-de for bwd
+binary=$1
+phrases=$2
+lex=$3
+obase=$4
+smoothing=$5
+inv=$6
+cleanup()
+{
+    if [ -e $obase.$$ ] ;     then rm $obase.$$; fi
+    if [ -e $obase.$$.coc ] ; then mv $obase.$$.coc $obase.coc; fi
+}
+mkfifo $obase.$$ || exit 1
+trap 'cleanup' 0
+export LC_ALL=C
+if [[ "$inv" == "--Inverse" ]] ; then
+    parallel --gnu < $obase.$$ -j10 --pipe --blocksize 250M "sort -S 10G | gzip > $obase.{#}.gz" &
+else
+    gzip < $obase.$$ > $obase.scored.gz_ &
+fi
+if [[ $phrases != "-" && $phrases != "/dev/stdin" ]] ; then
+    $binary $phrases  <(zcat -f $lex) $obase.$$ $smoothing $inv || exit 1
+else
+    $binary /dev/stdin <(zcat -f $lex) $obase.$$ $smoothing $inv || exit 1
+fi
+if [ $? ] ; then exit $?; fi
+wait
+exit $?;

mosesdecoder/contrib/m4m/scripts/moses.transfer-weights.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/env python
+# Combines the system definition from one .ini file with the weights contained
+# in another. Works for the new moses.ini format with fully named feature
+# functions. Writes the new .ini file to stdout
+# Script by Ulrich Germann.
+import re,sys,os
+from optparse import OptionParser
+SectionHeaderPattern = re.compile(r'^\[(.*)\]\s*$')
+def read_ini(filename):
+    '''
+    Reads a moses.ini file and returns a dictionary mapping
+    from section names to a list of lines contained in that section.
+    '''
+    AllSections = {}
+    CurSection  = AllSections.setdefault('',[])
+    for line in open(filename):
+        line = line.strip()
+        m = SectionHeaderPattern.match(line)
+        if m:
+            CurSection = AllSections.setdefault(m.group(1),[])
+        elif len(line):
+            CurSection.append(line)
+            pass
+        pass
+    return AllSections
+parser = OptionParser()
+parser.add_option("-s", "--system", dest = "system",
+                  help = "moses.ini file defining the system")
+parser.add_option("-w", "--weights", dest = "weight",
+                  help = "moses.ini file defining the system")
+opts,args = parser.parse_args()
+system = read_ini(opts.system)
+weight = read_ini(opts.weight)
+for s in system:
+    if len(s) == 0 or s[0:6] == 'weight': continue
+    print "[%s]"%s
+    print "\n".join(system[s])
+    print
+    pass
+if 'weight' in weight:
+    print '[weight]'
+    print "\n".join(weight['weight'])
+else:
+    for s in weight:
+        if s[0:6] != 'weight': continue
+        print "[%s]"%s
+        print "\n".join(system[s])
+        print
+        pass
+    pass

mosesdecoder/contrib/m4m/util/Jamfile ADDED Viewed

	@@ -0,0 +1,12 @@

+external-lib bzip2 ;
+external-lib zlib ;
+exe merge-sorted :
+merge-sorted.cc
+$(TOP)/moses/TranslationModel/UG/mm//mm
+$(TOP)/moses/TranslationModel/UG/generic//generic
+$(TOP)//boost_iostreams
+$(TOP)//boost_program_options
+;

mosesdecoder/contrib/memscore/Makefile.in ADDED Viewed

	@@ -0,0 +1,581 @@

+# Makefile.in generated by automake 1.9.6 from Makefile.am.
+# @configure_input@
+# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
+# 2003, 2004, 2005  Free Software Foundation, Inc.
+# This Makefile.in is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
+# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+# PARTICULAR PURPOSE.
+@SET_MAKE@
+# memscore - in-memory phrase scoring for Statistical Machine Translation
+# Christian Hardmeier, FBK-irst, Trento, 2010
+# $Id$
+srcdir = @srcdir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+pkgdatadir = $(datadir)/@PACKAGE@
+pkglibdir = $(libdir)/@PACKAGE@
+pkgincludedir = $(includedir)/@PACKAGE@
+top_builddir = .
+am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
+INSTALL = @INSTALL@
+install_sh_DATA = $(install_sh) -c -m 644
+install_sh_PROGRAM = $(install_sh) -c
+install_sh_SCRIPT = $(install_sh) -c
+INSTALL_HEADER = $(INSTALL_DATA)
+transform = $(program_transform_name)
+NORMAL_INSTALL = :
+PRE_INSTALL = :
+POST_INSTALL = :
+NORMAL_UNINSTALL = :
+PRE_UNINSTALL = :
+POST_UNINSTALL = :
+bin_PROGRAMS = memscore$(EXEEXT)
+@IRSTLM_TRUE@am__append_1 = phraselm.cpp phraselm.h
+@CHANNEL_SCORER_TRUE@am__append_2 = channel-scorer.cpp channel-scorer.h
+subdir = .
+DIST_COMMON = $(am__configure_deps) $(srcdir)/Makefile.am \
+	$(srcdir)/Makefile.in $(srcdir)/config.h.in \
+	$(top_srcdir)/configure depcomp install-sh missing
+ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
+am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
+	$(top_srcdir)/configure.ac
+am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
+	$(ACLOCAL_M4)
+am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
+ configure.lineno configure.status.lineno
+mkinstalldirs = $(install_sh) -d
+CONFIG_HEADER = config.h
+CONFIG_CLEAN_FILES =
+am__installdirs = "$(DESTDIR)$(bindir)"
+binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
+PROGRAMS = $(bin_PROGRAMS)
+am__memscore_SOURCES_DIST = datastorage.h memscore.h phrasetable.h \
+	scorer.h scorer-impl.h statistic.h timestamp.h phrasetable.cpp \
+	memscore.cpp scorer.cpp lexdecom.cpp lexdecom.h phraselm.cpp \
+	phraselm.h channel-scorer.cpp channel-scorer.h
+@IRSTLM_TRUE@am__objects_1 = phraselm.$(OBJEXT)
+@CHANNEL_SCORER_TRUE@am__objects_2 = channel-scorer.$(OBJEXT)
+am_memscore_OBJECTS = phrasetable.$(OBJEXT) memscore.$(OBJEXT) \
+	scorer.$(OBJEXT) lexdecom.$(OBJEXT) $(am__objects_1) \
+	$(am__objects_2)
+memscore_OBJECTS = $(am_memscore_OBJECTS)
+memscore_DEPENDENCIES =
+DEFAULT_INCLUDES = -I. -I$(srcdir) -I.
+depcomp = $(SHELL) $(top_srcdir)/depcomp
+am__depfiles_maybe = depfiles
+CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
+	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
+CXXLD = $(CXX)
+CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
+	-o $@
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
+SOURCES = $(memscore_SOURCES)
+DIST_SOURCES = $(am__memscore_SOURCES_DIST)
+ETAGS = etags
+CTAGS = ctags
+DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
+distdir = $(PACKAGE)-$(VERSION)
+top_distdir = $(distdir)
+am__remove_distdir = \
+  { test ! -d $(distdir) \
+    || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
+         && rm -fr $(distdir); }; }
+DIST_ARCHIVES = $(distdir).tar.gz
+GZIP_ENV = --best
+distuninstallcheck_listfiles = find . -type f -print
+distcleancheck_listfiles = find . -type f -print
+ACLOCAL = @ACLOCAL@
+AMDEP_FALSE = @AMDEP_FALSE@
+AMDEP_TRUE = @AMDEP_TRUE@
+AMTAR = @AMTAR@
+AUTOCONF = @AUTOCONF@
+AUTOHEADER = @AUTOHEADER@
+AUTOMAKE = @AUTOMAKE@
+AWK = @AWK@
+BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
+BOOST_LDFLAGS = @BOOST_LDFLAGS@
+CC = @CC@
+CCDEPMODE = @CCDEPMODE@
+CFLAGS = @CFLAGS@
+CHANNEL_SCORER_FALSE = @CHANNEL_SCORER_FALSE@
+CHANNEL_SCORER_TRUE = @CHANNEL_SCORER_TRUE@
+CPPFLAGS = @CPPFLAGS@
+CXX = @CXX@
+CXXCPP = @CXXCPP@
+CXXDEPMODE = @CXXDEPMODE@
+CXXFLAGS = @CXXFLAGS@
+CYGPATH_W = @CYGPATH_W@
+DEFS = @DEFS@
+DEPDIR = @DEPDIR@
+ECHO_C = @ECHO_C@
+ECHO_N = @ECHO_N@
+ECHO_T = @ECHO_T@
+EGREP = @EGREP@
+EXEEXT = @EXEEXT@
+GREP = @GREP@
+INSTALL_DATA = @INSTALL_DATA@
+INSTALL_PROGRAM = @INSTALL_PROGRAM@
+INSTALL_SCRIPT = @INSTALL_SCRIPT@
+INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
+IRSTLM_FALSE = @IRSTLM_FALSE@
+IRSTLM_TRUE = @IRSTLM_TRUE@
+LDFLAGS = @LDFLAGS@
+LIBOBJS = @LIBOBJS@
+LIBS = @LIBS@
+LTLIBOBJS = @LTLIBOBJS@
+MAKEINFO = @MAKEINFO@
+OBJEXT = @OBJEXT@
+PACKAGE = @PACKAGE@
+PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
+PACKAGE_NAME = @PACKAGE_NAME@
+PACKAGE_STRING = @PACKAGE_STRING@
+PACKAGE_TARNAME = @PACKAGE_TARNAME@
+PACKAGE_VERSION = @PACKAGE_VERSION@
+PATH_SEPARATOR = @PATH_SEPARATOR@
+SET_MAKE = @SET_MAKE@
+SHELL = @SHELL@
+STRIP = @STRIP@
+VERSION = @VERSION@
+ac_ct_CC = @ac_ct_CC@
+ac_ct_CXX = @ac_ct_CXX@
+am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
+am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
+am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
+am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
+am__include = @am__include@
+am__leading_dot = @am__leading_dot@
+am__quote = @am__quote@
+am__tar = @am__tar@
+am__untar = @am__untar@
+bindir = @bindir@
+build_alias = @build_alias@
+datadir = @datadir@
+datarootdir = @datarootdir@
+docdir = @docdir@
+dvidir = @dvidir@
+exec_prefix = @exec_prefix@
+host_alias = @host_alias@
+htmldir = @htmldir@
+includedir = @includedir@
+infodir = @infodir@
+install_sh = @install_sh@
+libdir = @libdir@
+libexecdir = @libexecdir@
+localedir = @localedir@
+localstatedir = @localstatedir@
+mandir = @mandir@
+mkdir_p = @mkdir_p@
+oldincludedir = @oldincludedir@
+pdfdir = @pdfdir@
+prefix = @prefix@
+program_transform_name = @program_transform_name@
+psdir = @psdir@
+sbindir = @sbindir@
+sharedstatedir = @sharedstatedir@
+sysconfdir = @sysconfdir@
+target_alias = @target_alias@
+ACLOCAL_AMFLAGS = -I m4
+AUTOMAKE_OPTIONS = foreign
+AM_CXXFLAGS = $(BOOST_CPPFLAGS) -Wall -ffast-math -ftrapping-math -fomit-frame-pointer
+memscore_SOURCES = datastorage.h memscore.h phrasetable.h scorer.h \
+	scorer-impl.h statistic.h timestamp.h phrasetable.cpp \
+	memscore.cpp scorer.cpp lexdecom.cpp lexdecom.h \
+	$(am__append_1) $(am__append_2)
+memscore_LDADD = $(IRSTLM_LIBS) $(GSL_LIBS)
+all: config.h
+	$(MAKE) $(AM_MAKEFLAGS) all-am
+.SUFFIXES:
+.SUFFIXES: .cpp .o .obj
+am--refresh:
+	@:
+$(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
+	@for dep in $?; do \
+	  case '$(am__configure_deps)' in \
+	    *$$dep*) \
+	      echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \
+	      cd $(srcdir) && $(AUTOMAKE) --foreign  \
+		&& exit 0; \
+	      exit 1;; \
+	  esac; \
+	done; \
+	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  Makefile'; \
+	cd $(top_srcdir) && \
+	  $(AUTOMAKE) --foreign  Makefile
+.PRECIOUS: Makefile
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    echo ' $(SHELL) ./config.status'; \
+	    $(SHELL) ./config.status;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
+	esac;
+$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
+	$(SHELL) ./config.status --recheck
+$(top_srcdir)/configure:  $(am__configure_deps)
+	cd $(srcdir) && $(AUTOCONF)
+$(ACLOCAL_M4):  $(am__aclocal_m4_deps)
+	cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
+config.h: stamp-h1
+	@if test ! -f $@; then \
+	  rm -f stamp-h1; \
+	  $(MAKE) stamp-h1; \
+	else :; fi
+stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
+	@rm -f stamp-h1
+	cd $(top_builddir) && $(SHELL) ./config.status config.h
+$(srcdir)/config.h.in:  $(am__configure_deps)
+	cd $(top_srcdir) && $(AUTOHEADER)
+	rm -f stamp-h1
+	touch $@
+distclean-hdr:
+	-rm -f config.h stamp-h1
+install-binPROGRAMS: $(bin_PROGRAMS)
+	@$(NORMAL_INSTALL)
+	test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)"
+	@list='$(bin_PROGRAMS)'; for p in $$list; do \
+	  p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
+	  if test -f $$p \
+	  ; then \
+	    f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \
+	   echo " $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) '$$p' '$(DESTDIR)$(bindir)/$$f'"; \
+	   $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) "$$p" "$(DESTDIR)$(bindir)/$$f" || exit 1; \
+	  else :; fi; \
+	done
+uninstall-binPROGRAMS:
+	@$(NORMAL_UNINSTALL)
+	@list='$(bin_PROGRAMS)'; for p in $$list; do \
+	  f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \
+	  echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \
+	  rm -f "$(DESTDIR)$(bindir)/$$f"; \
+	done
+clean-binPROGRAMS:
+	-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
+memscore$(EXEEXT): $(memscore_OBJECTS) $(memscore_DEPENDENCIES)
+	@rm -f memscore$(EXEEXT)
+	$(CXXLINK) $(memscore_LDFLAGS) $(memscore_OBJECTS) $(memscore_LDADD) $(LIBS)
+mostlyclean-compile:
+	-rm -f *.$(OBJEXT)
+distclean-compile:
+	-rm -f *.tab.c
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/channel-scorer.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lexdecom.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memscore.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phraselm.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phrasetable.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scorer.Po@am__quote@
+.cpp.o:
+@am__fastdepCXX_TRUE@	if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \
+@am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ $<
+.cpp.obj:
+@am__fastdepCXX_TRUE@	if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \
+@am__fastdepCXX_TRUE@	then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+uninstall-info-am:
+ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
+	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	mkid -fID $$unique
+tags: TAGS
+TAGS:  $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
+	  test -n "$$unique" || unique=$$empty_fix; \
+	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
+	    $$tags $$unique; \
+	fi
+ctags: CTAGS
+CTAGS:  $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
+		$(TAGS_FILES) $(LISP)
+	tags=; \
+	here=`pwd`; \
+	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
+	unique=`for i in $$list; do \
+	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+	  done | \
+	  $(AWK) '    { files[$$0] = 1; } \
+	       END { for (i in files) print i; }'`; \
+	test -z "$(CTAGS_ARGS)$$tags$$unique" \
+	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
+	     $$tags $$unique
+GTAGS:
+	here=`$(am__cd) $(top_builddir) && pwd` \
+	  && cd $(top_srcdir) \
+	  && gtags -i $(GTAGS_ARGS) $$here
+distclean-tags:
+	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
+distdir: $(DISTFILES)
+	$(am__remove_distdir)
+	mkdir $(distdir)
+	$(mkdir_p) $(distdir)/m4
+	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
+	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
+	list='$(DISTFILES)'; for file in $$list; do \
+	  case $$file in \
+	    $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
+	    $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
+	  esac; \
+	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
+	  dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
+	  if test "$$dir" != "$$file" && test "$$dir" != "."; then \
+	    dir="/$$dir"; \
+	    $(mkdir_p) "$(distdir)$$dir"; \
+	  else \
+	    dir=''; \
+	  fi; \
+	  if test -d $$d/$$file; then \
+	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
+	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
+	    fi; \
+	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
+	  else \
+	    test -f $(distdir)/$$file \
+	    || cp -p $$d/$$file $(distdir)/$$file \
+	    || exit 1; \
+	  fi; \
+	done
+	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
+	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
+	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
+	|| chmod -R a+r $(distdir)
+dist-gzip: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+dist-bzip2: distdir
+	tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2
+	$(am__remove_distdir)
+dist-tarZ: distdir
+	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
+	$(am__remove_distdir)
+dist-shar: distdir
+	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
+	$(am__remove_distdir)
+dist-zip: distdir
+	-rm -f $(distdir).zip
+	zip -rq $(distdir).zip $(distdir)
+	$(am__remove_distdir)
+dist dist-all: distdir
+	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
+	$(am__remove_distdir)
+# This target untars the dist file and tries a VPATH configuration.  Then
+# it guarantees that the distribution is self-contained by making another
+# tarfile.
+distcheck: dist
+	case '$(DIST_ARCHIVES)' in \
+	*.tar.gz*) \
+	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\
+	*.tar.bz2*) \
+	  bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\
+	*.tar.Z*) \
+	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
+	*.shar.gz*) \
+	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\
+	*.zip*) \
+	  unzip $(distdir).zip ;;\
+	esac
+	chmod -R a-w $(distdir); chmod a+w $(distdir)
+	mkdir $(distdir)/_build
+	mkdir $(distdir)/_inst
+	chmod a-w $(distdir)
+	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
+	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
+	  && cd $(distdir)/_build \
+	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
+	    $(DISTCHECK_CONFIGURE_FLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) \
+	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
+	  && $(MAKE) $(AM_MAKEFLAGS) check \
+	  && $(MAKE) $(AM_MAKEFLAGS) install \
+	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
+	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
+	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
+	        distuninstallcheck \
+	  && chmod -R a-w "$$dc_install_base" \
+	  && ({ \
+	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
+	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
+	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
+	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
+	  && rm -rf "$$dc_destdir" \
+	  && $(MAKE) $(AM_MAKEFLAGS) dist \
+	  && rm -rf $(DIST_ARCHIVES) \
+	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
+	$(am__remove_distdir)
+	@(echo "$(distdir) archives ready for distribution: "; \
+	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
+	  sed -e '1{h;s/./=/g;p;x;}' -e '$${p;x;}'
+distuninstallcheck:
+	@cd $(distuninstallcheck_dir) \
+	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
+	   || { echo "ERROR: files left after uninstall:" ; \
+	        if test -n "$(DESTDIR)"; then \
+	          echo "  (check DESTDIR support)"; \
+	        fi ; \
+	        $(distuninstallcheck_listfiles) ; \
+	        exit 1; } >&2
+distcleancheck: distclean
+	@if test '$(srcdir)' = . ; then \
+	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
+	  exit 1 ; \
+	fi
+	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
+	  || { echo "ERROR: files left in build directory after distclean:" ; \
+	       $(distcleancheck_listfiles) ; \
+	       exit 1; } >&2
+check-am: all-am
+check: check-am
+all-am: Makefile $(PROGRAMS) config.h
+installdirs:
+	for dir in "$(DESTDIR)$(bindir)"; do \
+	  test -z "$$dir" || $(mkdir_p) "$$dir"; \
+	done
+install: install-am
+install-exec: install-exec-am
+install-data: install-data-am
+uninstall: uninstall-am
+install-am: all-am
+	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
+installcheck: installcheck-am
+install-strip:
+	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
+	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
+	  `test -z '$(STRIP)' || \
+	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
+mostlyclean-generic:
+clean-generic:
+distclean-generic:
+	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
+maintainer-clean-generic:
+	@echo "This command is intended for maintainers to use"
+	@echo "it deletes files that may require special tools to rebuild."
+clean: clean-am
+clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
+distclean: distclean-am
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+distclean-am: clean-am distclean-compile distclean-generic \
+	distclean-hdr distclean-tags
+dvi: dvi-am
+dvi-am:
+html: html-am
+info: info-am
+info-am:
+install-data-am:
+install-exec-am: install-binPROGRAMS
+install-info: install-info-am
+install-man:
+installcheck-am:
+maintainer-clean: maintainer-clean-am
+	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
+	-rm -rf $(top_srcdir)/autom4te.cache
+	-rm -rf ./$(DEPDIR)
+	-rm -f Makefile
+maintainer-clean-am: distclean-am maintainer-clean-generic
+mostlyclean: mostlyclean-am
+mostlyclean-am: mostlyclean-compile mostlyclean-generic
+pdf: pdf-am
+pdf-am:
+ps: ps-am
+ps-am:
+uninstall-am: uninstall-binPROGRAMS uninstall-info-am
+.PHONY: CTAGS GTAGS all all-am am--refresh check check-am clean \
+	clean-binPROGRAMS clean-generic ctags dist dist-all dist-bzip2 \
+	dist-gzip dist-shar dist-tarZ dist-zip distcheck distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-tags distcleancheck distdir distuninstallcheck dvi \
+	dvi-am html html-am info info-am install install-am \
+	install-binPROGRAMS install-data install-data-am install-exec \
+	install-exec-am install-info install-info-am install-man \
+	install-strip installcheck installcheck-am installdirs \
+	maintainer-clean maintainer-clean-generic mostlyclean \
+	mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
+	tags uninstall uninstall-am uninstall-binPROGRAMS \
+	uninstall-info-am
+# Tell versions [3.59,3.63) of GNU make to not export all variables.
+# Otherwise a system limit (for SysV at least) may be exceeded.
+.NOEXPORT:

mosesdecoder/contrib/memscore/configure.ac ADDED Viewed

	@@ -0,0 +1,84 @@

+# memscore - in-memory phrase scoring for Statistical Machine Translation
+# Christian Hardmeier, FBK-irst, Trento, 2010
+# $Id$
+# Process this file with autoconf to produce a configure script.
+AC_INIT([memscore], [1.0], [hardmeier at fbk.eu])
+AM_INIT_AUTOMAKE
+AC_LANG([C++])
+AC_ARG_WITH(irstlm,
+	[AC_HELP_STRING([--with-irstlm=PATH], [(optional) path to the IRSTLM toolkit])],
+	[with_irstlm=$withval],
+	[with_irstlm=check])
+AC_ARG_WITH([gsl],
+	[AC_HELP_STRING([--with-gsl=PATH], [path to the GSL library])],
+	[with_gsl=$withval
+	 CPPFLAGS="$CPPFLAGS -I$with_gsl/include"
+	 LDFLAGS="$LDFLAGS -L$with_gsl/lib"],
+	[with_gsl=check])
+AC_ARG_ENABLE([channel],
+	[AC_HELP_STRING([--enable-channel], [feature not yet publicly available])],
+	[AC_DEFINE(ENABLE_CHANNEL_SCORER, [], [Define to enable channel scorer])],
+	[enable_channel=no])
+AC_PREREQ([2.63])
+AC_CONFIG_SRCDIR([memscore.cpp])
+AC_CONFIG_HEADERS([config.h])
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+# Checks for libraries.
+AX_BOOST_BASE([1.35.0])
+AC_CHECK_LIB([m], [cos])
+AC_CHECK_LIB([z], [gzopen])
+have_gsl=yes
+AC_CHECK_LIB([gslcblas],[cblas_dgemm], [], [have_gsl=no])
+AC_CHECK_LIB([gsl],[gsl_blas_dgemm], [], [have_gsl=no])
+AS_IF([test x$with_irstlm = xcheck],
+	  [AC_CHECK_HEADER([n_gram.h],
+                 [AC_DEFINE([HAVE_IRSTLM], [], [flag for IRSTLM])],
+                 [with_irstlm=no])]
+,
+	  [SAVE_CPPFLAGS="$CPPFLAGS"
+	  CPPFLAGS="$CPPFLAGS -I${with_irstlm}/include"
+	  AC_CHECK_HEADER(n_gram.h,
+			 [AC_DEFINE([HAVE_IRSTLM], [], [flag for IRSTLM])],
+			 [AC_MSG_ERROR([Cannot find IRSTLM!])])
+	  MY_ARCH=`uname -m`
+	  LIB_IRSTLM="-lirstlm"
+	  LDFLAGS="$LDFLAGS -L${with_irstlm}/lib/${MY_ARCH}"
+	  LIBS="$LIBS $LIB_IRSTLM"
+	  FMTLIBS="$FMTLIBS libirstlm.a"]
+)
+AM_CONDITIONAL([IRSTLM], [test x$with_irstlm != xno])
+AS_IF([test x$enable_channel = xyes],
+	[AS_IF([test x$with_irstlm = xno || test x$have_gsl = xno],
+		[AC_MSG_ERROR([The channel scorer needs both GSL and irstlm.])])])
+# Checks for header files.
+#AC_CHECK_HEADERS([fenv.h sys/time.h])
+# Checks for typedefs, structures, and compiler characteristics.
+AC_TYPE_SIZE_T
+AC_CHECK_TYPES([ptrdiff_t])
+# Checks for library functions.
+#AC_FUNC_MALLOC
+#AC_CHECK_FUNCS([getpagesize gettimeofday])
+AM_CONDITIONAL(CHANNEL_SCORER, test x$enable_channel = xyes)
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT

mosesdecoder/contrib/memscore/lexdecom.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * File:   lexdecom.h
+ * Author: Felipe Sánchez-Martínez, Universitat d'Alacant <fsanchez@dlsi.ua.es>
+ *
+ * Created on 2010/01/27
+ */
+#ifndef _LEXDECOM_H
+#define	_LEXDECOM_H
+#include "phrasetable.h"
+#include "scorer.h"
+class LexicalDecompositionPhraseScorer : public PhraseScorer
+{
+private:
+  explicit LexicalDecompositionPhraseScorer(PhraseTable &pd, bool reverse, const String &lwfile,
+      const char *argv[], int &argp,  const PhraseScorerFactory &ptf);
+  virtual void do_score_phrases();
+  virtual Score do_get_score(const PhraseTable::const_iterator &it);
+  Score get_weight(const String &s_src, const String &s_tgt) const;
+  Score get_weight(Count src, Count tgt) const;
+  typedef std::map<std::pair<Count,Count>, Score> WeightMapType_;
+  WeightMapType_ weight_map_;
+  // p(J|I) = probability of source-length J given target-length I
+  std::map<unsigned, std::map<unsigned, Score> > prob_srclen_tgtlen_;
+  Score get_noisy_or_combination(Count src_word, PhraseInfo &tgt_phrase);
+  PhraseScorer* black_box_scorer;
+public:
+  static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse,  const PhraseScorerFactory &ptf);
+};
+#endif	/* _LEXDECOM_H */

mosesdecoder/contrib/memscore/memscore.cpp ADDED Viewed

	@@ -0,0 +1,85 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#include <iostream>
+#include <vector>
+#include "phrasetable.h"
+#include "scorer.h"
+const char *progname;
+typedef PhrasePairInfo::AlignmentVector::value_type VP;
+bool cmp_counts(const VP &a1, const VP &a2);
+int main(int argc, const char *argv[]);
+bool cmp_counts(const VP &a1, const VP &a2)
+{
+  return a1.second < a2.second;
+}
+int main(int argc, const char *argv[])
+{
+  progname = argv[0];
+  if(argc == 1) {
+    std::cerr << "No scorers specified." << std::endl;
+    usage();
+  }
+  MemoryPhraseTable pt;
+  PhraseScorerFactory psf(pt);
+  typedef std::vector<PhraseScorer *> ScorerList;
+  ScorerList scorers;
+  for(int argp = 1; argp < argc; ) {
+    bool reverse;
+    if(!strcmp(argv[argp], "-s"))
+      reverse = false;
+    else if(!strcmp(argv[argp], "-r"))
+      reverse = true;
+    else
+      usage();
+    scorers.push_back(psf.create_scorer(argv, ++argp, reverse));
+  }
+  pt.load_data(std::cin);
+  pt.compute_phrase_statistics();
+  for(ScorerList::iterator s = scorers.begin(); s != scorers.end(); ++s)
+    (*s)->score_phrases();
+  for(PhrasePairCounts::const_iterator it = pt.raw_begin(); it != pt.raw_end(); ++it) {
+    PhrasePairInfo ppi(it);
+    Phrase src = ppi.get_src();
+    Phrase tgt = ppi.get_tgt();
+    const PhrasePairInfo::AlignmentVector av = ppi.get_alignments();
+    PhraseAlignment alig = std::max_element(av.begin(), av.end(), cmp_counts)->first;
+    std::cout << pt.get_src_phrase(src) << " ||| " << pt.get_tgt_phrase(tgt) << " ||| " << alig << " |||";
+    for(ScorerList::iterator s = scorers.begin(); s != scorers.end(); ++s)
+      std::cout << ' ' << (*s)->get_score(it);
+    std::cout << '\n'; // don't use std::endl to avoid flushing
+  }
+}
+void usage()
+{
+  std::cerr <<	"Usage: " << progname << " <scorer1> <scorer2> ..." << std::endl <<
+            "       where each scorer is specified as" << std::endl <<
+            "       -s <scorer> <args>         to estimate p(s|t)" << std::endl <<
+            "       -r <scorer> <args>         to estimate p(t|s)" << std::endl << std::endl;
+  std::cerr <<	"Implemented scorers:" << std::endl;
+  const std::vector<String> &v = PhraseScorerFactory::scorer_list();
+  std::copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cerr, "\n"));
+  exit(1);
+}

mosesdecoder/contrib/memscore/memscore.h ADDED Viewed

	@@ -0,0 +1,57 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#ifndef MEMSCORE_H
+#define MEMSCORE_H
+#include <sstream>
+#include <string>
+#include <utility>
+#include "config.h"
+#ifndef HAVE_PTRDIFF_T
+typedef long ptrdiff_t;
+#endif
+#ifdef __GNUC__
+#define NORETURN __attribute__ ((noreturn))
+#else
+#define NORETURN
+#endif
+void usage() NORETURN;
+typedef double Score;
+typedef unsigned int Count;
+typedef unsigned int Phrase;
+typedef ptrdiff_t DataIndex;
+typedef std::pair<Phrase,Phrase> PhrasePair;
+typedef char *PhrasePairData;
+typedef std::string String;
+typedef std::istringstream IStringStream;
+/* phrasetable.h */
+class PhraseText;
+class PhraseInfo;
+class PhraseInfoList;
+class PhraseAlignment;
+class PhrasePairInfo;
+class PhraseTable;
+/* scorer.h */
+class PhraseScorer;
+/* statistic.h */
+class PhraseStatistic;
+/* IRSTLM */
+class lmtable;
+class ngram;
+#endif

mosesdecoder/contrib/memscore/missing ADDED Viewed

	@@ -0,0 +1,360 @@

+#! /bin/sh
+# Common stub for a few missing GNU programs while installing.
+scriptversion=2005-06-08.21
+# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005
+#   Free Software Foundation, Inc.
+# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2, or (at your option)
+# any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+# 02110-1301, USA.
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that program.
+if test $# -eq 0; then
+  echo 1>&2 "Try \`$0 --help' for more information"
+  exit 1
+fi
+run=:
+# In the cases where this matters, `missing' is being run in the
+# srcdir already.
+if test -f configure.ac; then
+  configure_ac=configure.ac
+else
+  configure_ac=configure.in
+fi
+msg="missing on your system"
+case "$1" in
+--run)
+  # Try to run requested program, and just exit if it succeeds.
+  run=
+  shift
+  "$@" && exit 0
+  # Exit code 63 means version mismatch.  This often happens
+  # when the user try to use an ancient version of a tool on
+  # a file that requires a minimum version.  In this case we
+  # we should proceed has if the program had been absent, or
+  # if --run hadn't been passed.
+  if test $? = 63; then
+    run=:
+    msg="probably too old"
+  fi
+  ;;
+  -h|--h|--he|--hel|--help)
+    echo "\
+$0 [OPTION]... PROGRAM [ARGUMENT]...
+Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
+error status if there is no known handling for PROGRAM.
+Options:
+  -h, --help      display this help and exit
+  -v, --version   output version information and exit
+  --run           try to run the given command, and emulate it if it fails
+Supported PROGRAM values:
+  aclocal      touch file \`aclocal.m4'
+  autoconf     touch file \`configure'
+  autoheader   touch file \`config.h.in'
+  automake     touch all \`Makefile.in' files
+  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
+  flex         create \`lex.yy.c', if possible, from existing .c
+  help2man     touch the output file
+  lex          create \`lex.yy.c', if possible, from existing .c
+  makeinfo     touch the output file
+  tar          try tar, gnutar, gtar, then tar without non-portable flags
+  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
+Send bug reports to <bug-automake@gnu.org>."
+    exit $?
+    ;;
+  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
+    echo "missing $scriptversion (GNU Automake)"
+    exit $?
+    ;;
+  -*)
+    echo 1>&2 "$0: Unknown \`$1' option"
+    echo 1>&2 "Try \`$0 --help' for more information"
+    exit 1
+    ;;
+esac
+# Now exit if we have it, but it failed.  Also exit now if we
+# don't have it and --version was passed (most likely to detect
+# the program).
+case "$1" in
+  lex|yacc)
+    # Not GNU programs, they don't have --version.
+    ;;
+  tar)
+    if test -n "$run"; then
+       echo 1>&2 "ERROR: \`tar' requires --run"
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       exit 1
+    fi
+    ;;
+  *)
+    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
+       # We have it, but it failed.
+       exit 1
+    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
+       # Could not run --version or --help.  This is probably someone
+       # running `$TOOL --version' or `$TOOL --help' to check whether
+       # $TOOL exists and not knowing $TOOL uses missing.
+       exit 1
+    fi
+    ;;
+esac
+# If it does not exist, or fails to run (possibly an outdated version),
+# try to emulate it.
+case "$1" in
+  aclocal*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
+         to install the \`Automake' and \`Perl' packages.  Grab them from
+         any GNU archive site."
+    touch aclocal.m4
+    ;;
+  autoconf)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`${configure_ac}'.  You might want to install the
+         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
+         archive site."
+    touch configure
+    ;;
+  autoheader)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
+         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
+         from any GNU archive site."
+    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
+    test -z "$files" && files="config.h"
+    touch_files=
+    for f in $files; do
+      case "$f" in
+      *:*) touch_files="$touch_files "`echo "$f" |
+				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
+      *) touch_files="$touch_files $f.in";;
+      esac
+    done
+    touch $touch_files
+    ;;
+  automake*)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
+         You might want to install the \`Automake' and \`Perl' packages.
+         Grab them from any GNU archive site."
+    find . -type f -name Makefile.am -print |
+	   sed 's/\.am$/.in/' |
+	   while read f; do touch "$f"; done
+    ;;
+  autom4te)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, but is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.
+         You can get \`$1' as part of \`Autoconf' from any GNU
+         archive site."
+    file=`echo "$*" | sed -n 's/.*--output[ =]*\([^ ]*\).*/\1/p'`
+    test -z "$file" && file=`echo "$*" | sed -n 's/.*-o[ ]*\([^ ]*\).*/\1/p'`
+    if test -f "$file"; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo "#! /bin/sh"
+	echo "# Created by GNU Automake missing as a replacement of"
+	echo "#  $ $@"
+	echo "exit 0"
+	chmod +x $file
+	exit 1
+    fi
+    ;;
+  bison|yacc)
+    echo 1>&2 "\
+WARNING: \`$1' $msg.  You should only need it if
+         you modified a \`.y' file.  You may need the \`Bison' package
+         in order for those modifications to take effect.  You can get
+         \`Bison' from any GNU archive site."
+    rm -f y.tab.c y.tab.h
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.y)
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.c
+	    fi
+	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" y.tab.h
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f y.tab.h ]; then
+	echo >y.tab.h
+    fi
+    if [ ! -f y.tab.c ]; then
+	echo 'main() { return 0; }' >y.tab.c
+    fi
+    ;;
+  lex|flex)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.l' file.  You may need the \`Flex' package
+         in order for those modifications to take effect.  You can get
+         \`Flex' from any GNU archive site."
+    rm -f lex.yy.c
+    if [ $# -ne 1 ]; then
+        eval LASTARG="\${$#}"
+	case "$LASTARG" in
+	*.l)
+	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
+	    if [ -f "$SRCFILE" ]; then
+	         cp "$SRCFILE" lex.yy.c
+	    fi
+	  ;;
+	esac
+    fi
+    if [ ! -f lex.yy.c ]; then
+	echo 'main() { return 0; }' >lex.yy.c
+    fi
+    ;;
+  help2man)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+	 you modified a dependency of a manual page.  You may need the
+	 \`Help2man' package in order for those modifications to take
+	 effect.  You can get \`Help2man' from any GNU archive site."
+    file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
+    if test -z "$file"; then
+	file=`echo "$*" | sed -n 's/.*--output=\([^ ]*\).*/\1/p'`
+    fi
+    if [ -f "$file" ]; then
+	touch $file
+    else
+	test -z "$file" || exec >$file
+	echo ".ab help2man is required to generate this page"
+	exit 1
+    fi
+    ;;
+  makeinfo)
+    echo 1>&2 "\
+WARNING: \`$1' is $msg.  You should only need it if
+         you modified a \`.texi' or \`.texinfo' file, or any other file
+         indirectly affecting the aspect of the manual.  The spurious
+         call might also be the consequence of using a buggy \`make' (AIX,
+         DU, IRIX).  You might want to install the \`Texinfo' package or
+         the \`GNU make' package.  Grab either from any GNU archive site."
+    # The file to touch is that specified with -o ...
+    file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
+    if test -z "$file"; then
+      # ... or it is the one specified with @setfilename ...
+      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
+      file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $infile`
+      # ... or it is derived from the source name (dir/f.texi becomes f.info)
+      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
+    fi
+    # If the file does not exist, the user really needs makeinfo;
+    # let's fail without touching anything.
+    test -f $file || exit 1
+    touch $file
+    ;;
+  tar)
+    shift
+    # We have already tried tar in the generic part.
+    # Look for gnutar/gtar before invocation to avoid ugly error
+    # messages.
+    if (gnutar --version > /dev/null 2>&1); then
+       gnutar "$@" && exit 0
+    fi
+    if (gtar --version > /dev/null 2>&1); then
+       gtar "$@" && exit 0
+    fi
+    firstarg="$1"
+    if shift; then
+	case "$firstarg" in
+	*o*)
+	    firstarg=`echo "$firstarg" | sed s/o//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+	case "$firstarg" in
+	*h*)
+	    firstarg=`echo "$firstarg" | sed s/h//`
+	    tar "$firstarg" "$@" && exit 0
+	    ;;
+	esac
+    fi
+    echo 1>&2 "\
+WARNING: I can't seem to be able to run \`tar' with the given arguments.
+         You may want to install GNU tar or Free paxutils, or check the
+         command line arguments."
+    exit 1
+    ;;
+  *)
+    echo 1>&2 "\
+WARNING: \`$1' is needed, and is $msg.
+         You might have modified some files without having the
+         proper tools for further handling them.  Check the \`README' file,
+         it often tells you about the needed prerequisites for installing
+         this package.  You may also peek at any GNU archive site, in case
+         some other package would contain this missing \`$1' program."
+    exit 1
+    ;;
+esac
+exit 0
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-end: "$"
+# End:

mosesdecoder/contrib/memscore/phraselm.h ADDED Viewed

	@@ -0,0 +1,45 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#ifndef PHRASELM_H
+#define PHRASELM_H
+#include <cassert>
+#include "memscore.h"
+#include "phrasetable.h"
+#include "statistic.h"
+class lmtable;
+class PhraseLanguageModel : public PhraseStatistic
+{
+protected:
+  String lmfile_;
+  Count score_idx_;
+  PhraseInfoList *phrase_info_list_;
+  void compute_lmscores(PhraseInfoList &phrase_info_list, bool closed_world);
+public:
+  PhraseLanguageModel(String lmfile) : lmfile_(lmfile) {}
+  virtual void attach(PhraseInfoList &pilist);
+  virtual void compute_statistic();
+  virtual Score get_score(PhraseInfo &pi) {
+    assert(computation_done_);
+    return pi.data(score_idx_);
+  }
+};
+class ClosedPhraseLanguageModel : public PhraseLanguageModel
+{
+public:
+  ClosedPhraseLanguageModel(String lmfile) : PhraseLanguageModel(lmfile) {}
+  virtual void compute_statistic();
+};
+#endif

mosesdecoder/contrib/memscore/phrasetable.cpp ADDED Viewed

	@@ -0,0 +1,348 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#include "phrasetable.h"
+#include "statistic.h"
+#include "timestamp.h"
+#include <iostream>
+#include <limits>
+#include <sstream>
+#include <string>
+/* PhraseText */
+PhraseText::DictionaryType_ PhraseText::dictionary_;
+Count PhraseText::last_id_ = 1;
+PhraseText::PhraseText(const String &s)
+{
+  IStringStream is(s);
+  while(is.good()) {
+    String w;
+    getline(is, w, ' ');
+    Count *id = boost::fast_pool_allocator<Count>::allocate(1);
+    *id = index_word(w);
+    word_list_.push_back(id);
+  }
+}
+std::ostream &operator<<(std::ostream &os, const PhraseText &pt)
+{
+  bool print_space = false;
+  for(PhraseText::const_string_iterator it = pt.string_begin(); it != pt.string_end(); it++) {
+    if(print_space)
+      os << ' ';
+    else
+      print_space = true;
+    os << *it;
+  }
+  return os;
+}
+/* PhraseAlignment */
+PhraseAlignment::Alignment::AlignmentMapType_ PhraseAlignment::Alignment::alignment_map_;
+PhraseAlignment::Alignment::AlignmentVectorType_ PhraseAlignment::Alignment::alignment_vector_;
+PhraseAlignment::Alignment::Alignment(Count slen, Count tlen, const String &alignment) :
+  slen_(slen), tlen_(tlen), matrix_(slen * tlen, false)
+{
+  assert(slen_ > 0 && slen_ < 10);
+  IStringStream is(alignment);
+  while(is.good()) {
+    String a;
+    getline(is, a, ' ');
+    IStringStream ap(a);
+    Count s, t;
+    char dash;
+    ap >> s >> dash >> t;
+    assert(s < slen && t < tlen);
+    assert(dash == '-');
+    matrix_[t * slen + s] = true;
+  }
+}
+Count PhraseAlignment::Alignment::index_alignment(Count slen, Count tlen, const String &alignment)
+{
+  AlignmentTuple_ tup = boost::make_tuple(slen, tlen, alignment);
+  AlignmentMapType_::const_iterator it = alignment_map_.find(tup);
+  if(it == alignment_map_.end()) {
+    const Alignment *pa = new Alignment(slen, tlen, alignment);
+    Count index = alignment_vector_.size();
+    alignment_map_.insert(std::make_pair(tup, index));
+    alignment_vector_.push_back(pa);
+    return index;
+  } else
+    return it->second;
+}
+std::ostream &operator<<(std::ostream &os, const PhraseAlignment::Alignment &pa)
+{
+  bool print_space = false;
+  for(Count i = 0; i < pa.matrix_.size(); i++) {
+    if(print_space)
+      os << ' ';
+    else
+      print_space = true;
+    os << (i / pa.slen_) << '-' << (i % pa.slen_);
+  }
+  return os;
+}
+std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa)
+{
+  for(Count s = 0; s < pa.get_source_length(); s++) {
+    os << '(';
+    bool print_comma = false;
+    for(Count t = 0; t < pa.get_target_length(); t++) {
+      if(pa.is_aligned(s, t)) {
+        if(print_comma)
+          os << ',';
+        else
+          print_comma = true;
+        os << t;
+      }
+    }
+    os << ") ";
+  }
+  os << "|||";
+  for(Count t = 0; t < pa.get_target_length(); t++) {
+    os << " (";
+    bool print_comma = false;
+    for(Count s = 0; s < pa.get_source_length(); s++) {
+      if(pa.is_aligned(s, t)) {
+        if(print_comma)
+          os << ',';
+        else
+          print_comma = true;
+        os << s;
+      }
+    }
+    os << ')';
+  }
+  return os;
+}
+/* PhrasePairInfo */
+bool PhrasePairInfo::init_phase_ = true;
+Count PhrasePairInfo::data_ncounts_ = COUNT_FREE_IDX;
+Count PhrasePairInfo::data_nscores_ = SCORE_FREE_IDX;
+const Count PhrasePairInfo::CONTINUATION_BIT = 1 << (std::numeric_limits<Count>::digits - 1);
+PhrasePairInfo::PhrasePairInfo(Count src, Count tgt, Count alignment, Count count) :
+  src_(src), tgt_(tgt), data_(NULL), reverse_(false)
+{
+  init_phase_ = false;
+  realloc_data(1);
+  count_data(COUNT_COUNT_IDX) = count;
+  Count *aligd = alignment_data(0);
+  aligd[0] = alignment;
+  aligd[1] = count;
+}
+DataIndex PhrasePairInfo::register_score_data(Count size)
+{
+  assert(init_phase_);
+  Count start = data_nscores_;
+  data_nscores_ += size;
+  return start;
+}
+DataIndex PhrasePairInfo::register_count_data(Count size)
+{
+  assert(init_phase_);
+  Count start = data_ncounts_;
+  data_ncounts_ += size;
+  return start;
+}
+PhrasePairInfo::AlignmentVector PhrasePairInfo::get_alignments() const
+{
+  PhrasePairInfo::AlignmentVector vec;
+  Count i = 0;
+  bool last;
+  do {
+    const Count *aligd = alignment_data(i++);
+    last = !(aligd[0] & CONTINUATION_BIT);
+    Count alig = aligd[0] & ~CONTINUATION_BIT;
+    vec.push_back(std::make_pair(PhraseAlignment(alig, reverse_), aligd[1]));
+  } while(!last);
+  return vec;
+}
+void PhrasePairInfo::add_alignment(Count new_alignment)
+{
+  Count i = 0;
+  bool last;
+  do {
+    Count *aligd = alignment_data(i++);
+    last = !(aligd[0] & CONTINUATION_BIT);
+    Count alig = aligd[0] & ~CONTINUATION_BIT;
+    if(alig == new_alignment) {
+      aligd[1]++;
+      return;
+    }
+  } while(!last);
+  realloc_data(i + 1);
+  Count *last_aligd = alignment_data(i - 1);
+  last_aligd[0] |= CONTINUATION_BIT;
+  Count *this_aligd = alignment_data(i);
+  this_aligd[0] = new_alignment;
+  this_aligd[1] = 1;
+}
+void PhrasePairInfo::realloc_data(Count nalignments)
+{
+  static boost::pool<> *pool[3] = { NULL, NULL, NULL };
+  size_t fixed_size = data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count);
+  size_t new_data_size = fixed_size + COUNTS_PER_ALIGNMENT * nalignments * sizeof(Count);
+  PhrasePairData new_data;
+  if(nalignments <= 3) {
+    if(!pool[nalignments - 1])
+      pool[nalignments - 1] = new boost::pool<>(new_data_size);
+    new_data = reinterpret_cast<PhrasePairData>(pool[nalignments - 1]->malloc());
+  } else
+    new_data = new char[new_data_size];
+  if(data_) {
+    memcpy(new_data, data_, fixed_size);
+    Count i = 0;
+    Count *old_aligd, *new_aligd;
+    do {
+      assert(i < nalignments);
+      old_aligd = alignment_data(data_, i);
+      new_aligd = alignment_data(new_data, i);
+      new_aligd[0] = old_aligd[0];
+      new_aligd[1] = old_aligd[1];
+      i++;
+    } while(old_aligd[0] & CONTINUATION_BIT);
+    if(nalignments <= 4)
+      pool[nalignments - 2]->free(data_);
+    else
+      delete[] data_;
+  }
+  data_ = new_data;
+}
+/* PhraseInfoList */
+Phrase PhraseInfoList::index_phrase(const String &s_phr)
+{
+  IDMapType_::const_iterator it = idmap_.find(s_phr);
+  if(it != idmap_.end())
+    return it->second;
+  PhraseInfo *pi = phrase_info_pool_.construct(data_size_, s_phr);
+  list_.push_back(pi);
+  idmap_[s_phr] = list_.size() - 1;
+  return idmap_[s_phr];
+}
+DataIndex PhraseInfoList::register_data(Count size)
+{
+  DataIndex start = data_size_;
+  data_size_ += size;
+  return start;
+}
+void PhraseInfoList::attach_statistic(PhraseStatistic &s)
+{
+  statistics_.push_back(&s);
+  s.attach(*this);
+}
+void PhraseInfoList::compute_statistics()
+{
+  while(!statistics_.empty()) {
+    statistics_.front()->compute_statistic();
+    statistics_.pop_front();
+  }
+}
+/* PhraseTable */
+void MemoryPhraseTable::load_data(std::istream &instream)
+{
+  Count total_count = 0;
+  Timestamp t_load;
+  Count nlines = 1;
+  String line;
+  while(getline(instream, line)) {
+    size_t sep1 = line.find(" ||| ");
+    if(sep1 == line.npos) {
+      std::cerr << "Phrase separator not found in: " << line << std::endl;
+      abort();
+    }
+    size_t sep2 = line.find(" ||| ", sep1 + 1);
+    String s_src(line, 0, sep1);
+    String s_tgt(line, sep1 + 5, sep2 - sep1 - 5);
+    String s_alignment(line, sep2 + 5);
+    Phrase src = src_info_.index_phrase(s_src);
+    Phrase tgt = tgt_info_.index_phrase(s_tgt);
+    Count alignment = PhraseAlignment::index_alignment(src_info_[src].get_phrase().size(), tgt_info_[tgt].get_phrase().size(), s_alignment);
+    src_info_[src].inc_count();
+    tgt_info_[tgt].inc_count();
+    total_count++;
+    PhrasePair stpair(src, tgt);
+    PhrasePairCounts::iterator it = joint_counts_.find(stpair);
+    if(it == joint_counts_.end()) {
+      src_info_[src].inc_distinct();
+      tgt_info_[tgt].inc_distinct();
+      joint_counts_.insert(std::make_pair(stpair, PhrasePairInfo(src, tgt, alignment, 1).get_phrase_pair_data()));
+    } else {
+      PhrasePairInfo pi(src, tgt, it->second);
+      pi.inc_count();
+      pi.add_alignment(alignment);
+      it->second = pi.get_phrase_pair_data(); // may have changed by adding the alignment
+    }
+    if(nlines % 50000 == 0)
+      std:: cerr << "Read " << nlines << " lines in " << (t_load.elapsed_time() / 1000) << " ms." << std::endl;
+    nlines++;
+  }
+}
+void MemoryPhraseTable::attach_src_statistic(PhraseStatistic &s)
+{
+  src_info_.attach_statistic(s);
+}
+void MemoryPhraseTable::attach_tgt_statistic(PhraseStatistic &s)
+{
+  tgt_info_.attach_statistic(s);
+}
+void MemoryPhraseTable::compute_phrase_statistics()
+{
+  src_info_.compute_statistics();
+  tgt_info_.compute_statistics();
+}

mosesdecoder/contrib/memscore/scorer.h ADDED Viewed

	@@ -0,0 +1,71 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#ifndef SCORER_H
+#define SCORER_H
+#include "memscore.h"
+class PhraseScorerFactory
+{
+private:
+  PhraseTable &phrase_table_;
+public:
+  explicit PhraseScorerFactory(PhraseTable &phrase_table) :
+    phrase_table_(phrase_table) {}
+  PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse);
+  PhraseTable &get_phrase_table() const {
+    return phrase_table_;
+  }
+  static const std::vector<String> &scorer_list();
+};
+class PhraseScorer
+{
+protected:
+  PhraseTable &phrase_table_;
+  bool reverse_;
+  explicit PhraseScorer(PhraseTable &pt, bool reverse) :
+    phrase_table_(!reverse ? pt : pt.reverse()), reverse_(reverse) {}
+  PhraseTable::iterator get_pair(Phrase src, Phrase tgt) {
+    PhraseTable::iterator it = phrase_table_.find(std::make_pair(src, tgt));
+    assert(it != phrase_table_.end());
+    return it;
+  }
+private:
+  virtual void do_score_phrases() {}
+  virtual Score do_get_score(const PhraseTable::const_iterator &it) = 0;
+public:
+  virtual ~PhraseScorer() {}
+  virtual Score get_discount() {}
+  void score_phrases() {
+    do_score_phrases();
+  }
+  Score get_score(const PhrasePairCounts::const_iterator &it) {
+    return do_get_score(phrase_table_.find(it));
+  }
+  Score get_score(const PhraseTable::const_iterator &it) {
+    return do_get_score(it);
+  }
+  Score get_score(Phrase src, Phrase tgt) {
+    PhraseTable::const_iterator it = get_pair(src, tgt);
+    return do_get_score(it);
+  }
+};
+#endif

mosesdecoder/contrib/memscore/timestamp.h ADDED Viewed

	@@ -0,0 +1,29 @@

+// memscore - in-memory phrase scoring for Statistical Machine Translation
+// Christian Hardmeier, FBK-irst, Trento, 2010
+// $Id$
+#ifndef TIMESTAMP_H
+#define TIMESTAMP_H
+#include <sys/time.h>
+class Timestamp
+{
+private:
+  struct timeval tv_;
+public:
+  typedef double time_difference;
+  Timestamp() {
+    gettimeofday(&tv_, NULL);
+  }
+  time_difference elapsed_time() const {
+    struct timeval tv2;
+    gettimeofday(&tv2, NULL);
+    return (tv2.tv_sec - tv_.tv_sec) * 1e6 + (tv2.tv_usec - tv_.tv_usec);
+  }
+};
+#endif

mosesdecoder/contrib/mira/Main.cpp ADDED Viewed

	@@ -0,0 +1,1849 @@

+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include <string>
+#include <vector>
+#include <map>
+#include <boost/program_options.hpp>
+#include <boost/algorithm/string.hpp>
+#ifdef MPI_ENABLE
+#include <boost/mpi.hpp>
+namespace mpi = boost::mpi;
+#endif
+#include "Main.h"
+#include "Optimiser.h"
+#include "Hildreth.h"
+#include "HypothesisQueue.h"
+#include "moses/StaticData.h"
+#include "moses/ScoreComponentCollection.h"
+#include "moses/ThreadPool.h"
+#include "mert/BleuScorer.h"
+#include "moses/FeatureVector.h"
+#include "moses/FF/WordTranslationFeature.h"
+#include "moses/FF/PhrasePairFeature.h"
+#include "moses/FF/WordPenaltyProducer.h"
+#include "moses/LM/Base.h"
+#include "util/random.hh"
+using namespace Mira;
+using namespace std;
+using namespace Moses;
+namespace po = boost::program_options;
+int main(int argc, char** argv)
+{
+  util::rand_init();
+  size_t rank = 0;
+  size_t size = 1;
+#ifdef MPI_ENABLE
+  mpi::environment env(argc,argv);
+  mpi::communicator world;
+  rank = world.rank();
+  size = world.size();
+#endif
+  bool help;
+  int verbosity;
+  string mosesConfigFile;
+  string inputFile;
+  vector<string> referenceFiles;
+  vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
+  //  string coreWeightFile, startWeightFile;
+  size_t epochs;
+  string learner;
+  bool shuffle;
+  size_t mixingFrequency;
+  size_t weightDumpFrequency;
+  string weightDumpStem;
+  bool scale_margin;
+  bool scale_update;
+  size_t n;
+  size_t batchSize;
+  bool distinctNbest;
+  bool accumulateWeights;
+  float historySmoothing;
+  bool scaleByInputLength, scaleByAvgInputLength;
+  bool scaleByInverseLength, scaleByAvgInverseLength;
+  float scaleByX;
+  float slack;
+  bool averageWeights;
+  bool weightConvergence;
+  float learning_rate;
+  float mira_learning_rate;
+  float perceptron_learning_rate;
+  string decoder_settings;
+  float min_weight_change;
+  bool normaliseWeights, normaliseMargin;
+  bool print_feature_values;
+  bool historyBleu   ;
+  bool sentenceBleu;
+  bool perceptron_update;
+  bool hope_fear;
+  bool model_hope_fear;
+  size_t hope_n, fear_n;
+  size_t bleu_smoothing_scheme;
+  float min_oracle_bleu;
+  float minBleuRatio, maxBleuRatio;
+  bool boost;
+  bool decode_hope, decode_fear, decode_model;
+  string decode_filename;
+  bool batchEqualsShard;
+  bool sparseAverage, dumpMixedWeights, sparseNoAverage;
+  int featureCutoff;
+  bool pruneZeroWeights;
+  bool printFeatureCounts, printNbestWithFeatures;
+  bool avgRefLength;
+  bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp;
+  float scale_lm_factor, scale_wp_factor;
+  bool kbest;
+  string moses_src;
+  float sigmoidParam;
+  float bleuWeight, bleuWeight_hope, bleuWeight_fear;
+  bool bleu_weight_lm;
+  float bleu_weight_lm_factor;
+  bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse;
+  float l1_lambda, l2_lambda;
+  bool most_violated, most_violated_reg, all_violated, max_bleu_diff;
+  bool feature_confidence, signed_counts;
+  float decay_core, decay_sparse, core_r0, sparse_r0;
+  float bleu_weight_fear_factor;
+  bool hildreth;
+  float add2lm;
+  // compute real sentence Bleu scores on complete translations, disable Bleu feature
+  bool realBleu, disableBleuFeature;
+  bool rescaleSlack;
+  bool makePairs;
+  bool debug;
+  bool reg_on_every_mix;
+  size_t continue_epoch;
+  bool modelPlusBleu,  simpleHistoryBleu;
+  po::options_description desc("Allowed options");
+  desc.add_options()
+  ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
+  ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
+  ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
+  ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
+  ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
+  ("most-violated", po::value<bool>(&most_violated)->default_value(false), "Add most violated constraint")
+  ("all-violated", po::value<bool>(&all_violated)->default_value(false), "Add all violated constraints")
+  ("feature-confidence", po::value<bool>(&feature_confidence)->default_value(false), "Confidence-weighted learning")
+  ("signed-counts", po::value<bool>(&signed_counts)->default_value(false), "Use signed feature counts for CWL")
+  ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
+  ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
+  ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
+  ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
+  ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
+  ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
+  ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
+  ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
+  ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
+  ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
+  ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
+  ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
+  ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
+  ("decay-core", po::value<float>(&decay_core)->default_value(0.01), "Decay for core feature learning rate")
+  ("decay-sparse", po::value<float>(&decay_sparse)->default_value(0.01), "Decay for sparse feature learning rate")
+  ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(true), "Make bleu weight depend on lm weight")
+  ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
+  ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
+  ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
+  ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
+  ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
+  ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
+  ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
+  ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
+  ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
+  ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
+  ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
+  ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
+  ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
+  ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
+  ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
+  ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
+  ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
+  ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
+  ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
+  ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
+  ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
+  ("fear-n", po::value<size_t>(&fear_n)->default_value(1), "Number of fear translations used")
+  ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+  ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
+  ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
+  ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
+  ("hope-n", po::value<size_t>(&hope_n)->default_value(2), "Number of hope translations used")
+  ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
+  ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
+  ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
+  ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
+  ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
+  ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
+  ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
+  ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
+  ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
+  ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
+  ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
+  ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
+  ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
+  ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(10), "How often per epoch to mix weights, when using mpi")
+  ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
+  ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
+  ("nbest,n", po::value<size_t>(&n)->default_value(30), "Number of translations in n-best list")
+  ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
+  ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
+  ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
+  ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
+  ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+  ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
+  ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
+  ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
+  ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
+  ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
+  ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
+  ("kbest", po::value<bool>(&kbest)->default_value(true), "Select hope/fear pairs from a list of nbest translations")
+  ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
+  ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale BLEU by (history of) input length")
+  ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
+  ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
+  ("scale-by-x", po::value<float>(&scaleByX)->default_value(0.1), "Scale the BLEU score by value x")
+  ("scale-lm", po::value<bool>(&scale_lm)->default_value(true), "Scale the language model feature")
+  ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(0.5), "Scale the language model feature by this factor")
+  ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
+  ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
+  ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
+  ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
+  ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
+  ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
+  ("slack", po::value<float>(&slack)->default_value(0.05), "Use slack in optimiser")
+  ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
+  ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
+  ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
+  ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
+  ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(2), "How often per epoch to dump weights (mpi)")
+  ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
+  po::options_description cmdline_options;
+  cmdline_options.add(desc);
+  po::variables_map vm;
+  po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
+  po::notify(vm);
+  if (help) {
+    std::cout << "Usage: " + string(argv[0])
+              + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
+    std::cout << desc << std::endl;
+    return 0;
+  }
+  const StaticData &staticData = StaticData::Instance();
+  bool trainWithMultipleFolds = false;
+  if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
+    if (rank == 0)
+      cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
+    trainWithMultipleFolds = true;
+  }
+  if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
+    cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
+    exit(1);
+  }
+  if ((sparseAverage || sparseNoAverage) && averageWeights) {
+    cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl;
+    exit(1);
+  }
+  if (trainWithMultipleFolds) {
+    if (!mosesConfigFilesFolds.size()) {
+      cerr << "Error: No moses ini files specified for training with folds" << endl;
+      exit(1);
+    }
+    if (!inputFilesFolds.size()) {
+      cerr << "Error: No input files specified for training with folds" << endl;
+      exit(1);
+    }
+    if (!referenceFilesFolds.size()) {
+      cerr << "Error: No reference files specified for training with folds" << endl;
+      exit(1);
+    }
+  } else {
+    if (mosesConfigFile.empty()) {
+      cerr << "Error: No moses ini file specified" << endl;
+      return 1;
+    }
+    if (inputFile.empty()) {
+      cerr << "Error: No input file specified" << endl;
+      return 1;
+    }
+    if (!referenceFiles.size()) {
+      cerr << "Error: No reference files specified" << endl;
+      return 1;
+    }
+  }
+  // load input and references
+  vector<string> inputSentences;
+  size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
+  size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size();
+  vector<vector<string> > inputSentencesFolds(inputSize);
+  vector<vector<string> > referenceSentences(refSize);
+  // number of cores for each fold
+  size_t coresPerFold = 0, myFold = 0;
+  if (trainWithMultipleFolds) {
+    if (mosesConfigFilesFolds.size() > size) {
+      cerr << "Number of cores has to be a multiple of the number of folds" << endl;
+      exit(1);
+    }
+    coresPerFold = size/mosesConfigFilesFolds.size();
+    if (size % coresPerFold > 0) {
+      cerr << "Number of cores has to be a multiple of the number of folds" << endl;
+      exit(1);
+    }
+    if (rank == 0)
+      cerr << "Number of cores per fold: " << coresPerFold << endl;
+    myFold = rank/coresPerFold;
+    cerr << "Rank " << rank << ", my fold: " << myFold << endl;
+  }
+  // NOTE: we do not actually need the references here, because we are reading them in from StaticData
+  if (trainWithMultipleFolds) {
+    if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) {
+      cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl;
+      exit(1);
+    }
+    VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl);
+    if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) {
+      cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl;
+      exit(1);
+    }
+    if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
+      cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
+           << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
+      exit(1);
+    }
+    VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
+  } else {
+    if (!loadSentences(inputFile, inputSentences)) {
+      cerr << "Error: Failed to load input sentences from " << inputFile << endl;
+      return 1;
+    }
+    for (size_t i = 0; i < referenceFiles.size(); ++i) {
+      if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
+        cerr << "Error: Failed to load reference sentences from "
+             << referenceFiles[i] << endl;
+        return 1;
+      }
+      if (referenceSentences[i].size() != inputSentences.size()) {
+        cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
+             << referenceSentences[i].size() << ") length of reference file " << i
+             << endl;
+        return 1;
+      }
+    }
+  }
+  if (scaleByAvgInputLength ||  scaleByInverseLength || scaleByAvgInverseLength)
+    scaleByInputLength = false;
+  if (historyBleu || simpleHistoryBleu) {
+    sentenceBleu = false;
+    cerr << "Using history Bleu. " << endl;
+  }
+  if (kbest) {
+    realBleu = true;
+    disableBleuFeature = true;
+    cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl;
+  }
+  // initialise Moses
+  // add references to initialize Bleu feature
+  boost::trim(decoder_settings);
+  decoder_settings += " -mira -n-best-list - " + boost::lexical_cast<string>(n) + " distinct";
+  vector<string> decoder_params;
+  boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
+  // bleu feature
+  decoder_params.push_back("-feature-add");
+  decoder_settings = "BleuScoreFeature tuneable=false references=";
+  if (trainWithMultipleFolds) {
+    decoder_settings += referenceFilesFolds[myFold];
+  } else {
+    decoder_settings += referenceFiles[0];
+    for (size_t i=1; i < referenceFiles.size(); ++i) {
+      decoder_settings += ",";
+      decoder_settings += referenceFiles[i];
+    }
+  }
+  decoder_params.push_back(decoder_settings);
+  string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
+  VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
+  MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
+  decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
+                             scaleByInverseLength, scaleByAvgInverseLength,
+                             scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
+  bool chartDecoding = staticData.IsChart();
+  // Optionally shuffle the sentences
+  vector<size_t> order;
+  if (trainWithMultipleFolds) {
+    for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
+      order.push_back(i);
+    }
+  } else {
+    if (rank == 0) {
+      for (size_t i = 0; i < inputSentences.size(); ++i) {
+        order.push_back(i);
+      }
+    }
+  }
+  // initialise optimizer
+  Optimiser* optimiser = NULL;
+  if (learner == "mira") {
+    if (rank == 0) {
+      cerr << "Optimising using Mira" << endl;
+      cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
+      if (normaliseMargin)
+        cerr << "sigmoid parameter: " << sigmoidParam << endl;
+    }
+    optimiser = new MiraOptimiser(slack, scale_margin, scale_update, boost, normaliseMargin, sigmoidParam);
+    learning_rate = mira_learning_rate;
+    perceptron_update = false;
+  } else if (learner == "perceptron") {
+    if (rank == 0) {
+      cerr << "Optimising using Perceptron" << endl;
+    }
+    optimiser = new Perceptron();
+    learning_rate = perceptron_learning_rate;
+    perceptron_update = true;
+    model_hope_fear = false; // mira only
+    hope_fear = false; // mira only
+    n = 1;
+    hope_n = 1;
+    fear_n = 1;
+  } else {
+    cerr << "Error: Unknown optimiser: " << learner << endl;
+    return 1;
+  }
+  // resolve parameter dependencies
+  if (batchSize > 1 && perceptron_update) {
+    batchSize = 1;
+    cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
+  }
+  if (hope_n == 0)
+    hope_n = n;
+  if (fear_n == 0)
+    fear_n = n;
+  if (model_hope_fear || kbest)
+    hope_fear = false; // is true by default
+  if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
+    cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
+    return 1;
+  }
+#ifdef MPI_ENABLE
+  if (!trainWithMultipleFolds)
+    mpi::broadcast(world, order, 0);
+#endif
+  // Create shards according to the number of processes used
+  vector<size_t> shard;
+  if (trainWithMultipleFolds) {
+    size_t shardSize = order.size()/coresPerFold;
+    size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
+    size_t shardEnd = shardStart + shardSize;
+    if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
+      shardEnd = order.size();
+      shardSize = shardEnd - shardStart;
+    }
+    VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
+    VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
+    shard.resize(shardSize);
+    copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+    batchSize = 1;
+  } else {
+    size_t shardSize = order.size() / size;
+    size_t shardStart = (size_t) (shardSize * rank);
+    size_t shardEnd = (size_t) (shardSize * (rank + 1));
+    if (rank == size - 1) {
+      shardEnd = order.size();
+      shardSize = shardEnd - shardStart;
+    }
+    VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl);
+    VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
+    shard.resize(shardSize);
+    copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+    if (batchEqualsShard)
+      batchSize = shardSize;
+  }
+  // get reference to feature functions
+  // const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
+  ScoreComponentCollection initialWeights = decoder->getWeights();
+  if (add2lm != 0) {
+    const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+    for (size_t i = 0; i < statefulFFs.size(); ++i) {
+      const StatefulFeatureFunction *ff = statefulFFs[i];
+      const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+      if (lm) {
+        float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
+        initialWeights.Assign(lm, lmWeight);
+        cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
+      }
+    }
+  }
+  if (normaliseWeights) {
+    initialWeights.L1Normalise();
+    cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
+  }
+  decoder->setWeights(initialWeights);
+  // set bleu weight to twice the size of the language model weight(s)
+  if (bleu_weight_lm) {
+    float lmSum = 0;
+    const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+    for (size_t i = 0; i < statefulFFs.size(); ++i) {
+      const StatefulFeatureFunction *ff = statefulFFs[i];
+      const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+      if (lm) {
+        lmSum += abs(initialWeights.GetScoreForProducer(lm));
+      }
+    }
+    bleuWeight = lmSum * bleu_weight_lm_factor;
+    if (!kbest) cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
+  }
+  // bleu weights can be set separately for hope and fear; otherwise they are both set to 'lm weight * bleu_weight_lm_factor'
+  if (bleuWeight_hope == -1) {
+    bleuWeight_hope = bleuWeight;
+  }
+  if (bleuWeight_fear == -1) {
+    bleuWeight_fear = bleuWeight;
+  }
+  bleuWeight_fear *= bleu_weight_fear_factor;
+  if (!kbest) {
+    cerr << "Bleu weight: " << bleuWeight << endl;
+    cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
+  }
+  if (decode_hope || decode_fear || decode_model) {
+    size_t decode = 1;
+    if (decode_fear) decode = 2;
+    if (decode_model) decode = 3;
+    decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
+  }
+  //Main loop:
+  ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
+  ScoreComponentCollection cumulativeWeightsBinary;
+  size_t numberOfUpdates = 0;
+  size_t numberOfUpdatesThisEpoch = 0;
+  time_t now;
+  time(&now);
+  cerr << "Rank " << rank << ", " << ctime(&now);
+  float avgInputLength = 0;
+  float sumOfInputs = 0;
+  size_t numberOfInputs = 0;
+  ScoreComponentCollection mixedWeights;
+  ScoreComponentCollection mixedWeightsPrevious;
+  ScoreComponentCollection mixedWeightsBeforePrevious;
+  ScoreComponentCollection mixedAverageWeights;
+  ScoreComponentCollection mixedAverageWeightsPrevious;
+  ScoreComponentCollection mixedAverageWeightsBeforePrevious;
+  bool stop = false;
+//	int sumStillViolatedConstraints;
+  float epsilon = 0.0001;
+  // Variables for feature confidence
+  ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
+  featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
+  cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
+  for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
+    if (shuffle) {
+      if (trainWithMultipleFolds || rank == 0) {
+        cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
+        RandomIndex rindex;
+        random_shuffle(order.begin(), order.end(), rindex);
+      }
+#ifdef MPI_ENABLE
+      if (!trainWithMultipleFolds)
+        mpi::broadcast(world, order, 0);
+#endif
+      // redo shards
+      if (trainWithMultipleFolds) {
+        size_t shardSize = order.size()/coresPerFold;
+        size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
+        size_t shardEnd = shardStart + shardSize;
+        if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
+          shardEnd = order.size();
+          shardSize = shardEnd - shardStart;
+        }
+        VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
+        VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
+        shard.resize(shardSize);
+        copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+        batchSize = 1;
+      } else {
+        size_t shardSize = order.size()/size;
+        size_t shardStart = (size_t) (shardSize * rank);
+        size_t shardEnd = (size_t) (shardSize * (rank + 1));
+        if (rank == size - 1) {
+          shardEnd = order.size();
+          shardSize = shardEnd - shardStart;
+        }
+        VERBOSE(1, "Shard size: " << shardSize << endl);
+        VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
+        shard.resize(shardSize);
+        copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+        if (batchEqualsShard)
+          batchSize = shardSize;
+      }
+    }
+    // sum of violated constraints in an epoch
+    // sumStillViolatedConstraints = 0;
+    numberOfUpdatesThisEpoch = 0;
+    // Sum up weights over one epoch, final average uses weights from last epoch
+    if (!accumulateWeights) {
+      cumulativeWeights.ZeroAll();
+      cumulativeWeightsBinary.ZeroAll();
+    }
+    // number of weight dumps this epoch
+    size_t weightMixingThisEpoch = 0;
+    size_t weightEpochDump = 0;
+    size_t shardPosition = 0;
+    vector<size_t>::const_iterator sid = shard.begin();
+    while (sid != shard.end()) {
+      // feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
+      vector<vector<ScoreComponentCollection> > featureValues;
+      vector<vector<float> > bleuScores;
+      vector<vector<float> > modelScores;
+      // variables for hope-fear/perceptron setting
+      vector<vector<ScoreComponentCollection> > featureValuesHope;
+      vector<vector<ScoreComponentCollection> > featureValuesFear;
+      vector<vector<float> > bleuScoresHope;
+      vector<vector<float> > bleuScoresFear;
+      vector<vector<float> > modelScoresHope;
+      vector<vector<float> > modelScoresFear;
+      // get moses weights
+      ScoreComponentCollection mosesWeights = decoder->getWeights();
+      VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
+      if (historyBleu || simpleHistoryBleu) {
+        decoder->printBleuFeatureHistory(cerr);
+      }
+      // BATCHING: produce nbest lists for all input sentences in batch
+      vector<float> oracleBleuScores;
+      vector<float> oracleModelScores;
+      vector<vector<const Word*> > oneBests;
+      vector<ScoreComponentCollection> oracleFeatureValues;
+      vector<size_t> inputLengths;
+      vector<size_t> ref_ids;
+      size_t actualBatchSize = 0;
+      size_t examples_in_batch = 0;
+      bool skip_example = false;
+      for (size_t batchPosition = 0; batchPosition < batchSize && sid
+           != shard.end(); ++batchPosition) {
+        string input;
+        if (trainWithMultipleFolds)
+          input = inputSentencesFolds[myFold][*sid];
+        else
+          input = inputSentences[*sid];
+        Moses::Sentence *sentence = new Sentence();
+        stringstream in(input + "\n");
+        const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
+        sentence->Read(in,inputFactorOrder);
+        cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
+        sentence->Print(cerr);
+        cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
+        size_t current_input_length = (*sentence).GetSize();
+        if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
+          sumOfInputs += current_input_length;
+          ++numberOfInputs;
+          avgInputLength = sumOfInputs/numberOfInputs;
+          decoder->setAvgInputLength(avgInputLength);
+          cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
+        }
+        vector<ScoreComponentCollection> newFeatureValues;
+        vector<float> newScores;
+        if (model_hope_fear) {
+          featureValues.push_back(newFeatureValues);
+          bleuScores.push_back(newScores);
+          modelScores.push_back(newScores);
+        }
+        if (hope_fear || perceptron_update) {
+          featureValuesHope.push_back(newFeatureValues);
+          featureValuesFear.push_back(newFeatureValues);
+          bleuScoresHope.push_back(newScores);
+          bleuScoresFear.push_back(newScores);
+          modelScoresHope.push_back(newScores);
+          modelScoresFear.push_back(newScores);
+          if (historyBleu || simpleHistoryBleu || debug_model) {
+            featureValues.push_back(newFeatureValues);
+            bleuScores.push_back(newScores);
+            modelScores.push_back(newScores);
+          }
+        }
+        if (kbest) {
+          // for decoding
+          featureValues.push_back(newFeatureValues);
+          bleuScores.push_back(newScores);
+          modelScores.push_back(newScores);
+          // for storing selected examples
+          featureValuesHope.push_back(newFeatureValues);
+          featureValuesFear.push_back(newFeatureValues);
+          bleuScoresHope.push_back(newScores);
+          bleuScoresFear.push_back(newScores);
+          modelScoresHope.push_back(newScores);
+          modelScoresFear.push_back(newScores);
+        }
+        size_t ref_length;
+        float avg_ref_length;
+        if (print_weights)
+          cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
+        if (print_core_weights) {
+          cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
+          mosesWeights.PrintCoreFeatures();
+          cerr << endl;
+        }
+        // check LM weight
+        const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+        for (size_t i = 0; i < statefulFFs.size(); ++i) {
+          const StatefulFeatureFunction *ff = statefulFFs[i];
+          const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+          if (lm) {
+            float lmWeight = mosesWeights.GetScoreForProducer(lm);
+            cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
+            if (lmWeight <= 0) {
+              cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
+              mosesWeights.Assign(lm, 0.1);
+              cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
+            }
+          }
+        }
+        // select inference scheme
+        cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
+        if (hope_fear || perceptron_update) {
+          // HOPE
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
+               "best hope translations" << endl;
+          vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
+              featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
+              1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          vector<const Word*> oracle = outputHope[0];
+          decoder->cleanup(chartDecoding);
+          ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+          avg_ref_length = ref_length;
+          float hope_length_ratio = (float)oracle.size()/ref_length;
+          cerr << endl;
+          // count sparse features occurring in hope translation
+          featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
+          vector<const Word*> bestModel;
+          if (debug_model || historyBleu || simpleHistoryBleu) {
+            // MODEL (for updating the history only, using dummy vectors)
+            cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
+            vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+                featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+                1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+            bestModel = outputModel[0];
+            decoder->cleanup(chartDecoding);
+            cerr << endl;
+            ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+          }
+          // FEAR
+          //float fear_length_ratio = 0;
+          float bleuRatioHopeFear = 0;
+          //int fearSize = 0;
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
+          vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
+              featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
+              1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          vector<const Word*> fear = outputFear[0];
+          decoder->cleanup(chartDecoding);
+          ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+          avg_ref_length += ref_length;
+          avg_ref_length /= 2;
+          //fear_length_ratio = (float)fear.size()/ref_length;
+          //fearSize = (int)fear.size();
+          cerr << endl;
+          for (size_t i = 0; i < fear.size(); ++i)
+            delete fear[i];
+          // count sparse features occurring in fear translation
+          featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
+          // Bleu-related example selection
+          bool skip = false;
+          bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
+          if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
+            skip = true;
+          if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
+            skip = true;
+          // sanity check
+          if (historyBleu || simpleHistoryBleu) {
+            if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
+                modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
+              if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
+                  abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
+                skip = true;
+              }
+            }
+            if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
+                modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
+              if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
+                  abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
+                skip = true;
+              }
+            }
+          }
+          if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
+            if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
+              // check if it's an error or a warning
+              skip = true;
+              if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
+              } else {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
+              }
+            }
+          }
+          if (skip) {
+            cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
+            featureValuesHope[batchPosition].clear();
+            featureValuesFear[batchPosition].clear();
+            bleuScoresHope[batchPosition].clear();
+            bleuScoresFear[batchPosition].clear();
+            if (historyBleu || simpleHistoryBleu || debug_model) {
+              featureValues[batchPosition].clear();
+              bleuScores[batchPosition].clear();
+            }
+          } else {
+            examples_in_batch++;
+            // needed for history
+            if (historyBleu || simpleHistoryBleu)  {
+              inputLengths.push_back(current_input_length);
+              ref_ids.push_back(*sid);
+              oneBests.push_back(bestModel);
+            }
+          }
+        }
+        if (model_hope_fear) {
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
+          size_t oraclePos = featureValues[batchPosition].size();
+          decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
+                            featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+                            0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          //vector<const Word*> oracle = outputHope[0];
+          // needed for history
+          inputLengths.push_back(current_input_length);
+          ref_ids.push_back(*sid);
+          decoder->cleanup(chartDecoding);
+          //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
+          //float hope_length_ratio = (float)oracle.size()/ref_length;
+          cerr << endl;
+          oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
+          oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
+          oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
+          // MODEL
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
+          if (historyBleu || simpleHistoryBleu) {
+            vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
+                bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
+                modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+            vector<const Word*> bestModel = outputModel[0];
+            oneBests.push_back(bestModel);
+            inputLengths.push_back(current_input_length);
+            ref_ids.push_back(*sid);
+          } else {
+            decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+                              featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+                              0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          }
+          decoder->cleanup(chartDecoding);
+          //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+          //float model_length_ratio = (float)bestModel.size()/ref_length;
+          cerr << endl;
+          // FEAR
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
+          decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
+                            featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
+                            0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          decoder->cleanup(chartDecoding);
+          //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
+          //float fear_length_ratio = (float)fear.size()/ref_length;
+          examples_in_batch++;
+        }
+        if (kbest) {
+          // MODEL
+          cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
+          if (historyBleu || simpleHistoryBleu) {
+            vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
+                bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
+                modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength,	rank, epoch, "");
+            vector<const Word*> bestModel = outputModel[0];
+            oneBests.push_back(bestModel);
+            inputLengths.push_back(current_input_length);
+            ref_ids.push_back(*sid);
+          } else {
+            decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
+                              featureValues[batchPosition], bleuScores[batchPosition],
+                              modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
+          }
+          decoder->cleanup(chartDecoding);
+          //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
+          //float model_length_ratio = (float)bestModel.size()/ref_length;
+          cerr << endl;
+          examples_in_batch++;
+          HypothesisQueue queueHope(hope_n);
+          HypothesisQueue queueFear(fear_n);
+          cerr << endl;
+          if (most_violated || all_violated) {
+            float bleuHope = -1000;
+            float bleuFear = 1000;
+            int indexHope = -1;
+            int indexFear = -1;
+            vector<float> bleuHopeList;
+            vector<float> bleuFearList;
+            vector<float> indexHopeList;
+            vector<float> indexFearList;
+            if (most_violated)
+              cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
+            else if (all_violated)
+              cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
+            else
+              cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
+            // find best hope, then find fear that violates our constraint most
+            for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+              if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
+                if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
+                  if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
+                    // better model score
+                    bleuHope = bleuScores[batchPosition][i];
+                    indexHope = i;
+                  }
+                }
+              } else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
+                bleuHope = bleuScores[batchPosition][i];
+                indexHope = i;
+              }
+            }
+            float currentViolation = 0;
+            for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+              float bleuDiff = bleuHope - bleuScores[batchPosition][i];
+              float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
+              if ((bleuDiff > epsilon) && (modelDiff < bleuDiff)) {
+                float diff = bleuDiff - modelDiff;
+                if (diff > epsilon) {
+                  if (all_violated) {
+                    cerr << ".. adding pair";
+                    bleuHopeList.push_back(bleuHope);
+                    bleuFearList.push_back(bleuScores[batchPosition][i]);
+                    indexHopeList.push_back(indexHope);
+                    indexFearList.push_back(i);
+                  } else if (most_violated && diff > currentViolation) {
+                    currentViolation = diff;
+                    bleuFear = bleuScores[batchPosition][i];
+                    indexFear = i;
+                    cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
+                  }
+                }
+              }
+            }
+            if (most_violated) {
+              if (currentViolation > 0) {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
+                cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
+                bleuScoresHope[batchPosition].push_back(bleuHope);
+                bleuScoresFear[batchPosition].push_back(bleuFear);
+                featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
+                featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
+                float modelScoreHope = modelScores[batchPosition][indexHope];
+                float modelScoreFear = modelScores[batchPosition][indexFear];
+                if (most_violated_reg) {
+                  // reduce model score difference by factor ~0.5
+                  float reg = currentViolation/4;
+                  modelScoreHope += abs(reg);
+                  modelScoreFear -= abs(reg);
+                  float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
+                  cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
+                }
+                modelScoresHope[batchPosition].push_back(modelScoreHope);
+                modelScoresFear[batchPosition].push_back(modelScoreFear);
+                featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
+                featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
+              } else {
+                cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
+                skip_example = 1;
+              }
+            } else cerr << endl;
+          }
+          if (max_bleu_diff) {
+            cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
+            for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
+              float hopeScore = bleuScores[batchPosition][i];
+              if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
+              BleuIndexPair hope(hopeScore, i);
+              queueHope.Push(hope);
+              float fearScore = -1*(bleuScores[batchPosition][i]);
+              if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
+              BleuIndexPair fear(fearScore, i);
+              queueFear.Push(fear);
+            }
+            skip_example = 0;
+          }
+          cerr << endl;
+          vector<BleuIndexPair> hopeList, fearList;
+          for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
+          for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
+          for (size_t i=0; i<hopeList.size(); ++i) {
+            //float bleuHope = hopeList[i].first;
+            size_t indexHope = hopeList[i].second;
+            float bleuHope = bleuScores[batchPosition][indexHope];
+            for (size_t j=0; j<fearList.size(); ++j) {
+              //float bleuFear = -1*(fearList[j].first);
+              size_t indexFear = fearList[j].second;
+              float bleuFear = bleuScores[batchPosition][indexFear];
+              cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope  << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
+              bleuScoresHope[batchPosition].push_back(bleuHope);
+              bleuScoresFear[batchPosition].push_back(bleuFear);
+              featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
+              featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
+              float modelScoreHope = modelScores[batchPosition][indexHope];
+              float modelScoreFear = modelScores[batchPosition][indexFear];
+              modelScoresHope[batchPosition].push_back(modelScoreHope);
+              modelScoresFear[batchPosition].push_back(modelScoreFear);
+              featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
+              featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
+            }
+          }
+          if (!makePairs)
+            cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
+        }
+        // next input sentence
+        ++sid;
+        ++actualBatchSize;
+        ++shardPosition;
+      } // end of batch loop
+      if (examples_in_batch == 0 || (kbest && skip_example)) {
+        cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
+      } else {
+        vector<vector<float> > losses(actualBatchSize);
+        if (model_hope_fear) {
+          // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
+          for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
+            for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
+              losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
+            }
+          }
+        }
+        // set weight for bleu feature to 0 before optimizing
+        vector<FeatureFunction*>::const_iterator iter;
+        const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
+        for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
+          if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
+            mosesWeights.Assign(*iter, 0);
+            break;
+          }
+        }
+        // scale LM feature (to avoid rapid changes)
+        if (scale_lm) {
+          cerr << "scale lm" << endl;
+          const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
+          for (size_t i = 0; i < statefulFFs.size(); ++i) {
+            const StatefulFeatureFunction *ff = statefulFFs[i];
+            const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
+            if (lm) {
+              // scale down score
+              if (model_hope_fear) {
+                scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
+              } else {
+                scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
+                scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
+              }
+            }
+          }
+        }
+        // scale WP
+        if (scale_wp) {
+          // scale up weight
+          WordPenaltyProducer &wp = WordPenaltyProducer::InstanceNonConst();
+          // scale down score
+          if (model_hope_fear) {
+            scaleFeatureScore(&wp, scale_wp_factor, featureValues, rank, epoch);
+          } else {
+            scaleFeatureScore(&wp, scale_wp_factor, featureValuesHope, rank, epoch);
+            scaleFeatureScore(&wp, scale_wp_factor, featureValuesFear, rank, epoch);
+          }
+        }
+        // print out the feature values
+        if (print_feature_values) {
+          cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
+          if (model_hope_fear) printFeatureValues(featureValues);
+          else {
+            cerr << "hope: " << endl;
+            printFeatureValues(featureValuesHope);
+            cerr << "fear: " << endl;
+            printFeatureValues(featureValuesFear);
+          }
+        }
+        // apply learning rates to feature vectors before optimization
+        if (feature_confidence) {
+          cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
+          if (model_hope_fear) {
+            applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
+          } else {
+            applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
+            applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
+          }
+        } else {
+          // apply fixed learning rates
+          cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
+          if (core_r0 != 1.0 || sparse_r0 != 1.0) {
+            if (model_hope_fear) {
+              applyLearningRates(featureValues, core_r0, sparse_r0);
+            } else {
+              applyLearningRates(featureValuesHope, core_r0, sparse_r0);
+              applyLearningRates(featureValuesFear, core_r0, sparse_r0);
+            }
+          }
+        }
+        // Run optimiser on batch:
+        VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
+        size_t update_status = 1;
+        ScoreComponentCollection weightUpdate;
+        if (perceptron_update) {
+          vector<vector<float> > dummy1;
+          update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
+                          featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
+        } else if (hope_fear) {
+          if (bleuScoresHope[0][0] >= min_oracle_bleu) {
+            if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
+              update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
+                              featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
+                              bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
+            } else
+              update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
+                              featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
+                              modelScoresFear, learning_rate, rank, epoch);
+          } else
+            update_status = 1;
+        } else if (kbest) {
+          if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
+            cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
+            cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
+            update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
+                              weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
+                              bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
+                              modelScoresFear[0][0], learning_rate, rank, epoch);
+          } else {
+            cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
+            cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
+            update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
+                            featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
+                            modelScoresFear, learning_rate, rank, epoch);
+          }
+        } else {
+          // model_hope_fear
+          update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
+                          featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
+                          oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
+        }
+        // sumStillViolatedConstraints += update_status;
+        if (update_status == 0) {	 // if weights were updated
+          // apply weight update
+          if (debug)
+            cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
+          if (feature_confidence) {
+            // update confidence counts based on weight update
+            confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
+            // update feature learning rates
+            featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
+          }
+          // apply weight update to Moses weights
+          mosesWeights.PlusEquals(weightUpdate);
+          if (normaliseWeights)
+            mosesWeights.L1Normalise();
+          cumulativeWeights.PlusEquals(mosesWeights);
+          if (sparseAverage) {
+            ScoreComponentCollection binary;
+            binary.SetToBinaryOf(mosesWeights);
+            cumulativeWeightsBinary.PlusEquals(binary);
+          }
+          ++numberOfUpdates;
+          ++numberOfUpdatesThisEpoch;
+          if (averageWeights) {
+            ScoreComponentCollection averageWeights(cumulativeWeights);
+            if (accumulateWeights) {
+              averageWeights.DivideEquals(numberOfUpdates);
+            } else {
+              averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+            }
+            mosesWeights = averageWeights;
+          }
+          // set new Moses weights
+          decoder->setWeights(mosesWeights);
+          //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
+        }
+        // update history (for approximate document Bleu)
+        if (historyBleu || simpleHistoryBleu) {
+          for (size_t i = 0; i < oneBests.size(); ++i)
+            cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
+          decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
+          deleteTranslations(oneBests);
+        }
+      } // END TRANSLATE AND UPDATE BATCH
+      // size of all shards except for the last one
+      size_t generalShardSize;
+      if (trainWithMultipleFolds)
+        generalShardSize = order.size()/coresPerFold;
+      else
+        generalShardSize = order.size()/size;
+      size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
+      size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
+      bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);
+      // mix weights?
+      if (mix) {
+#ifdef MPI_ENABLE
+        cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
+        // collect all weights in mixedWeights and divide by number of processes
+        mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
+        // mix confidence counts
+        //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
+        ScoreComponentCollection totalBinary;
+        if (sparseAverage) {
+          ScoreComponentCollection binary;
+          binary.SetToBinaryOf(mosesWeights);
+          mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
+        }
+        if (rank == 0) {
+          // divide by number of processes
+          if (sparseNoAverage)
+            mixedWeights.CoreDivideEquals(size); // average only core weights
+          else if (sparseAverage)
+            mixedWeights.DivideEquals(totalBinary);
+          else
+            mixedWeights.DivideEquals(size);
+          // divide confidence counts
+          //mixedConfidenceCounts.DivideEquals(size);
+          // normalise weights after averaging
+          if (normaliseWeights) {
+            mixedWeights.L1Normalise();
+          }
+          ++weightMixingThisEpoch;
+          if (pruneZeroWeights) {
+            size_t pruned = mixedWeights.PruneZeroWeightFeatures();
+            cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                 << pruned << " zero-weighted features pruned from mixedWeights." << endl;
+            pruned = cumulativeWeights.PruneZeroWeightFeatures();
+            cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                 << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
+          }
+          if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
+            size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
+            cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                 << pruned << " features pruned from mixedWeights." << endl;
+            pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
+            cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                 << pruned << " features pruned from cumulativeWeights." << endl;
+          }
+          if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
+            if (l1_regularize) {
+              size_t pruned;
+              if (l1_reg_sparse)
+                pruned = mixedWeights.SparseL1Regularize(l1_lambda);
+              else
+                pruned = mixedWeights.L1Regularize(l1_lambda);
+              cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                   << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+            }
+            if (l2_regularize) {
+              if (l2_reg_sparse)
+                mixedWeights.SparseL2Regularize(l2_lambda);
+              else
+                mixedWeights.L2Regularize(l2_lambda);
+              cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                   << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
+            }
+          }
+        }
+        // broadcast average weights from process 0
+        mpi::broadcast(world, mixedWeights, 0);
+        decoder->setWeights(mixedWeights);
+        mosesWeights = mixedWeights;
+        // broadcast summed confidence counts
+        //mpi::broadcast(world, mixedConfidenceCounts, 0);
+        //confidenceCounts = mixedConfidenceCounts;
+#endif
+#ifndef MPI_ENABLE
+        //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
+        mixedWeights = mosesWeights;
+#endif
+      } // end mixing
+      // Dump weights?
+      if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
+        // dump mixed weights at end of every epoch to enable continuing a crashed experiment
+        // (for jackknife every time the weights are mixed)
+        ostringstream filename;
+        if (epoch < 10)
+          filename << weightDumpStem << "_mixed_0" << epoch;
+        else
+          filename << weightDumpStem << "_mixed_" << epoch;
+        if (weightDumpFrequency > 1)
+          filename << "_" << weightEpochDump;
+        mixedWeights.Save(filename.str());
+        cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+      }
+      if (dumpMixedWeights) {
+        if (mix && rank == 0 && !weightDumpStem.empty()) {
+          // dump mixed weights instead of average weights
+          ostringstream filename;
+          if (epoch < 10)
+            filename << weightDumpStem << "_0" << epoch;
+          else
+            filename << weightDumpStem << "_" << epoch;
+          if (weightDumpFrequency > 1)
+            filename << "_" << weightEpochDump;
+          cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+          mixedWeights.Save(filename.str());
+          ++weightEpochDump;
+        }
+      } else {
+        if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
+          cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
+          ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
+          bool proceed = false;
+          if (accumulateWeights) {
+            if (numberOfUpdates > 0) {
+              tmpAverageWeights.DivideEquals(numberOfUpdates);
+              proceed = true;
+            }
+          } else {
+            if (numberOfUpdatesThisEpoch > 0) {
+              if (sparseNoAverage) // average only core weights
+                tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
+              else if (sparseAverage)
+                tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
+              else
+                tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
+              proceed = true;
+            }
+          }
+          if (proceed) {
+#ifdef MPI_ENABLE
+            // average across processes
+            mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
+            ScoreComponentCollection totalBinary;
+            if (sparseAverage) {
+              ScoreComponentCollection binary;
+              binary.SetToBinaryOf(mosesWeights);
+              mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
+            }
+#endif
+#ifndef MPI_ENABLE
+            mixedAverageWeights = tmpAverageWeights;
+            //FIXME: What do to for non-mpi version
+            ScoreComponentCollection totalBinary;
+#endif
+            if (rank == 0 && !weightDumpStem.empty()) {
+              // divide by number of processes
+              if (sparseNoAverage)
+                mixedAverageWeights.CoreDivideEquals(size); // average only core weights
+              else if (sparseAverage)
+                mixedAverageWeights.DivideEquals(totalBinary);
+              else
+                mixedAverageWeights.DivideEquals(size);
+              // normalise weights after averaging
+              if (normaliseWeights) {
+                mixedAverageWeights.L1Normalise();
+              }
+              // dump final average weights
+              ostringstream filename;
+              if (epoch < 10) {
+                filename << weightDumpStem << "_0" << epoch;
+              } else {
+                filename << weightDumpStem << "_" << epoch;
+              }
+              if (weightDumpFrequency > 1) {
+                filename << "_" << weightEpochDump;
+              }
+              /*if (accumulateWeights) {
+              cerr << "\nMixed average weights (cumulative) during epoch "	<< epoch << ": " << mixedAverageWeights << endl;
+              } else {
+              cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
+              }*/
+              cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+              mixedAverageWeights.Save(filename.str());
+              ++weightEpochDump;
+              if (weightEpochDump == weightDumpFrequency) {
+                if (l1_regularize) {
+                  size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
+                  cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                       << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
+                }
+                if (l2_regularize) {
+                  mixedAverageWeights.SparseL2Regularize(l2_lambda);
+                  cerr << "Rank " << rank << ", epoch " << epoch << ", "
+                       << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
+                }
+                if (l1_regularize || l2_regularize) {
+                  filename << "_reg";
+                  cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
+                  mixedAverageWeights.Save(filename.str());
+                }
+              }
+              if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
+                // print out all features with counts
+                stringstream s1, s2;
+                s1 << "sparse_feature_hope_counts" << "_" << epoch;
+                s2 << "sparse_feature_fear_counts" << "_" << epoch;
+                ofstream sparseFeatureCountsHope(s1.str().c_str());
+                ofstream sparseFeatureCountsFear(s2.str().c_str());
+                mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
+                mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
+                sparseFeatureCountsHope.close();
+                sparseFeatureCountsFear.close();
+              }
+            }
+          }
+        }// end dumping
+      } // end if dump
+    } // end of shard loop, end of this epoch
+    cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;
+    if (historyBleu || simpleHistoryBleu) {
+      cerr << "Bleu feature history after epoch " <<  epoch << endl;
+      decoder->printBleuFeatureHistory(cerr);
+    }
+    //		cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
+    // Check whether there were any weight updates during this epoch
+    size_t sumUpdates;
+    size_t *sendbuf_uint, *recvbuf_uint;
+    sendbuf_uint = (size_t *) malloc(sizeof(size_t));
+    recvbuf_uint = (size_t *) malloc(sizeof(size_t));
+#ifdef MPI_ENABLE
+    sendbuf_uint[0] = numberOfUpdatesThisEpoch;
+    recvbuf_uint[0] = 0;
+    MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world);
+    sumUpdates = recvbuf_uint[0];
+#endif
+#ifndef MPI_ENABLE
+    sumUpdates = numberOfUpdatesThisEpoch;
+#endif
+    if (rank == 0 && sumUpdates == 0) {
+      cerr << "\nNo weight updates during this epoch.. stopping." << endl;
+      stop = true;
+#ifdef MPI_ENABLE
+      mpi::broadcast(world, stop, 0);
+#endif
+    }
+    if (!stop) {
+      // Test if weights have converged
+      if (weightConvergence) {
+        bool reached = true;
+        if (rank == 0 && (epoch >= 2)) {
+          ScoreComponentCollection firstDiff, secondDiff;
+          if (dumpMixedWeights) {
+            firstDiff = mixedWeights;
+            firstDiff.MinusEquals(mixedWeightsPrevious);
+            secondDiff = mixedWeights;
+            secondDiff.MinusEquals(mixedWeightsBeforePrevious);
+          } else {
+            firstDiff = mixedAverageWeights;
+            firstDiff.MinusEquals(mixedAverageWeightsPrevious);
+            secondDiff = mixedAverageWeights;
+            secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
+          }
+          VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
+          VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
+          // check whether stopping criterion has been reached
+          // (both difference vectors must have all weight changes smaller than min_weight_change)
+          if (firstDiff.GetLInfNorm() >= min_weight_change)
+            reached = false;
+          if (secondDiff.GetLInfNorm() >= min_weight_change)
+            reached = false;
+          if (reached) {
+            // stop MIRA
+            stop = true;
+            cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
+            ScoreComponentCollection dummy;
+            ostringstream endfilename;
+            endfilename << "stopping";
+            dummy.Save(endfilename.str());
+          }
+        }
+        mixedWeightsBeforePrevious = mixedWeightsPrevious;
+        mixedWeightsPrevious = mixedWeights;
+        mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
+        mixedAverageWeightsPrevious = mixedAverageWeights;
+#ifdef MPI_ENABLE
+        mpi::broadcast(world, stop, 0);
+#endif
+      } //end if (weightConvergence)
+    }
+  } // end of epoch loop
+#ifdef MPI_ENABLE
+  MPI_Finalize();
+#endif
+  time(&now);
+  cerr << "Rank " << rank << ", " << ctime(&now);
+  if (rank == 0) {
+    ScoreComponentCollection dummy;
+    ostringstream endfilename;
+    endfilename << "finished";
+    dummy.Save(endfilename.str());
+  }
+  delete decoder;
+  exit(0);
+}
+bool loadSentences(const string& filename, vector<string>& sentences)
+{
+  ifstream in(filename.c_str());
+  if (!in)
+    return false;
+  string line;
+  while (getline(in, line))
+    sentences.push_back(line);
+  return true;
+}
+bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size)
+{
+  if (mix_or_dump_base == 0) return 0;
+  if (actual_batch_size > 1) {
+    bool mix_or_dump = false;
+    size_t numberSubtracts = actual_batch_size;
+    do {
+      if (shard_position % mix_or_dump_base == 0) {
+        mix_or_dump = true;
+        break;
+      }
+      --shard_position;
+      --numberSubtracts;
+    } while (numberSubtracts > 0);
+    return mix_or_dump;
+  } else {
+    return ((shard_position % mix_or_dump_base) == 0);
+  }
+}
+void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues)
+{
+  for (size_t i = 0; i < featureValues.size(); ++i) {
+    for (size_t j = 0; j < featureValues[i].size(); ++j) {
+      cerr << featureValues[i][j] << endl;
+    }
+  }
+  cerr << endl;
+}
+void deleteTranslations(vector<vector<const Word*> > &translations)
+{
+  for (size_t i = 0; i < translations.size(); ++i) {
+    for (size_t j = 0; j < translations[i].size(); ++j) {
+      delete translations[i][j];
+    }
+  }
+}
+void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight)
+{
+  if (decode == 1)
+    cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
+  else if (decode == 2)
+    cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
+  else
+    cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;
+  // Create shards according to the number of processes used
+  vector<size_t> order;
+  for (size_t i = 0; i < inputSentences.size(); ++i)
+    order.push_back(i);
+  vector<size_t> shard;
+  float shardSize = (float) (order.size()) / size;
+  size_t shardStart = (size_t) (shardSize * rank);
+  size_t shardEnd = (size_t) (shardSize * (rank + 1));
+  if (rank == size - 1) {
+    shardEnd = inputSentences.size();
+    shardSize = shardEnd - shardStart;
+  }
+  VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl);
+  VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
+  shard.resize(shardSize);
+  copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
+  // open files for writing
+  stringstream fname;
+  fname << filename << ".rank" << rank;
+  filename = fname.str();
+  ostringstream filename_nbest;
+  filename_nbest << filename << "." << n << "best";
+  ofstream out(filename.c_str());
+  ofstream nbest_out((filename_nbest.str()).c_str());
+  if (!out) {
+    ostringstream msg;
+    msg << "Unable to open " << fname.str();
+    throw runtime_error(msg.str());
+  }
+  if (!nbest_out) {
+    ostringstream msg;
+    msg << "Unable to open " << filename_nbest;
+    throw runtime_error(msg.str());
+  }
+  for (size_t i = 0; i < shard.size(); ++i) {
+    size_t sid = shard[i];
+    string& input = inputSentences[sid];
+    vector<vector<ScoreComponentCollection> > dummyFeatureValues;
+    vector<vector<float> > dummyBleuScores;
+    vector<vector<float> > dummyModelScores;
+    vector<ScoreComponentCollection> newFeatureValues;
+    vector<float> newScores;
+    dummyFeatureValues.push_back(newFeatureValues);
+    dummyBleuScores.push_back(newScores);
+    dummyModelScores.push_back(newScores);
+    float factor = 0.0;
+    if (decode == 1) factor = 1.0;
+    if (decode == 2) factor = -1.0;
+    cerr << "Rank " << rank << ", translating sentence " << sid << endl;
+    bool realBleu = false;
+    vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
+        dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
+    cerr << endl;
+    decoder->cleanup(StaticData::Instance().IsChart());
+    for (size_t i = 0; i < nbestOutput.size(); ++i) {
+      vector<const Word*> output = nbestOutput[i];
+      stringstream translation;
+      for (size_t k = 0; k < output.size(); ++k) {
+        Word* w = const_cast<Word*>(output[k]);
+        translation << w->GetString(0);
+        translation << " ";
+      }
+      if (i == 0)
+        out << translation.str() << endl;
+      nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
+                " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
+    }
+  }
+  out.close();
+  nbest_out.close();
+  cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;
+#ifdef MPI_ENABLE
+  MPI_Finalize();
+#endif
+  time_t now;
+  time(&now);
+  cerr << "Rank " << rank << ", " << ctime(&now);
+  delete decoder;
+  exit(0);
+}
+void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0)
+{
+  for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
+    for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
+      featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
+}
+void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0)
+{
+  for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
+    for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
+      featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
+}
+void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
+{
+  string name = sp->GetScoreProducerDescription();
+  // scale down score
+  float featureScore;
+  for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
+    for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
+      featureScore = featureValues[i][j].GetScoreForProducer(sp);
+      featureValues[i][j].Assign(sp, featureScore*scaling_factor);
+      //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
+    }
+  }
+}
+void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
+{
+  string name = sp->GetScoreProducerDescription();
+  // scale down score
+  for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
+    for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
+      vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
+      for (size_t k=0; k<featureScores.size(); ++k)
+        featureScores[k] *= scaling_factor;
+      featureValues[i][j].Assign(sp, featureScores);
+      //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
+    }
+  }
+}

mosesdecoder/contrib/mira/Perceptron.cpp ADDED Viewed

	@@ -0,0 +1,53 @@

+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2010 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "Optimiser.h"
+using namespace Moses;
+using namespace std;
+namespace Mira
+{
+size_t Perceptron::updateWeightsHopeFear(
+  ScoreComponentCollection& weightUpdate,
+  const vector< vector<ScoreComponentCollection> >& featureValuesHope,
+  const vector< vector<ScoreComponentCollection> >& featureValuesFear,
+  const vector< vector<float> >& dummy1,
+  const vector< vector<float> >& dummy2,
+  const vector< vector<float> >& dummy3,
+  const vector< vector<float> >& dummy4,
+  float perceptron_learning_rate,
+  size_t rank,
+  size_t epoch,
+  int updatePosition)
+{
+  cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl;
+  cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl;
+  ScoreComponentCollection featureValueDiff = featureValuesHope[0][0];
+  featureValueDiff.MinusEquals(featureValuesFear[0][0]);
+  cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
+  featureValueDiff.MultiplyEquals(perceptron_learning_rate);
+  weightUpdate.PlusEquals(featureValueDiff);
+  cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl;
+  return 0;
+}
+}

mosesdecoder/contrib/mira/mira.xcodeproj/project.pbxproj ADDED Viewed

	@@ -0,0 +1,401 @@

+// !$*UTF8*$!
+{
+	archiveVersion = 1;
+	classes = {
+	};
+	objectVersion = 45;
+	objects = {
+/* Begin PBXBuildFile section */
+		1E141A311243527800123194 /* Perceptron.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E141A2F1243527800123194 /* Perceptron.cpp */; };
+		1E56EBF51243B91600E8315C /* MiraOptimiser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */; };
+		1E9DC63C1242602F0059001A /* Decoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E9DC6391242602F0059001A /* Decoder.cpp */; };
+		1E9DC63D1242602F0059001A /* Main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E9DC63B1242602F0059001A /* Main.cpp */; };
+		1E9DC6DA1242684C0059001A /* libmoses-chart.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6D1124268310059001A /* libmoses-chart.a */; };
+		1E9DC6DB124268510059001A /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6CB124268270059001A /* libmoses.a */; };
+		1E9DC6DC124268580059001A /* libOnDiskPt.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6D9124268440059001A /* libOnDiskPt.a */; };
+		8DD76F6A0486A84900D96B5E /* mira.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* mira.1 */; };
+/* End PBXBuildFile section */
+/* Begin PBXContainerItemProxy section */
+		1E9DC6CA124268270059001A /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = D2AAC046055464E500DB518D;
+			remoteInfo = moses;
+		};
+		1E9DC6D0124268310059001A /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = D2AAC046055464E500DB518D;
+			remoteInfo = "moses-chart";
+		};
+		1E9DC6D8124268440059001A /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
+			proxyType = 2;
+			remoteGlobalIDString = D2AAC046055464E500DB518D;
+			remoteInfo = OnDiskPt;
+		};
+		1EF4E84C12440612006233A0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
+			proxyType = 1;
+			remoteGlobalIDString = D2AAC045055464E500DB518D /* moses */;
+			remoteInfo = moses;
+		};
+		1EF4E84E12440612006233A0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
+			proxyType = 1;
+			remoteGlobalIDString = D2AAC045055464E500DB518D /* moses-chart */;
+			remoteInfo = "moses-chart";
+		};
+		1EF4E85012440612006233A0 /* PBXContainerItemProxy */ = {
+			isa = PBXContainerItemProxy;
+			containerPortal = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
+			proxyType = 1;
+			remoteGlobalIDString = D2AAC045055464E500DB518D /* OnDiskPt */;
+			remoteInfo = OnDiskPt;
+		};
+/* End PBXContainerItemProxy section */
+/* Begin PBXCopyFilesBuildPhase section */
+		8DD76F690486A84900D96B5E /* CopyFiles */ = {
+			isa = PBXCopyFilesBuildPhase;
+			buildActionMask = 8;
+			dstPath = /usr/share/man/man1/;
+			dstSubfolderSpec = 0;
+			files = (
+				8DD76F6A0486A84900D96B5E /* mira.1 in CopyFiles */,
+			);
+			runOnlyForDeploymentPostprocessing = 1;
+		};
+/* End PBXCopyFilesBuildPhase section */
+/* Begin PBXFileReference section */
+		1E141A2F1243527800123194 /* Perceptron.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Perceptron.cpp; sourceTree = "<group>"; };
+		1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MiraOptimiser.cpp; sourceTree = "<group>"; };
+		1E9DC6391242602F0059001A /* Decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Decoder.cpp; sourceTree = "<group>"; };
+		1E9DC63A1242602F0059001A /* Decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Decoder.h; sourceTree = "<group>"; };
+		1E9DC63B1242602F0059001A /* Main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Main.cpp; sourceTree = "<group>"; };
+		1E9DC63E124260370059001A /* Optimiser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Optimiser.h; sourceTree = "<group>"; };
+		1E9DC6C6124268270059001A /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = moses.xcodeproj; path = ../moses/moses.xcodeproj; sourceTree = SOURCE_ROOT; };
+		1E9DC6CC124268310059001A /* moses-chart.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = "moses-chart.xcodeproj"; path = "../moses-chart/moses-chart.xcodeproj"; sourceTree = SOURCE_ROOT; };
+		1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = OnDiskPt.xcodeproj; path = ../OnDiskPt/OnDiskPt.xcodeproj; sourceTree = SOURCE_ROOT; };
+		1E9DC76712426FC60059001A /* Main.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Main.h; sourceTree = "<group>"; };
+		8DD76F6C0486A84900D96B5E /* mira */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mira; sourceTree = BUILT_PRODUCTS_DIR; };
+		C6859E8B029090EE04C91782 /* mira.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mira.1; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+/* Begin PBXFrameworksBuildPhase section */
+		8DD76F660486A84900D96B5E /* Frameworks */ = {
+			isa = PBXFrameworksBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1E9DC6DC124268580059001A /* libOnDiskPt.a in Frameworks */,
+				1E9DC6DB124268510059001A /* libmoses.a in Frameworks */,
+				1E9DC6DA1242684C0059001A /* libmoses-chart.a in Frameworks */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXFrameworksBuildPhase section */
+/* Begin PBXGroup section */
+		08FB7794FE84155DC02AAC07 /* mira */ = {
+			isa = PBXGroup;
+			children = (
+				1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */,
+				1E9DC6CC124268310059001A /* moses-chart.xcodeproj */,
+				1E9DC6C6124268270059001A /* moses.xcodeproj */,
+				08FB7795FE84155DC02AAC07 /* Source */,
+				C6859E8C029090F304C91782 /* Documentation */,
+				1AB674ADFE9D54B511CA2CBB /* Products */,
+			);
+			name = mira;
+			sourceTree = "<group>";
+		};
+		08FB7795FE84155DC02AAC07 /* Source */ = {
+			isa = PBXGroup;
+			children = (
+				1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */,
+				1E141A2F1243527800123194 /* Perceptron.cpp */,
+				1E9DC63E124260370059001A /* Optimiser.h */,
+				1E9DC6391242602F0059001A /* Decoder.cpp */,
+				1E9DC63A1242602F0059001A /* Decoder.h */,
+				1E9DC63B1242602F0059001A /* Main.cpp */,
+				1E9DC76712426FC60059001A /* Main.h */,
+			);
+			name = Source;
+			sourceTree = "<group>";
+		};
+		1AB674ADFE9D54B511CA2CBB /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				8DD76F6C0486A84900D96B5E /* mira */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		1E9DC6C7124268270059001A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1E9DC6CB124268270059001A /* libmoses.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		1E9DC6CD124268310059001A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1E9DC6D1124268310059001A /* libmoses-chart.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		1E9DC6D5124268440059001A /* Products */ = {
+			isa = PBXGroup;
+			children = (
+				1E9DC6D9124268440059001A /* libOnDiskPt.a */,
+			);
+			name = Products;
+			sourceTree = "<group>";
+		};
+		C6859E8C029090F304C91782 /* Documentation */ = {
+			isa = PBXGroup;
+			children = (
+				C6859E8B029090EE04C91782 /* mira.1 */,
+			);
+			name = Documentation;
+			sourceTree = "<group>";
+		};
+/* End PBXGroup section */
+/* Begin PBXNativeTarget section */
+		8DD76F620486A84900D96B5E /* mira */ = {
+			isa = PBXNativeTarget;
+			buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "mira" */;
+			buildPhases = (
+				8DD76F640486A84900D96B5E /* Sources */,
+				8DD76F660486A84900D96B5E /* Frameworks */,
+				8DD76F690486A84900D96B5E /* CopyFiles */,
+			);
+			buildRules = (
+			);
+			dependencies = (
+				1EF4E84D12440612006233A0 /* PBXTargetDependency */,
+				1EF4E84F12440612006233A0 /* PBXTargetDependency */,
+				1EF4E85112440612006233A0 /* PBXTargetDependency */,
+			);
+			name = mira;
+			productInstallPath = "$(HOME)/bin";
+			productName = mira;
+			productReference = 8DD76F6C0486A84900D96B5E /* mira */;
+			productType = "com.apple.product-type.tool";
+		};
+/* End PBXNativeTarget section */
+/* Begin PBXProject section */
+		08FB7793FE84155DC02AAC07 /* Project object */ = {
+			isa = PBXProject;
+			buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "mira" */;
+			compatibilityVersion = "Xcode 3.1";
+			hasScannedForEncodings = 1;
+			mainGroup = 08FB7794FE84155DC02AAC07 /* mira */;
+			projectDirPath = "";
+			projectReferences = (
+				{
+					ProductGroup = 1E9DC6CD124268310059001A /* Products */;
+					ProjectRef = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
+				},
+				{
+					ProductGroup = 1E9DC6C7124268270059001A /* Products */;
+					ProjectRef = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
+				},
+				{
+					ProductGroup = 1E9DC6D5124268440059001A /* Products */;
+					ProjectRef = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
+				},
+			);
+			projectRoot = "";
+			targets = (
+				8DD76F620486A84900D96B5E /* mira */,
+			);
+		};
+/* End PBXProject section */
+/* Begin PBXReferenceProxy section */
+		1E9DC6CB124268270059001A /* libmoses.a */ = {
+			isa = PBXReferenceProxy;
+			fileType = archive.ar;
+			path = libmoses.a;
+			remoteRef = 1E9DC6CA124268270059001A /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		1E9DC6D1124268310059001A /* libmoses-chart.a */ = {
+			isa = PBXReferenceProxy;
+			fileType = archive.ar;
+			path = "libmoses-chart.a";
+			remoteRef = 1E9DC6D0124268310059001A /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+		1E9DC6D9124268440059001A /* libOnDiskPt.a */ = {
+			isa = PBXReferenceProxy;
+			fileType = archive.ar;
+			path = libOnDiskPt.a;
+			remoteRef = 1E9DC6D8124268440059001A /* PBXContainerItemProxy */;
+			sourceTree = BUILT_PRODUCTS_DIR;
+		};
+/* End PBXReferenceProxy section */
+/* Begin PBXSourcesBuildPhase section */
+		8DD76F640486A84900D96B5E /* Sources */ = {
+			isa = PBXSourcesBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+				1E9DC63C1242602F0059001A /* Decoder.cpp in Sources */,
+				1E9DC63D1242602F0059001A /* Main.cpp in Sources */,
+				1E141A311243527800123194 /* Perceptron.cpp in Sources */,
+				1E56EBF51243B91600E8315C /* MiraOptimiser.cpp in Sources */,
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+		};
+/* End PBXSourcesBuildPhase section */
+/* Begin PBXTargetDependency section */
+		1EF4E84D12440612006233A0 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			name = moses;
+			targetProxy = 1EF4E84C12440612006233A0 /* PBXContainerItemProxy */;
+		};
+		1EF4E84F12440612006233A0 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			name = "moses-chart";
+			targetProxy = 1EF4E84E12440612006233A0 /* PBXContainerItemProxy */;
+		};
+		1EF4E85112440612006233A0 /* PBXTargetDependency */ = {
+			isa = PBXTargetDependency;
+			name = OnDiskPt;
+			targetProxy = 1EF4E85012440612006233A0 /* PBXContainerItemProxy */;
+		};
+/* End PBXTargetDependency section */
+/* Begin XCBuildConfiguration section */
+		1DEB923208733DC60010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				COPY_PHASE_STRIP = NO;
+				GCC_DYNAMIC_NO_PIC = NO;
+				GCC_ENABLE_FIX_AND_CONTINUE = YES;
+				GCC_MODEL_TUNING = G5;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					../irstlm/lib/i386,
+					../srilm/lib/macosx,
+				);
+				OTHER_LDFLAGS = (
+					"-lboost_program_options",
+					"-lz",
+					"-lirstlm",
+					"-lmisc",
+					"-ldstruct",
+					"-loolm",
+					"-lflm",
+					"-llattice",
+				);
+				PRODUCT_NAME = mira;
+			};
+			name = Debug;
+		};
+		1DEB923308733DC60010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ALWAYS_SEARCH_USER_PATHS = NO;
+				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+				GCC_MODEL_TUNING = G5;
+				INSTALL_PATH = /usr/local/bin;
+				LIBRARY_SEARCH_PATHS = (
+					../irstlm/lib/i386,
+					../srilm/lib/macosx,
+				);
+				OTHER_LDFLAGS = (
+					"-lboost_program_options",
+					"-lz",
+					"-lirstlm",
+					"-lmisc",
+					"-ldstruct",
+					"-loolm",
+					"-lflm",
+					"-llattice",
+				);
+				PRODUCT_NAME = mira;
+			};
+			name = Release;
+		};
+		1DEB923608733DC60010E9CD /* Debug */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_OPTIMIZATION_LEVEL = 0;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					/usr/local/include,
+					"../moses-chart/src",
+					../moses/src,
+					../irstlm/include,
+				);
+				ONLY_ACTIVE_ARCH = YES;
+				PREBINDING = NO;
+				SDKROOT = macosx10.6;
+			};
+			name = Debug;
+		};
+		1DEB923708733DC60010E9CD /* Release */ = {
+			isa = XCBuildConfiguration;
+			buildSettings = {
+				ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+				GCC_C_LANGUAGE_STANDARD = gnu99;
+				GCC_WARN_ABOUT_RETURN_TYPE = YES;
+				GCC_WARN_UNUSED_VARIABLE = YES;
+				HEADER_SEARCH_PATHS = (
+					/usr/local/include,
+					"../moses-chart/src",
+					../moses/src,
+					../irstlm/include,
+				);
+				PREBINDING = NO;
+				SDKROOT = macosx10.6;
+			};
+			name = Release;
+		};
+/* End XCBuildConfiguration section */
+/* Begin XCConfigurationList section */
+		1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "mira" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB923208733DC60010E9CD /* Debug */,
+				1DEB923308733DC60010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+		1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "mira" */ = {
+			isa = XCConfigurationList;
+			buildConfigurations = (
+				1DEB923608733DC60010E9CD /* Debug */,
+				1DEB923708733DC60010E9CD /* Release */,
+			);
+			defaultConfigurationIsVisible = 0;
+			defaultConfigurationName = Release;
+		};
+/* End XCConfigurationList section */
+	};
+	rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
+}

mosesdecoder/contrib/moses-speedtest/README.md ADDED Viewed

	@@ -0,0 +1,146 @@

+# Moses speedtesting framework
+### Description
+This is an automatic test framework that is designed to test the day to day performance changes in Moses.
+### Set up
+#### Set up a Moses repo
+Set up a Moses repo and build it with the desired configuration.
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git
+cd mosesdecoder
+./bjam -j10 --with-cmph=/usr/include/
+```
+You need to build Moses first, so that the testsuite knows what command you want it to use when rebuilding against newer revisions.
+#### Create a parent directory.
+Create a parent directory where the **runtests.py** and related scripts and configuration file should reside.
+This should also be the location of the TEST_DIR and TEST_LOG_DIR as explained in the next section.
+#### Set up a global configuration file.
+You need a configuration file for the testsuite. A sample configuration file is provided in **testsuite\_config**
+<pre>
+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1
+MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof
+MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools
+</pre>
+The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
+The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access.
+_TEST\_DIR_ is the directory where all the tests will reside.
+_TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
+_BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
+_MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results.
+_MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools.
+### Creating tests
+In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
+Inside that folder one should place a configuration file named **config**. The naming is mandatory.
+An example such configuration file is **test\_config**
+<pre>
+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
+Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
+</pre>
+The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
+The _LDPRE:_ specifies if tests should be run with any LD\_PRELOAD flags.
+The _Variants:_ line specifies what type of tests should we run. This particular line will run the following tests:
+1. A Vanilla test meaning just the command after _Command_ will be issued.
+2. A vanilla cached test meaning that after the vanilla test, the test will be run again without dropping caches in order to benchmark performance on cached filesystem.
+3. A test with LD_PRELOAD ldpreloads moses -f command. For each available LDPRELOAD comma separated library to preload.
+4. A cached version of all LD_PRELOAD tests.
+5. A profile variant is only available if you have setup the profiler repository. It produces gprof outputs for all of the above in a subdirectory inside the _TEST\_LOG\_DIR.
+#### Produce profiler results.
+If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile
+cd mosesdecoder-profile
+./bjam -j10 --with-cmph=/usr/include/ variant=profile
+```
+Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**).
+#### Produce google profiler results.
+If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config
+```bash
+git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile
+cd mosesdecoder
+./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/
+```
+Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html).
+### Running tests.
+Running the tests is done through the **runtests.py** script.
+#### Running all tests.
+To run all tests, with the base branch and the latests revision (and generate new basebranch test data if such is missing) do a:
+```bash
+python3 runtests.py -c testsuite_config
+```
+#### Running specific tests.
+The script allows the user to manually run a particular test or to test against a specific branch or revision:
+<pre>
+moses-speedtest@crom:~/phrase_tables$ python3 runtests.py --help
+usage: runtests.py [-h] -c CONFIGFILE [-s SINGLETESTDIR] [-r REVISION]
+                   [-b BRANCH]
+A python based speedtest suite for moses.
+optional arguments:
+  -h, --help            show this help message and exit
+  -c CONFIGFILE, --configfile CONFIGFILE
+                        Specify test config file
+  -s SINGLETESTDIR, --singletest SINGLETESTDIR
+                        Single test name directory. Specify directory name,
+                        not full path!
+  -r REVISION, --revision REVISION
+                        Specify a specific revison for the test.
+  -b BRANCH, --branch BRANCH
+                        Specify a branch for the test.
+</pre>
+### Generating HTML report.
+To generate a summary of the test results use the **html\_gen.py** script. It places a file named *index.html* in the current script directory.
+```bash
+python3 html_gen.py testsuite_config
+```
+You should use the generated file with the **style.css** file provided in the html directory.
+### Command line regression testing.
+Alternatively you could check for regressions from the command line using the **check\_fo\r_regression.py** script:
+```bash
+python3 check_for_regression.py TESTLOGS_DIRECTORY
+```
+Alternatively the results of all tests are logged inside the the specified TESTLOGS directory so you can manually check them for additional information such as date, time, revision, branch, etc...
+### Create a cron job:
+Create a cron job to run the tests daily and generate an html report. An example *cronjob* is available.
+```bash
+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html
+```
+Place the script in _/etc/cron.daily_ for dayly testing
+###### Author
+Nikolay Bogoychev, 2014
+###### License
+This software is licensed under the LGPL.

mosesdecoder/contrib/moses-speedtest/check_for_regression.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""Checks if any of the latests tests has performed considerably different than
+ the previous ones. Takes the log directory as an argument."""
+import os
+import sys
+from testsuite_common import Result, processLogLine, bcolors, getLastTwoLines
+LOGDIR = sys.argv[1] #Get the log directory as an argument
+PERCENTAGE = 5 #Default value for how much a test shoudl change
+if len(sys.argv) == 3:
+    PERCENTAGE = float(sys.argv[2]) #Default is 5%, but we can specify more
+    #line parameter
+def printResults(regressed, better, unchanged, firsttime):
+    """Pretty print the results in different colours"""
+    if regressed != []:
+        for item in regressed:
+            print(bcolors.RED + "REGRESSION! " + item.testname + " Was: "\
+            + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+            + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+            + bcolors.ENDC)
+    print('\n')
+    if unchanged != []:
+        for item in unchanged:
+            print(bcolors.BLUE + "UNCHANGED: " + item.testname + " Revision: " +\
+                item.revision + bcolors.ENDC)
+    print('\n')
+    if better != []:
+        for item in better:
+            print(bcolors.GREEN + "IMPROVEMENT! " + item.testname + " Was: "\
+            + str(item.previous) + " Is: " + str(item.current) + " Change: "\
+            + str(abs(item.percentage)) + "%. Revision: " + item.revision\
+            + bcolors.ENDC)
+    if firsttime != []:
+        for item in firsttime:
+            print(bcolors.PURPLE + "First time test! " + item.testname +\
+            " Took: " + str(item.real) +  " seconds. Revision: " +\
+            item.revision + bcolors.ENDC)
+all_files = os.listdir(LOGDIR)
+regressed = []
+better = []
+unchanged = []
+firsttime = []
+#Go through all log files and find which tests have performed better.
+for logfile in all_files:
+    (line1, line2) = getLastTwoLines(logfile, LOGDIR)
+    log1 = processLogLine(line1)
+    if line2 == '\n': # Empty line, only one test ever run
+        firsttime.append(log1)
+        continue
+    log2 = processLogLine(line2)
+    res = Result(log1.testname, log1.real, log2.real, log2.revision,\
+    log2.branch, log1.revision, log1.branch)
+    if res.percentage < -PERCENTAGE:
+        regressed.append(res)
+    elif res.change > PERCENTAGE:
+        better.append(res)
+    else:
+        unchanged.append(res)
+printResults(regressed, better, unchanged, firsttime)

mosesdecoder/contrib/moses-speedtest/cronjob ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/sh
+cd /home/moses-speedtest/phrase_tables
+python3 runtests.py -c testsuite_config #Run the tests.
+python3 html_gen.py testsuite_config #Generate html
+cp index.html /fs/thor4/html/www/speed-test/ #Update the html

mosesdecoder/contrib/moses-speedtest/runtests.py ADDED Viewed

	@@ -0,0 +1,439 @@

+"""Given a config file, runs tests"""
+import os
+import subprocess
+import time
+import shutil
+from argparse import ArgumentParser
+from testsuite_common import processLogLine
+def parse_cmd():
+    """Parse the command line arguments"""
+    description = "A python based speedtest suite for moses."
+    parser = ArgumentParser(description=description)
+    parser.add_argument("-c", "--configfile", action="store",\
+                dest="configfile", required=True,\
+                help="Specify test config file")
+    parser.add_argument("-s", "--singletest", action="store",\
+                dest="singletestdir", default=None,\
+                help="Single test name directory. Specify directory name,\
+                not full path!")
+    parser.add_argument("-r", "--revision", action="store",\
+                dest="revision", default=None,\
+                help="Specify a specific revison for the test.")
+    parser.add_argument("-b", "--branch", action="store",\
+                dest="branch", default=None,\
+                help="Specify a branch for the test.")
+    arguments = parser.parse_args()
+    return arguments
+def repoinit(testconfig, profiler=None):
+    """Determines revision and sets up the repo. If given the profiler optional
+    argument, wil init the profiler repo instead of the default one."""
+    revision = ''
+    #Update the repo
+    if profiler == "gnu-profiler":
+        if testconfig.repo_prof is not None:
+            os.chdir(testconfig.repo_prof)
+        else:
+            raise ValueError('Profiling repo is not defined')
+    elif profiler == "google-profiler":
+        if testconfig.repo_gprof is not None:
+            os.chdir(testconfig.repo_gprof)
+        else:
+            raise ValueError('Profiling repo is not defined')
+    else:
+        os.chdir(testconfig.repo)
+    #Checkout specific branch, else maintain main branch
+    if testconfig.branch != 'master':
+        subprocess.call(['git', 'checkout', testconfig.branch])
+        rev, _ = subprocess.Popen(['git', 'rev-parse', 'HEAD'],\
+            stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+        revision = str(rev).replace("\\n'", '').replace("b'", '')
+    else:
+        subprocess.call(['git checkout master'], shell=True)
+    #Check a specific revision. Else checkout master.
+    if testconfig.revision:
+        subprocess.call(['git', 'checkout', testconfig.revision])
+        revision = testconfig.revision
+    elif testconfig.branch == 'master':
+        subprocess.call(['git pull'], shell=True)
+        rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\
+            stderr=subprocess.PIPE, shell=True).communicate()
+        revision = str(rev).replace("\\n'", '').replace("b'", '')
+    return revision
+class Configuration:
+    """A simple class to hold all of the configuration constatns"""
+    def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None):
+        self.repo = repo
+        self.repo_prof = repo_prof
+        self.repo_gprof = repo_gprof
+        self.drop_caches = drop_caches
+        self.tests = tests
+        self.testlogs = testlogs
+        self.basebranch = basebranch
+        self.baserev = baserev
+        self.singletest = None
+        self.revision = None
+        self.branch = 'master' # Default branch
+    def additional_args(self, singletest, revision, branch):
+        """Additional configuration from command line arguments"""
+        self.singletest = singletest
+        if revision is not None:
+            self.revision = revision
+        if branch is not None:
+            self.branch = branch
+    def set_revision(self, revision):
+        """Sets the current revision that is being tested"""
+        self.revision = revision
+class Test:
+    """A simple class to contain all information about tests"""
+    def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None):
+        self.name = name
+        self.command = command
+        self.prof_command = prof_command
+        self.gprof_command = gprof_command
+        self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
+        self.permutations = permutations
+def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None):
+    """Parses the config file"""
+    command, ldopts, prof_command, gprof_command = '', '', None, None
+    permutations = []
+    fileopen = open(conffile, 'r')
+    for line in fileopen:
+        line = line.split('#')[0] # Discard comments
+        if line == '' or line == '\n':
+            continue # Discard lines with comments only and empty lines
+        opt, args = line.split(' ', 1) # Get arguments
+        if opt == 'Command:':
+            command = args.replace('\n', '')
+            if moses_prof_repo is not None:  # Get optional command for profiling
+                prof_command = moses_prof_repo + '/bin/' + command
+            if moses_gprof_repo is not None: # Get optional command for google-perftools
+                gprof_command = moses_gprof_repo + '/bin/' + command
+            command = moses_repo + '/bin/' + command
+        elif opt == 'LDPRE:':
+            ldopts = args.replace('\n', '')
+        elif opt == 'Variants:':
+            permutations = args.replace('\n', '').replace(' ', '').split(',')
+        else:
+            raise ValueError('Unrecognized option ' + opt)
+    #We use the testdir as the name.
+    testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command)
+    fileopen.close()
+    return testcase
+def parse_testconfig(conffile):
+    """Parses the config file for the whole testsuite."""
+    repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
+    basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None
+    fileopen = open(conffile, 'r')
+    for line in fileopen:
+        line = line.split('#')[0] # Discard comments
+        if line == '' or line == '\n':
+            continue # Discard lines with comments only and empty lines
+        opt, args = line.split(' ', 1) # Get arguments
+        if opt == 'MOSES_REPO_PATH:':
+            repo_path = args.replace('\n', '')
+        elif opt == 'DROP_CACHES_COMM:':
+            drop_caches = args.replace('\n', '')
+        elif opt == 'TEST_DIR:':
+            tests_dir = args.replace('\n', '')
+        elif opt == 'TEST_LOG_DIR:':
+            testlog_dir = args.replace('\n', '')
+        elif opt == 'BASEBRANCH:':
+            basebranch = args.replace('\n', '')
+        elif opt == 'BASEREV:':
+            baserev = args.replace('\n', '')
+        elif opt == 'MOSES_PROFILER_REPO:':  # Optional
+            repo_prof_path = args.replace('\n', '')
+        elif opt == 'MOSES_GOOGLE_PROFILER_REPO:':  # Optional
+            repo_gprof_path = args.replace('\n', '')
+        else:
+            raise ValueError('Unrecognized option ' + opt)
+    config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
+    basebranch, baserev, repo_prof_path, repo_gprof_path)
+    fileopen.close()
+    return config
+def get_config():
+    """Builds the config object with all necessary attributes"""
+    args = parse_cmd()
+    config = parse_testconfig(args.configfile)
+    config.additional_args(args.singletestdir, args.revision, args.branch)
+    revision = repoinit(config)
+    if config.repo_prof is not None:
+        repoinit(config, "gnu-profiler")
+    if config.repo_gprof is not None:
+        repoinit(config, "google-profiler")
+    config.set_revision(revision)
+    return config
+def check_for_basever(testlogfile, basebranch):
+    """Checks if the base revision is present in the testlogs"""
+    filetoopen = open(testlogfile, 'r')
+    for line in filetoopen:
+        templine = processLogLine(line)
+        if templine.branch == basebranch:
+            return True
+    return False
+def split_time(filename):
+    """Splits the output of the time function into seperate parts.
+    We will write time to file, because many programs output to
+    stderr which makes it difficult to get only the exact results we need."""
+    timefile = open(filename, 'r')
+    realtime = float(timefile.readline().replace('\n', '').split()[1])
+    usertime = float(timefile.readline().replace('\n', '').split()[1])
+    systime = float(timefile.readline().replace('\n', '').split()[1])
+    timefile.close()
+    return (realtime, usertime, systime)
+def write_log(time_file, logname, config):
+    """Writes to a logfile"""
+    log_write = open(config.testlogs + '/' + logname, 'a') # Open logfile
+    date_run = time.strftime("%d.%m.%Y %H:%M:%S") # Get the time of the test
+    realtime, usertime, systime = split_time(time_file) # Get the times in a nice form
+    # Append everything to a log file.
+    writestr = date_run + " " + config.revision + " Testname: " + logname +\
+    " RealTime: " + str(realtime) + " UserTime: " + str(usertime) +\
+    " SystemTime: " + str(systime) + " Branch: " + config.branch +'\n'
+    log_write.write(writestr)
+    log_write.close()
+def write_gprof(command, name, variant, config):
+    """Produces a gprof report from a gmon file"""
+    #Check if we have a directory for the profiling of this testcase:
+    output_dir = config.testlogs + '/' + name
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    outputfile = output_dir + '/' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
+    #Compile a gprof command and output the file in the directory we just created
+    gmon_path = os.getcwd() + '/gmon.out'  # Path to the profiling file
+    executable_path = command.split(' ')[0]  # Path to the moses binary
+    gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile
+    subprocess.call([gprof_command], shell=True)
+    os.remove(gmon_path)  # After we are done discard the gmon file
+def write_pprof(name, variant, config):
+    """Copies the google-perftools profiler output to the corresponding test directory"""
+    output_dir = config.testlogs + '/' + name
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
+    shutil.move("/tmp/moses.prof", outputfile)
+def execute_test(command, path, name, variant, config, profile=None):
+    """Executes a testcase given a whole command, path to the test file output,
+    name of the test and variant tested. Config is the global configuration"""
+    subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate()
+    if profile is None:
+        write_log(path, name + '_' + variant, config)
+    elif profile == "gnu-profiler":  # Basically produce a gmon output
+        write_gprof(command, name, variant, config)
+    elif profile == "google-profiler":
+        write_pprof(name, variant, config)
+def execute_tests(testcase, cur_directory, config):
+    """Executes timed tests based on the config file"""
+    #Several global commands related to the time wrapper
+    time_command = ' time -p -o /tmp/time_moses_tests '
+    time_path = '/tmp/time_moses_tests'
+    #Figure out the order of which tests must be executed.
+    #Change to the current test directory
+    os.chdir(config.tests + '/' + cur_directory)
+    #Clear caches
+    subprocess.call(['sync'], shell=True)
+    subprocess.call([config.drop_caches], shell=True)
+    #Perform vanilla test and if a cached test exists - as well
+    print(testcase.name)
+    if 'vanilla' in testcase.permutations:
+        #Create the command for executing moses
+        whole_command = time_command + testcase.command
+        #test normal and cached
+        execute_test(whole_command, time_path, testcase.name, 'vanilla', config)
+        if 'cached' in testcase.permutations:
+            execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config)
+    #Now perform LD_PRELOAD tests
+    if 'ldpre' in testcase.permutations:
+        for opt in testcase.ldopts:
+            #Clear caches
+            subprocess.call(['sync'], shell=True)
+            subprocess.call([config.drop_caches], shell=True)
+            #Create the command for executing moses:
+            whole_command = 'LD_PRELOAD=' + opt + time_command + testcase.command
+            variant = 'ldpre_' + opt
+            #test normal and cached
+            execute_test(whole_command, time_path, testcase.name, variant, config)
+            if 'cached' in testcase.permutations:
+                execute_test(whole_command, time_path, testcase.name, variant + '_cached', config)
+    #Perform profiling test. Mostly same as the above lines but necessary duplication.
+    #All actual code is inside execute_test so those lines shouldn't need modifying
+    if 'profile' in testcase.permutations:
+        subprocess.call(['sync'], shell=True)  # Drop caches first
+        subprocess.call([config.drop_caches], shell=True)
+        if 'vanilla' in testcase.permutations:
+            whole_command = testcase.prof_command
+            execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler")
+            if 'cached' in testcase.permutations:
+                execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler")
+        if 'ldpre' in testcase.permutations:
+            for opt in testcase.ldopts:
+                #Clear caches
+                subprocess.call(['sync'], shell=True)
+                subprocess.call([config.drop_caches], shell=True)
+                #Create the command for executing moses:
+                whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command
+                variant = 'profile_ldpre_' + opt
+                #test normal and cached
+                execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler")
+                if 'cached' in testcase.permutations:
+                    execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler")
+    #Google-perftools profiler
+    if 'google-profiler' in testcase.permutations:
+        subprocess.call(['sync'], shell=True)  # Drop caches first
+        subprocess.call([config.drop_caches], shell=True)
+        #Create the command for executing moses
+        whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command
+        #test normal and cached
+        execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler')
+        if 'cached' in testcase.permutations:
+            execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler')
+    #Now perform LD_PRELOAD tests
+    if 'ldpre' in testcase.permutations:
+        for opt in testcase.ldopts:
+            #Clear caches
+            subprocess.call(['sync'], shell=True)
+            subprocess.call([config.drop_caches], shell=True)
+            #Create the command for executing moses:
+            whole_command = 'LD_PRELOAD=' + opt + " " + whole_command
+            variant = 'ldpre_' + opt
+            #test normal and cached
+            execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler')
+            if 'cached' in testcase.permutations:
+                execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler')
+# Go through all the test directories and executes tests
+if __name__ == '__main__':
+    CONFIG = get_config()
+    ALL_DIR = os.listdir(CONFIG.tests)
+    #We should first check if any of the tests is run for the first time.
+    #If some of them are run for the first time we should first get their
+    #time with the base version (usually the previous release)
+    FIRSTTIME = []
+    TESTLOGS = []
+    #Strip filenames of test underscores
+    for listline in os.listdir(CONFIG.testlogs):
+        listline = listline.replace('_vanilla', '')
+        listline = listline.replace('_cached', '')
+        listline = listline.replace('_ldpre', '')
+        TESTLOGS.append(listline)
+    for directory in ALL_DIR:
+        if directory not in TESTLOGS:
+            FIRSTTIME.append(directory)
+    #Sometimes even though we have the log files, we will need to rerun them
+    #Against a base version, because we require a different baseversion (for
+    #example when a new version of Moses is released.) Therefore we should
+    #Check if the version of Moses that we have as a base version is in all
+    #of the log files.
+    for logfile in os.listdir(CONFIG.testlogs):
+        logfile_name = CONFIG.testlogs + '/' + logfile
+        if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch):
+            logfile = logfile.replace('_vanilla', '')
+            logfile = logfile.replace('_cached', '')
+            logfile = logfile.replace('_ldpre', '')
+            FIRSTTIME.append(logfile)
+    FIRSTTIME = list(set(FIRSTTIME)) #Deduplicate
+    if FIRSTTIME != []:
+        #Create a new configuration for base version tests:
+        BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
+            CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
+            CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof)
+        BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
+        #Set up the repository and get its revision:
+        REVISION = repoinit(BASECONFIG)
+        BASECONFIG.set_revision(REVISION)
+        #Build
+        os.chdir(BASECONFIG.repo)
+        subprocess.call(['./previous.sh'], shell=True)
+        #If profiler configuration exists also init it
+        if BASECONFIG.repo_prof is not None:
+            repoinit(BASECONFIG, "gnu-profiler")
+            os.chdir(BASECONFIG.repo_prof)
+            subprocess.call(['./previous.sh'], shell=True)
+        if BASECONFIG.repo_gprof is not None:
+            repoinit(BASECONFIG, "google-profiler")
+            os.chdir(BASECONFIG.repo_gprof)
+            subprocess.call(['./previous.sh'], shell=True)
+        #Perform tests
+        for directory in FIRSTTIME:
+            cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
+            '/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof)
+            execute_tests(cur_testcase, directory, BASECONFIG)
+        #Reset back the repository to the normal configuration
+        repoinit(CONFIG)
+        if BASECONFIG.repo_prof is not None:
+            repoinit(CONFIG, "gnu-profiler")
+        if BASECONFIG.repo_gprof is not None:
+            repoinit(CONFIG, "google-profiler")
+    #Builds moses
+    os.chdir(CONFIG.repo)
+    subprocess.call(['./previous.sh'], shell=True)
+    if CONFIG.repo_prof is not None:
+        os.chdir(CONFIG.repo_prof)
+        subprocess.call(['./previous.sh'], shell=True)
+    if CONFIG.repo_gprof is not None:
+        os.chdir(CONFIG.repo_gprof)
+        subprocess.call(['./previous.sh'], shell=True)
+    if CONFIG.singletest:
+        TESTCASE = parse_configfile(CONFIG.tests + '/' +\
+            CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
+        execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
+    else:
+        for directory in ALL_DIR:
+            cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
+            '/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
+            execute_tests(cur_testcase, directory, CONFIG)

mosesdecoder/contrib/moses-speedtest/sys_drop_caches.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/spython
+from sys import argv, stderr, exit
+from os import linesep as ls
+procfile = "/proc/sys/vm/drop_caches"
+options = ["1","2","3"]
+flush_type = None
+try:
+    flush_type = argv[1][0:1]
+    if not flush_type in options:
+        raise IndexError, "not in options"
+    with open(procfile, "w") as f:
+        f.write("%s%s" % (flush_type,ls))
+    exit(0)
+except IndexError, e:
+    stderr.write("Argument %s required.%s" % (options, ls))
+except IOError, e:
+    stderr.write("Error writing to file.%s" % ls)
+except StandardError, e:
+    stderr.write("Unknown Error.%s" % ls)
+exit(1)

mosesdecoder/contrib/moses-speedtest/test_config ADDED Viewed

	@@ -0,0 +1,3 @@

+Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
+LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
+Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla

mosesdecoder/contrib/moses-speedtest/testsuite_config ADDED Viewed

	@@ -0,0 +1,5 @@

+MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
+DROP_CACHES_COMM: sys_drop_caches 3
+TEST_DIR: /home/moses-speedtest/phrase_tables/tests
+TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
+BASEBRANCH: RELEASE-2.1.1

mosesdecoder/contrib/picaro/README ADDED Viewed

	@@ -0,0 +1,62 @@

+README - 16 Jan 2011b
+Author: Jason Riesa <jason.riesa@gmail.com>
+Picaro [v1.0]: A simple command-line alignment visualization tool.
+Visualize alignments in grid-format.
+This brief README is organized as follows:
+I. REQUIREMENTS
+II. USAGE
+III. INPUT FORMAT
+IV. EXAMPLE USAGE
+V. NOTES
+I. REQUIREMENTS
+===============
+Python v2.5 or higher is required.
+II. USAGE
+=========
+Picaro takes as input 3 mandatory arguments and up to 2 optional arguments:
+Mandatory arguments:
+1. -a1 <alignment1>	where alignment1 is a path to an alignment file
+2. -e  <e>		where e is a path to a file of English sentences
+3. -f  <f>		where f is a path to a file of French sentences
+Optional arguments:
+1. -a2 <a2>	 	path to alignment2 file in f-e format
+2. -maxlen <len>	for each sentence pair, render only when each
+			sentence has length in words <= len
+For historical reasons we use the labels e, f, English, and French,
+but any language pair will do.
+III. INPUT FORMAT
+=================
+- Files e and f must be sentence-aligned
+- Alignment files must be in f-e format
+See included sample files in zh/ and es/.
+IV. EXAMPLE USAGE
+=================
+WITH A SINGLE ALIGNMENT:
+$ picaro.py -e zh/sample.e -f zh/sample.f -a1 zh/sample.aln
+COMPARING TWO ALIGNMENTS:
+$ picaro.py -e zh/sample.e -f zh/sample.f -a1 zh/alternate.aln -a2 zh/sample.aln
+When visualizing two alignments at once, refer to the following color scheme:
+Green blocks: alignments a1 and a2 agree
+Blue blocks:  alignment a1 only
+Gold blocks:  alignment a2 only
+V. NOTES
+========
+RIGHT-TO-LEFT TEXT:
+If you are using right-to-left text, e.g. Arabic, transliterate your text first.
+Terminals generally render unexpectedly with mixed left-to-right and right-to-left text.
+For Arabic, in particular, we use the Buckwalter translitation scheme [1] when using this tool.
+The following Perl module implements Buckwalter transliteration:
+http://search.cpan.org/~smrz/Encode-Arabic-1.8/lib/Encode/Arabic.pm
+[1] http://www.ldc.upenn.edu/myl/morph/buckwalter.html

mosesdecoder/contrib/picaro/es/README ADDED Viewed

	@@ -0,0 +1,4 @@

+Spanish-English sample
+sample.f	Spanish text
+sample.e 	English text
+sample.a	Alignment file with links in f-e format

mosesdecoder/contrib/picaro/es/sample.aln ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0-0 0-1 1-2 1-3 2-4 3-5 4-6 5-7

mosesdecoder/contrib/picaro/es/sample.e ADDED Viewed

	@@ -0,0 +1 @@


1	+ i want to go to spain tomorrow .

mosesdecoder/contrib/picaro/es/sample.f ADDED Viewed

	@@ -0,0 +1 @@


1	+ quiero ir a españa mañana .

mosesdecoder/contrib/picaro/picaro.py ADDED Viewed

	@@ -0,0 +1,250 @@

+#!/usr/bin/env python
+#
+# Picaro: An simple command-line alignment visualization tool.
+#
+# picaro.py
+# Visualize alignments between sentences in a grid format.
+#
+# Jason Riesa <riesa@isi.edu>
+# version: 01-16-2010
+#
+# Copyright (C) 2013 Jason Riesa
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+import sys, os, commands
+from collections import defaultdict
+#TC_BIN = "tc/tc.linux32"
+a1_file_str = ""
+a2_file_str = ""
+f_file_str = ""
+e_file_str = ""
+SHOW_TC_A1 = 0
+SHOW_TC_A2 = 0
+maxlen = float('inf')
+# Process command line options
+try:
+    while len(sys.argv) > 1:
+        option = sys.argv[1];           del sys.argv[1]
+        if  option == '-a1':
+            a1_file_str = sys.argv[1];  del sys.argv[1]
+        elif option == '-a2':
+            a2_file_str = sys.argv[1];  del sys.argv[1]
+        elif option == '-f':
+            f_file_str = sys.argv[1];   del sys.argv[1]
+        elif option == '-e':
+            e_file_str = sys.argv[1];   del sys.argv[1]
+	elif option == '-maxlen':
+	    maxlen = int(sys.argv[1]);  del sys.argv[1]
+        else:
+            sys.stderr.write("Invalid option: %s\n" % (option))
+            sys.exit(1)
+	'''
+        elif option == '-tc':
+            if sys.argv[1] == '1':
+                SHOW_TC_A1 = 1; del sys.argv[1]
+            elif sys.argv[1] == '2':
+                SHOW_TC_A2 = 2; del sys.argv[1]
+            else:
+                raise Exception, "Invalid argument to option -tc"
+	'''
+    if a1_file_str == "" or f_file_str == "" or e_file_str == "":
+        raise Exception, "Not all options properly specified."
+    # Make sure transitive closure binary exists if user has enabled this option
+    if SHOW_TC_A1 or SHOW_TC_A2:
+        if not os.path.exists(TC_BIN):
+            raise Exception, "Transitive closure binary "+TC_BIN+" not found."
+except Exception, msg:
+    sys.stderr.write("%s: %s\n" % (sys.argv[0], msg))
+    sys.stderr.write("Usage: %s: -a1 <alignment1> -f <f> -e <e> [-a2 <alignment2>]\n" % (sys.argv[0]))
+    sys.stderr.write("Mandatory arguments:\n")
+    sys.stderr.write(" -a1 <a1>\t path to alignment 1 file in f-e format\n")
+    sys.stderr.write(" -f <f>\t\t path to source text f\n")
+    sys.stderr.write(" -e <e>\t\t path to target text e\n")
+    sys.stderr.write("Optional arguments:\n")
+    sys.stderr.write(" -a2 <a2>\t path to alignment 2 file in f-e format\n")
+    sys.stderr.write(" -maxlen <len>\t display alignment only when e and f have length <= len\n")
+    sys.exit(1)
+a_file = open(a1_file_str, 'r')
+f_file = open(f_file_str, 'r')
+e_file = open(e_file_str, 'r')
+if a2_file_str != "":
+    a2_file = open(a2_file_str, 'r')
+sentenceNumber = 0
+nextRequested = 1
+for aline in a_file:
+    eline = e_file.readline()
+    fline = f_file.readline()
+    if a2_file_str != "":
+        a2line = a2_file.readline()
+    links = aline.split()
+    e_words = eline.split()
+    f_words = fline.split()
+    if a2_file_str != "":
+        links2 = a2line.split()
+    # Get transitive closure of links and links2
+    if SHOW_TC_A1:
+        cmd = 'echo "' + ' '.join(links) + '" | ' + TC_BIN
+        failure1, output1 = commands.getstatusoutput(cmd)
+        tc1 = output1.split()
+    if SHOW_TC_A2:
+        cmd = 'echo "' + ' '.join(links2) + '" | ' + TC_BIN
+        failure2, output2 = commands.getstatusoutput(cmd)
+        tc2 = output2.split()
+    # Update tracking counts
+    sentenceNumber += 1
+    if sentenceNumber < nextRequested:
+        continue
+    # Don't generate alignment grids for very large sentences
+    if len(e_words) > maxlen or len(f_words) > maxlen:
+        continue
+    print "== SENTENCE ",sentenceNumber," =="
+    # Initialize alignment objects
+    # a holds alignments of user-specified -a1 <file>
+    # a2 holds alignments of user-specified -a2 <file>
+    a = defaultdict(lambda: defaultdict(int))
+    a2 = defaultdict(lambda: defaultdict(int))
+    # Print e_words on the columns
+    # First, find the length of the longest word
+    longestEWordSize = 0
+    longestEWord = 0
+    for w in e_words:
+        if len(w) > longestEWordSize:
+            longestEWordSize = len(w)
+            longestEWord = w
+    # Now, print the e-words
+    for i in range(longestEWordSize, 0, -1):
+        for w in e_words:
+            if len(w) < i:
+                print " ",
+            else:
+                print w[(i*-1)],
+        print
+    # Fill in alignment matrix 1
+    for link in links:
+        i, j = map(int, link.split('-'))
+        a[int(i)][int(j)] = 1
+    # Fill in extra links added by transitive closure
+    if SHOW_TC_A1:
+        for link in tc1:
+            i, j = map(int, link.split('-'))
+            if(a[i][j] != 1):
+                a[i][j] = 2
+    # Fill in alignment matrix 2
+    if(a2_file_str != ""):
+        for link in links2:
+            i, j = map(int, link.split('-'))
+            a2[i][j] = 1
+        # Fill in extra links added by transitive closure
+        if SHOW_TC_A2:
+            for link in tc2:
+                i, j = map(int, link.split('-'))
+                if(a2[i][j] != 1):
+                    a2[i][j] = 2
+    # Print filled-in alignment matrix
+    if a2_file_str == "":
+        for i, _ in enumerate(f_words):
+            for j, _ in enumerate(e_words):
+                val1 = a[i][j]
+                if val1 == 0:
+                    # No link
+                    print ':',
+                elif val1 == 1:
+                    # Regular link
+                    print u'\u001b[44m\u0020\u001b[0m',
+                elif val1 == 2:
+                    # Link due to transitive closure
+                    # Render as gray-shaded square
+                    print 'O',
+            print f_words[i]
+        print
+    else:
+        for i, _ in enumerate(f_words):
+            for j, _ in enumerate(e_words):
+                val1 = a[i][j]
+                val2 = a2[i][j]
+                if val1 == 0 and val2 == 0:
+                    # Link not in a nor a2
+                    # Empty grid box
+                    print ':',
+                # Link in both a and a2
+                elif val1 > 0 and val2 > 0:
+                    # Green box
+                    if val1 == 1:
+                        if val2 == 1:
+                            print u'\u001b[42m\u001b[1m\u0020\u001b[0m',
+                        elif val2 == 2:
+                            print u'\u001b[42m\u001b[30m2\u001b[0m',
+                    elif val1 == 2:
+                        if val2 == 1:
+                            print u'\u001b[42m\u0020\u001b[0m',
+                        elif val2 == 2:
+                            print u'\u001b[42m\u001b[30m3\u001b[0m',
+                # Link in a2, but not a
+                elif val1 == 0 and val2 > 0:
+                    if val2 == 1:
+                        # Yellow box
+                        print u'\u001b[1m\u001b[43m\u0020\u001b[0m',
+                    elif val2 == 2:
+                        # Artificial link by transitive closure
+                        print u'\u001b[43m\u001b[30m2\u001b[0m',
+                # Link in a, but not a2
+                elif val1 > 0 and val2 == 0:
+                    if val1 == 1:
+                        # Blue box
+                        print u'\u001b[1m\u001b[44m\u0020\u001b[0m',
+                    elif val1 == 2:
+                        print u'\u001b[44m\u001b[37m1\u001b[0m',
+            print f_words[i]
+    nextDefault = sentenceNumber + 1
+    sys.stdout.write("Enter next alignment number or 'q' to quit [%d]: " %(nextDefault))
+    user_input = sys.stdin.readline().strip()
+    if user_input == "":
+        nextRequested = nextDefault
+    elif user_input[0] == "q" or user_input == "quit":
+        sys.exit(1)
+    else:
+        try:
+            nextRequested = int(user_input)
+        except:
+            nextRequested = sentenceNumber + 1
+            sys.stdout.write("Unknown alignment id: %s\nContinuing with %d.\n" %(user_input, nextRequested))
+a_file.close()
+e_file.close()
+f_file.close()

mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.idx ADDED Viewed

Binary file (68 Bytes). View file

mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.idx ADDED Viewed

Binary file (76 Bytes). View file

mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.srctree.wa ADDED Viewed

Binary file (728 Bytes). View file