sleepyhead111 commited on
Commit
de68f2b
·
verified ·
1 Parent(s): 61f1661

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/contrib/m4m/examples/giza-vs-fast.m4m +99 -0
  2. mosesdecoder/contrib/m4m/modules/obsolete/Makefile +64 -0
  3. mosesdecoder/contrib/m4m/modules/obsolete/baseline-system.m4m +48 -0
  4. mosesdecoder/contrib/m4m/modules/obsolete/directory-structure.m4m +7 -0
  5. mosesdecoder/contrib/m4m/modules/obsolete/model-filtering.m4m +37 -0
  6. mosesdecoder/contrib/m4m/modules/obsolete/phrase-table.make.scratch +124 -0
  7. mosesdecoder/contrib/m4m/modules/obsolete/reporting.m4m +95 -0
  8. mosesdecoder/contrib/m4m/modules/obsolete/run-moses.m4m +37 -0
  9. mosesdecoder/contrib/m4m/modules/obsolete/setup-experiments.m4m +121 -0
  10. mosesdecoder/contrib/m4m/modules/obsolete/skip-steps.mak +19 -0
  11. mosesdecoder/contrib/m4m/modules/obsolete/system.m4m +38 -0
  12. mosesdecoder/contrib/m4m/modules/obsolete/template.m4m +66 -0
  13. mosesdecoder/contrib/m4m/modules/obsolete/tune.m4m +45 -0
  14. mosesdecoder/contrib/m4m/scripts/fast-align2bal.py +31 -0
  15. mosesdecoder/contrib/m4m/scripts/giza.txt2snt.sh +41 -0
  16. mosesdecoder/contrib/m4m/scripts/moses.extract-phrases.sh +63 -0
  17. mosesdecoder/contrib/m4m/scripts/moses.make-lex.py +86 -0
  18. mosesdecoder/contrib/m4m/scripts/moses.phrase-extract.sh +110 -0
  19. mosesdecoder/contrib/m4m/scripts/moses.score-phrases.sh +41 -0
  20. mosesdecoder/contrib/m4m/scripts/moses.transfer-weights.py +61 -0
  21. mosesdecoder/contrib/m4m/util/Jamfile +12 -0
  22. mosesdecoder/contrib/memscore/Makefile.in +581 -0
  23. mosesdecoder/contrib/memscore/configure.ac +84 -0
  24. mosesdecoder/contrib/memscore/lexdecom.h +41 -0
  25. mosesdecoder/contrib/memscore/memscore.cpp +85 -0
  26. mosesdecoder/contrib/memscore/memscore.h +57 -0
  27. mosesdecoder/contrib/memscore/missing +360 -0
  28. mosesdecoder/contrib/memscore/phraselm.h +45 -0
  29. mosesdecoder/contrib/memscore/phrasetable.cpp +348 -0
  30. mosesdecoder/contrib/memscore/scorer.h +71 -0
  31. mosesdecoder/contrib/memscore/timestamp.h +29 -0
  32. mosesdecoder/contrib/mira/Main.cpp +1849 -0
  33. mosesdecoder/contrib/mira/Perceptron.cpp +53 -0
  34. mosesdecoder/contrib/mira/mira.xcodeproj/project.pbxproj +401 -0
  35. mosesdecoder/contrib/moses-speedtest/README.md +146 -0
  36. mosesdecoder/contrib/moses-speedtest/check_for_regression.py +63 -0
  37. mosesdecoder/contrib/moses-speedtest/cronjob +7 -0
  38. mosesdecoder/contrib/moses-speedtest/runtests.py +439 -0
  39. mosesdecoder/contrib/moses-speedtest/sys_drop_caches.py +22 -0
  40. mosesdecoder/contrib/moses-speedtest/test_config +3 -0
  41. mosesdecoder/contrib/moses-speedtest/testsuite_config +5 -0
  42. mosesdecoder/contrib/picaro/README +62 -0
  43. mosesdecoder/contrib/picaro/es/README +4 -0
  44. mosesdecoder/contrib/picaro/es/sample.aln +1 -0
  45. mosesdecoder/contrib/picaro/es/sample.e +1 -0
  46. mosesdecoder/contrib/picaro/es/sample.f +1 -0
  47. mosesdecoder/contrib/picaro/picaro.py +250 -0
  48. mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.idx +0 -0
  49. mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.idx +0 -0
  50. mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.srctree.wa +0 -0
mosesdecoder/contrib/m4m/examples/giza-vs-fast.m4m ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # some variables need to be set before m4m modules are included
4
+ .SECONDARY:
5
+
6
+ MOSES_ROOT = ${HOME}/code/moses/master/mosesdecoder
7
+ MGIZA_ROOT = ${HOME}/tools/mgiza
8
+ fast_align = ${HOME}/bin/fast_align
9
+
10
+ # L1: source language; L2: target language
11
+ L1 = de
12
+ L2 = en
13
+ WDIR = $(CURDIR)
14
+
15
+ include ${MOSES_ROOT}/contrib/m4m/modules/m4m.m4m
16
+
17
+ # both systems use the same language model
18
+ L2raw := $(wildcard ${WDIR}/crp/trn/*/raw/*.${L2}.gz)
19
+ L2data := $(subst /raw/,/cased/,${L2raw})
20
+ lm.order = 5
21
+ lm.factor = 0
22
+ lm.lazy = 1
23
+ lm.file = ${WDIR}/lm/${L2}.5-grams.kenlm
24
+ ${lm.file}: | $(L2data)
25
+ $(eval $(call add_kenlm,${lm.file},${lm.order},${lm.factor},${lm.lazy}))
26
+ .INTERMEDIATE: ${L2data}
27
+
28
+ dmodels = wbe-mslr-bidirectional-fe-allff
29
+ mysystem = systems/${word-alignment}-aligned
30
+ myptable = model/tm/${aligner}.${L1}-${L2}
31
+ mydtable = model/dm/${aligner}.${L1}-${L2}
32
+
33
+ wa ?= $(error wa not specified on command line)
34
+ SYSTEMS :=
35
+ aligner :=
36
+ $(foreach a,${wa},\
37
+ $(eval aligner:=${a});\
38
+ $(eval $(clear-ptables));\
39
+ $(eval $(clear-dtables));\
40
+ $(eval SYSTEMS+=systems/${a}-aligned);\
41
+ $(eval $(call add_binary_phrase_table,0,0,4,$${myptable}));\
42
+ $(eval $(call add_binary_reordering_table,0,0,8,\
43
+ ${dmodels},$${mydtable},$${myptable}));\
44
+ $(eval $(call create_moses_ini,$${mysystem})))
45
+
46
+ aln: $(foreach a,${wa},${WDIR}/crp/trn/aln/$a/${L1}-${L2}.symal.gz)
47
+ info:
48
+ dtable: ${DTABLES}
49
+ ptable: ${PTABLES}
50
+ system: $(addsuffix /moses.ini.0,${SYSTEMS})
51
+ eval: ${EVALUATIONS}
52
+
53
+
54
+ ifdef tune.runs
55
+
56
+ TUNED_SYSTEMS :=
57
+ EVALUATIONS :=
58
+ $(eval $(tune_all_systems))
59
+ $(eval $(bleu_score_all_systems))
60
+ tune: ${TUNED_SYSTEMS}
61
+ echo TUNED ${TUNED_SYSTEMS}
62
+ all: ${EVALUATIONS}
63
+
64
+ else
65
+
66
+ tune: all
67
+
68
+ # The recursive calls below make sure that tuning runs happen sequentially
69
+ # (moses runs multi-threaded anyway). The reason is that we may want to have
70
+ # first results as soon as possible.
71
+ tune.runs := 1 1
72
+ $(info TUNE RUNS ${tune.runs})
73
+ all:
74
+ $(foreach n,$(shell seq ${tune.runs}),\
75
+ ${MAKE} -f $(word 1, ${MAKEFILE_LIST}) \
76
+ tune.runs="$n $n" ${MAKECMDGOALS} -${MAKEFLAGS})
77
+
78
+ endif
79
+
80
+ .PHONY: $(addprefix reset-,lm tm dm all aln tune eval systems)
81
+ reset-aln: reset-mm
82
+ -rm -rf $(foreach a,${wa},crp/trn/aln/${a})
83
+ reset-mm: reset-dm reset-tm
84
+ -rm -rf $(foreach a,${wa},crp/trn/mm/${a})
85
+ reset-dm: reset-systems
86
+ -rm -rf $(foreach a,${wa},model/dm/${a}.*)
87
+ reset-tm: reset-systems
88
+ -rm -rf $(foreach a,${wa},model/tm/${a}.*)
89
+ reset-systems:
90
+ -rm -rf ${SYSTEMS}
91
+ reset-tune:
92
+ -rm -rf $(foreach s,${SYSTEMS}/$s/tune)
93
+ reset-eval:
94
+ -rm -rf $(foreach s,${SYSTEMS},$s/eval)
95
+ reset-lm:
96
+ -rm -rf lm
97
+ reset-all: reset-lm reset-aln
98
+ -rm -rf $(wildcard crp/trn/*/[ct]* crp/dev/[ct]* crp/tst/[ct]*)
99
+ -rm -rf auxiliary
mosesdecoder/contrib/m4m/modules/obsolete/Makefile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+ # Mandatory at the beginning of the file, before m4m inclusions
3
+
4
+ # L1,L2: tags that identify translation source (L1)
5
+ # and translation target (L2) language
6
+ L1 ?= de
7
+ L2 ?= en
8
+
9
+ LL = $(word 1, $(sort ${L1} ${L2}))-$(word 2, $(sort ${L1} ${L2}))
10
+ # a name for this experiment
11
+ experiment = dynsa-vs-std-phrase-table
12
+
13
+ # the working directry
14
+ WDIR = $(CURDIR)
15
+ MOSES_ROOT = ${HOME}/code/moses/master/mosesdecoder
16
+
17
+ # include m4m boilerplate
18
+ include ${MOSES_ROOT}/contrib/m4m/modules/m4m.m4m
19
+
20
+ $(info M4MDIR=${m4mdir})
21
+
22
+ #include ${m4mdir}/baseline-system.make
23
+ #include ${m4mdir}dynsa-system.make
24
+ #$(info ${MY_EXPERIMENT})
25
+
26
+ tune.sets = $(subst /raw/,/cased/,$(wildcard crp/dev/raw/*.${L1}.gz))
27
+
28
+ all:
29
+ .PHONY: all
30
+
31
+ ifdef tune.runs
32
+ $(foreach tuneset, $(word 1,${tune.sets:.${L1}.gz=}),\
33
+ $(foreach run,$(shell seq ${tune.runs}),\
34
+ $(eval $(call tune_system,baseline/moses.ini.0,\
35
+ baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
36
+ ${tuneset}.${L1},${tuneset}.${L2},0));\
37
+ $(if ,$(info $(call tune_system,baseline/moses.ini.0,\
38
+ baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
39
+ ${tuneset}.${L1},${tuneset}.${L2},0));)\
40
+ $(eval $(call copy_weights,dynsa/moses.ini.0,\
41
+ baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
42
+ dynsa/tuned/$(notdir ${tuneset})/${run}/moses.ini));\
43
+ $(if ,$(info $(call copy_weights,dynsa/moses.ini.0,\
44
+ baseline/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
45
+ dynsa/tuned/$(notdir ${tuneset})/${run}/moses.ini));)\
46
+ $(foreach evalset,$(word 2,${tune.sets:.${L1}.gz=}),\
47
+ $(foreach system,baseline dynsa,\
48
+ $(eval evaltarget:=${system}/eval/$(notdir ${tuneset})/${run}/$(notdir ${evalset}));\
49
+ $(eval $(call bleu_eval,${evaltarget},\
50
+ ${system}/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
51
+ ${evalset}.${L1},${moses.inputtype.plaintext},${evalset}.${L2}));\
52
+ $(if ,$(info $(call bleu_eval,${evaltarget},\
53
+ ${system}/tuned/$(notdir ${tuneset})/${run}/moses.ini,\
54
+ ${evalset}.${L1},${moses.inputtype.plaintext},${evalset}.${L2}));)\
55
+ ));\
56
+ ))
57
+
58
+ all: ${EVALUATIONS}
59
+ echo EVALS ${EVALUATIONS}
60
+ else
61
+ all:
62
+ $(foreach n,$(shell seq 1 1),${MAKE} tune.runs="$n $n";)
63
+ endif
64
+
mosesdecoder/contrib/m4m/modules/obsolete/baseline-system.m4m ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # This module defines a simple phrase-based baseline system
4
+ # - a single corpus
5
+ # - no factors
6
+ # - single ttable
7
+ # - single distortion model
8
+
9
+ # chose a name for the system
10
+ # ${system}/moses.ini.0 then defines the system
11
+ system = baseline
12
+ SYSTEMS += ${system}
13
+ .PHONY: ${system}
14
+ ${system}: ${system}/moses.ini.0
15
+
16
+ #################################################################################
17
+ #
18
+ # Create phrase table(s) and distortion model(s) that you want to use in this
19
+ # system. If you already have binary or text version of all tables, you don't
20
+ # need to specify pll.{txt1,txt2,aln}.
21
+ pll.txt1 = ${WDIR}/crp/trn/aln/fast/${L1}.txt.gz
22
+ pll.txt2 = ${WDIR}/crp/trn/aln/fast/${L2}.txt.gz
23
+ pll.aln = ${WDIR}/crp/trn/aln/fast/${L1}-${L2}.symal.gz
24
+ ptable = ${WDIR}/model/tm/ptable.${L1}-${L2}
25
+ dtable = ${WDIR}/model/dm/dtable.${L1}-${L2}
26
+ ptable.max-phrase-length = 7
27
+ # ptable.smoothing = --GoodTuring
28
+ # dmodels = wbe-mslr-bidirectional-fe-allff
29
+
30
+ LMODEL_ENTRIES = KENLM;name=KENLM0;order=5;factor=0;num-features=1;lazyken=0;path=$(abspath lm/europarl-v7.en.kenlm)
31
+ LMODELS = lm/europarl-v7.en.kenlm
32
+
33
+ MY_EXPERIMENT += $(call add_binary_phrase_table,0,0,5,${ptable})
34
+ $(eval $(call add_binary_phrase_table,0,0,5,${ptable}))
35
+
36
+ if 0
37
+ MY_EXPERIMENT += $(call add_binary_reordering_table,0,0,8,\
38
+ wbe-mslr-bidirectional-fe-allff,${dtable},${ptable})
39
+ $(eval $(call add_binary_reordering_table,0,0,8,\
40
+ wbe-mslr-bidirectional-fe-allff,${dtable},${ptable}))
41
+ endif
42
+
43
+ MY_EXPERIMENT += $(call create_moses_ini,${system})
44
+ $(eval $(call create_moses_ini,${system}))
45
+
46
+ #################################################################################
47
+
48
+
mosesdecoder/contrib/m4m/modules/obsolete/directory-structure.m4m ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # STANDARD LOCATIONS
4
+ basedir ?= $(CURDIR)
5
+ tune.dir ?= ${basedir}/tune
6
+ eval.dir ?= ${basedir}/eval
7
+ input.dir ?= ${basedir}/input
mosesdecoder/contrib/m4m/modules/obsolete/model-filtering.m4m ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+ #
3
+ # This module deals with model filtering (if necessary).
4
+ # It produces the moses.ini files for filtered models for
5
+ # tuning and evaluation.
6
+
7
+ ifndef ${moses_ini_for_tuning}
8
+ moses_ini_for_tuning = # WHAT'S THE DEFAULT LOCATION FOR THIS IN EMS?
9
+ endif
10
+
11
+ ifndef ${moses_ini_for_eval}
12
+ moses_ini_for_eval = # WHAT'S THE DEFAULT LOCATION FOR THIS IN EMS?
13
+ endif
14
+
15
+ # filter models if suggested by set-up
16
+ ifneq (${moses_ini_for_tuning}, ${untuned_moses_ini})
17
+ ${moses_ini_for_tuning}: | ${untuned_moses_ini}
18
+ ${moses_ini_for_tuning}: | ${tuning_input_ready}
19
+
20
+ # phrase table in text format?
21
+ ifeq ($(shell grep -v '^ *\#' ${untuned_moses_ini} \
22
+ | grep -A1 '\[ttable-file\]' | tail -n +2 \
23
+ | head -n1 | awk '{print $$1}'),0)
24
+ # ADD PHRASE TABLE FILTERING COMMAND HERE
25
+ endif
26
+
27
+ # how does moses know if a lexicalized distortion table is binary or not?
28
+ # ADD LEXICAL DISTORTION TABLE FILTERING COMMAND HERE
29
+
30
+ ifneq (${moses_ini_for_eval),$(tuned_moses_ini))
31
+ # add code for model filtering for eval here
32
+ endif
33
+
34
+
35
+
36
+
37
+
mosesdecoder/contrib/m4m/modules/obsolete/phrase-table.make.scratch ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .PHONY: $1
2
+ # $1: $1.binphr.idx
3
+ # $1.txt.gz: | L1text = $4
4
+ # $1.txt.gz: | L2text = $5
5
+ # $1.txt.gz: | symal = $6
6
+ # ${moses.ini}: $1
7
+ # PTABLES += 1;$2;$3;5;$1
8
+ # endef
9
+
10
+
11
+ # ${target}.tmp/fwd/scored.gz: | ${target}/phrase-extraction.DONE
12
+ # | ${L1File} ${L2File} ${symal}
13
+
14
+
15
+ # # convert phrase table from text file to binary format
16
+ # %.binphr.idx: | %.txt.gz ${MOSES_BIN}/processPhraseTable
17
+ # $(lock)
18
+ # zcat -f $*.txt.gz | ${MOSES_BIN}/processPhraseTable \
19
+ # -ttable ${L1factors} ${L2factors} - -nscores 5 -out ${@D}/_${@F} \
20
+ # && mv ${@D}/_${@F} $@
21
+ # $(unlock)
22
+
23
+
24
+ # # directory definitions
25
+ # mo_mdl = model
26
+ # mo_tmp = model/tmp
27
+ # wrdaln = ${fstaln}/out
28
+ # # wrdaln should be set elsewhere!
29
+
30
+ # # milestone files created during phrase table construction
31
+ # ptable_bin = ${mo_mdl}/ptable.${L1}-${L2}
32
+ # ptable = ${mo_mdl}/ptable.${L1}-${L2}.txt.gz
33
+ # lex1given2 = ${mo_mdl}/${L1}-given-${L2}.lex.gz
34
+ # lex2given1 = ${mo_mdl}/${L2}-given-${L1}.lex.gz
35
+ # mosesinifile = ${mo_mdl}/moses.ini.0
36
+
37
+ # .PHONY: lex ptable
38
+ # lex: ${lex1given2} ${lex2given1}
39
+ # ptable: ${ptable_bin}
40
+
41
+ # # steps taken in this module
42
+
43
+ # # -------------------------------------------------------------------------------
44
+ # # --- STEP 1a: extract raw phrases from word-aligned corpus ---------------------
45
+ # # -------------------------------------------------------------------------------
46
+ # # Note: the script ${moses.extract-phrases} takes care of initial sorting
47
+ # ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract-phrases}
48
+ # ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract}
49
+ # ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}.txt.gz
50
+ # ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L2}.txt.gz
51
+ # ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}-${L2}.symal.gz
52
+ # ${mo_tmp}/phrase-extraction.DONE:
53
+ # $(lock)
54
+ # ${moses.extract-phrases} \
55
+ # ${moses.extract} \
56
+ # ${wrdaln}/${L1}.txt.gz \
57
+ # ${wrdaln}/${L2}.txt.gz \
58
+ # ${wrdaln}/${L1}-${L2}.symal.gz \
59
+ # ${mo_tmp} ${max_phrase_length} \
60
+ # ${dmodel.type}-${dmodel.orientation} \
61
+ # && touch $@
62
+ # $(unlock)
63
+
64
+ # # -------------------------------------------------------------------------------
65
+ # # --- STEP 1a: extract word translation lexica from word-aligned corpus ---------
66
+ # # --- (for lexical phrase scoring) ---------
67
+ # # -------------------------------------------------------------------------------
68
+ # $(lex2given1): $(lex1given2)
69
+ # $(lex1given2): | ${wrdaln}/${L1}.txt.gz
70
+ # $(lex1given2): | ${wrdaln}/${L2}.txt.gz
71
+ # $(lex1given2): | ${wrdaln}/${L1}-${L2}.symal.gz
72
+ # $(lock)
73
+ # $(moses.make-lex) \
74
+ # ${wrdaln}/${L1}.txt.gz \
75
+ # ${wrdaln}/${L2}.txt.gz \
76
+ # ${wrdaln}/${L1}-${L2}.symal.gz \
77
+ # $(lex1given2) \
78
+ # $(lex2given1)
79
+ # $(unlock)
80
+
81
+ # # -------------------------------------------------------------------------------
82
+ # # --- STEP 2: score extracted phrase pairs --------------------------------------
83
+ # # -------------------------------------------------------------------------------
84
+ # ptfwdhalf = ${mo_tmp}/fwd/phrases.fwd.scored.gz
85
+ # ptbwdhalf = ${mo_tmp}/bwd/phrase-scoring.DONE
86
+
87
+ # # -------------------------------------------------------------------------------
88
+ # # --- STEP 2a: score phrases in the 'forward' direction -------------------------
89
+ # # -------------------------------------------------------------------------------
90
+ # $(ptfwdhalf): | ${mo_tmp}/phrase-extraction.DONE
91
+ # $(ptfwdhalf): | ${lex1given2}
92
+ # $(lock)
93
+ # $(merge-sorted) ${mo_tmp}/fwd/part.*.gz \
94
+ # | ${moses.score-phrases} ${MOSES_BIN}/score - ${lex1given2} ${@:.scored.gz=} \
95
+ # $(ptable.smoothing) && mv $@_ $@
96
+ # $(unlock)
97
+
98
+ # # -------------------------------------------------------------------------------
99
+ # # --- STEP 2b: score phrases in the 'backward' direction -------------------------
100
+ # # -------------------------------------------------------------------------------
101
+ # # Note: ${moses.score-phrases} re-sorts the scored backward phrases
102
+ # $(ptbwdhalf): | ${mo_tmp}/phrase-extraction.DONE
103
+ # $(ptbwdhalf): | ${lex2given1}
104
+ # $(lock)
105
+ # $(merge-sorted) ${mo_tmp}/bwd/part.*.gz \
106
+ # | ${moses.score-phrases} ${MOSES_BIN}/score - ${lex2given1} ${@D}/scored \
107
+ # "$(ptable.smoothing)" --Inverse && touch $@
108
+ # $(unlock)
109
+
110
+ # # -------------------------------------------------------------------------------
111
+ # # --- STEP 3: put the two phrase table halves together --------------------------
112
+ # # -------------------------------------------------------------------------------
113
+ # # ptfwdhalf is a single .gz file, ptbwdhalf is a collection .gz files
114
+ # $(ptable): | ${MOSES_BIN}/consolidate
115
+ # $(ptable): | $(ptfwdhalf) $(ptbwdhalf)
116
+ # $(lock)
117
+ # ${MOSES_BIN}/consolidate \
118
+ # <(zcat ${ptfwdhalf}) \
119
+ # <(${merge-sorted} ${mo_tmp}/bwd/scored.*.gz) /dev/stdout \
120
+ # $(if $(ptable.smoothing), \
121
+ # $(ptable.smoothing) $(ptfwdhalf:.sorted.gz=.coc)) \
122
+ # | gzip > $@_ && mv $@_ $@
123
+ # $(unlock)
124
+
mosesdecoder/contrib/m4m/modules/obsolete/reporting.m4m ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ rset = set=$2,type=$3,file=evaluation/$1/$2.$3
4
+ analyses = $(foreach e, ${eval-sets}, \
5
+ $(call rset,$1,$e,analysis-precision) \
6
+ $(call rset,$1,$e,analysis-coverage))
7
+ eval-scores = $(foreach e, ${eval-sets}, \
8
+ $(foreach m, ${eval-metrics}, \
9
+ $(call rset,$1,$e,$m)))
10
+ eval-results = $(foreach e, ${eval-sets}, \
11
+ $(foreach m, ${eval-metrics}, \
12
+ evaluation/$1/$e.$m))
13
+
14
+
15
+ .SECONDEXPANSION:
16
+ # NOTA BENE: setup-experiments.make adds additional dependencies for
17
+ # evaluation/%/report in the file experiments.make!
18
+ evaluation/%/report: sets = $(call eval-scores,$*)
19
+ #evaluation/%/report: sets += $(call analyses,$*)
20
+ #evaluation/%/report: tuned_moses_ini := $(if ${have_tuned_moses_ini},${have_tuned_moses_ini},tuning/$*/moses.tuned.ini)
21
+ evaluation/%/report: prereqs = $(call eval-results,$*)
22
+ evaluation/%/report: $$(prereqs)
23
+ echo $(foreach s, ${sets}, $s) $^
24
+ mkdir $@.lock
25
+ echo $(call lockline) > $@.lock/owner
26
+ ${report} ${sets} > $@_
27
+ mv $@_ $@
28
+ rm $@.lock/owner
29
+ rmdir $@.lock
30
+
31
+ %.analysis: params1 = -input ${$(notdir $*)-src}
32
+ %.analysis: params1 += -input-corpus ${crp_train}.${L1}
33
+ %.analysis: params1 += -ttable ${ttable} -dir $@
34
+ %.analysis: params2 = -precision-by-coverage
35
+ %.analysis: params2 += -reference ${$(notdir $*)-ref}
36
+ %.analysis: params2 += -system $*.truecased
37
+ %.analysis: params2 += -segmentation $*.output
38
+ %.analysis: params2 += -system-alignment $*.output.wa
39
+ %.analysis: params2 += -coverage $@
40
+ %.analysis: | ${ttable} ${crp_train}.${L1}
41
+ %.analysis: %.output.wa %.output %.truecased
42
+ @echo ANALYSING $^
43
+ @mkdir $@.lock
44
+ @echo $(call lockline) > $@.lock/owner
45
+ ${analyze} ${params1}
46
+ ${analyze} ${params1} ${params2}
47
+ @rm$@.lock/owner
48
+ @rmdir $@.lock
49
+
50
+ %.multi-bleu: %.cleaned
51
+ $(info )
52
+ $(info RUNNING MULTI-BLEU on $^)
53
+ @mkdir $@.lock
54
+ @echo $(call lockline) > $@.lock/owner
55
+ ${multi-bleu} ${$(notdir $*)-ref} < $< > $@_
56
+ @mv $@_ $@
57
+ @rm $@.lock/owner
58
+ @rmdir $@.lock
59
+
60
+ %.truecased: %.cleaned
61
+ mkdir $@.lock
62
+ $(detruecase) < $< > $@_
63
+ mv $@_ $@
64
+ rmdir $@.lock
65
+
66
+ %.cleaned: %.output
67
+ $(info )
68
+ $(info CLEANING UP DECODER OUTPUT: $<)
69
+ $(info )
70
+ mkdir $@.lock
71
+ echo $(call lockline) > $@.lock/owner
72
+ $(clean-decoder-output) < $< > $@_
73
+ mv $@_ $@
74
+ rm $@.lock/owner
75
+ rmdir $@.lock
76
+
77
+ %.output.wa: %.output
78
+ evaluation/%.output: decoder_flags += -threads ${moses.threads} -v 0
79
+ evaluation/%.output: decoder_flags += -inputtype ${input-type}
80
+ evaluation/%.output: decoder_flags += -alignment-output-file $@.wa
81
+ evaluation/%.output: decoder_flags += -t -text-type "test"
82
+ evaluation/%.output: decoder_flags += -f ${moses_ini}
83
+ evaluation/%.output: input = ${$(notdir $*)-src}
84
+ evaluation/%.output:
85
+ echo MOSES_INI = ${moses_ini}
86
+ @mkdir -p $(@D)
87
+ @mkdir $@.lock
88
+ @echo $(call lockline) > $@.lock/owner
89
+ ${decode} ${decoder_flags} < ${input} > $@_
90
+ @mv $@_ $@
91
+ @rm $@.lock/owner
92
+ @rmdir $@.lock
93
+
94
+ .SECONDARY:
95
+
mosesdecoder/contrib/m4m/modules/obsolete/run-moses.m4m ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # This make module deals with running the moses decoder.
4
+ # It sets default parameters and checks that parameters that
5
+ # need to be set elsewhere are actually set.
6
+
7
+ # The following parameters are translation-job specific and need to be set
8
+ # explicitly for each job.
9
+
10
+ moses.threads ?= 4
11
+ moses.flags += -threads ${moses.threads}
12
+ moses.flags += -v 0 -t -text-type "test"
13
+
14
+ %.moses-out.wa: moses.flags += -alignment-output-file $*.output.wa
15
+ %.moses-out.wa: %.moses-out
16
+
17
+
18
+ .SECONDEXPANSION:
19
+ %.moses-out:
20
+ echo MOSES $^
21
+ $(checkvar,moses.input)
22
+ $(checkvar,moses.ini)
23
+ $(lock)
24
+ ${moses} -i ${moses.input} -inputtype ${moses.inputtype} \
25
+ -f ${moses.ini} ${moses.flags} > $@_ && mv $@_ $@
26
+ $(unlock)
27
+
28
+ %.cleaned: %.moses-out
29
+ $(lock)
30
+ $(clean-decoder-output) < $< > $@_ && mv $@_ $@
31
+ $(unlock)
32
+
33
+ %.natcased: %.cleaned
34
+ $(eval $(call lock))
35
+ $(detruecase) < $*.cleaned > $@_ && mv $@_ $@
36
+ $(eval $(call unlock))
37
+
mosesdecoder/contrib/m4m/modules/obsolete/setup-experiments.m4m ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # This make module sets up the actual experiments
4
+
5
+ L1 = fr
6
+ L2 = en
7
+ tune-ref-ready = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/tuning/reference.tc.18
8
+ eval-ref-ready = /fs/saxnot5/germann/accept/homophones/exp.new/evaluation/201201_devtest_b.reference.tok.1
9
+ crp_train = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/training/corpus.19
10
+ ttable = /fs/sif0/bhaddow/experiments/accept/symantec-baseline/model/phrase-table.10
11
+
12
+ untuned_moses_ini = model/moses.ini.0
13
+ fixed-iweight = --activate-feature d_0,d_1,d_2,d_3,d_4,d_5,d_6,lm_0,w_0,tm_0,tm_1,tm_2,tm_3,tm_4
14
+
15
+ # list the evaluation metrics to be used for evaluation
16
+ # TO DO: list available metrics
17
+ eval-metrics = multi-bleu
18
+ moses-threads = 20
19
+ tuning-runs = $(shell seq 25)
20
+
21
+ # experiments.make: WSCHEMES = uniform unigram bigram bigram2
22
+ # experiments.make: DATASETS = tune eval
23
+ # experiments.make: PREPROC = baseline uniq multi
24
+ # experiments.make: CSETS = unfiltered filtered edited
25
+ experiments.make: WSCHEMES = bigram2
26
+ experiments.make: DATASETS = tune eval
27
+ experiments.make: PREPROC = baseline
28
+ experiments.make: CSETS = filtered
29
+ # remake experiments.make if this file changes
30
+ experiments.make: $(word $(words ${MAKEFILE_LIST}), ${MAKEFILE_LIST})
31
+ experiments.make:
32
+ mkdir $@.lock
33
+ echo $(call lockline) > $@.lock/owner
34
+ echo '# -*- Makefile -*-' > $@_
35
+ echo '# This file was automatically generated by setup-experiments.make.' >> $@_
36
+ echo 'experiments := ' >> $@_;
37
+ $(foreach p, ${PREPROC}, \
38
+ echo '# NEW EXPERIMENT #####################################' >> $@_; \
39
+ echo 'experiments += $p' >> $@_; \
40
+ echo 'ctr = $$(words $${experiments})' >> $@_; \
41
+ echo '$p: input-type = 0' >> $@_; \
42
+ echo '$p: eval-sets = $p.eval' >> $@_; \
43
+ echo '$p: tune-src = input/$p.tune.tc' >> $@_; \
44
+ echo '$p: tune-ref = ${tune-ref-ready}' >> $@_; \
45
+ echo '$p: $p.eval-src = input/$p.eval.tc' >> $@_; \
46
+ echo '$p: $p.eval-ref = ${eval-ref-ready}' >> $@_; \
47
+ echo '$p: evaluation/$${ctr}/report' >> $@_; \
48
+ echo >> $@_; \
49
+ echo 'evaluation/$p/%/$p.eval.output: input = input/$p.eval.tc' >> $@_; \
50
+ echo 'evaluation/$p/%/$p.eval.output: input/$p.eval.tc' >> $@_; \
51
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
52
+ 'evaluation/$${ctr}/$p.eval.output: ${tuned_moses_ini}', \
53
+ 'evaluation/$${ctr}/$p.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
54
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
55
+ 'evaluation/$${ctr}/$p.eval.output: moses_ini := ${tuned_moses_ini}', \
56
+ 'evaluation/$${ctr}/$p.eval.output: moses_ini := tuning/$${ctr}/moses.tuned.ini') >> $@_; \
57
+ echo 'evaluation/$${ctr}/$p.eval.multi-bleu: $${$p.eval-ref}' >> $@_; \
58
+ echo >> $@_;)
59
+ $(foreach c, ${CSETS}, \
60
+ $(foreach p, ${PREPROC}, \
61
+ $(foreach w, ${WSCHEMES}, \
62
+ echo '# NEW EXPERIMENT #####################################' >> $@_; \
63
+ echo 'experiments += $w-$c-$p' >> $@_; \
64
+ echo 'ctr = $$(words $${experiments})' >> $@_; \
65
+ echo '$w-$c-$p: input-type = 1' >> $@_; \
66
+ echo '$w-$c-$p: eval-sets = $w-$c-$p.eval' >> $@_; \
67
+ echo '$w-$c-$p: tune-src = input/$w-$c-$p.tune.cfn' >> $@_; \
68
+ echo '$w-$c-$p: tune-ref = ${tune-ref-ready}' >> $@_; \
69
+ echo '$w-$c-$p: $w-$c-$p.eval-src = input/$w-$c-$p.eval.cfn' >> $@_; \
70
+ echo '$w-$c-$p: $w-$c-$p.eval-ref = ${eval-ref-ready}' >> $@_; \
71
+ echo '$w-$c-$p: evaluation/$${ctr}/report' >> $@_; \
72
+ echo >> $@_; \
73
+ echo 'evaluation/$${ctr}/$w-$c-$p.eval.output: input = input/$w-$c-$p.eval.cfn' >> $@_; \
74
+ echo 'evaluation/$${ctr}/$w-$c-$p.eval.output: input/$w-$c-$p.eval.cfn' >> $@_; \
75
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
76
+ 'evaluation/$${ctr}/$w-$c-$p.eval.output: ${tuned_moses_ini}', \
77
+ 'evaluation/$${ctr}/$w-$c-$p.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
78
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
79
+ 'evaluation/$${ctr}/$w-$c-$p.eval.output: moses_ini := ${tuned_moses_ini}', \
80
+ 'evaluation/$${ctr}/$w-$c-$p.eval.output: moses_ini := tuning/$${ctr}/moses.tuned.ini') >> $@_; \
81
+ echo 'evaluation/$${ctr}/$w-$c-$p.eval.multi-bleu: $${$w-$c-$p.eval-ref}' >> $@_; \
82
+ echo >> $@_;\
83
+ $(foreach d, tune eval, \
84
+ echo 'cfn-targets += input/$w-$c-$p.$d.cfn' >> $@_; \
85
+ echo 'input/$w-$c-$p.$d.cfn: input/$p.$d.tc' >> $@_; \
86
+ printf '\t@mkdir $$@.lock\n\t@echo $$(call lockline) > $$@.lock/owner\n' >> $@_; \
87
+ printf '\tcreate-confusion-network.01.exe -q -w $w -s csets/csets.$c.txt -c ../mm/fr < $$< > $$@_ && mv $$@_ $$@\n' >> $@_;\
88
+ printf '\t@rm $$@.lock/owner\n\t@rmdir $$@.lock\n' >> $@_;))))
89
+ echo '.PHONY += $$(experiments) cfn' >> $@_
90
+ echo 'cfns: $${cfn-targets}' >> $@_
91
+ @mv $@_ $@
92
+ @rm $@.lock/owner
93
+ @rmdir $@.lock
94
+
95
+
96
+
97
+ # # echo 'ctr = $$(words $${experiments})' >> $@_; \
98
+ # echo 'eval-sets = $w-$c-$p.eval' >> $@_; \
99
+ # echo 'rx := $$(call report-prereqs,$${ctr},$${eval-sets})' >> $@_; \
100
+ # echo '$w-$c-$p: run-id := $${ctr}' >> $@_; \
101
+ # echo '$w-$c-$p: tune-input = input/$w-$c-$p.tune.cfn' >> $@_; \
102
+ # echo '$w-$c-$p: tune-src = input/$w-$c-$p.tune.cfn' >> $@_; \
103
+ # echo '$w-$c-$p: tune-ref = ${tune-ref-ready}' >> $@_; \
104
+ # echo '$w-$c-$p: $w-$c-$p.eval-src = input/$w-$c-$p.eval.cfn' >> $@_; \
105
+ # echo '$w-$c-$p: $w-$c-$p.eval-ref = ${eval-ref-ready}' >> $@_; \
106
+ # echo '$w-$c-$p: input-type = 1' >> $@_; \
107
+ # echo '$w-$c-$p: mert.options += $$(if $$(findstring uniform,$w),${fixed-iweight})' >> $@_; \
108
+ # echo '$w-$c-$p: evaluation/report.$${ctr}' >> $@_; \
109
+ # echo >> $@_; \
110
+ # echo 'evaluation/$w-$c-$p.eval.output.$${ctr}: input = input/$w-$c-$p.eval.cfn' >> $@_; \
111
+ # echo >> $@_; \
112
+ # $(foreach d, tune eval, \
113
+ # ofile=input/$w-$c-$p.$d.cfn; \
114
+ # ifile=input/$p.$d.tc; \
115
+ # echo "$$ofile: $$ifile" >> $@_ ; \
116
+ # printf '\t create-confusion-network.01.exe -w $w -s csets/cset.$c.txt -c ../mm/fr < $$< > $$@_ && mv $$@_ $$@\n' >> $@_ ; \
117
+ # echo >> $@_; ))))
118
+ # echo '.PHONY += $$(experiments)' >> $@_
119
+ # @mv $@_ $@
120
+ # @rm $@.lock/owner
121
+ # @rmdir $@.lock
mosesdecoder/contrib/m4m/modules/obsolete/skip-steps.mak ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # Specify in this file resources that you already have
4
+ run_id ?= 0
5
+
6
+ untuned_moses_ini := model/moses.ini.0
7
+ moses_ini_for_tuning = ${untuned_moses_ini}
8
+ moses_ini_for_eval = ${tuned_moses_ini}
9
+
10
+ # Notes:
11
+ #
12
+ # - if ${moses_ini_for_tuning} is different from ${untuned_mose_ini}, the phrase table and the
13
+ # lexical distortion table will be filtered for tuning (see tune.make)
14
+ # - if ${moses_ini_for_eval} is different from ${tuned_mose_ini}, the phrase table and the
15
+ # lexical distortion table will be filtered for evaluation (see eval.make)
16
+
17
+
18
+ all:
19
+ echo ";$(foo);"
mosesdecoder/contrib/m4m/modules/obsolete/system.m4m ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ # This module defines the actual system
4
+
5
+ # Choose names for translation and distortion model
6
+ ptable = model/tm/ptable.${L1}-${L2}
7
+ dtable = model/dm/dtable.${L1}-${L2}
8
+
9
+ # specify the underlying corpus
10
+ pll.txt1 ?= crp/trn/aln/${word-alignment}/${L1}.txt.gz
11
+ pll.txt2 ?= crp/trn/aln/${word-alignment}/${L2}.txt.gz
12
+ pll.aln ?= crp/trn/aln/${word-alignment}/${L1}-${L2}.symal.gz
13
+
14
+ # specify the distortion model parameters; we bunch them
15
+ # all together in one string
16
+ ${ptable}: dmodels = wbe-mslr-bidirectional-fe-allff
17
+
18
+ # phrase table parameters: maximum phrase length and smoothing
19
+ ptable.max-phrase-length = 7
20
+ ptable.smoothing = --GoodTuring
21
+
22
+ #$(info $(call add_binary_phrase_table,0,0,5,${ptable},info))
23
+ $(eval $(call add_binary_phrase_table,0,0,5,${ptable}))
24
+
25
+ $(eval $(call add_binary_reordering_table,\
26
+ 0-0,wbe-mslr-bidirectional-fe-allff,6,${dtable},${ptable}))
27
+
28
+ $(info $(call add_binary_reordering_table,\
29
+ 0-0,wbe-mslr-bidirectional-fe-allff,6,${dtable},${ptable},info))
30
+
31
+ # below: moses.ini.0 is the moses ini file PRE-TUNING!
32
+ define build_system
33
+ $1/moses.ini.0
34
+
35
+
36
+ makefile:
37
+ $(info $(call add_binary_phrase_table,0,0,5,${ptable},info))
38
+
mosesdecoder/contrib/m4m/modules/obsolete/template.m4m ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+
3
+ define setup =
4
+ echo 'experiments := ' >> $@_; \
5
+ $(foreach p, ${PREPROC}, \
6
+ echo '# NEW EXPERIMENT #####################################' >> $@_; \
7
+ echo 'experiments += ${tag}' >> $@_; \
8
+ echo 'ctr = $$(words $${experiments})' >> $@_; \
9
+ echo '$: input-type = $(2)' >> $@_; \
10
+ echo '${tag}: eval-sets = ${tag}.eval' >> $@_; \
11
+ echo '${tag}: tune-src = input/${tag}.tune.tc' >> $@_; \
12
+ echo '${tag}: tune-ref = ${tune-ref-ready}' >> $@_; \
13
+ echo '${tag}: ${tag}.eval-src = input/${tag}.eval.$(if $(findstring 1,$(2),cfn,tc))' >> $@_; \
14
+ echo '${tag}: ${tag}.eval-ref = ${eval-ref-ready}' >> $@_; \
15
+ echo '${tag}: evaluation/$${ctr}/report' >> $@_; \
16
+ $(foreach e, ${tag}.eval, \
17
+ $(foreach m, ${eval-metrics}, \
18
+ echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.$m' >> $@_;) \
19
+ echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.analysis' >> $@_;) \
20
+ echo >> $@_; \
21
+ echo 'evaluation/$${ctr}/${tag}.eval.output: input = input/${tag}.eval.tc' >> $@_; \
22
+ echo 'evaluation/$${ctr}/${tag}.eval.output: input/${tag}.eval.tc' >> $@_; \
23
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
24
+ 'evaluation/$${ctr}/${tag}.eval.output: ${tuned_moses_ini}', \
25
+ 'evaluation/$${ctr}/${tag}.eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
26
+ echo 'evaluation/$${ctr}/${tag}.eval.multi-bleu: $${${tag}.eval-ref}' >> $@_; \
27
+ echo >> $@_;)
28
+ echo '.PHONY += $$(experiments)' >> $@_
29
+ @mv $@_ $@
30
+ @rm $@.lock/owner
31
+ @rmdir $@.lock
32
+
33
+
34
+ # $(1): system / input processing
35
+ # $(2): input type (cfn or text)
36
+ define setup_experiment =
37
+ echo 'experiments := ' >> $@_; \
38
+ $(foreach p, ${PREPROC}, \
39
+ echo '# NEW EXPERIMENT #####################################' >> $@_; \
40
+ echo 'experiments += $(1)' >> $@_; \
41
+ echo 'ctr = $$(words $${experiments})' >> $@_; \
42
+ echo '$(1): input-type = $(2)' >> $@_; \
43
+ echo '$(1): eval-sets = $(1).eval' >> $@_; \
44
+ echo '$(1): tune-src = input/$(1).tune.tc' >> $@_; \
45
+ echo '$(1): tune-ref = ${tune-ref-ready}' >> $@_; \
46
+ echo '$(1): $(1).eval-src = input/$(1).eval.$(if $(findstring 1,$(2),cfn,tc))' >> $@_; \
47
+ echo '$(1): $(1).eval-ref = ${eval-ref-ready}' >> $@_; \
48
+ echo '$(1): evaluation/$${ctr}/report' >> $@_; \
49
+ $(foreach e, $(1).eval, \
50
+ $(foreach m, ${eval-metrics}, \
51
+ echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.$m' >> $@_;) \
52
+ echo 'evaluation/$${ctr}/report: evaluation/$${ctr}/$e.analysis' >> $@_;) \
53
+ echo >> $@_; \
54
+ echo 'evaluation/$${ctr}/$(1).eval.output: input = input/$(1).eval.tc' >> $@_; \
55
+ echo 'evaluation/$${ctr}/$(1).eval.output: input/$(1).eval.tc' >> $@_; \
56
+ echo $(if $(findstring ini,${tuned_moses_ini}), \
57
+ 'evaluation/$${ctr}/$(1).eval.output: ${tuned_moses_ini}', \
58
+ 'evaluation/$${ctr}/$(1).eval.output: tuning/$${ctr}/moses.tuned.ini') >> $@_; \
59
+ echo 'evaluation/$${ctr}/$(1).eval.multi-bleu: $${$(1).eval-ref}' >> $@_; \
60
+ echo >> $@_;)
61
+ echo '.PHONY += $$(experiments)' >> $@_
62
+ @mv $@_ $@
63
+ @rm $@.lock/owner
64
+ @rmdir $@.lock
65
+
66
+ endef
mosesdecoder/contrib/m4m/modules/obsolete/tune.m4m ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- Makefile -*-
2
+ # make module for tuning a system
3
+
4
+ #tune.input ?= $(error missing
5
+ #tuned_moses_ini ?= tuning/moses.ini.${run_id}
6
+ #$(tuned_moses_ini): | ${untuned_moses_ini}
7
+ #$(tuned_moses_ini): | ${untuned_moses_ini}
8
+
9
+ # make sure that all necessary variables are set
10
+ untuned_moses_ini ?= $(error Fatal error: the required variable untuned_moses_ini is not set)
11
+ tuning_input ?= $(error Fatal error: the required variable tuning_input is not set)
12
+ tuning_reference ?= $(error Fatal error: the required variable tuning_reference is not set)
13
+ tuning_itype ?= $(error Fatal error: the required variable tuning_itype is not set)
14
+ tuning_wdir ?= $(error Fatal error: the required variable tuning_wdir is not set)
15
+
16
+ $tuning_root_dir ?= ${MOSES_ROOT}
17
+
18
+
19
+
20
+ # default tuning parameters
21
+ mert.nbest ?= 100
22
+ mert.decoder-threads ?= 4
23
+ tuning/%/tmp/moses.ini: mertcmd =
24
+ tuning/%/tmp/moses.ini: mert_flags += --working-dir $(CURDIR)/tuning/$*/tmp
25
+ tuning/%/tmp/moses.ini: mert_flags += --decoder-flags "${mert.decoder_flags} -inputtype ${input-type}"
26
+ tuning/%/tmp/moses.ini: mert_flags += --rootdir ${MOSES_ROOT}/scripts
27
+ tuning/%/tmp/moses.ini: mert_flags += --mertdir ${MOSES_BIN}
28
+ tuning/%/tmp/moses.ini: mert_flags += ${mert.options}
29
+ tuning/%/tmp/moses.ini: ${untuned_moses_ini}
30
+ $(info TUNING: ${tune} ${tune-src} ${tune-ref} ${decode} ${untuned_moses_ini} ${mert_flags})
31
+ @mkdir -p $(@D)
32
+ @mkdir $@.lock
33
+ @echo $(call lockline) > $@.lock/owner
34
+ ${tune} ${mert_flags} ${tune-src} ${tune-ref} ${decode} ${untuned_moses_ini}
35
+ @rm $@.lock/owner
36
+ @rmdir $@.lock
37
+
38
+ tuning/%/moses.tuned.ini: tuning/%/tmp/moses.ini
39
+ @mkdir -p $(@D)
40
+ @mkdir $@.lock
41
+ @echo $(call lockline) > $@.lock/owner
42
+ ${apply-weights} tuning/$*/tmp/moses.ini < ${untuned_moses_ini} > $@_
43
+ @mv $@_ $@
44
+ @rm $@.lock/owner
45
+ @rmdir $@.lock
mosesdecoder/contrib/m4m/scripts/fast-align2bal.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # Auxiliary script to convert fast_align output to the "bal" input format
3
+ # that symal requires.
4
+ # Script by Ulrich Germann.
5
+
6
+ # command line args:
7
+ # <L1 plain text> <L2 plain text> <L1-L2 alignments> <L2-L1 alignments>
8
+ #
9
+ # TO DO: - proper argument parsing with getopt
10
+ # - help text
11
+
12
+ import sys,os
13
+
14
+ (T1,T2,fwd,bwd) = [open(x) for x in sys.argv[1:]]
15
+
16
+ def alnvec(slen,alinks,mode):
17
+ d = dict([[int(x[mode]),int(x[(mode+1)%2])+1] for x
18
+ in [y.split('-') for y in alinks]])
19
+ return [d.get(i,0) for i in xrange(slen)]
20
+
21
+ ctr = 0
22
+ for t1 in T1:
23
+ t1 = t1.strip().split()
24
+ t2 = T2.readline().strip().split()
25
+ a1 = alnvec(len(t1),bwd.readline().split(),0)
26
+ a2 = alnvec(len(t2),fwd.readline().split(),1)
27
+ print 1
28
+ print len(t2), " ".join(t2), '#', " ".join(["%d"%x for x in a2])
29
+ print len(t1), " ".join(t1), '#', " ".join(["%d"%x for x in a1])
30
+ ctr += 1
31
+ pass
mosesdecoder/contrib/m4m/scripts/giza.txt2snt.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Wrapper script around plain2snt that allows us to generate the numberized
3
+ # files from gzipped text files via named pipes. (c) 2011-2012 Ulrich Germann
4
+
5
+ fail()
6
+ {
7
+ echo $@
8
+ exit 1
9
+ }
10
+
11
+ on_term()
12
+ {
13
+ rm $odir/${L1}
14
+ rm $odir/${L2}
15
+ }
16
+
17
+ trap 'on_term' TERM EXIT QUIT INT 0
18
+
19
+ if [ $# -lt 4 ]; then
20
+ fail "usage: $0 <txtdir> <L1> <L2> <odir>"
21
+ fi
22
+
23
+ txtdir=$1
24
+ L1=$2
25
+ L2=$3
26
+ odir=$4
27
+
28
+ mkdir -p $odir
29
+ mkfifo $odir/${L1} || exit 1
30
+ mkfifo $odir/${L2} || exit 1
31
+
32
+ find -L ${txtdir} -name "*.${L1}" -or -name "*.${L1}.gz" | sort | xargs zcat -f > $odir/${L1} &
33
+ find -L ${txtdir} -name "*.${L2}" -or -name "*.${L2}.gz" | sort | xargs zcat -f > $odir/${L2} &
34
+
35
+ pushd $odir
36
+ plain2snt ${L1} ${L2}
37
+ wait
38
+ mv ${L1}_${L2}.snt ${L1}-${L2}.snt
39
+ mv ${L2}_${L1}.snt ${L2}-${L1}.snt
40
+ wait
41
+ popd
mosesdecoder/contrib/m4m/scripts/moses.extract-phrases.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # helper script for phrase extraction
3
+ # (c) 2011-2012 Ulrich Germann
4
+ # txtdir - directory with gzipped plain text files
5
+ # sntdir - directory with files in Giza's .snt format, also including the .OK files
6
+ # produced by giza.txt2snt.sh
7
+ # gizdir - directory where aligned corpus resides
8
+ # L1,L2 - language tags for L1,L2
9
+ # plmax - max phrase length to be extraced
10
+
11
+ extractor=$1
12
+ L1_text=$2
13
+ L2_text=$3
14
+ aln=$4
15
+ odir=$5
16
+ max_plen=$6
17
+ dmodel=$7
18
+
19
+
20
+ echo $#
21
+ if [ $# -lt 6 ] ; then
22
+ echo <<EOF \
23
+ "usage: $0 <moses-extract-command> <L1 text> <L2 text> <alignment file> <output dir> <max phrase length> <distortion-model>"
24
+ EOF
25
+ exit 1
26
+ fi
27
+
28
+ fifo=$odir/fifo.$$
29
+
30
+ cleanup()
31
+ {
32
+ if [ -e $fifo ] ; then rm $fifo; fi
33
+ if [ -e $fifo.inv ] ; then rm $fifo.inv; fi
34
+ if [ -e $fifo.o ] ; then rm $fifo.o; fi
35
+ }
36
+
37
+ trap 'cleanup' 0
38
+ export LC_ALL=C
39
+ mkdir -p $odir/fwd $odir/bwd $odir/dst
40
+ mkfifo $fifo
41
+ parallel < $fifo -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/fwd/part.{#}.gz" &
42
+ mkfifo $fifo.inv
43
+ parallel < $fifo.inv -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/bwd/part.{#}.gz" &
44
+ if [ "$dmodel" != "" ] ; then
45
+ mkfifo $fifo.o
46
+ parallel < $fifo.o -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/dst/part.{#}.gz" &
47
+ dmodel="orientation --model $dmodel"
48
+ fi
49
+ #echo "($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1"
50
+ ($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1
51
+
52
+ wait
53
+
54
+ # for part in fwd bwd dst; do
55
+ # echo -n '' > $odir/${part}/sort.batch
56
+ # for f in $odir/${part}/part.[0-9][0-9][0-9][0-9].gz; do
57
+ # g=`echo $f | sed 's/.gz$//'`
58
+ # # echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz && rm \$f.gz; fi" \
59
+ # echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz; fi" \
60
+ # >> $odir/${part}/sort.batch
61
+ # done
62
+ # done
63
+
mosesdecoder/contrib/m4m/scripts/moses.make-lex.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Quick hack to extract lexica from Giza-Aligned corpus
4
+ # (c) 2011 Ulrich Germann
5
+
6
+ import sys, os
7
+
8
+ D = os.popen("zcat %s" % sys.argv[1])
9
+ E = os.popen("zcat %s" % sys.argv[2])
10
+ A = os.popen("zcat %s" % sys.argv[3])
11
+ d_given_e = sys.argv[4]
12
+ e_given_d = sys.argv[5]
13
+
14
+ try:
15
+ os.makedirs(os.path.dirname(d_given_e))
16
+ os.makedirs(os.path.dirname(e_given_d))
17
+ except:
18
+ pass
19
+
20
+ WD = ["NULL","UNK"]
21
+ WE = ["NULL","UNK"]
22
+ VD = {}
23
+ VE = {}
24
+ JJ = []
25
+ MD = []
26
+ ME = []
27
+
28
+ def id(V,W,x):
29
+ i = V.setdefault(x,len(W))
30
+ if i == len(W): W.append(x)
31
+ return i
32
+
33
+ ctr = 0
34
+ for dline in D:
35
+ ctr += 1
36
+ #if ctr % 1000 == 0: sys.stderr.write('.')
37
+ eline = E.readline()
38
+ aline = A.readline()
39
+ d = [id(VD,WD,w) for w in dline.strip().split()]
40
+ e = [id(VE,WE,w) for w in eline.strip().split()]
41
+ a = [[int(y) for y in x.split('-')] for x in aline.split()]
42
+
43
+ while len(MD) <= len(VD) + 2:
44
+ MD.append(0)
45
+ JJ.append({})
46
+ pass
47
+
48
+ while len(ME) <= len(VE) + 2:
49
+ ME.append(0)
50
+ pass
51
+
52
+ fd = [0 for i in xrange(len(d))]
53
+ fe = [0 for i in xrange(len(e))]
54
+ for x,y in a:
55
+ fd[x] += 1
56
+ fe[y] += 1
57
+ MD[d[x]] += 1
58
+ ME[e[y]] += 1
59
+ JJ[d[x]][e[y]] = JJ[d[x]].setdefault(e[y],0) + 1
60
+ # print WD[d[x]],WE[e[y]],JJ[d[x]][e[y]]
61
+ pass
62
+ for i in [d[k] for k in xrange(len(d)) if fd[k] == 0]:
63
+ ME[0] += 1
64
+ MD[i] += 1
65
+ JJ[i][0] = JJ[i].setdefault(0,0) + 1
66
+ pass
67
+ for i in [e[k] for k in xrange(len(e)) if fe[k] == 0]:
68
+ ME[i] += 1
69
+ MD[0] += 1
70
+ JJ[0][i] = JJ[0].setdefault(i,0) + 1
71
+ pass
72
+ pass
73
+
74
+ ED = os.popen("gzip > %s" % e_given_d, 'w')
75
+ DE = os.popen("gzip > %s" % d_given_e, 'w')
76
+
77
+ for d in xrange(len(JJ)):
78
+ T = JJ[d]
79
+ for e,jj in T.items():
80
+ print >>ED, WE[e], WD[d], float(jj)/MD[d]
81
+ print >>DE, WD[d], WE[e], float(jj)/ME[e]
82
+ pass
83
+ pass
84
+
85
+ ED.close()
86
+ DE.close()
mosesdecoder/contrib/m4m/scripts/moses.phrase-extract.sh ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Helper script for phrase extraction from a single corpus shard.
3
+ # Written by Ulrich Germann.
4
+
5
+ # to be added: built-in factor filtering for factored models
6
+
7
+ cleanup()
8
+ {
9
+ if [ -e $fifo ] ; then rm $fifo; fi
10
+ if [ -e $fifo.inv ] ; then rm $fifo.inv; fi
11
+ if [ -e $fifo.o ] ; then rm $fifo.o; fi
12
+ }
13
+
14
+ usage()
15
+ {
16
+ echo
17
+ echo "$0: wrapper script to extract phrases from word-aligned corpus"
18
+ echo -e "usage:\n $0 <extractor> <ibase> <L1tag> <L2tag> [-x] "
19
+ echo "options:"
20
+ echo "-l: maximum phrase length ($plen)"
21
+ echo "-m: distortion model specification"
22
+ echo "-o: base name for output files .fwd.gz .bwd.gz [.<dmodel>.dst.gz]"
23
+ echo "-x: (no argument) don't create .fwd.gz and .bwd.gz"
24
+ echo
25
+ echo "required input files: <ibase>.<L1tag>.gz ibase.<L2tag>.gz ibase.<aln>.gz"
26
+ }
27
+
28
+ plen=7
29
+ nottable=
30
+ dmodel=
31
+ dspec=
32
+ pargs=
33
+ sfactors=
34
+ tfactors=
35
+ while [ $# -gt 0 ]; do
36
+ case $1 in
37
+ -l*) plen=${1#-l}
38
+ plen=${plen#=}
39
+ if [ -z $plen ] ; then
40
+ shift
41
+ plen=$1
42
+ fi
43
+ ;;
44
+ -m*) dmodel=${1#-m}
45
+ dmodel=${dmodel#=}
46
+ if [ -z $dmodel ] ; then
47
+ shift
48
+ dmodel="$1"
49
+ fi
50
+ ;;
51
+ -o*) obase=${1#-o}
52
+ obase=${obase#=}
53
+ if [ -z $obase ] ; then
54
+ shift
55
+ obase=$1
56
+ fi
57
+ ;;
58
+ -s*) sfactors=${1#-s}
59
+ sfactors=${sfactors#=}
60
+ if [ -z $sfactors ] ; then
61
+ shift
62
+ sfactors = $1
63
+ fi
64
+ ;;
65
+ -t*) tfactors=${1#-t}
66
+ tfactors=${tfactors#=}
67
+ if [ -z $tfactors ] ; then
68
+ shift
69
+ sfactors = $1
70
+ fi
71
+ ;;
72
+ -x) nottable=1;;
73
+ -h) usage; exit 0;;
74
+ *) pargs=(${pargs[*]} $1);;
75
+ esac
76
+ shift
77
+ done
78
+
79
+ if [ -n "$sfactors" ] || [ -n "$tfactors" ] ; then
80
+ echo "Factor filtering is not implemented yet!"
81
+ exit 2
82
+ fi
83
+
84
+ extract=${pargs[0]}
85
+ ibase=${pargs[1]}
86
+ L1tag=${pargs[2]}
87
+ L2tag=${pargs[3]}
88
+ obase=${obase:=$ibase}
89
+
90
+ fifo=$obase.$$
91
+ trap 'cleanup' 0
92
+
93
+ export LC_ALL=C
94
+ if [ -z "$nottable" ] ; then
95
+ mkfifo $fifo; sort -S 5G < $fifo | gzip > $obase.fwd.gz &
96
+ mkfifo $fifo.inv; sort -S 5G < $fifo.inv | gzip > $obase.bwd.gz &
97
+ fi
98
+ if [ -n "$dmodel" ] ; then
99
+ mkfifo $fifo.o
100
+ sort -S 5G < $fifo.o | gzip > $obase.dst.gz &
101
+ dspec="orientation --model "
102
+ dspec+=`echo $dmodel | perl -pe 's/((hier|phrase|wbe)-(msd|msrl|mono)).*/$1/;'`
103
+ fi
104
+
105
+ txt1=${ibase}.${L1tag}.gz
106
+ txt2=${ibase}.${L2tag}.gz
107
+ aln=${ibase}.aln.gz
108
+ echo "($extract <(zcat -f $txt1) <(zcat -f $txt2) <(zcat -f $aln) $fifo $plen $dspec) || exit 1"
109
+ ($extract <(zcat -f $txt2) <(zcat -f $txt1) <(zcat -f $aln) $fifo $plen $dspec) || exit 1
110
+ wait
mosesdecoder/contrib/m4m/scripts/moses.score-phrases.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Wrapper script around the moses phrase scoring utility.
3
+ # Script by Ulrich Germann. Called from within M4M.
4
+ #
5
+ # lexicon given should be
6
+ # de-given-en for fwd
7
+ # en-given-de for bwd
8
+
9
+ binary=$1
10
+ phrases=$2
11
+ lex=$3
12
+ obase=$4
13
+ smoothing=$5
14
+ inv=$6
15
+
16
+ cleanup()
17
+ {
18
+ if [ -e $obase.$$ ] ; then rm $obase.$$; fi
19
+ if [ -e $obase.$$.coc ] ; then mv $obase.$$.coc $obase.coc; fi
20
+ }
21
+
22
+ mkfifo $obase.$$ || exit 1
23
+
24
+ trap 'cleanup' 0
25
+
26
+ export LC_ALL=C
27
+ if [[ "$inv" == "--Inverse" ]] ; then
28
+ parallel --gnu < $obase.$$ -j10 --pipe --blocksize 250M "sort -S 10G | gzip > $obase.{#}.gz" &
29
+ else
30
+ gzip < $obase.$$ > $obase.scored.gz_ &
31
+ fi
32
+
33
+ if [[ $phrases != "-" && $phrases != "/dev/stdin" ]] ; then
34
+ $binary $phrases <(zcat -f $lex) $obase.$$ $smoothing $inv || exit 1
35
+ else
36
+ $binary /dev/stdin <(zcat -f $lex) $obase.$$ $smoothing $inv || exit 1
37
+ fi
38
+
39
+ if [ $? ] ; then exit $?; fi
40
+ wait
41
+ exit $?;
mosesdecoder/contrib/m4m/scripts/moses.transfer-weights.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Combines the system definition from one .ini file with the weights contained
4
+ # in another. Works for the new moses.ini format with fully named feature
5
+ # functions. Writes the new .ini file to stdout
6
+ # Script by Ulrich Germann.
7
+
8
+ import re,sys,os
9
+ from optparse import OptionParser
10
+
11
+ SectionHeaderPattern = re.compile(r'^\[(.*)\]\s*$')
12
+ def read_ini(filename):
13
+ '''
14
+ Reads a moses.ini file and returns a dictionary mapping
15
+ from section names to a list of lines contained in that section.
16
+ '''
17
+ AllSections = {}
18
+ CurSection = AllSections.setdefault('',[])
19
+ for line in open(filename):
20
+ line = line.strip()
21
+ m = SectionHeaderPattern.match(line)
22
+ if m:
23
+ CurSection = AllSections.setdefault(m.group(1),[])
24
+ elif len(line):
25
+ CurSection.append(line)
26
+ pass
27
+ pass
28
+ return AllSections
29
+
30
+ parser = OptionParser()
31
+ parser.add_option("-s", "--system", dest = "system",
32
+ help = "moses.ini file defining the system")
33
+ parser.add_option("-w", "--weights", dest = "weight",
34
+ help = "moses.ini file defining the system")
35
+
36
+ opts,args = parser.parse_args()
37
+
38
+ system = read_ini(opts.system)
39
+ weight = read_ini(opts.weight)
40
+
41
+ for s in system:
42
+ if len(s) == 0 or s[0:6] == 'weight': continue
43
+ print "[%s]"%s
44
+ print "\n".join(system[s])
45
+ print
46
+ pass
47
+
48
+ if 'weight' in weight:
49
+ print '[weight]'
50
+ print "\n".join(weight['weight'])
51
+ else:
52
+ for s in weight:
53
+ if s[0:6] != 'weight': continue
54
+ print "[%s]"%s
55
+ print "\n".join(system[s])
56
+ print
57
+ pass
58
+ pass
59
+
60
+
61
+
mosesdecoder/contrib/m4m/util/Jamfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ external-lib bzip2 ;
2
+ external-lib zlib ;
3
+
4
+ exe merge-sorted :
5
+ merge-sorted.cc
6
+ $(TOP)/moses/TranslationModel/UG/mm//mm
7
+ $(TOP)/moses/TranslationModel/UG/generic//generic
8
+ $(TOP)//boost_iostreams
9
+ $(TOP)//boost_program_options
10
+ ;
11
+
12
+
mosesdecoder/contrib/memscore/Makefile.in ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Makefile.in generated by automake 1.9.6 from Makefile.am.
2
+ # @configure_input@
3
+
4
+ # Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
5
+ # 2003, 2004, 2005 Free Software Foundation, Inc.
6
+ # This Makefile.in is free software; the Free Software Foundation
7
+ # gives unlimited permission to copy and/or distribute it,
8
+ # with or without modifications, as long as this notice is preserved.
9
+
10
+ # This program is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY, to the extent permitted by law; without
12
+ # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
13
+ # PARTICULAR PURPOSE.
14
+
15
+ @SET_MAKE@
16
+
17
+ # memscore - in-memory phrase scoring for Statistical Machine Translation
18
+ # Christian Hardmeier, FBK-irst, Trento, 2010
19
+ # $Id$
20
+
21
+ srcdir = @srcdir@
22
+ top_srcdir = @top_srcdir@
23
+ VPATH = @srcdir@
24
+ pkgdatadir = $(datadir)/@PACKAGE@
25
+ pkglibdir = $(libdir)/@PACKAGE@
26
+ pkgincludedir = $(includedir)/@PACKAGE@
27
+ top_builddir = .
28
+ am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
29
+ INSTALL = @INSTALL@
30
+ install_sh_DATA = $(install_sh) -c -m 644
31
+ install_sh_PROGRAM = $(install_sh) -c
32
+ install_sh_SCRIPT = $(install_sh) -c
33
+ INSTALL_HEADER = $(INSTALL_DATA)
34
+ transform = $(program_transform_name)
35
+ NORMAL_INSTALL = :
36
+ PRE_INSTALL = :
37
+ POST_INSTALL = :
38
+ NORMAL_UNINSTALL = :
39
+ PRE_UNINSTALL = :
40
+ POST_UNINSTALL = :
41
+ bin_PROGRAMS = memscore$(EXEEXT)
42
+ @IRSTLM_TRUE@am__append_1 = phraselm.cpp phraselm.h
43
+ @CHANNEL_SCORER_TRUE@am__append_2 = channel-scorer.cpp channel-scorer.h
44
+ subdir = .
45
+ DIST_COMMON = $(am__configure_deps) $(srcdir)/Makefile.am \
46
+ $(srcdir)/Makefile.in $(srcdir)/config.h.in \
47
+ $(top_srcdir)/configure depcomp install-sh missing
48
+ ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
49
+ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_boost_base.m4 \
50
+ $(top_srcdir)/configure.ac
51
+ am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
52
+ $(ACLOCAL_M4)
53
+ am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
54
+ configure.lineno configure.status.lineno
55
+ mkinstalldirs = $(install_sh) -d
56
+ CONFIG_HEADER = config.h
57
+ CONFIG_CLEAN_FILES =
58
+ am__installdirs = "$(DESTDIR)$(bindir)"
59
+ binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
60
+ PROGRAMS = $(bin_PROGRAMS)
61
+ am__memscore_SOURCES_DIST = datastorage.h memscore.h phrasetable.h \
62
+ scorer.h scorer-impl.h statistic.h timestamp.h phrasetable.cpp \
63
+ memscore.cpp scorer.cpp lexdecom.cpp lexdecom.h phraselm.cpp \
64
+ phraselm.h channel-scorer.cpp channel-scorer.h
65
+ @IRSTLM_TRUE@am__objects_1 = phraselm.$(OBJEXT)
66
+ @CHANNEL_SCORER_TRUE@am__objects_2 = channel-scorer.$(OBJEXT)
67
+ am_memscore_OBJECTS = phrasetable.$(OBJEXT) memscore.$(OBJEXT) \
68
+ scorer.$(OBJEXT) lexdecom.$(OBJEXT) $(am__objects_1) \
69
+ $(am__objects_2)
70
+ memscore_OBJECTS = $(am_memscore_OBJECTS)
71
+ memscore_DEPENDENCIES =
72
+ DEFAULT_INCLUDES = -I. -I$(srcdir) -I.
73
+ depcomp = $(SHELL) $(top_srcdir)/depcomp
74
+ am__depfiles_maybe = depfiles
75
+ CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
76
+ $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
77
+ CXXLD = $(CXX)
78
+ CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
79
+ -o $@
80
+ COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
81
+ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
82
+ CCLD = $(CC)
83
+ LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
84
+ SOURCES = $(memscore_SOURCES)
85
+ DIST_SOURCES = $(am__memscore_SOURCES_DIST)
86
+ ETAGS = etags
87
+ CTAGS = ctags
88
+ DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
89
+ distdir = $(PACKAGE)-$(VERSION)
90
+ top_distdir = $(distdir)
91
+ am__remove_distdir = \
92
+ { test ! -d $(distdir) \
93
+ || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
94
+ && rm -fr $(distdir); }; }
95
+ DIST_ARCHIVES = $(distdir).tar.gz
96
+ GZIP_ENV = --best
97
+ distuninstallcheck_listfiles = find . -type f -print
98
+ distcleancheck_listfiles = find . -type f -print
99
+ ACLOCAL = @ACLOCAL@
100
+ AMDEP_FALSE = @AMDEP_FALSE@
101
+ AMDEP_TRUE = @AMDEP_TRUE@
102
+ AMTAR = @AMTAR@
103
+ AUTOCONF = @AUTOCONF@
104
+ AUTOHEADER = @AUTOHEADER@
105
+ AUTOMAKE = @AUTOMAKE@
106
+ AWK = @AWK@
107
+ BOOST_CPPFLAGS = @BOOST_CPPFLAGS@
108
+ BOOST_LDFLAGS = @BOOST_LDFLAGS@
109
+ CC = @CC@
110
+ CCDEPMODE = @CCDEPMODE@
111
+ CFLAGS = @CFLAGS@
112
+ CHANNEL_SCORER_FALSE = @CHANNEL_SCORER_FALSE@
113
+ CHANNEL_SCORER_TRUE = @CHANNEL_SCORER_TRUE@
114
+ CPPFLAGS = @CPPFLAGS@
115
+ CXX = @CXX@
116
+ CXXCPP = @CXXCPP@
117
+ CXXDEPMODE = @CXXDEPMODE@
118
+ CXXFLAGS = @CXXFLAGS@
119
+ CYGPATH_W = @CYGPATH_W@
120
+ DEFS = @DEFS@
121
+ DEPDIR = @DEPDIR@
122
+ ECHO_C = @ECHO_C@
123
+ ECHO_N = @ECHO_N@
124
+ ECHO_T = @ECHO_T@
125
+ EGREP = @EGREP@
126
+ EXEEXT = @EXEEXT@
127
+ GREP = @GREP@
128
+ INSTALL_DATA = @INSTALL_DATA@
129
+ INSTALL_PROGRAM = @INSTALL_PROGRAM@
130
+ INSTALL_SCRIPT = @INSTALL_SCRIPT@
131
+ INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
132
+ IRSTLM_FALSE = @IRSTLM_FALSE@
133
+ IRSTLM_TRUE = @IRSTLM_TRUE@
134
+ LDFLAGS = @LDFLAGS@
135
+ LIBOBJS = @LIBOBJS@
136
+ LIBS = @LIBS@
137
+ LTLIBOBJS = @LTLIBOBJS@
138
+ MAKEINFO = @MAKEINFO@
139
+ OBJEXT = @OBJEXT@
140
+ PACKAGE = @PACKAGE@
141
+ PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
142
+ PACKAGE_NAME = @PACKAGE_NAME@
143
+ PACKAGE_STRING = @PACKAGE_STRING@
144
+ PACKAGE_TARNAME = @PACKAGE_TARNAME@
145
+ PACKAGE_VERSION = @PACKAGE_VERSION@
146
+ PATH_SEPARATOR = @PATH_SEPARATOR@
147
+ SET_MAKE = @SET_MAKE@
148
+ SHELL = @SHELL@
149
+ STRIP = @STRIP@
150
+ VERSION = @VERSION@
151
+ ac_ct_CC = @ac_ct_CC@
152
+ ac_ct_CXX = @ac_ct_CXX@
153
+ am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
154
+ am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
155
+ am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
156
+ am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
157
+ am__include = @am__include@
158
+ am__leading_dot = @am__leading_dot@
159
+ am__quote = @am__quote@
160
+ am__tar = @am__tar@
161
+ am__untar = @am__untar@
162
+ bindir = @bindir@
163
+ build_alias = @build_alias@
164
+ datadir = @datadir@
165
+ datarootdir = @datarootdir@
166
+ docdir = @docdir@
167
+ dvidir = @dvidir@
168
+ exec_prefix = @exec_prefix@
169
+ host_alias = @host_alias@
170
+ htmldir = @htmldir@
171
+ includedir = @includedir@
172
+ infodir = @infodir@
173
+ install_sh = @install_sh@
174
+ libdir = @libdir@
175
+ libexecdir = @libexecdir@
176
+ localedir = @localedir@
177
+ localstatedir = @localstatedir@
178
+ mandir = @mandir@
179
+ mkdir_p = @mkdir_p@
180
+ oldincludedir = @oldincludedir@
181
+ pdfdir = @pdfdir@
182
+ prefix = @prefix@
183
+ program_transform_name = @program_transform_name@
184
+ psdir = @psdir@
185
+ sbindir = @sbindir@
186
+ sharedstatedir = @sharedstatedir@
187
+ sysconfdir = @sysconfdir@
188
+ target_alias = @target_alias@
189
+ ACLOCAL_AMFLAGS = -I m4
190
+ AUTOMAKE_OPTIONS = foreign
191
+ AM_CXXFLAGS = $(BOOST_CPPFLAGS) -Wall -ffast-math -ftrapping-math -fomit-frame-pointer
192
+ memscore_SOURCES = datastorage.h memscore.h phrasetable.h scorer.h \
193
+ scorer-impl.h statistic.h timestamp.h phrasetable.cpp \
194
+ memscore.cpp scorer.cpp lexdecom.cpp lexdecom.h \
195
+ $(am__append_1) $(am__append_2)
196
+ memscore_LDADD = $(IRSTLM_LIBS) $(GSL_LIBS)
197
+ all: config.h
198
+ $(MAKE) $(AM_MAKEFLAGS) all-am
199
+
200
+ .SUFFIXES:
201
+ .SUFFIXES: .cpp .o .obj
202
+ am--refresh:
203
+ @:
204
+ $(srcdir)/Makefile.in: $(srcdir)/Makefile.am $(am__configure_deps)
205
+ @for dep in $?; do \
206
+ case '$(am__configure_deps)' in \
207
+ *$$dep*) \
208
+ echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \
209
+ cd $(srcdir) && $(AUTOMAKE) --foreign \
210
+ && exit 0; \
211
+ exit 1;; \
212
+ esac; \
213
+ done; \
214
+ echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign Makefile'; \
215
+ cd $(top_srcdir) && \
216
+ $(AUTOMAKE) --foreign Makefile
217
+ .PRECIOUS: Makefile
218
+ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
219
+ @case '$?' in \
220
+ *config.status*) \
221
+ echo ' $(SHELL) ./config.status'; \
222
+ $(SHELL) ./config.status;; \
223
+ *) \
224
+ echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
225
+ cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
226
+ esac;
227
+
228
+ $(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
229
+ $(SHELL) ./config.status --recheck
230
+
231
+ $(top_srcdir)/configure: $(am__configure_deps)
232
+ cd $(srcdir) && $(AUTOCONF)
233
+ $(ACLOCAL_M4): $(am__aclocal_m4_deps)
234
+ cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
235
+
236
+ config.h: stamp-h1
237
+ @if test ! -f $@; then \
238
+ rm -f stamp-h1; \
239
+ $(MAKE) stamp-h1; \
240
+ else :; fi
241
+
242
+ stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
243
+ @rm -f stamp-h1
244
+ cd $(top_builddir) && $(SHELL) ./config.status config.h
245
+ $(srcdir)/config.h.in: $(am__configure_deps)
246
+ cd $(top_srcdir) && $(AUTOHEADER)
247
+ rm -f stamp-h1
248
+ touch $@
249
+
250
+ distclean-hdr:
251
+ -rm -f config.h stamp-h1
252
+ install-binPROGRAMS: $(bin_PROGRAMS)
253
+ @$(NORMAL_INSTALL)
254
+ test -z "$(bindir)" || $(mkdir_p) "$(DESTDIR)$(bindir)"
255
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
256
+ p1=`echo $$p|sed 's/$(EXEEXT)$$//'`; \
257
+ if test -f $$p \
258
+ ; then \
259
+ f=`echo "$$p1" | sed 's,^.*/,,;$(transform);s/$$/$(EXEEXT)/'`; \
260
+ echo " $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) '$$p' '$(DESTDIR)$(bindir)/$$f'"; \
261
+ $(INSTALL_PROGRAM_ENV) $(binPROGRAMS_INSTALL) "$$p" "$(DESTDIR)$(bindir)/$$f" || exit 1; \
262
+ else :; fi; \
263
+ done
264
+
265
+ uninstall-binPROGRAMS:
266
+ @$(NORMAL_UNINSTALL)
267
+ @list='$(bin_PROGRAMS)'; for p in $$list; do \
268
+ f=`echo "$$p" | sed 's,^.*/,,;s/$(EXEEXT)$$//;$(transform);s/$$/$(EXEEXT)/'`; \
269
+ echo " rm -f '$(DESTDIR)$(bindir)/$$f'"; \
270
+ rm -f "$(DESTDIR)$(bindir)/$$f"; \
271
+ done
272
+
273
+ clean-binPROGRAMS:
274
+ -test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
275
+ memscore$(EXEEXT): $(memscore_OBJECTS) $(memscore_DEPENDENCIES)
276
+ @rm -f memscore$(EXEEXT)
277
+ $(CXXLINK) $(memscore_LDFLAGS) $(memscore_OBJECTS) $(memscore_LDADD) $(LIBS)
278
+
279
+ mostlyclean-compile:
280
+ -rm -f *.$(OBJEXT)
281
+
282
+ distclean-compile:
283
+ -rm -f *.tab.c
284
+
285
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/channel-scorer.Po@am__quote@
286
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lexdecom.Po@am__quote@
287
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/memscore.Po@am__quote@
288
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phraselm.Po@am__quote@
289
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/phrasetable.Po@am__quote@
290
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/scorer.Po@am__quote@
291
+
292
+ .cpp.o:
293
+ @am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ $<; \
294
+ @am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi
295
+ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
296
+ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
297
+ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ $<
298
+
299
+ .cpp.obj:
300
+ @am__fastdepCXX_TRUE@ if $(CXXCOMPILE) -MT $@ -MD -MP -MF "$(DEPDIR)/$*.Tpo" -c -o $@ `$(CYGPATH_W) '$<'`; \
301
+ @am__fastdepCXX_TRUE@ then mv -f "$(DEPDIR)/$*.Tpo" "$(DEPDIR)/$*.Po"; else rm -f "$(DEPDIR)/$*.Tpo"; exit 1; fi
302
+ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
303
+ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
304
+ @am__fastdepCXX_FALSE@ $(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
305
+ uninstall-info-am:
306
+
307
+ ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
308
+ list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
309
+ unique=`for i in $$list; do \
310
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
311
+ done | \
312
+ $(AWK) ' { files[$$0] = 1; } \
313
+ END { for (i in files) print i; }'`; \
314
+ mkid -fID $$unique
315
+ tags: TAGS
316
+
317
+ TAGS: $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
318
+ $(TAGS_FILES) $(LISP)
319
+ tags=; \
320
+ here=`pwd`; \
321
+ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
322
+ unique=`for i in $$list; do \
323
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
324
+ done | \
325
+ $(AWK) ' { files[$$0] = 1; } \
326
+ END { for (i in files) print i; }'`; \
327
+ if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
328
+ test -n "$$unique" || unique=$$empty_fix; \
329
+ $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
330
+ $$tags $$unique; \
331
+ fi
332
+ ctags: CTAGS
333
+ CTAGS: $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
334
+ $(TAGS_FILES) $(LISP)
335
+ tags=; \
336
+ here=`pwd`; \
337
+ list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
338
+ unique=`for i in $$list; do \
339
+ if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
340
+ done | \
341
+ $(AWK) ' { files[$$0] = 1; } \
342
+ END { for (i in files) print i; }'`; \
343
+ test -z "$(CTAGS_ARGS)$$tags$$unique" \
344
+ || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
345
+ $$tags $$unique
346
+
347
+ GTAGS:
348
+ here=`$(am__cd) $(top_builddir) && pwd` \
349
+ && cd $(top_srcdir) \
350
+ && gtags -i $(GTAGS_ARGS) $$here
351
+
352
+ distclean-tags:
353
+ -rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
354
+
355
+ distdir: $(DISTFILES)
356
+ $(am__remove_distdir)
357
+ mkdir $(distdir)
358
+ $(mkdir_p) $(distdir)/m4
359
+ @srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
360
+ topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
361
+ list='$(DISTFILES)'; for file in $$list; do \
362
+ case $$file in \
363
+ $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
364
+ $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
365
+ esac; \
366
+ if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
367
+ dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
368
+ if test "$$dir" != "$$file" && test "$$dir" != "."; then \
369
+ dir="/$$dir"; \
370
+ $(mkdir_p) "$(distdir)$$dir"; \
371
+ else \
372
+ dir=''; \
373
+ fi; \
374
+ if test -d $$d/$$file; then \
375
+ if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
376
+ cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
377
+ fi; \
378
+ cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
379
+ else \
380
+ test -f $(distdir)/$$file \
381
+ || cp -p $$d/$$file $(distdir)/$$file \
382
+ || exit 1; \
383
+ fi; \
384
+ done
385
+ -find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
386
+ ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
387
+ ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
388
+ ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
389
+ || chmod -R a+r $(distdir)
390
+ dist-gzip: distdir
391
+ tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
392
+ $(am__remove_distdir)
393
+
394
+ dist-bzip2: distdir
395
+ tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2
396
+ $(am__remove_distdir)
397
+
398
+ dist-tarZ: distdir
399
+ tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
400
+ $(am__remove_distdir)
401
+
402
+ dist-shar: distdir
403
+ shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
404
+ $(am__remove_distdir)
405
+
406
+ dist-zip: distdir
407
+ -rm -f $(distdir).zip
408
+ zip -rq $(distdir).zip $(distdir)
409
+ $(am__remove_distdir)
410
+
411
+ dist dist-all: distdir
412
+ tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
413
+ $(am__remove_distdir)
414
+
415
+ # This target untars the dist file and tries a VPATH configuration. Then
416
+ # it guarantees that the distribution is self-contained by making another
417
+ # tarfile.
418
+ distcheck: dist
419
+ case '$(DIST_ARCHIVES)' in \
420
+ *.tar.gz*) \
421
+ GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\
422
+ *.tar.bz2*) \
423
+ bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\
424
+ *.tar.Z*) \
425
+ uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
426
+ *.shar.gz*) \
427
+ GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\
428
+ *.zip*) \
429
+ unzip $(distdir).zip ;;\
430
+ esac
431
+ chmod -R a-w $(distdir); chmod a+w $(distdir)
432
+ mkdir $(distdir)/_build
433
+ mkdir $(distdir)/_inst
434
+ chmod a-w $(distdir)
435
+ dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
436
+ && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
437
+ && cd $(distdir)/_build \
438
+ && ../configure --srcdir=.. --prefix="$$dc_install_base" \
439
+ $(DISTCHECK_CONFIGURE_FLAGS) \
440
+ && $(MAKE) $(AM_MAKEFLAGS) \
441
+ && $(MAKE) $(AM_MAKEFLAGS) dvi \
442
+ && $(MAKE) $(AM_MAKEFLAGS) check \
443
+ && $(MAKE) $(AM_MAKEFLAGS) install \
444
+ && $(MAKE) $(AM_MAKEFLAGS) installcheck \
445
+ && $(MAKE) $(AM_MAKEFLAGS) uninstall \
446
+ && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
447
+ distuninstallcheck \
448
+ && chmod -R a-w "$$dc_install_base" \
449
+ && ({ \
450
+ (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
451
+ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
452
+ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
453
+ && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
454
+ distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
455
+ } || { rm -rf "$$dc_destdir"; exit 1; }) \
456
+ && rm -rf "$$dc_destdir" \
457
+ && $(MAKE) $(AM_MAKEFLAGS) dist \
458
+ && rm -rf $(DIST_ARCHIVES) \
459
+ && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
460
+ $(am__remove_distdir)
461
+ @(echo "$(distdir) archives ready for distribution: "; \
462
+ list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
463
+ sed -e '1{h;s/./=/g;p;x;}' -e '$${p;x;}'
464
+ distuninstallcheck:
465
+ @cd $(distuninstallcheck_dir) \
466
+ && test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
467
+ || { echo "ERROR: files left after uninstall:" ; \
468
+ if test -n "$(DESTDIR)"; then \
469
+ echo " (check DESTDIR support)"; \
470
+ fi ; \
471
+ $(distuninstallcheck_listfiles) ; \
472
+ exit 1; } >&2
473
+ distcleancheck: distclean
474
+ @if test '$(srcdir)' = . ; then \
475
+ echo "ERROR: distcleancheck can only run from a VPATH build" ; \
476
+ exit 1 ; \
477
+ fi
478
+ @test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
479
+ || { echo "ERROR: files left in build directory after distclean:" ; \
480
+ $(distcleancheck_listfiles) ; \
481
+ exit 1; } >&2
482
+ check-am: all-am
483
+ check: check-am
484
+ all-am: Makefile $(PROGRAMS) config.h
485
+ installdirs:
486
+ for dir in "$(DESTDIR)$(bindir)"; do \
487
+ test -z "$$dir" || $(mkdir_p) "$$dir"; \
488
+ done
489
+ install: install-am
490
+ install-exec: install-exec-am
491
+ install-data: install-data-am
492
+ uninstall: uninstall-am
493
+
494
+ install-am: all-am
495
+ @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
496
+
497
+ installcheck: installcheck-am
498
+ install-strip:
499
+ $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
500
+ install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
501
+ `test -z '$(STRIP)' || \
502
+ echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
503
+ mostlyclean-generic:
504
+
505
+ clean-generic:
506
+
507
+ distclean-generic:
508
+ -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
509
+
510
+ maintainer-clean-generic:
511
+ @echo "This command is intended for maintainers to use"
512
+ @echo "it deletes files that may require special tools to rebuild."
513
+ clean: clean-am
514
+
515
+ clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
516
+
517
+ distclean: distclean-am
518
+ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
519
+ -rm -rf ./$(DEPDIR)
520
+ -rm -f Makefile
521
+ distclean-am: clean-am distclean-compile distclean-generic \
522
+ distclean-hdr distclean-tags
523
+
524
+ dvi: dvi-am
525
+
526
+ dvi-am:
527
+
528
+ html: html-am
529
+
530
+ info: info-am
531
+
532
+ info-am:
533
+
534
+ install-data-am:
535
+
536
+ install-exec-am: install-binPROGRAMS
537
+
538
+ install-info: install-info-am
539
+
540
+ install-man:
541
+
542
+ installcheck-am:
543
+
544
+ maintainer-clean: maintainer-clean-am
545
+ -rm -f $(am__CONFIG_DISTCLEAN_FILES)
546
+ -rm -rf $(top_srcdir)/autom4te.cache
547
+ -rm -rf ./$(DEPDIR)
548
+ -rm -f Makefile
549
+ maintainer-clean-am: distclean-am maintainer-clean-generic
550
+
551
+ mostlyclean: mostlyclean-am
552
+
553
+ mostlyclean-am: mostlyclean-compile mostlyclean-generic
554
+
555
+ pdf: pdf-am
556
+
557
+ pdf-am:
558
+
559
+ ps: ps-am
560
+
561
+ ps-am:
562
+
563
+ uninstall-am: uninstall-binPROGRAMS uninstall-info-am
564
+
565
+ .PHONY: CTAGS GTAGS all all-am am--refresh check check-am clean \
566
+ clean-binPROGRAMS clean-generic ctags dist dist-all dist-bzip2 \
567
+ dist-gzip dist-shar dist-tarZ dist-zip distcheck distclean \
568
+ distclean-compile distclean-generic distclean-hdr \
569
+ distclean-tags distcleancheck distdir distuninstallcheck dvi \
570
+ dvi-am html html-am info info-am install install-am \
571
+ install-binPROGRAMS install-data install-data-am install-exec \
572
+ install-exec-am install-info install-info-am install-man \
573
+ install-strip installcheck installcheck-am installdirs \
574
+ maintainer-clean maintainer-clean-generic mostlyclean \
575
+ mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
576
+ tags uninstall uninstall-am uninstall-binPROGRAMS \
577
+ uninstall-info-am
578
+
579
+ # Tell versions [3.59,3.63) of GNU make to not export all variables.
580
+ # Otherwise a system limit (for SysV at least) may be exceeded.
581
+ .NOEXPORT:
mosesdecoder/contrib/memscore/configure.ac ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ # Christian Hardmeier, FBK-irst, Trento, 2010
3
+ # $Id$
4
+
5
+ # Process this file with autoconf to produce a configure script.
6
+
7
+ AC_INIT([memscore], [1.0], [hardmeier at fbk.eu])
8
+ AM_INIT_AUTOMAKE
9
+ AC_LANG([C++])
10
+
11
+ AC_ARG_WITH(irstlm,
12
+ [AC_HELP_STRING([--with-irstlm=PATH], [(optional) path to the IRSTLM toolkit])],
13
+ [with_irstlm=$withval],
14
+ [with_irstlm=check])
15
+
16
+ AC_ARG_WITH([gsl],
17
+ [AC_HELP_STRING([--with-gsl=PATH], [path to the GSL library])],
18
+ [with_gsl=$withval
19
+ CPPFLAGS="$CPPFLAGS -I$with_gsl/include"
20
+ LDFLAGS="$LDFLAGS -L$with_gsl/lib"],
21
+ [with_gsl=check])
22
+
23
+ AC_ARG_ENABLE([channel],
24
+ [AC_HELP_STRING([--enable-channel], [feature not yet publicly available])],
25
+ [AC_DEFINE(ENABLE_CHANNEL_SCORER, [], [Define to enable channel scorer])],
26
+ [enable_channel=no])
27
+
28
+ AC_PREREQ([2.63])
29
+ AC_CONFIG_SRCDIR([memscore.cpp])
30
+ AC_CONFIG_HEADERS([config.h])
31
+
32
+ # Checks for programs.
33
+ AC_PROG_CXX
34
+ AC_PROG_CC
35
+
36
+ # Checks for libraries.
37
+ AX_BOOST_BASE([1.35.0])
38
+
39
+ AC_CHECK_LIB([m], [cos])
40
+ AC_CHECK_LIB([z], [gzopen])
41
+
42
+ have_gsl=yes
43
+ AC_CHECK_LIB([gslcblas],[cblas_dgemm], [], [have_gsl=no])
44
+ AC_CHECK_LIB([gsl],[gsl_blas_dgemm], [], [have_gsl=no])
45
+
46
+ AS_IF([test x$with_irstlm = xcheck],
47
+ [AC_CHECK_HEADER([n_gram.h],
48
+ [AC_DEFINE([HAVE_IRSTLM], [], [flag for IRSTLM])],
49
+ [with_irstlm=no])]
50
+ ,
51
+ [SAVE_CPPFLAGS="$CPPFLAGS"
52
+ CPPFLAGS="$CPPFLAGS -I${with_irstlm}/include"
53
+
54
+ AC_CHECK_HEADER(n_gram.h,
55
+ [AC_DEFINE([HAVE_IRSTLM], [], [flag for IRSTLM])],
56
+ [AC_MSG_ERROR([Cannot find IRSTLM!])])
57
+
58
+ MY_ARCH=`uname -m`
59
+ LIB_IRSTLM="-lirstlm"
60
+ LDFLAGS="$LDFLAGS -L${with_irstlm}/lib/${MY_ARCH}"
61
+ LIBS="$LIBS $LIB_IRSTLM"
62
+ FMTLIBS="$FMTLIBS libirstlm.a"]
63
+ )
64
+ AM_CONDITIONAL([IRSTLM], [test x$with_irstlm != xno])
65
+
66
+ AS_IF([test x$enable_channel = xyes],
67
+ [AS_IF([test x$with_irstlm = xno || test x$have_gsl = xno],
68
+ [AC_MSG_ERROR([The channel scorer needs both GSL and irstlm.])])])
69
+
70
+ # Checks for header files.
71
+ #AC_CHECK_HEADERS([fenv.h sys/time.h])
72
+
73
+ # Checks for typedefs, structures, and compiler characteristics.
74
+ AC_TYPE_SIZE_T
75
+ AC_CHECK_TYPES([ptrdiff_t])
76
+
77
+ # Checks for library functions.
78
+ #AC_FUNC_MALLOC
79
+ #AC_CHECK_FUNCS([getpagesize gettimeofday])
80
+
81
+ AM_CONDITIONAL(CHANNEL_SCORER, test x$enable_channel = xyes)
82
+
83
+ AC_CONFIG_FILES([Makefile])
84
+ AC_OUTPUT
mosesdecoder/contrib/memscore/lexdecom.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * File: lexdecom.h
3
+ * Author: Felipe Sánchez-Martínez, Universitat d'Alacant <fsanchez@dlsi.ua.es>
4
+ *
5
+ * Created on 2010/01/27
6
+ */
7
+
8
+ #ifndef _LEXDECOM_H
9
+ #define _LEXDECOM_H
10
+
11
+ #include "phrasetable.h"
12
+ #include "scorer.h"
13
+
14
+ class LexicalDecompositionPhraseScorer : public PhraseScorer
15
+ {
16
+ private:
17
+ explicit LexicalDecompositionPhraseScorer(PhraseTable &pd, bool reverse, const String &lwfile,
18
+ const char *argv[], int &argp, const PhraseScorerFactory &ptf);
19
+
20
+ virtual void do_score_phrases();
21
+ virtual Score do_get_score(const PhraseTable::const_iterator &it);
22
+
23
+ Score get_weight(const String &s_src, const String &s_tgt) const;
24
+ Score get_weight(Count src, Count tgt) const;
25
+
26
+ typedef std::map<std::pair<Count,Count>, Score> WeightMapType_;
27
+
28
+ WeightMapType_ weight_map_;
29
+
30
+ // p(J|I) = probability of source-length J given target-length I
31
+ std::map<unsigned, std::map<unsigned, Score> > prob_srclen_tgtlen_;
32
+
33
+ Score get_noisy_or_combination(Count src_word, PhraseInfo &tgt_phrase);
34
+
35
+ PhraseScorer* black_box_scorer;
36
+
37
+ public:
38
+ static PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse, const PhraseScorerFactory &ptf);
39
+ };
40
+
41
+ #endif /* _LEXDECOM_H */
mosesdecoder/contrib/memscore/memscore.cpp ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #include <iostream>
6
+ #include <vector>
7
+
8
+ #include "phrasetable.h"
9
+ #include "scorer.h"
10
+
11
+ const char *progname;
12
+
13
+ typedef PhrasePairInfo::AlignmentVector::value_type VP;
14
+
15
+ bool cmp_counts(const VP &a1, const VP &a2);
16
+ int main(int argc, const char *argv[]);
17
+
18
+ bool cmp_counts(const VP &a1, const VP &a2)
19
+ {
20
+ return a1.second < a2.second;
21
+ }
22
+
23
+ int main(int argc, const char *argv[])
24
+ {
25
+ progname = argv[0];
26
+
27
+ if(argc == 1) {
28
+ std::cerr << "No scorers specified." << std::endl;
29
+ usage();
30
+ }
31
+
32
+ MemoryPhraseTable pt;
33
+ PhraseScorerFactory psf(pt);
34
+
35
+ typedef std::vector<PhraseScorer *> ScorerList;
36
+ ScorerList scorers;
37
+
38
+ for(int argp = 1; argp < argc; ) {
39
+ bool reverse;
40
+ if(!strcmp(argv[argp], "-s"))
41
+ reverse = false;
42
+ else if(!strcmp(argv[argp], "-r"))
43
+ reverse = true;
44
+ else
45
+ usage();
46
+
47
+ scorers.push_back(psf.create_scorer(argv, ++argp, reverse));
48
+ }
49
+
50
+ pt.load_data(std::cin);
51
+ pt.compute_phrase_statistics();
52
+
53
+ for(ScorerList::iterator s = scorers.begin(); s != scorers.end(); ++s)
54
+ (*s)->score_phrases();
55
+
56
+ for(PhrasePairCounts::const_iterator it = pt.raw_begin(); it != pt.raw_end(); ++it) {
57
+ PhrasePairInfo ppi(it);
58
+ Phrase src = ppi.get_src();
59
+ Phrase tgt = ppi.get_tgt();
60
+ const PhrasePairInfo::AlignmentVector av = ppi.get_alignments();
61
+
62
+ PhraseAlignment alig = std::max_element(av.begin(), av.end(), cmp_counts)->first;
63
+
64
+ std::cout << pt.get_src_phrase(src) << " ||| " << pt.get_tgt_phrase(tgt) << " ||| " << alig << " |||";
65
+
66
+ for(ScorerList::iterator s = scorers.begin(); s != scorers.end(); ++s)
67
+ std::cout << ' ' << (*s)->get_score(it);
68
+ std::cout << '\n'; // don't use std::endl to avoid flushing
69
+ }
70
+ }
71
+
72
+ void usage()
73
+ {
74
+ std::cerr << "Usage: " << progname << " <scorer1> <scorer2> ..." << std::endl <<
75
+ " where each scorer is specified as" << std::endl <<
76
+ " -s <scorer> <args> to estimate p(s|t)" << std::endl <<
77
+ " -r <scorer> <args> to estimate p(t|s)" << std::endl << std::endl;
78
+
79
+ std::cerr << "Implemented scorers:" << std::endl;
80
+
81
+ const std::vector<String> &v = PhraseScorerFactory::scorer_list();
82
+ std::copy(v.begin(), v.end(), std::ostream_iterator<std::string>(std::cerr, "\n"));
83
+
84
+ exit(1);
85
+ }
mosesdecoder/contrib/memscore/memscore.h ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #ifndef MEMSCORE_H
6
+ #define MEMSCORE_H
7
+
8
+ #include <sstream>
9
+ #include <string>
10
+ #include <utility>
11
+
12
+ #include "config.h"
13
+
14
+ #ifndef HAVE_PTRDIFF_T
15
+ typedef long ptrdiff_t;
16
+ #endif
17
+
18
+ #ifdef __GNUC__
19
+ #define NORETURN __attribute__ ((noreturn))
20
+ #else
21
+ #define NORETURN
22
+ #endif
23
+
24
+ void usage() NORETURN;
25
+
26
+ typedef double Score;
27
+ typedef unsigned int Count;
28
+ typedef unsigned int Phrase;
29
+ typedef ptrdiff_t DataIndex;
30
+ typedef std::pair<Phrase,Phrase> PhrasePair;
31
+ typedef char *PhrasePairData;
32
+ typedef std::string String;
33
+ typedef std::istringstream IStringStream;
34
+
35
+ /* phrasetable.h */
36
+
37
+ class PhraseText;
38
+ class PhraseInfo;
39
+ class PhraseInfoList;
40
+ class PhraseAlignment;
41
+ class PhrasePairInfo;
42
+ class PhraseTable;
43
+
44
+ /* scorer.h */
45
+
46
+ class PhraseScorer;
47
+
48
+ /* statistic.h */
49
+
50
+ class PhraseStatistic;
51
+
52
+ /* IRSTLM */
53
+
54
+ class lmtable;
55
+ class ngram;
56
+
57
+ #endif
mosesdecoder/contrib/memscore/missing ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #! /bin/sh
2
+ # Common stub for a few missing GNU programs while installing.
3
+
4
+ scriptversion=2005-06-08.21
5
+
6
+ # Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005
7
+ # Free Software Foundation, Inc.
8
+ # Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
9
+
10
+ # This program is free software; you can redistribute it and/or modify
11
+ # it under the terms of the GNU General Public License as published by
12
+ # the Free Software Foundation; either version 2, or (at your option)
13
+ # any later version.
14
+
15
+ # This program is distributed in the hope that it will be useful,
16
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
17
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
+ # GNU General Public License for more details.
19
+
20
+ # You should have received a copy of the GNU General Public License
21
+ # along with this program; if not, write to the Free Software
22
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23
+ # 02110-1301, USA.
24
+
25
+ # As a special exception to the GNU General Public License, if you
26
+ # distribute this file as part of a program that contains a
27
+ # configuration script generated by Autoconf, you may include it under
28
+ # the same distribution terms that you use for the rest of that program.
29
+
30
+ if test $# -eq 0; then
31
+ echo 1>&2 "Try \`$0 --help' for more information"
32
+ exit 1
33
+ fi
34
+
35
+ run=:
36
+
37
+ # In the cases where this matters, `missing' is being run in the
38
+ # srcdir already.
39
+ if test -f configure.ac; then
40
+ configure_ac=configure.ac
41
+ else
42
+ configure_ac=configure.in
43
+ fi
44
+
45
+ msg="missing on your system"
46
+
47
+ case "$1" in
48
+ --run)
49
+ # Try to run requested program, and just exit if it succeeds.
50
+ run=
51
+ shift
52
+ "$@" && exit 0
53
+ # Exit code 63 means version mismatch. This often happens
54
+ # when the user try to use an ancient version of a tool on
55
+ # a file that requires a minimum version. In this case we
56
+ # we should proceed has if the program had been absent, or
57
+ # if --run hadn't been passed.
58
+ if test $? = 63; then
59
+ run=:
60
+ msg="probably too old"
61
+ fi
62
+ ;;
63
+
64
+ -h|--h|--he|--hel|--help)
65
+ echo "\
66
+ $0 [OPTION]... PROGRAM [ARGUMENT]...
67
+
68
+ Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
69
+ error status if there is no known handling for PROGRAM.
70
+
71
+ Options:
72
+ -h, --help display this help and exit
73
+ -v, --version output version information and exit
74
+ --run try to run the given command, and emulate it if it fails
75
+
76
+ Supported PROGRAM values:
77
+ aclocal touch file \`aclocal.m4'
78
+ autoconf touch file \`configure'
79
+ autoheader touch file \`config.h.in'
80
+ automake touch all \`Makefile.in' files
81
+ bison create \`y.tab.[ch]', if possible, from existing .[ch]
82
+ flex create \`lex.yy.c', if possible, from existing .c
83
+ help2man touch the output file
84
+ lex create \`lex.yy.c', if possible, from existing .c
85
+ makeinfo touch the output file
86
+ tar try tar, gnutar, gtar, then tar without non-portable flags
87
+ yacc create \`y.tab.[ch]', if possible, from existing .[ch]
88
+
89
+ Send bug reports to <bug-automake@gnu.org>."
90
+ exit $?
91
+ ;;
92
+
93
+ -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
94
+ echo "missing $scriptversion (GNU Automake)"
95
+ exit $?
96
+ ;;
97
+
98
+ -*)
99
+ echo 1>&2 "$0: Unknown \`$1' option"
100
+ echo 1>&2 "Try \`$0 --help' for more information"
101
+ exit 1
102
+ ;;
103
+
104
+ esac
105
+
106
+ # Now exit if we have it, but it failed. Also exit now if we
107
+ # don't have it and --version was passed (most likely to detect
108
+ # the program).
109
+ case "$1" in
110
+ lex|yacc)
111
+ # Not GNU programs, they don't have --version.
112
+ ;;
113
+
114
+ tar)
115
+ if test -n "$run"; then
116
+ echo 1>&2 "ERROR: \`tar' requires --run"
117
+ exit 1
118
+ elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
119
+ exit 1
120
+ fi
121
+ ;;
122
+
123
+ *)
124
+ if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
125
+ # We have it, but it failed.
126
+ exit 1
127
+ elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
128
+ # Could not run --version or --help. This is probably someone
129
+ # running `$TOOL --version' or `$TOOL --help' to check whether
130
+ # $TOOL exists and not knowing $TOOL uses missing.
131
+ exit 1
132
+ fi
133
+ ;;
134
+ esac
135
+
136
+ # If it does not exist, or fails to run (possibly an outdated version),
137
+ # try to emulate it.
138
+ case "$1" in
139
+ aclocal*)
140
+ echo 1>&2 "\
141
+ WARNING: \`$1' is $msg. You should only need it if
142
+ you modified \`acinclude.m4' or \`${configure_ac}'. You might want
143
+ to install the \`Automake' and \`Perl' packages. Grab them from
144
+ any GNU archive site."
145
+ touch aclocal.m4
146
+ ;;
147
+
148
+ autoconf)
149
+ echo 1>&2 "\
150
+ WARNING: \`$1' is $msg. You should only need it if
151
+ you modified \`${configure_ac}'. You might want to install the
152
+ \`Autoconf' and \`GNU m4' packages. Grab them from any GNU
153
+ archive site."
154
+ touch configure
155
+ ;;
156
+
157
+ autoheader)
158
+ echo 1>&2 "\
159
+ WARNING: \`$1' is $msg. You should only need it if
160
+ you modified \`acconfig.h' or \`${configure_ac}'. You might want
161
+ to install the \`Autoconf' and \`GNU m4' packages. Grab them
162
+ from any GNU archive site."
163
+ files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
164
+ test -z "$files" && files="config.h"
165
+ touch_files=
166
+ for f in $files; do
167
+ case "$f" in
168
+ *:*) touch_files="$touch_files "`echo "$f" |
169
+ sed -e 's/^[^:]*://' -e 's/:.*//'`;;
170
+ *) touch_files="$touch_files $f.in";;
171
+ esac
172
+ done
173
+ touch $touch_files
174
+ ;;
175
+
176
+ automake*)
177
+ echo 1>&2 "\
178
+ WARNING: \`$1' is $msg. You should only need it if
179
+ you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
180
+ You might want to install the \`Automake' and \`Perl' packages.
181
+ Grab them from any GNU archive site."
182
+ find . -type f -name Makefile.am -print |
183
+ sed 's/\.am$/.in/' |
184
+ while read f; do touch "$f"; done
185
+ ;;
186
+
187
+ autom4te)
188
+ echo 1>&2 "\
189
+ WARNING: \`$1' is needed, but is $msg.
190
+ You might have modified some files without having the
191
+ proper tools for further handling them.
192
+ You can get \`$1' as part of \`Autoconf' from any GNU
193
+ archive site."
194
+
195
+ file=`echo "$*" | sed -n 's/.*--output[ =]*\([^ ]*\).*/\1/p'`
196
+ test -z "$file" && file=`echo "$*" | sed -n 's/.*-o[ ]*\([^ ]*\).*/\1/p'`
197
+ if test -f "$file"; then
198
+ touch $file
199
+ else
200
+ test -z "$file" || exec >$file
201
+ echo "#! /bin/sh"
202
+ echo "# Created by GNU Automake missing as a replacement of"
203
+ echo "# $ $@"
204
+ echo "exit 0"
205
+ chmod +x $file
206
+ exit 1
207
+ fi
208
+ ;;
209
+
210
+ bison|yacc)
211
+ echo 1>&2 "\
212
+ WARNING: \`$1' $msg. You should only need it if
213
+ you modified a \`.y' file. You may need the \`Bison' package
214
+ in order for those modifications to take effect. You can get
215
+ \`Bison' from any GNU archive site."
216
+ rm -f y.tab.c y.tab.h
217
+ if [ $# -ne 1 ]; then
218
+ eval LASTARG="\${$#}"
219
+ case "$LASTARG" in
220
+ *.y)
221
+ SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
222
+ if [ -f "$SRCFILE" ]; then
223
+ cp "$SRCFILE" y.tab.c
224
+ fi
225
+ SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
226
+ if [ -f "$SRCFILE" ]; then
227
+ cp "$SRCFILE" y.tab.h
228
+ fi
229
+ ;;
230
+ esac
231
+ fi
232
+ if [ ! -f y.tab.h ]; then
233
+ echo >y.tab.h
234
+ fi
235
+ if [ ! -f y.tab.c ]; then
236
+ echo 'main() { return 0; }' >y.tab.c
237
+ fi
238
+ ;;
239
+
240
+ lex|flex)
241
+ echo 1>&2 "\
242
+ WARNING: \`$1' is $msg. You should only need it if
243
+ you modified a \`.l' file. You may need the \`Flex' package
244
+ in order for those modifications to take effect. You can get
245
+ \`Flex' from any GNU archive site."
246
+ rm -f lex.yy.c
247
+ if [ $# -ne 1 ]; then
248
+ eval LASTARG="\${$#}"
249
+ case "$LASTARG" in
250
+ *.l)
251
+ SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
252
+ if [ -f "$SRCFILE" ]; then
253
+ cp "$SRCFILE" lex.yy.c
254
+ fi
255
+ ;;
256
+ esac
257
+ fi
258
+ if [ ! -f lex.yy.c ]; then
259
+ echo 'main() { return 0; }' >lex.yy.c
260
+ fi
261
+ ;;
262
+
263
+ help2man)
264
+ echo 1>&2 "\
265
+ WARNING: \`$1' is $msg. You should only need it if
266
+ you modified a dependency of a manual page. You may need the
267
+ \`Help2man' package in order for those modifications to take
268
+ effect. You can get \`Help2man' from any GNU archive site."
269
+
270
+ file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
271
+ if test -z "$file"; then
272
+ file=`echo "$*" | sed -n 's/.*--output=\([^ ]*\).*/\1/p'`
273
+ fi
274
+ if [ -f "$file" ]; then
275
+ touch $file
276
+ else
277
+ test -z "$file" || exec >$file
278
+ echo ".ab help2man is required to generate this page"
279
+ exit 1
280
+ fi
281
+ ;;
282
+
283
+ makeinfo)
284
+ echo 1>&2 "\
285
+ WARNING: \`$1' is $msg. You should only need it if
286
+ you modified a \`.texi' or \`.texinfo' file, or any other file
287
+ indirectly affecting the aspect of the manual. The spurious
288
+ call might also be the consequence of using a buggy \`make' (AIX,
289
+ DU, IRIX). You might want to install the \`Texinfo' package or
290
+ the \`GNU make' package. Grab either from any GNU archive site."
291
+ # The file to touch is that specified with -o ...
292
+ file=`echo "$*" | sed -n 's/.*-o \([^ ]*\).*/\1/p'`
293
+ if test -z "$file"; then
294
+ # ... or it is the one specified with @setfilename ...
295
+ infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
296
+ file=`sed -n '/^@setfilename/ { s/.* \([^ ]*\) *$/\1/; p; q; }' $infile`
297
+ # ... or it is derived from the source name (dir/f.texi becomes f.info)
298
+ test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
299
+ fi
300
+ # If the file does not exist, the user really needs makeinfo;
301
+ # let's fail without touching anything.
302
+ test -f $file || exit 1
303
+ touch $file
304
+ ;;
305
+
306
+ tar)
307
+ shift
308
+
309
+ # We have already tried tar in the generic part.
310
+ # Look for gnutar/gtar before invocation to avoid ugly error
311
+ # messages.
312
+ if (gnutar --version > /dev/null 2>&1); then
313
+ gnutar "$@" && exit 0
314
+ fi
315
+ if (gtar --version > /dev/null 2>&1); then
316
+ gtar "$@" && exit 0
317
+ fi
318
+ firstarg="$1"
319
+ if shift; then
320
+ case "$firstarg" in
321
+ *o*)
322
+ firstarg=`echo "$firstarg" | sed s/o//`
323
+ tar "$firstarg" "$@" && exit 0
324
+ ;;
325
+ esac
326
+ case "$firstarg" in
327
+ *h*)
328
+ firstarg=`echo "$firstarg" | sed s/h//`
329
+ tar "$firstarg" "$@" && exit 0
330
+ ;;
331
+ esac
332
+ fi
333
+
334
+ echo 1>&2 "\
335
+ WARNING: I can't seem to be able to run \`tar' with the given arguments.
336
+ You may want to install GNU tar or Free paxutils, or check the
337
+ command line arguments."
338
+ exit 1
339
+ ;;
340
+
341
+ *)
342
+ echo 1>&2 "\
343
+ WARNING: \`$1' is needed, and is $msg.
344
+ You might have modified some files without having the
345
+ proper tools for further handling them. Check the \`README' file,
346
+ it often tells you about the needed prerequisites for installing
347
+ this package. You may also peek at any GNU archive site, in case
348
+ some other package would contain this missing \`$1' program."
349
+ exit 1
350
+ ;;
351
+ esac
352
+
353
+ exit 0
354
+
355
+ # Local variables:
356
+ # eval: (add-hook 'write-file-hooks 'time-stamp)
357
+ # time-stamp-start: "scriptversion="
358
+ # time-stamp-format: "%:y-%02m-%02d.%02H"
359
+ # time-stamp-end: "$"
360
+ # End:
mosesdecoder/contrib/memscore/phraselm.h ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #ifndef PHRASELM_H
6
+ #define PHRASELM_H
7
+
8
+ #include <cassert>
9
+
10
+ #include "memscore.h"
11
+ #include "phrasetable.h"
12
+ #include "statistic.h"
13
+
14
+ class lmtable;
15
+
16
+ class PhraseLanguageModel : public PhraseStatistic
17
+ {
18
+ protected:
19
+ String lmfile_;
20
+ Count score_idx_;
21
+
22
+ PhraseInfoList *phrase_info_list_;
23
+
24
+ void compute_lmscores(PhraseInfoList &phrase_info_list, bool closed_world);
25
+
26
+ public:
27
+ PhraseLanguageModel(String lmfile) : lmfile_(lmfile) {}
28
+
29
+ virtual void attach(PhraseInfoList &pilist);
30
+ virtual void compute_statistic();
31
+
32
+ virtual Score get_score(PhraseInfo &pi) {
33
+ assert(computation_done_);
34
+ return pi.data(score_idx_);
35
+ }
36
+ };
37
+
38
+ class ClosedPhraseLanguageModel : public PhraseLanguageModel
39
+ {
40
+ public:
41
+ ClosedPhraseLanguageModel(String lmfile) : PhraseLanguageModel(lmfile) {}
42
+ virtual void compute_statistic();
43
+ };
44
+
45
+ #endif
mosesdecoder/contrib/memscore/phrasetable.cpp ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #include "phrasetable.h"
6
+ #include "statistic.h"
7
+ #include "timestamp.h"
8
+
9
+ #include <iostream>
10
+ #include <limits>
11
+ #include <sstream>
12
+ #include <string>
13
+
14
+ /* PhraseText */
15
+
16
+ PhraseText::DictionaryType_ PhraseText::dictionary_;
17
+ Count PhraseText::last_id_ = 1;
18
+
19
+ PhraseText::PhraseText(const String &s)
20
+ {
21
+ IStringStream is(s);
22
+ while(is.good()) {
23
+ String w;
24
+ getline(is, w, ' ');
25
+ Count *id = boost::fast_pool_allocator<Count>::allocate(1);
26
+ *id = index_word(w);
27
+ word_list_.push_back(id);
28
+ }
29
+ }
30
+
31
+ std::ostream &operator<<(std::ostream &os, const PhraseText &pt)
32
+ {
33
+ bool print_space = false;
34
+ for(PhraseText::const_string_iterator it = pt.string_begin(); it != pt.string_end(); it++) {
35
+ if(print_space)
36
+ os << ' ';
37
+ else
38
+ print_space = true;
39
+
40
+ os << *it;
41
+ }
42
+
43
+ return os;
44
+ }
45
+
46
+ /* PhraseAlignment */
47
+
48
+ PhraseAlignment::Alignment::AlignmentMapType_ PhraseAlignment::Alignment::alignment_map_;
49
+ PhraseAlignment::Alignment::AlignmentVectorType_ PhraseAlignment::Alignment::alignment_vector_;
50
+
51
+ PhraseAlignment::Alignment::Alignment(Count slen, Count tlen, const String &alignment) :
52
+ slen_(slen), tlen_(tlen), matrix_(slen * tlen, false)
53
+ {
54
+ assert(slen_ > 0 && slen_ < 10);
55
+ IStringStream is(alignment);
56
+ while(is.good()) {
57
+ String a;
58
+ getline(is, a, ' ');
59
+ IStringStream ap(a);
60
+ Count s, t;
61
+ char dash;
62
+ ap >> s >> dash >> t;
63
+ assert(s < slen && t < tlen);
64
+ assert(dash == '-');
65
+ matrix_[t * slen + s] = true;
66
+ }
67
+ }
68
+
69
+ Count PhraseAlignment::Alignment::index_alignment(Count slen, Count tlen, const String &alignment)
70
+ {
71
+ AlignmentTuple_ tup = boost::make_tuple(slen, tlen, alignment);
72
+ AlignmentMapType_::const_iterator it = alignment_map_.find(tup);
73
+
74
+ if(it == alignment_map_.end()) {
75
+ const Alignment *pa = new Alignment(slen, tlen, alignment);
76
+ Count index = alignment_vector_.size();
77
+ alignment_map_.insert(std::make_pair(tup, index));
78
+ alignment_vector_.push_back(pa);
79
+ return index;
80
+ } else
81
+ return it->second;
82
+ }
83
+
84
+ std::ostream &operator<<(std::ostream &os, const PhraseAlignment::Alignment &pa)
85
+ {
86
+ bool print_space = false;
87
+ for(Count i = 0; i < pa.matrix_.size(); i++) {
88
+ if(print_space)
89
+ os << ' ';
90
+ else
91
+ print_space = true;
92
+
93
+ os << (i / pa.slen_) << '-' << (i % pa.slen_);
94
+ }
95
+
96
+ return os;
97
+ }
98
+
99
+ std::ostream &operator<<(std::ostream &os, const PhraseAlignment &pa)
100
+ {
101
+ for(Count s = 0; s < pa.get_source_length(); s++) {
102
+ os << '(';
103
+ bool print_comma = false;
104
+ for(Count t = 0; t < pa.get_target_length(); t++) {
105
+ if(pa.is_aligned(s, t)) {
106
+ if(print_comma)
107
+ os << ',';
108
+ else
109
+ print_comma = true;
110
+
111
+ os << t;
112
+ }
113
+ }
114
+ os << ") ";
115
+ }
116
+
117
+ os << "|||";
118
+
119
+ for(Count t = 0; t < pa.get_target_length(); t++) {
120
+ os << " (";
121
+ bool print_comma = false;
122
+ for(Count s = 0; s < pa.get_source_length(); s++) {
123
+ if(pa.is_aligned(s, t)) {
124
+ if(print_comma)
125
+ os << ',';
126
+ else
127
+ print_comma = true;
128
+
129
+ os << s;
130
+ }
131
+ }
132
+ os << ')';
133
+ }
134
+
135
+ return os;
136
+ }
137
+
138
+ /* PhrasePairInfo */
139
+
140
+ bool PhrasePairInfo::init_phase_ = true;
141
+ Count PhrasePairInfo::data_ncounts_ = COUNT_FREE_IDX;
142
+ Count PhrasePairInfo::data_nscores_ = SCORE_FREE_IDX;
143
+ const Count PhrasePairInfo::CONTINUATION_BIT = 1 << (std::numeric_limits<Count>::digits - 1);
144
+
145
+ PhrasePairInfo::PhrasePairInfo(Count src, Count tgt, Count alignment, Count count) :
146
+ src_(src), tgt_(tgt), data_(NULL), reverse_(false)
147
+ {
148
+ init_phase_ = false;
149
+ realloc_data(1);
150
+ count_data(COUNT_COUNT_IDX) = count;
151
+ Count *aligd = alignment_data(0);
152
+ aligd[0] = alignment;
153
+ aligd[1] = count;
154
+ }
155
+
156
+ DataIndex PhrasePairInfo::register_score_data(Count size)
157
+ {
158
+ assert(init_phase_);
159
+
160
+ Count start = data_nscores_;
161
+ data_nscores_ += size;
162
+ return start;
163
+ }
164
+
165
+ DataIndex PhrasePairInfo::register_count_data(Count size)
166
+ {
167
+ assert(init_phase_);
168
+
169
+ Count start = data_ncounts_;
170
+ data_ncounts_ += size;
171
+ return start;
172
+ }
173
+
174
+ PhrasePairInfo::AlignmentVector PhrasePairInfo::get_alignments() const
175
+ {
176
+ PhrasePairInfo::AlignmentVector vec;
177
+
178
+ Count i = 0;
179
+ bool last;
180
+ do {
181
+ const Count *aligd = alignment_data(i++);
182
+ last = !(aligd[0] & CONTINUATION_BIT);
183
+ Count alig = aligd[0] & ~CONTINUATION_BIT;
184
+ vec.push_back(std::make_pair(PhraseAlignment(alig, reverse_), aligd[1]));
185
+ } while(!last);
186
+
187
+ return vec;
188
+ }
189
+
190
+ void PhrasePairInfo::add_alignment(Count new_alignment)
191
+ {
192
+ Count i = 0;
193
+ bool last;
194
+ do {
195
+ Count *aligd = alignment_data(i++);
196
+ last = !(aligd[0] & CONTINUATION_BIT);
197
+ Count alig = aligd[0] & ~CONTINUATION_BIT;
198
+ if(alig == new_alignment) {
199
+ aligd[1]++;
200
+ return;
201
+ }
202
+ } while(!last);
203
+
204
+ realloc_data(i + 1);
205
+
206
+ Count *last_aligd = alignment_data(i - 1);
207
+ last_aligd[0] |= CONTINUATION_BIT;
208
+
209
+ Count *this_aligd = alignment_data(i);
210
+ this_aligd[0] = new_alignment;
211
+ this_aligd[1] = 1;
212
+ }
213
+
214
+ void PhrasePairInfo::realloc_data(Count nalignments)
215
+ {
216
+ static boost::pool<> *pool[3] = { NULL, NULL, NULL };
217
+
218
+ size_t fixed_size = data_nscores_ * sizeof(Score) + data_ncounts_ * sizeof(Count);
219
+ size_t new_data_size = fixed_size + COUNTS_PER_ALIGNMENT * nalignments * sizeof(Count);
220
+
221
+ PhrasePairData new_data;
222
+ if(nalignments <= 3) {
223
+ if(!pool[nalignments - 1])
224
+ pool[nalignments - 1] = new boost::pool<>(new_data_size);
225
+
226
+ new_data = reinterpret_cast<PhrasePairData>(pool[nalignments - 1]->malloc());
227
+ } else
228
+ new_data = new char[new_data_size];
229
+
230
+ if(data_) {
231
+ memcpy(new_data, data_, fixed_size);
232
+ Count i = 0;
233
+ Count *old_aligd, *new_aligd;
234
+ do {
235
+ assert(i < nalignments);
236
+ old_aligd = alignment_data(data_, i);
237
+ new_aligd = alignment_data(new_data, i);
238
+ new_aligd[0] = old_aligd[0];
239
+ new_aligd[1] = old_aligd[1];
240
+ i++;
241
+ } while(old_aligd[0] & CONTINUATION_BIT);
242
+ if(nalignments <= 4)
243
+ pool[nalignments - 2]->free(data_);
244
+ else
245
+ delete[] data_;
246
+ }
247
+
248
+ data_ = new_data;
249
+ }
250
+
251
+ /* PhraseInfoList */
252
+
253
+ Phrase PhraseInfoList::index_phrase(const String &s_phr)
254
+ {
255
+ IDMapType_::const_iterator it = idmap_.find(s_phr);
256
+ if(it != idmap_.end())
257
+ return it->second;
258
+
259
+ PhraseInfo *pi = phrase_info_pool_.construct(data_size_, s_phr);
260
+
261
+ list_.push_back(pi);
262
+ idmap_[s_phr] = list_.size() - 1;
263
+ return idmap_[s_phr];
264
+ }
265
+
266
+ DataIndex PhraseInfoList::register_data(Count size)
267
+ {
268
+ DataIndex start = data_size_;
269
+ data_size_ += size;
270
+ return start;
271
+ }
272
+
273
+ void PhraseInfoList::attach_statistic(PhraseStatistic &s)
274
+ {
275
+ statistics_.push_back(&s);
276
+ s.attach(*this);
277
+ }
278
+
279
+ void PhraseInfoList::compute_statistics()
280
+ {
281
+ while(!statistics_.empty()) {
282
+ statistics_.front()->compute_statistic();
283
+ statistics_.pop_front();
284
+ }
285
+ }
286
+
287
+ /* PhraseTable */
288
+
289
+ void MemoryPhraseTable::load_data(std::istream &instream)
290
+ {
291
+ Count total_count = 0;
292
+
293
+ Timestamp t_load;
294
+ Count nlines = 1;
295
+ String line;
296
+ while(getline(instream, line)) {
297
+ size_t sep1 = line.find(" ||| ");
298
+ if(sep1 == line.npos) {
299
+ std::cerr << "Phrase separator not found in: " << line << std::endl;
300
+ abort();
301
+ }
302
+ size_t sep2 = line.find(" ||| ", sep1 + 1);
303
+ String s_src(line, 0, sep1);
304
+ String s_tgt(line, sep1 + 5, sep2 - sep1 - 5);
305
+ String s_alignment(line, sep2 + 5);
306
+
307
+ Phrase src = src_info_.index_phrase(s_src);
308
+ Phrase tgt = tgt_info_.index_phrase(s_tgt);
309
+ Count alignment = PhraseAlignment::index_alignment(src_info_[src].get_phrase().size(), tgt_info_[tgt].get_phrase().size(), s_alignment);
310
+
311
+ src_info_[src].inc_count();
312
+ tgt_info_[tgt].inc_count();
313
+ total_count++;
314
+
315
+ PhrasePair stpair(src, tgt);
316
+ PhrasePairCounts::iterator it = joint_counts_.find(stpair);
317
+
318
+ if(it == joint_counts_.end()) {
319
+ src_info_[src].inc_distinct();
320
+ tgt_info_[tgt].inc_distinct();
321
+ joint_counts_.insert(std::make_pair(stpair, PhrasePairInfo(src, tgt, alignment, 1).get_phrase_pair_data()));
322
+ } else {
323
+ PhrasePairInfo pi(src, tgt, it->second);
324
+ pi.inc_count();
325
+ pi.add_alignment(alignment);
326
+ it->second = pi.get_phrase_pair_data(); // may have changed by adding the alignment
327
+ }
328
+ if(nlines % 50000 == 0)
329
+ std:: cerr << "Read " << nlines << " lines in " << (t_load.elapsed_time() / 1000) << " ms." << std::endl;
330
+ nlines++;
331
+ }
332
+ }
333
+
334
+ void MemoryPhraseTable::attach_src_statistic(PhraseStatistic &s)
335
+ {
336
+ src_info_.attach_statistic(s);
337
+ }
338
+
339
+ void MemoryPhraseTable::attach_tgt_statistic(PhraseStatistic &s)
340
+ {
341
+ tgt_info_.attach_statistic(s);
342
+ }
343
+
344
+ void MemoryPhraseTable::compute_phrase_statistics()
345
+ {
346
+ src_info_.compute_statistics();
347
+ tgt_info_.compute_statistics();
348
+ }
mosesdecoder/contrib/memscore/scorer.h ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #ifndef SCORER_H
6
+ #define SCORER_H
7
+
8
+ #include "memscore.h"
9
+
10
+ class PhraseScorerFactory
11
+ {
12
+ private:
13
+ PhraseTable &phrase_table_;
14
+
15
+ public:
16
+ explicit PhraseScorerFactory(PhraseTable &phrase_table) :
17
+ phrase_table_(phrase_table) {}
18
+
19
+ PhraseScorer *create_scorer(const char *argv[], int &argp, bool reverse);
20
+
21
+ PhraseTable &get_phrase_table() const {
22
+ return phrase_table_;
23
+ }
24
+
25
+ static const std::vector<String> &scorer_list();
26
+ };
27
+
28
+ class PhraseScorer
29
+ {
30
+ protected:
31
+ PhraseTable &phrase_table_;
32
+ bool reverse_;
33
+
34
+ explicit PhraseScorer(PhraseTable &pt, bool reverse) :
35
+ phrase_table_(!reverse ? pt : pt.reverse()), reverse_(reverse) {}
36
+
37
+ PhraseTable::iterator get_pair(Phrase src, Phrase tgt) {
38
+ PhraseTable::iterator it = phrase_table_.find(std::make_pair(src, tgt));
39
+ assert(it != phrase_table_.end());
40
+ return it;
41
+ }
42
+
43
+ private:
44
+ virtual void do_score_phrases() {}
45
+
46
+ virtual Score do_get_score(const PhraseTable::const_iterator &it) = 0;
47
+
48
+ public:
49
+ virtual ~PhraseScorer() {}
50
+
51
+ virtual Score get_discount() {}
52
+
53
+ void score_phrases() {
54
+ do_score_phrases();
55
+ }
56
+
57
+ Score get_score(const PhrasePairCounts::const_iterator &it) {
58
+ return do_get_score(phrase_table_.find(it));
59
+ }
60
+
61
+ Score get_score(const PhraseTable::const_iterator &it) {
62
+ return do_get_score(it);
63
+ }
64
+
65
+ Score get_score(Phrase src, Phrase tgt) {
66
+ PhraseTable::const_iterator it = get_pair(src, tgt);
67
+ return do_get_score(it);
68
+ }
69
+ };
70
+
71
+ #endif
mosesdecoder/contrib/memscore/timestamp.h ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // memscore - in-memory phrase scoring for Statistical Machine Translation
2
+ // Christian Hardmeier, FBK-irst, Trento, 2010
3
+ // $Id$
4
+
5
+ #ifndef TIMESTAMP_H
6
+ #define TIMESTAMP_H
7
+
8
+ #include <sys/time.h>
9
+
10
+ class Timestamp
11
+ {
12
+ private:
13
+ struct timeval tv_;
14
+
15
+ public:
16
+ typedef double time_difference;
17
+
18
+ Timestamp() {
19
+ gettimeofday(&tv_, NULL);
20
+ }
21
+
22
+ time_difference elapsed_time() const {
23
+ struct timeval tv2;
24
+ gettimeofday(&tv2, NULL);
25
+ return (tv2.tv_sec - tv_.tv_sec) * 1e6 + (tv2.tv_usec - tv_.tv_usec);
26
+ }
27
+ };
28
+
29
+ #endif
mosesdecoder/contrib/mira/Main.cpp ADDED
@@ -0,0 +1,1849 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <algorithm>
21
+ #include <cstdlib>
22
+ #include <ctime>
23
+ #include <string>
24
+ #include <vector>
25
+ #include <map>
26
+
27
+ #include <boost/program_options.hpp>
28
+ #include <boost/algorithm/string.hpp>
29
+
30
+ #ifdef MPI_ENABLE
31
+ #include <boost/mpi.hpp>
32
+ namespace mpi = boost::mpi;
33
+ #endif
34
+
35
+ #include "Main.h"
36
+ #include "Optimiser.h"
37
+ #include "Hildreth.h"
38
+ #include "HypothesisQueue.h"
39
+ #include "moses/StaticData.h"
40
+ #include "moses/ScoreComponentCollection.h"
41
+ #include "moses/ThreadPool.h"
42
+ #include "mert/BleuScorer.h"
43
+ #include "moses/FeatureVector.h"
44
+
45
+ #include "moses/FF/WordTranslationFeature.h"
46
+ #include "moses/FF/PhrasePairFeature.h"
47
+ #include "moses/FF/WordPenaltyProducer.h"
48
+ #include "moses/LM/Base.h"
49
+ #include "util/random.hh"
50
+
51
+ using namespace Mira;
52
+ using namespace std;
53
+ using namespace Moses;
54
+ namespace po = boost::program_options;
55
+
56
+ int main(int argc, char** argv)
57
+ {
58
+ util::rand_init();
59
+ size_t rank = 0;
60
+ size_t size = 1;
61
+ #ifdef MPI_ENABLE
62
+ mpi::environment env(argc,argv);
63
+ mpi::communicator world;
64
+ rank = world.rank();
65
+ size = world.size();
66
+ #endif
67
+
68
+ bool help;
69
+ int verbosity;
70
+ string mosesConfigFile;
71
+ string inputFile;
72
+ vector<string> referenceFiles;
73
+ vector<string> mosesConfigFilesFolds, inputFilesFolds, referenceFilesFolds;
74
+ // string coreWeightFile, startWeightFile;
75
+ size_t epochs;
76
+ string learner;
77
+ bool shuffle;
78
+ size_t mixingFrequency;
79
+ size_t weightDumpFrequency;
80
+ string weightDumpStem;
81
+ bool scale_margin;
82
+ bool scale_update;
83
+ size_t n;
84
+ size_t batchSize;
85
+ bool distinctNbest;
86
+ bool accumulateWeights;
87
+ float historySmoothing;
88
+ bool scaleByInputLength, scaleByAvgInputLength;
89
+ bool scaleByInverseLength, scaleByAvgInverseLength;
90
+ float scaleByX;
91
+ float slack;
92
+ bool averageWeights;
93
+ bool weightConvergence;
94
+ float learning_rate;
95
+ float mira_learning_rate;
96
+ float perceptron_learning_rate;
97
+ string decoder_settings;
98
+ float min_weight_change;
99
+ bool normaliseWeights, normaliseMargin;
100
+ bool print_feature_values;
101
+ bool historyBleu ;
102
+ bool sentenceBleu;
103
+ bool perceptron_update;
104
+ bool hope_fear;
105
+ bool model_hope_fear;
106
+ size_t hope_n, fear_n;
107
+ size_t bleu_smoothing_scheme;
108
+ float min_oracle_bleu;
109
+ float minBleuRatio, maxBleuRatio;
110
+ bool boost;
111
+ bool decode_hope, decode_fear, decode_model;
112
+ string decode_filename;
113
+ bool batchEqualsShard;
114
+ bool sparseAverage, dumpMixedWeights, sparseNoAverage;
115
+ int featureCutoff;
116
+ bool pruneZeroWeights;
117
+ bool printFeatureCounts, printNbestWithFeatures;
118
+ bool avgRefLength;
119
+ bool print_weights, print_core_weights, debug_model, scale_lm, scale_wp;
120
+ float scale_lm_factor, scale_wp_factor;
121
+ bool kbest;
122
+ string moses_src;
123
+ float sigmoidParam;
124
+ float bleuWeight, bleuWeight_hope, bleuWeight_fear;
125
+ bool bleu_weight_lm;
126
+ float bleu_weight_lm_factor;
127
+ bool l1_regularize, l2_regularize, l1_reg_sparse, l2_reg_sparse;
128
+ float l1_lambda, l2_lambda;
129
+ bool most_violated, most_violated_reg, all_violated, max_bleu_diff;
130
+ bool feature_confidence, signed_counts;
131
+ float decay_core, decay_sparse, core_r0, sparse_r0;
132
+ float bleu_weight_fear_factor;
133
+ bool hildreth;
134
+ float add2lm;
135
+
136
+ // compute real sentence Bleu scores on complete translations, disable Bleu feature
137
+ bool realBleu, disableBleuFeature;
138
+ bool rescaleSlack;
139
+ bool makePairs;
140
+ bool debug;
141
+ bool reg_on_every_mix;
142
+ size_t continue_epoch;
143
+ bool modelPlusBleu, simpleHistoryBleu;
144
+ po::options_description desc("Allowed options");
145
+ desc.add_options()
146
+ ("continue-epoch", po::value<size_t>(&continue_epoch)->default_value(0), "Continue an interrupted experiment from this epoch on")
147
+ ("freq-reg", po::value<bool>(&reg_on_every_mix)->default_value(false), "Regularize after every weight mixing")
148
+ ("l1sparse", po::value<bool>(&l1_reg_sparse)->default_value(true), "L1-regularization for sparse weights only")
149
+ ("l2sparse", po::value<bool>(&l2_reg_sparse)->default_value(true), "L2-regularization for sparse weights only")
150
+ ("mv-reg", po::value<bool>(&most_violated_reg)->default_value(false), "Regularize most violated constraint")
151
+ ("most-violated", po::value<bool>(&most_violated)->default_value(false), "Add most violated constraint")
152
+ ("all-violated", po::value<bool>(&all_violated)->default_value(false), "Add all violated constraints")
153
+ ("feature-confidence", po::value<bool>(&feature_confidence)->default_value(false), "Confidence-weighted learning")
154
+ ("signed-counts", po::value<bool>(&signed_counts)->default_value(false), "Use signed feature counts for CWL")
155
+ ("dbg", po::value<bool>(&debug)->default_value(true), "More debug output")
156
+ ("make-pairs", po::value<bool>(&makePairs)->default_value(true), "Make pairs of hypotheses for 1slack")
157
+ ("debug", po::value<bool>(&debug)->default_value(true), "More debug output")
158
+ ("rescale-slack", po::value<bool>(&rescaleSlack)->default_value(false), "Rescale slack in 1-slack formulation")
159
+ ("add2lm", po::value<float>(&add2lm)->default_value(0.0), "Add the specified amount to all LM weights")
160
+ ("hildreth", po::value<bool>(&hildreth)->default_value(false), "Prefer Hildreth over analytical update")
161
+ ("model-plus-bleu", po::value<bool>(&modelPlusBleu)->default_value(false), "Use the sum of model score and +/- bleu to select hope and fear translations")
162
+ ("simple-history-bleu", po::value<bool>(&simpleHistoryBleu)->default_value(false), "Simple history Bleu")
163
+
164
+ ("bleu-weight", po::value<float>(&bleuWeight)->default_value(1.0), "Bleu weight used in decoder objective")
165
+ ("bw-hope", po::value<float>(&bleuWeight_hope)->default_value(-1.0), "Bleu weight used in decoder objective for hope")
166
+ ("bw-fear", po::value<float>(&bleuWeight_fear)->default_value(-1.0), "Bleu weight used in decoder objective for fear")
167
+
168
+ ("core-r0", po::value<float>(&core_r0)->default_value(1.0), "Start learning rate for core features")
169
+ ("sparse-r0", po::value<float>(&sparse_r0)->default_value(1.0), "Start learning rate for sparse features")
170
+ ("decay-core", po::value<float>(&decay_core)->default_value(0.01), "Decay for core feature learning rate")
171
+ ("decay-sparse", po::value<float>(&decay_sparse)->default_value(0.01), "Decay for sparse feature learning rate")
172
+
173
+ ("tie-bw-to-lm", po::value<bool>(&bleu_weight_lm)->default_value(true), "Make bleu weight depend on lm weight")
174
+ ("bw-lm-factor", po::value<float>(&bleu_weight_lm_factor)->default_value(2.0), "Make bleu weight depend on lm weight by this factor")
175
+ ("bw-factor-fear", po::value<float>(&bleu_weight_fear_factor)->default_value(1.0), "Multiply fear weight by this factor")
176
+ ("accumulate-weights", po::value<bool>(&accumulateWeights)->default_value(false), "Accumulate and average weights over all epochs")
177
+ ("average-weights", po::value<bool>(&averageWeights)->default_value(false), "Set decoder weights to average weights after each update")
178
+ ("avg-ref-length", po::value<bool>(&avgRefLength)->default_value(false), "Use average reference length instead of shortest for BLEU score feature")
179
+ ("batch-equals-shard", po::value<bool>(&batchEqualsShard)->default_value(false), "Batch size is equal to shard size (purely batch)")
180
+ ("batch-size,b", po::value<size_t>(&batchSize)->default_value(1), "Size of batch that is send to optimiser for weight adjustments")
181
+ ("bleu-smoothing-scheme", po::value<size_t>(&bleu_smoothing_scheme)->default_value(1), "Set a smoothing scheme for sentence-Bleu: +1 (1), +0.1 (2), papineni (3) (default:1)")
182
+ ("boost", po::value<bool>(&boost)->default_value(false), "Apply boosting factor to updates on misranked candidates")
183
+ ("config,f", po::value<string>(&mosesConfigFile), "Moses ini-file")
184
+ ("configs-folds", po::value<vector<string> >(&mosesConfigFilesFolds), "Moses ini-files, one for each fold")
185
+ ("debug-model", po::value<bool>(&debug_model)->default_value(false), "Get best model translation for debugging purposes")
186
+ ("decode-hope", po::value<bool>(&decode_hope)->default_value(false), "Decode dev input set according to hope objective")
187
+ ("decode-fear", po::value<bool>(&decode_fear)->default_value(false), "Decode dev input set according to fear objective")
188
+ ("decode-model", po::value<bool>(&decode_model)->default_value(false), "Decode dev input set according to normal objective")
189
+ ("decode-filename", po::value<string>(&decode_filename), "Filename for Bleu objective translations")
190
+ ("decoder-settings", po::value<string>(&decoder_settings)->default_value(""), "Decoder settings for tuning runs")
191
+ ("distinct-nbest", po::value<bool>(&distinctNbest)->default_value(true), "Use n-best list with distinct translations in inference step")
192
+ ("dump-mixed-weights", po::value<bool>(&dumpMixedWeights)->default_value(false), "Dump mixed weights instead of averaged weights")
193
+ ("epochs,e", po::value<size_t>(&epochs)->default_value(10), "Number of epochs")
194
+ ("feature-cutoff", po::value<int>(&featureCutoff)->default_value(-1), "Feature cutoff as additional regularization for sparse features")
195
+ ("fear-n", po::value<size_t>(&fear_n)->default_value(1), "Number of fear translations used")
196
+ ("help", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
197
+ ("history-bleu", po::value<bool>(&historyBleu)->default_value(false), "Use 1best translations to update the history")
198
+ ("history-smoothing", po::value<float>(&historySmoothing)->default_value(0.9), "Adjust the factor for history smoothing")
199
+ ("hope-fear", po::value<bool>(&hope_fear)->default_value(true), "Use only hope and fear translations for optimisation (not model)")
200
+ ("hope-n", po::value<size_t>(&hope_n)->default_value(2), "Number of hope translations used")
201
+ ("input-file,i", po::value<string>(&inputFile), "Input file containing tokenised source")
202
+ ("input-files-folds", po::value<vector<string> >(&inputFilesFolds), "Input files containing tokenised source, one for each fold")
203
+ ("learner,l", po::value<string>(&learner)->default_value("mira"), "Learning algorithm")
204
+ ("l1-lambda", po::value<float>(&l1_lambda)->default_value(0.0001), "Lambda for l1-regularization (w_i +/- lambda)")
205
+ ("l2-lambda", po::value<float>(&l2_lambda)->default_value(0.01), "Lambda for l2-regularization (w_i * (1 - lambda))")
206
+ ("l1-reg", po::value<bool>(&l1_regularize)->default_value(false), "L1-regularization")
207
+ ("l2-reg", po::value<bool>(&l2_regularize)->default_value(false), "L2-regularization")
208
+ ("min-bleu-ratio", po::value<float>(&minBleuRatio)->default_value(-1), "Set a minimum BLEU ratio between hope and fear")
209
+ ("max-bleu-ratio", po::value<float>(&maxBleuRatio)->default_value(-1), "Set a maximum BLEU ratio between hope and fear")
210
+ ("max-bleu-diff", po::value<bool>(&max_bleu_diff)->default_value(true), "Select hope/fear with maximum Bleu difference")
211
+ ("min-oracle-bleu", po::value<float>(&min_oracle_bleu)->default_value(0), "Set a minimum oracle BLEU score")
212
+ ("min-weight-change", po::value<float>(&min_weight_change)->default_value(0.0001), "Set minimum weight change for stopping criterion")
213
+ ("mira-learning-rate", po::value<float>(&mira_learning_rate)->default_value(1), "Learning rate for MIRA (fixed or flexible)")
214
+ ("mixing-frequency", po::value<size_t>(&mixingFrequency)->default_value(10), "How often per epoch to mix weights, when using mpi")
215
+ ("model-hope-fear", po::value<bool>(&model_hope_fear)->default_value(false), "Use model, hope and fear translations for optimisation")
216
+ ("moses-src", po::value<string>(&moses_src)->default_value(""), "Moses source directory")
217
+ ("nbest,n", po::value<size_t>(&n)->default_value(30), "Number of translations in n-best list")
218
+ ("normalise-weights", po::value<bool>(&normaliseWeights)->default_value(false), "Whether to normalise the updated weights before passing them to the decoder")
219
+ ("normalise-margin", po::value<bool>(&normaliseMargin)->default_value(false), "Normalise the margin: squash between 0 and 1")
220
+ ("perceptron-learning-rate", po::value<float>(&perceptron_learning_rate)->default_value(0.01), "Perceptron learning rate")
221
+ ("print-feature-values", po::value<bool>(&print_feature_values)->default_value(false), "Print out feature values")
222
+ ("print-feature-counts", po::value<bool>(&printFeatureCounts)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
223
+ ("print-nbest-with-features", po::value<bool>(&printNbestWithFeatures)->default_value(false), "Print out feature values, print feature list with hope counts after 1st epoch")
224
+ ("print-weights", po::value<bool>(&print_weights)->default_value(false), "Print out current weights")
225
+ ("print-core-weights", po::value<bool>(&print_core_weights)->default_value(true), "Print out current core weights")
226
+ ("prune-zero-weights", po::value<bool>(&pruneZeroWeights)->default_value(false), "Prune zero-valued sparse feature weights")
227
+ ("reference-files,r", po::value<vector<string> >(&referenceFiles), "Reference translation files for training")
228
+ ("reference-files-folds", po::value<vector<string> >(&referenceFilesFolds), "Reference translation files for training, one for each fold")
229
+ ("kbest", po::value<bool>(&kbest)->default_value(true), "Select hope/fear pairs from a list of nbest translations")
230
+
231
+ ("scale-by-inverse-length", po::value<bool>(&scaleByInverseLength)->default_value(false), "Scale BLEU by (history of) inverse input length")
232
+ ("scale-by-input-length", po::value<bool>(&scaleByInputLength)->default_value(true), "Scale BLEU by (history of) input length")
233
+ ("scale-by-avg-input-length", po::value<bool>(&scaleByAvgInputLength)->default_value(false), "Scale BLEU by average input length")
234
+ ("scale-by-avg-inverse-length", po::value<bool>(&scaleByAvgInverseLength)->default_value(false), "Scale BLEU by average inverse input length")
235
+ ("scale-by-x", po::value<float>(&scaleByX)->default_value(0.1), "Scale the BLEU score by value x")
236
+ ("scale-lm", po::value<bool>(&scale_lm)->default_value(true), "Scale the language model feature")
237
+ ("scale-factor-lm", po::value<float>(&scale_lm_factor)->default_value(0.5), "Scale the language model feature by this factor")
238
+ ("scale-wp", po::value<bool>(&scale_wp)->default_value(false), "Scale the word penalty feature")
239
+ ("scale-factor-wp", po::value<float>(&scale_wp_factor)->default_value(2), "Scale the word penalty feature by this factor")
240
+ ("scale-margin", po::value<bool>(&scale_margin)->default_value(0), "Scale the margin by the Bleu score of the oracle translation")
241
+ ("sentence-level-bleu", po::value<bool>(&sentenceBleu)->default_value(true), "Use a sentences level Bleu scoring function")
242
+ ("shuffle", po::value<bool>(&shuffle)->default_value(false), "Shuffle input sentences before processing")
243
+ ("sigmoid-param", po::value<float>(&sigmoidParam)->default_value(1), "y=sigmoidParam is the axis that this sigmoid approaches")
244
+ ("slack", po::value<float>(&slack)->default_value(0.05), "Use slack in optimiser")
245
+ ("sparse-average", po::value<bool>(&sparseAverage)->default_value(false), "Average weights by the number of processes")
246
+ ("sparse-no-average", po::value<bool>(&sparseNoAverage)->default_value(false), "Don't average sparse weights, just sum")
247
+ ("stop-weights", po::value<bool>(&weightConvergence)->default_value(true), "Stop when weights converge")
248
+ ("verbosity,v", po::value<int>(&verbosity)->default_value(0), "Verbosity level")
249
+ ("weight-dump-frequency", po::value<size_t>(&weightDumpFrequency)->default_value(2), "How often per epoch to dump weights (mpi)")
250
+ ("weight-dump-stem", po::value<string>(&weightDumpStem)->default_value("weights"), "Stem of filename to use for dumping weights");
251
+
252
+ po::options_description cmdline_options;
253
+ cmdline_options.add(desc);
254
+ po::variables_map vm;
255
+ po::store(po::command_line_parser(argc, argv). options(cmdline_options).run(), vm);
256
+ po::notify(vm);
257
+
258
+ if (help) {
259
+ std::cout << "Usage: " + string(argv[0])
260
+ + " -f mosesini-file -i input-file -r reference-file(s) [options]" << std::endl;
261
+ std::cout << desc << std::endl;
262
+ return 0;
263
+ }
264
+
265
+ const StaticData &staticData = StaticData::Instance();
266
+
267
+ bool trainWithMultipleFolds = false;
268
+ if (mosesConfigFilesFolds.size() > 0 || inputFilesFolds.size() > 0 || referenceFilesFolds.size() > 0) {
269
+ if (rank == 0)
270
+ cerr << "Training with " << mosesConfigFilesFolds.size() << " folds" << endl;
271
+ trainWithMultipleFolds = true;
272
+ }
273
+
274
+ if (dumpMixedWeights && (mixingFrequency != weightDumpFrequency)) {
275
+ cerr << "Set mixing frequency = weight dump frequency for dumping mixed weights!" << endl;
276
+ exit(1);
277
+ }
278
+
279
+ if ((sparseAverage || sparseNoAverage) && averageWeights) {
280
+ cerr << "Parameters --sparse-average 1/--sparse-no-average 1 and --average-weights 1 are incompatible (not implemented)" << endl;
281
+ exit(1);
282
+ }
283
+
284
+ if (trainWithMultipleFolds) {
285
+ if (!mosesConfigFilesFolds.size()) {
286
+ cerr << "Error: No moses ini files specified for training with folds" << endl;
287
+ exit(1);
288
+ }
289
+
290
+ if (!inputFilesFolds.size()) {
291
+ cerr << "Error: No input files specified for training with folds" << endl;
292
+ exit(1);
293
+ }
294
+
295
+ if (!referenceFilesFolds.size()) {
296
+ cerr << "Error: No reference files specified for training with folds" << endl;
297
+ exit(1);
298
+ }
299
+ } else {
300
+ if (mosesConfigFile.empty()) {
301
+ cerr << "Error: No moses ini file specified" << endl;
302
+ return 1;
303
+ }
304
+
305
+ if (inputFile.empty()) {
306
+ cerr << "Error: No input file specified" << endl;
307
+ return 1;
308
+ }
309
+
310
+ if (!referenceFiles.size()) {
311
+ cerr << "Error: No reference files specified" << endl;
312
+ return 1;
313
+ }
314
+ }
315
+
316
+ // load input and references
317
+ vector<string> inputSentences;
318
+ size_t inputSize = trainWithMultipleFolds? inputFilesFolds.size(): 0;
319
+ size_t refSize = trainWithMultipleFolds? referenceFilesFolds.size(): referenceFiles.size();
320
+ vector<vector<string> > inputSentencesFolds(inputSize);
321
+ vector<vector<string> > referenceSentences(refSize);
322
+
323
+ // number of cores for each fold
324
+ size_t coresPerFold = 0, myFold = 0;
325
+ if (trainWithMultipleFolds) {
326
+ if (mosesConfigFilesFolds.size() > size) {
327
+ cerr << "Number of cores has to be a multiple of the number of folds" << endl;
328
+ exit(1);
329
+ }
330
+ coresPerFold = size/mosesConfigFilesFolds.size();
331
+ if (size % coresPerFold > 0) {
332
+ cerr << "Number of cores has to be a multiple of the number of folds" << endl;
333
+ exit(1);
334
+ }
335
+
336
+ if (rank == 0)
337
+ cerr << "Number of cores per fold: " << coresPerFold << endl;
338
+ myFold = rank/coresPerFold;
339
+ cerr << "Rank " << rank << ", my fold: " << myFold << endl;
340
+ }
341
+
342
+ // NOTE: we do not actually need the references here, because we are reading them in from StaticData
343
+ if (trainWithMultipleFolds) {
344
+ if (!loadSentences(inputFilesFolds[myFold], inputSentencesFolds[myFold])) {
345
+ cerr << "Error: Failed to load input sentences from " << inputFilesFolds[myFold] << endl;
346
+ exit(1);
347
+ }
348
+ VERBOSE(1, "Rank " << rank << " reading inputs from " << inputFilesFolds[myFold] << endl);
349
+
350
+ if (!loadSentences(referenceFilesFolds[myFold], referenceSentences[myFold])) {
351
+ cerr << "Error: Failed to load reference sentences from " << referenceFilesFolds[myFold] << endl;
352
+ exit(1);
353
+ }
354
+ if (referenceSentences[myFold].size() != inputSentencesFolds[myFold].size()) {
355
+ cerr << "Error: Input file length (" << inputSentencesFolds[myFold].size() << ") != ("
356
+ << referenceSentences[myFold].size() << ") reference file length (rank " << rank << ")" << endl;
357
+ exit(1);
358
+ }
359
+ VERBOSE(1, "Rank " << rank << " reading references from " << referenceFilesFolds[myFold] << endl);
360
+ } else {
361
+ if (!loadSentences(inputFile, inputSentences)) {
362
+ cerr << "Error: Failed to load input sentences from " << inputFile << endl;
363
+ return 1;
364
+ }
365
+
366
+ for (size_t i = 0; i < referenceFiles.size(); ++i) {
367
+ if (!loadSentences(referenceFiles[i], referenceSentences[i])) {
368
+ cerr << "Error: Failed to load reference sentences from "
369
+ << referenceFiles[i] << endl;
370
+ return 1;
371
+ }
372
+ if (referenceSentences[i].size() != inputSentences.size()) {
373
+ cerr << "Error: Input file length (" << inputSentences.size() << ") != ("
374
+ << referenceSentences[i].size() << ") length of reference file " << i
375
+ << endl;
376
+ return 1;
377
+ }
378
+ }
379
+ }
380
+
381
+ if (scaleByAvgInputLength || scaleByInverseLength || scaleByAvgInverseLength)
382
+ scaleByInputLength = false;
383
+
384
+ if (historyBleu || simpleHistoryBleu) {
385
+ sentenceBleu = false;
386
+ cerr << "Using history Bleu. " << endl;
387
+ }
388
+
389
+ if (kbest) {
390
+ realBleu = true;
391
+ disableBleuFeature = true;
392
+ cerr << "Use kbest lists and real Bleu scores, disable Bleu feature.." << endl;
393
+ }
394
+
395
+ // initialise Moses
396
+ // add references to initialize Bleu feature
397
+ boost::trim(decoder_settings);
398
+ decoder_settings += " -mira -n-best-list - " + boost::lexical_cast<string>(n) + " distinct";
399
+
400
+ vector<string> decoder_params;
401
+ boost::split(decoder_params, decoder_settings, boost::is_any_of("\t "));
402
+
403
+ // bleu feature
404
+ decoder_params.push_back("-feature-add");
405
+
406
+ decoder_settings = "BleuScoreFeature tuneable=false references=";
407
+ if (trainWithMultipleFolds) {
408
+ decoder_settings += referenceFilesFolds[myFold];
409
+ } else {
410
+ decoder_settings += referenceFiles[0];
411
+ for (size_t i=1; i < referenceFiles.size(); ++i) {
412
+ decoder_settings += ",";
413
+ decoder_settings += referenceFiles[i];
414
+ }
415
+ }
416
+ decoder_params.push_back(decoder_settings);
417
+
418
+ string configFile = trainWithMultipleFolds? mosesConfigFilesFolds[myFold] : mosesConfigFile;
419
+ VERBOSE(1, "Rank " << rank << " reading config file from " << configFile << endl);
420
+ MosesDecoder* decoder = new MosesDecoder(configFile, verbosity, decoder_params.size(), decoder_params);
421
+ decoder->setBleuParameters(disableBleuFeature, sentenceBleu, scaleByInputLength, scaleByAvgInputLength,
422
+ scaleByInverseLength, scaleByAvgInverseLength,
423
+ scaleByX, historySmoothing, bleu_smoothing_scheme, simpleHistoryBleu);
424
+ bool chartDecoding = staticData.IsChart();
425
+
426
+ // Optionally shuffle the sentences
427
+ vector<size_t> order;
428
+ if (trainWithMultipleFolds) {
429
+ for (size_t i = 0; i < inputSentencesFolds[myFold].size(); ++i) {
430
+ order.push_back(i);
431
+ }
432
+ } else {
433
+ if (rank == 0) {
434
+ for (size_t i = 0; i < inputSentences.size(); ++i) {
435
+ order.push_back(i);
436
+ }
437
+ }
438
+ }
439
+
440
+ // initialise optimizer
441
+ Optimiser* optimiser = NULL;
442
+ if (learner == "mira") {
443
+ if (rank == 0) {
444
+ cerr << "Optimising using Mira" << endl;
445
+ cerr << "slack: " << slack << ", learning rate: " << mira_learning_rate << endl;
446
+ if (normaliseMargin)
447
+ cerr << "sigmoid parameter: " << sigmoidParam << endl;
448
+ }
449
+ optimiser = new MiraOptimiser(slack, scale_margin, scale_update, boost, normaliseMargin, sigmoidParam);
450
+ learning_rate = mira_learning_rate;
451
+ perceptron_update = false;
452
+ } else if (learner == "perceptron") {
453
+ if (rank == 0) {
454
+ cerr << "Optimising using Perceptron" << endl;
455
+ }
456
+ optimiser = new Perceptron();
457
+ learning_rate = perceptron_learning_rate;
458
+ perceptron_update = true;
459
+ model_hope_fear = false; // mira only
460
+ hope_fear = false; // mira only
461
+ n = 1;
462
+ hope_n = 1;
463
+ fear_n = 1;
464
+ } else {
465
+ cerr << "Error: Unknown optimiser: " << learner << endl;
466
+ return 1;
467
+ }
468
+
469
+ // resolve parameter dependencies
470
+ if (batchSize > 1 && perceptron_update) {
471
+ batchSize = 1;
472
+ cerr << "Info: Setting batch size to 1 for perceptron update" << endl;
473
+ }
474
+
475
+ if (hope_n == 0)
476
+ hope_n = n;
477
+ if (fear_n == 0)
478
+ fear_n = n;
479
+
480
+ if (model_hope_fear || kbest)
481
+ hope_fear = false; // is true by default
482
+ if (learner == "mira" && !(hope_fear || model_hope_fear || kbest)) {
483
+ cerr << "Error: Need to select one of parameters --hope-fear/--model-hope-fear/--kbest for mira update." << endl;
484
+ return 1;
485
+ }
486
+
487
+ #ifdef MPI_ENABLE
488
+ if (!trainWithMultipleFolds)
489
+ mpi::broadcast(world, order, 0);
490
+ #endif
491
+
492
+ // Create shards according to the number of processes used
493
+ vector<size_t> shard;
494
+ if (trainWithMultipleFolds) {
495
+ size_t shardSize = order.size()/coresPerFold;
496
+ size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
497
+ size_t shardEnd = shardStart + shardSize;
498
+ if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
499
+ shardEnd = order.size();
500
+ shardSize = shardEnd - shardStart;
501
+ }
502
+ VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
503
+ VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
504
+ shard.resize(shardSize);
505
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
506
+ batchSize = 1;
507
+ } else {
508
+ size_t shardSize = order.size() / size;
509
+ size_t shardStart = (size_t) (shardSize * rank);
510
+ size_t shardEnd = (size_t) (shardSize * (rank + 1));
511
+ if (rank == size - 1) {
512
+ shardEnd = order.size();
513
+ shardSize = shardEnd - shardStart;
514
+ }
515
+ VERBOSE(1, "Rank: " << rank << " Shard size: " << shardSize << endl);
516
+ VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
517
+ shard.resize(shardSize);
518
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
519
+ if (batchEqualsShard)
520
+ batchSize = shardSize;
521
+ }
522
+
523
+ // get reference to feature functions
524
+ // const vector<FeatureFunction*> &featureFunctions = FeatureFunction::GetFeatureFunctions();
525
+ ScoreComponentCollection initialWeights = decoder->getWeights();
526
+
527
+ if (add2lm != 0) {
528
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
529
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
530
+ const StatefulFeatureFunction *ff = statefulFFs[i];
531
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
532
+
533
+ if (lm) {
534
+ float lmWeight = initialWeights.GetScoreForProducer(lm) + add2lm;
535
+ initialWeights.Assign(lm, lmWeight);
536
+ cerr << "Rank " << rank << ", add " << add2lm << " to lm weight." << endl;
537
+ }
538
+ }
539
+ }
540
+
541
+ if (normaliseWeights) {
542
+ initialWeights.L1Normalise();
543
+ cerr << "Rank " << rank << ", normalised initial weights: " << initialWeights << endl;
544
+ }
545
+
546
+ decoder->setWeights(initialWeights);
547
+
548
+ // set bleu weight to twice the size of the language model weight(s)
549
+ if (bleu_weight_lm) {
550
+ float lmSum = 0;
551
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
552
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
553
+ const StatefulFeatureFunction *ff = statefulFFs[i];
554
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
555
+
556
+ if (lm) {
557
+ lmSum += abs(initialWeights.GetScoreForProducer(lm));
558
+ }
559
+ }
560
+
561
+ bleuWeight = lmSum * bleu_weight_lm_factor;
562
+ if (!kbest) cerr << "Set bleu weight to lm weight * " << bleu_weight_lm_factor << endl;
563
+ }
564
+
565
+ // bleu weights can be set separately for hope and fear; otherwise they are both set to 'lm weight * bleu_weight_lm_factor'
566
+ if (bleuWeight_hope == -1) {
567
+ bleuWeight_hope = bleuWeight;
568
+ }
569
+ if (bleuWeight_fear == -1) {
570
+ bleuWeight_fear = bleuWeight;
571
+ }
572
+ bleuWeight_fear *= bleu_weight_fear_factor;
573
+ if (!kbest) {
574
+ cerr << "Bleu weight: " << bleuWeight << endl;
575
+ cerr << "Bleu weight fear: " << bleuWeight_fear << endl;
576
+ }
577
+
578
+ if (decode_hope || decode_fear || decode_model) {
579
+ size_t decode = 1;
580
+ if (decode_fear) decode = 2;
581
+ if (decode_model) decode = 3;
582
+ decodeHopeOrFear(rank, size, decode, decode_filename, inputSentences, decoder, n, bleuWeight);
583
+ }
584
+
585
+ //Main loop:
586
+ ScoreComponentCollection cumulativeWeights; // collect weights per epoch to produce an average
587
+ ScoreComponentCollection cumulativeWeightsBinary;
588
+ size_t numberOfUpdates = 0;
589
+ size_t numberOfUpdatesThisEpoch = 0;
590
+
591
+ time_t now;
592
+ time(&now);
593
+ cerr << "Rank " << rank << ", " << ctime(&now);
594
+
595
+ float avgInputLength = 0;
596
+ float sumOfInputs = 0;
597
+ size_t numberOfInputs = 0;
598
+
599
+ ScoreComponentCollection mixedWeights;
600
+ ScoreComponentCollection mixedWeightsPrevious;
601
+ ScoreComponentCollection mixedWeightsBeforePrevious;
602
+ ScoreComponentCollection mixedAverageWeights;
603
+ ScoreComponentCollection mixedAverageWeightsPrevious;
604
+ ScoreComponentCollection mixedAverageWeightsBeforePrevious;
605
+
606
+ bool stop = false;
607
+ // int sumStillViolatedConstraints;
608
+ float epsilon = 0.0001;
609
+
610
+ // Variables for feature confidence
611
+ ScoreComponentCollection confidenceCounts, mixedConfidenceCounts, featureLearningRates;
612
+ featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0); //initialise core learning rates
613
+ cerr << "Initial learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
614
+
615
+ for (size_t epoch = continue_epoch; epoch < epochs && !stop; ++epoch) {
616
+ if (shuffle) {
617
+ if (trainWithMultipleFolds || rank == 0) {
618
+ cerr << "Rank " << rank << ", epoch " << epoch << ", shuffling input sentences.." << endl;
619
+ RandomIndex rindex;
620
+ random_shuffle(order.begin(), order.end(), rindex);
621
+ }
622
+
623
+ #ifdef MPI_ENABLE
624
+ if (!trainWithMultipleFolds)
625
+ mpi::broadcast(world, order, 0);
626
+ #endif
627
+
628
+ // redo shards
629
+ if (trainWithMultipleFolds) {
630
+ size_t shardSize = order.size()/coresPerFold;
631
+ size_t shardStart = (size_t) (shardSize * (rank % coresPerFold));
632
+ size_t shardEnd = shardStart + shardSize;
633
+ if (rank % coresPerFold == coresPerFold - 1) { // last rank of each fold
634
+ shardEnd = order.size();
635
+ shardSize = shardEnd - shardStart;
636
+ }
637
+ VERBOSE(1, "Rank: " << rank << ", shard size: " << shardSize << endl);
638
+ VERBOSE(1, "Rank: " << rank << ", shard start: " << shardStart << " shard end: " << shardEnd << endl);
639
+ shard.resize(shardSize);
640
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
641
+ batchSize = 1;
642
+ } else {
643
+ size_t shardSize = order.size()/size;
644
+ size_t shardStart = (size_t) (shardSize * rank);
645
+ size_t shardEnd = (size_t) (shardSize * (rank + 1));
646
+ if (rank == size - 1) {
647
+ shardEnd = order.size();
648
+ shardSize = shardEnd - shardStart;
649
+ }
650
+ VERBOSE(1, "Shard size: " << shardSize << endl);
651
+ VERBOSE(1, "Rank: " << rank << " Shard start: " << shardStart << " Shard end: " << shardEnd << endl);
652
+ shard.resize(shardSize);
653
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
654
+ if (batchEqualsShard)
655
+ batchSize = shardSize;
656
+ }
657
+ }
658
+
659
+ // sum of violated constraints in an epoch
660
+ // sumStillViolatedConstraints = 0;
661
+
662
+ numberOfUpdatesThisEpoch = 0;
663
+ // Sum up weights over one epoch, final average uses weights from last epoch
664
+ if (!accumulateWeights) {
665
+ cumulativeWeights.ZeroAll();
666
+ cumulativeWeightsBinary.ZeroAll();
667
+ }
668
+
669
+ // number of weight dumps this epoch
670
+ size_t weightMixingThisEpoch = 0;
671
+ size_t weightEpochDump = 0;
672
+
673
+ size_t shardPosition = 0;
674
+ vector<size_t>::const_iterator sid = shard.begin();
675
+ while (sid != shard.end()) {
676
+ // feature values for hypotheses i,j (matrix: batchSize x 3*n x featureValues)
677
+ vector<vector<ScoreComponentCollection> > featureValues;
678
+ vector<vector<float> > bleuScores;
679
+ vector<vector<float> > modelScores;
680
+
681
+ // variables for hope-fear/perceptron setting
682
+ vector<vector<ScoreComponentCollection> > featureValuesHope;
683
+ vector<vector<ScoreComponentCollection> > featureValuesFear;
684
+ vector<vector<float> > bleuScoresHope;
685
+ vector<vector<float> > bleuScoresFear;
686
+ vector<vector<float> > modelScoresHope;
687
+ vector<vector<float> > modelScoresFear;
688
+
689
+ // get moses weights
690
+ ScoreComponentCollection mosesWeights = decoder->getWeights();
691
+ VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", weights: " << mosesWeights << endl);
692
+
693
+ if (historyBleu || simpleHistoryBleu) {
694
+ decoder->printBleuFeatureHistory(cerr);
695
+ }
696
+
697
+ // BATCHING: produce nbest lists for all input sentences in batch
698
+ vector<float> oracleBleuScores;
699
+ vector<float> oracleModelScores;
700
+ vector<vector<const Word*> > oneBests;
701
+ vector<ScoreComponentCollection> oracleFeatureValues;
702
+ vector<size_t> inputLengths;
703
+ vector<size_t> ref_ids;
704
+ size_t actualBatchSize = 0;
705
+
706
+ size_t examples_in_batch = 0;
707
+ bool skip_example = false;
708
+ for (size_t batchPosition = 0; batchPosition < batchSize && sid
709
+ != shard.end(); ++batchPosition) {
710
+ string input;
711
+ if (trainWithMultipleFolds)
712
+ input = inputSentencesFolds[myFold][*sid];
713
+ else
714
+ input = inputSentences[*sid];
715
+
716
+ Moses::Sentence *sentence = new Sentence();
717
+ stringstream in(input + "\n");
718
+ const vector<FactorType> inputFactorOrder = staticData.GetInputFactorOrder();
719
+ sentence->Read(in,inputFactorOrder);
720
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", input sentence " << *sid << ": \"";
721
+ sentence->Print(cerr);
722
+ cerr << "\"" << " (batch pos " << batchPosition << ")" << endl;
723
+ size_t current_input_length = (*sentence).GetSize();
724
+
725
+ if (epoch == 0 && (scaleByAvgInputLength || scaleByAvgInverseLength)) {
726
+ sumOfInputs += current_input_length;
727
+ ++numberOfInputs;
728
+ avgInputLength = sumOfInputs/numberOfInputs;
729
+ decoder->setAvgInputLength(avgInputLength);
730
+ cerr << "Rank " << rank << ", epoch 0, average input length: " << avgInputLength << endl;
731
+ }
732
+
733
+ vector<ScoreComponentCollection> newFeatureValues;
734
+ vector<float> newScores;
735
+ if (model_hope_fear) {
736
+ featureValues.push_back(newFeatureValues);
737
+ bleuScores.push_back(newScores);
738
+ modelScores.push_back(newScores);
739
+ }
740
+ if (hope_fear || perceptron_update) {
741
+ featureValuesHope.push_back(newFeatureValues);
742
+ featureValuesFear.push_back(newFeatureValues);
743
+ bleuScoresHope.push_back(newScores);
744
+ bleuScoresFear.push_back(newScores);
745
+ modelScoresHope.push_back(newScores);
746
+ modelScoresFear.push_back(newScores);
747
+ if (historyBleu || simpleHistoryBleu || debug_model) {
748
+ featureValues.push_back(newFeatureValues);
749
+ bleuScores.push_back(newScores);
750
+ modelScores.push_back(newScores);
751
+ }
752
+ }
753
+ if (kbest) {
754
+ // for decoding
755
+ featureValues.push_back(newFeatureValues);
756
+ bleuScores.push_back(newScores);
757
+ modelScores.push_back(newScores);
758
+
759
+ // for storing selected examples
760
+ featureValuesHope.push_back(newFeatureValues);
761
+ featureValuesFear.push_back(newFeatureValues);
762
+ bleuScoresHope.push_back(newScores);
763
+ bleuScoresFear.push_back(newScores);
764
+ modelScoresHope.push_back(newScores);
765
+ modelScoresFear.push_back(newScores);
766
+ }
767
+
768
+ size_t ref_length;
769
+ float avg_ref_length;
770
+
771
+ if (print_weights)
772
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: " << mosesWeights << endl;
773
+ if (print_core_weights) {
774
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current weights: ";
775
+ mosesWeights.PrintCoreFeatures();
776
+ cerr << endl;
777
+ }
778
+
779
+ // check LM weight
780
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
781
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
782
+ const StatefulFeatureFunction *ff = statefulFFs[i];
783
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
784
+
785
+ if (lm) {
786
+ float lmWeight = mosesWeights.GetScoreForProducer(lm);
787
+ cerr << "Rank " << rank << ", epoch " << epoch << ", lm weight: " << lmWeight << endl;
788
+ if (lmWeight <= 0) {
789
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: language model weight should never be <= 0." << endl;
790
+ mosesWeights.Assign(lm, 0.1);
791
+ cerr << "Rank " << rank << ", epoch " << epoch << ", assign lm weights of 0.1" << endl;
792
+ }
793
+ }
794
+ }
795
+
796
+ // select inference scheme
797
+ cerr << "Rank " << rank << ", epoch " << epoch << ", real Bleu? " << realBleu << endl;
798
+ if (hope_fear || perceptron_update) {
799
+ // HOPE
800
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << hope_n <<
801
+ "best hope translations" << endl;
802
+ vector< vector<const Word*> > outputHope = decoder->getNBest(input, *sid, hope_n, 1.0, bleuWeight_hope,
803
+ featureValuesHope[batchPosition], bleuScoresHope[batchPosition], modelScoresHope[batchPosition],
804
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
805
+ vector<const Word*> oracle = outputHope[0];
806
+ decoder->cleanup(chartDecoding);
807
+ ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
808
+ avg_ref_length = ref_length;
809
+ float hope_length_ratio = (float)oracle.size()/ref_length;
810
+ cerr << endl;
811
+
812
+ // count sparse features occurring in hope translation
813
+ featureValuesHope[batchPosition][0].IncrementSparseHopeFeatures();
814
+
815
+ vector<const Word*> bestModel;
816
+ if (debug_model || historyBleu || simpleHistoryBleu) {
817
+ // MODEL (for updating the history only, using dummy vectors)
818
+ cerr << "Rank " << rank << ", epoch " << epoch << ", 1best wrt model score (debug or history)" << endl;
819
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
820
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
821
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
822
+ bestModel = outputModel[0];
823
+ decoder->cleanup(chartDecoding);
824
+ cerr << endl;
825
+ ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
826
+ }
827
+
828
+ // FEAR
829
+ //float fear_length_ratio = 0;
830
+ float bleuRatioHopeFear = 0;
831
+ //int fearSize = 0;
832
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << fear_n << "best fear translations" << endl;
833
+ vector< vector<const Word*> > outputFear = decoder->getNBest(input, *sid, fear_n, -1.0, bleuWeight_fear,
834
+ featureValuesFear[batchPosition], bleuScoresFear[batchPosition], modelScoresFear[batchPosition],
835
+ 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
836
+ vector<const Word*> fear = outputFear[0];
837
+ decoder->cleanup(chartDecoding);
838
+ ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
839
+ avg_ref_length += ref_length;
840
+ avg_ref_length /= 2;
841
+ //fear_length_ratio = (float)fear.size()/ref_length;
842
+ //fearSize = (int)fear.size();
843
+ cerr << endl;
844
+ for (size_t i = 0; i < fear.size(); ++i)
845
+ delete fear[i];
846
+
847
+ // count sparse features occurring in fear translation
848
+ featureValuesFear[batchPosition][0].IncrementSparseFearFeatures();
849
+
850
+ // Bleu-related example selection
851
+ bool skip = false;
852
+ bleuRatioHopeFear = bleuScoresHope[batchPosition][0] / bleuScoresFear[batchPosition][0];
853
+ if (minBleuRatio != -1 && bleuRatioHopeFear < minBleuRatio)
854
+ skip = true;
855
+ if(maxBleuRatio != -1 && bleuRatioHopeFear > maxBleuRatio)
856
+ skip = true;
857
+
858
+ // sanity check
859
+ if (historyBleu || simpleHistoryBleu) {
860
+ if (bleuScores[batchPosition][0] > bleuScoresHope[batchPosition][0] &&
861
+ modelScores[batchPosition][0] > modelScoresHope[batchPosition][0]) {
862
+ if (abs(bleuScores[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon &&
863
+ abs(modelScores[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
864
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: MODEL translation better than HOPE translation." << endl;
865
+ skip = true;
866
+ }
867
+ }
868
+ if (bleuScoresFear[batchPosition][0] > bleuScores[batchPosition][0] &&
869
+ modelScoresFear[batchPosition][0] > modelScores[batchPosition][0]) {
870
+ if (abs(bleuScoresFear[batchPosition][0] - bleuScores[batchPosition][0]) > epsilon &&
871
+ abs(modelScoresFear[batchPosition][0] - modelScores[batchPosition][0]) > epsilon) {
872
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than MODEL translation." << endl;
873
+ skip = true;
874
+ }
875
+ }
876
+ }
877
+ if (bleuScoresFear[batchPosition][0] > bleuScoresHope[batchPosition][0]) {
878
+ if (abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) > epsilon) {
879
+ // check if it's an error or a warning
880
+ skip = true;
881
+ if (modelScoresFear[batchPosition][0] > modelScoresHope[batchPosition][0] && abs(modelScoresFear[batchPosition][0] - modelScoresHope[batchPosition][0]) > epsilon) {
882
+ cerr << "Rank " << rank << ", epoch " << epoch << ", ERROR: FEAR translation better than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
883
+ } else {
884
+ cerr << "Rank " << rank << ", epoch " << epoch << ", WARNING: FEAR translation has better Bleu than HOPE translation. (abs-diff: " << abs(bleuScoresFear[batchPosition][0] - bleuScoresHope[batchPosition][0]) << ")" <<endl;
885
+ }
886
+ }
887
+ }
888
+
889
+ if (skip) {
890
+ cerr << "Rank " << rank << ", epoch " << epoch << ", skip example (" << hope_length_ratio << ", " << bleuRatioHopeFear << ").. " << endl;
891
+ featureValuesHope[batchPosition].clear();
892
+ featureValuesFear[batchPosition].clear();
893
+ bleuScoresHope[batchPosition].clear();
894
+ bleuScoresFear[batchPosition].clear();
895
+ if (historyBleu || simpleHistoryBleu || debug_model) {
896
+ featureValues[batchPosition].clear();
897
+ bleuScores[batchPosition].clear();
898
+ }
899
+ } else {
900
+ examples_in_batch++;
901
+
902
+ // needed for history
903
+ if (historyBleu || simpleHistoryBleu) {
904
+ inputLengths.push_back(current_input_length);
905
+ ref_ids.push_back(*sid);
906
+ oneBests.push_back(bestModel);
907
+ }
908
+ }
909
+ }
910
+ if (model_hope_fear) {
911
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best hope translations" << endl;
912
+ size_t oraclePos = featureValues[batchPosition].size();
913
+ decoder->getNBest(input, *sid, n, 1.0, bleuWeight_hope,
914
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
915
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
916
+ //vector<const Word*> oracle = outputHope[0];
917
+ // needed for history
918
+ inputLengths.push_back(current_input_length);
919
+ ref_ids.push_back(*sid);
920
+ decoder->cleanup(chartDecoding);
921
+ //ref_length = decoder->getClosestReferenceLength(*sid, oracle.size());
922
+ //float hope_length_ratio = (float)oracle.size()/ref_length;
923
+ cerr << endl;
924
+
925
+ oracleFeatureValues.push_back(featureValues[batchPosition][oraclePos]);
926
+ oracleBleuScores.push_back(bleuScores[batchPosition][oraclePos]);
927
+ oracleModelScores.push_back(modelScores[batchPosition][oraclePos]);
928
+
929
+ // MODEL
930
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
931
+ if (historyBleu || simpleHistoryBleu) {
932
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
933
+ bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
934
+ modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
935
+ vector<const Word*> bestModel = outputModel[0];
936
+ oneBests.push_back(bestModel);
937
+ inputLengths.push_back(current_input_length);
938
+ ref_ids.push_back(*sid);
939
+ } else {
940
+ decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
941
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
942
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
943
+ }
944
+ decoder->cleanup(chartDecoding);
945
+ //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
946
+ //float model_length_ratio = (float)bestModel.size()/ref_length;
947
+ cerr << endl;
948
+
949
+ // FEAR
950
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best fear translations" << endl;
951
+ decoder->getNBest(input, *sid, n, -1.0, bleuWeight_fear,
952
+ featureValues[batchPosition], bleuScores[batchPosition], modelScores[batchPosition],
953
+ 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
954
+ decoder->cleanup(chartDecoding);
955
+ //ref_length = decoder->getClosestReferenceLength(*sid, fear.size());
956
+ //float fear_length_ratio = (float)fear.size()/ref_length;
957
+
958
+ examples_in_batch++;
959
+ }
960
+ if (kbest) {
961
+ // MODEL
962
+ cerr << "Rank " << rank << ", epoch " << epoch << ", " << n << "best wrt model score" << endl;
963
+ if (historyBleu || simpleHistoryBleu) {
964
+ vector< vector<const Word*> > outputModel = decoder->getNBest(input, *sid, n, 0.0,
965
+ bleuWeight, featureValues[batchPosition], bleuScores[batchPosition],
966
+ modelScores[batchPosition], 1, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
967
+ vector<const Word*> bestModel = outputModel[0];
968
+ oneBests.push_back(bestModel);
969
+ inputLengths.push_back(current_input_length);
970
+ ref_ids.push_back(*sid);
971
+ } else {
972
+ decoder->getNBest(input, *sid, n, 0.0, bleuWeight,
973
+ featureValues[batchPosition], bleuScores[batchPosition],
974
+ modelScores[batchPosition], 0, realBleu, distinctNbest, avgRefLength, rank, epoch, "");
975
+ }
976
+ decoder->cleanup(chartDecoding);
977
+ //ref_length = decoder->getClosestReferenceLength(*sid, bestModel.size());
978
+ //float model_length_ratio = (float)bestModel.size()/ref_length;
979
+ cerr << endl;
980
+
981
+ examples_in_batch++;
982
+
983
+ HypothesisQueue queueHope(hope_n);
984
+ HypothesisQueue queueFear(fear_n);
985
+ cerr << endl;
986
+ if (most_violated || all_violated) {
987
+ float bleuHope = -1000;
988
+ float bleuFear = 1000;
989
+ int indexHope = -1;
990
+ int indexFear = -1;
991
+
992
+ vector<float> bleuHopeList;
993
+ vector<float> bleuFearList;
994
+ vector<float> indexHopeList;
995
+ vector<float> indexFearList;
996
+
997
+ if (most_violated)
998
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with most violated constraint" << endl;
999
+ else if (all_violated)
1000
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with violated constraints";
1001
+ else
1002
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick all pairs with hope";
1003
+
1004
+ // find best hope, then find fear that violates our constraint most
1005
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
1006
+ if (abs(bleuScores[batchPosition][i] - bleuHope) < epsilon) { // equal bleu scores
1007
+ if (modelScores[batchPosition][i] > modelScores[batchPosition][indexHope]) {
1008
+ if (abs(modelScores[batchPosition][i] - modelScores[batchPosition][indexHope]) > epsilon) {
1009
+ // better model score
1010
+ bleuHope = bleuScores[batchPosition][i];
1011
+ indexHope = i;
1012
+ }
1013
+ }
1014
+ } else if (bleuScores[batchPosition][i] > bleuHope) { // better than current best
1015
+ bleuHope = bleuScores[batchPosition][i];
1016
+ indexHope = i;
1017
+ }
1018
+ }
1019
+
1020
+ float currentViolation = 0;
1021
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
1022
+ float bleuDiff = bleuHope - bleuScores[batchPosition][i];
1023
+ float modelDiff = modelScores[batchPosition][indexHope] - modelScores[batchPosition][i];
1024
+ if ((bleuDiff > epsilon) && (modelDiff < bleuDiff)) {
1025
+ float diff = bleuDiff - modelDiff;
1026
+ if (diff > epsilon) {
1027
+ if (all_violated) {
1028
+ cerr << ".. adding pair";
1029
+ bleuHopeList.push_back(bleuHope);
1030
+ bleuFearList.push_back(bleuScores[batchPosition][i]);
1031
+ indexHopeList.push_back(indexHope);
1032
+ indexFearList.push_back(i);
1033
+ } else if (most_violated && diff > currentViolation) {
1034
+ currentViolation = diff;
1035
+ bleuFear = bleuScores[batchPosition][i];
1036
+ indexFear = i;
1037
+ cerr << "Rank " << rank << ", epoch " << epoch << ", current violation: " << currentViolation << " (" << modelDiff << " >= " << bleuDiff << ")" << endl;
1038
+ }
1039
+ }
1040
+ }
1041
+ }
1042
+
1043
+ if (most_violated) {
1044
+ if (currentViolation > 0) {
1045
+ cerr << "Rank " << rank << ", epoch " << epoch << ", adding pair with violation " << currentViolation << endl;
1046
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
1047
+ bleuScoresHope[batchPosition].push_back(bleuHope);
1048
+ bleuScoresFear[batchPosition].push_back(bleuFear);
1049
+ featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
1050
+ featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
1051
+ float modelScoreHope = modelScores[batchPosition][indexHope];
1052
+ float modelScoreFear = modelScores[batchPosition][indexFear];
1053
+ if (most_violated_reg) {
1054
+ // reduce model score difference by factor ~0.5
1055
+ float reg = currentViolation/4;
1056
+ modelScoreHope += abs(reg);
1057
+ modelScoreFear -= abs(reg);
1058
+ float newViolation = (bleuHope - bleuFear) - (modelScoreHope - modelScoreFear);
1059
+ cerr << "Rank " << rank << ", epoch " << epoch << ", regularized violation: " << newViolation << endl;
1060
+ }
1061
+ modelScoresHope[batchPosition].push_back(modelScoreHope);
1062
+ modelScoresFear[batchPosition].push_back(modelScoreFear);
1063
+
1064
+ featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
1065
+ featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
1066
+ } else {
1067
+ cerr << "Rank " << rank << ", epoch " << epoch << ", no violated constraint found." << endl;
1068
+ skip_example = 1;
1069
+ }
1070
+ } else cerr << endl;
1071
+ }
1072
+ if (max_bleu_diff) {
1073
+ cerr << "Rank " << rank << ", epoch " << epoch << ", pick pair with max Bleu diff from list: " << bleuScores[batchPosition].size() << endl;
1074
+ for (size_t i=0; i<bleuScores[batchPosition].size(); ++i) {
1075
+ float hopeScore = bleuScores[batchPosition][i];
1076
+ if (modelPlusBleu) hopeScore += modelScores[batchPosition][i];
1077
+ BleuIndexPair hope(hopeScore, i);
1078
+ queueHope.Push(hope);
1079
+
1080
+ float fearScore = -1*(bleuScores[batchPosition][i]);
1081
+ if (modelPlusBleu) fearScore += modelScores[batchPosition][i];
1082
+ BleuIndexPair fear(fearScore, i);
1083
+ queueFear.Push(fear);
1084
+ }
1085
+ skip_example = 0;
1086
+ }
1087
+ cerr << endl;
1088
+
1089
+ vector<BleuIndexPair> hopeList, fearList;
1090
+ for (size_t i=0; i<hope_n && !queueHope.Empty(); ++i) hopeList.push_back(queueHope.Pop());
1091
+ for (size_t i=0; i<fear_n && !queueFear.Empty(); ++i) fearList.push_back(queueFear.Pop());
1092
+ for (size_t i=0; i<hopeList.size(); ++i) {
1093
+ //float bleuHope = hopeList[i].first;
1094
+ size_t indexHope = hopeList[i].second;
1095
+ float bleuHope = bleuScores[batchPosition][indexHope];
1096
+ for (size_t j=0; j<fearList.size(); ++j) {
1097
+ //float bleuFear = -1*(fearList[j].first);
1098
+ size_t indexFear = fearList[j].second;
1099
+ float bleuFear = bleuScores[batchPosition][indexFear];
1100
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << bleuHope << " (" << indexHope << "), fear: " << bleuFear << " (" << indexFear << ")" << endl;
1101
+ bleuScoresHope[batchPosition].push_back(bleuHope);
1102
+ bleuScoresFear[batchPosition].push_back(bleuFear);
1103
+ featureValuesHope[batchPosition].push_back(featureValues[batchPosition][indexHope]);
1104
+ featureValuesFear[batchPosition].push_back(featureValues[batchPosition][indexFear]);
1105
+ float modelScoreHope = modelScores[batchPosition][indexHope];
1106
+ float modelScoreFear = modelScores[batchPosition][indexFear];
1107
+
1108
+ modelScoresHope[batchPosition].push_back(modelScoreHope);
1109
+ modelScoresFear[batchPosition].push_back(modelScoreFear);
1110
+
1111
+ featureValues[batchPosition][indexHope].IncrementSparseHopeFeatures();
1112
+ featureValues[batchPosition][indexFear].IncrementSparseFearFeatures();
1113
+ }
1114
+ }
1115
+ if (!makePairs)
1116
+ cerr << "Rank " << rank << ", epoch " << epoch << "summing up hope and fear vectors, no pairs" << endl;
1117
+ }
1118
+
1119
+ // next input sentence
1120
+ ++sid;
1121
+ ++actualBatchSize;
1122
+ ++shardPosition;
1123
+ } // end of batch loop
1124
+
1125
+ if (examples_in_batch == 0 || (kbest && skip_example)) {
1126
+ cerr << "Rank " << rank << ", epoch " << epoch << ", batch is empty." << endl;
1127
+ } else {
1128
+ vector<vector<float> > losses(actualBatchSize);
1129
+ if (model_hope_fear) {
1130
+ // Set loss for each sentence as BLEU(oracle) - BLEU(hypothesis)
1131
+ for (size_t batchPosition = 0; batchPosition < actualBatchSize; ++batchPosition) {
1132
+ for (size_t j = 0; j < bleuScores[batchPosition].size(); ++j) {
1133
+ losses[batchPosition].push_back(oracleBleuScores[batchPosition] - bleuScores[batchPosition][j]);
1134
+ }
1135
+ }
1136
+ }
1137
+
1138
+ // set weight for bleu feature to 0 before optimizing
1139
+ vector<FeatureFunction*>::const_iterator iter;
1140
+ const vector<FeatureFunction*> &featureFunctions2 = FeatureFunction::GetFeatureFunctions();
1141
+ for (iter = featureFunctions2.begin(); iter != featureFunctions2.end(); ++iter) {
1142
+ if ((*iter)->GetScoreProducerDescription() == "BleuScoreFeature") {
1143
+ mosesWeights.Assign(*iter, 0);
1144
+ break;
1145
+ }
1146
+ }
1147
+
1148
+ // scale LM feature (to avoid rapid changes)
1149
+ if (scale_lm) {
1150
+ cerr << "scale lm" << endl;
1151
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
1152
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
1153
+ const StatefulFeatureFunction *ff = statefulFFs[i];
1154
+ const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff);
1155
+
1156
+ if (lm) {
1157
+ // scale down score
1158
+ if (model_hope_fear) {
1159
+ scaleFeatureScore(lm, scale_lm_factor, featureValues, rank, epoch);
1160
+ } else {
1161
+ scaleFeatureScore(lm, scale_lm_factor, featureValuesHope, rank, epoch);
1162
+ scaleFeatureScore(lm, scale_lm_factor, featureValuesFear, rank, epoch);
1163
+ }
1164
+ }
1165
+ }
1166
+ }
1167
+
1168
+ // scale WP
1169
+ if (scale_wp) {
1170
+ // scale up weight
1171
+ WordPenaltyProducer &wp = WordPenaltyProducer::InstanceNonConst();
1172
+
1173
+ // scale down score
1174
+ if (model_hope_fear) {
1175
+ scaleFeatureScore(&wp, scale_wp_factor, featureValues, rank, epoch);
1176
+ } else {
1177
+ scaleFeatureScore(&wp, scale_wp_factor, featureValuesHope, rank, epoch);
1178
+ scaleFeatureScore(&wp, scale_wp_factor, featureValuesFear, rank, epoch);
1179
+ }
1180
+ }
1181
+
1182
+ // print out the feature values
1183
+ if (print_feature_values) {
1184
+ cerr << "\nRank " << rank << ", epoch " << epoch << ", feature values: " << endl;
1185
+ if (model_hope_fear) printFeatureValues(featureValues);
1186
+ else {
1187
+ cerr << "hope: " << endl;
1188
+ printFeatureValues(featureValuesHope);
1189
+ cerr << "fear: " << endl;
1190
+ printFeatureValues(featureValuesFear);
1191
+ }
1192
+ }
1193
+
1194
+ // apply learning rates to feature vectors before optimization
1195
+ if (feature_confidence) {
1196
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply feature learning rates with decays " << decay_core << "/" << decay_sparse << ": " << featureLearningRates << endl;
1197
+ if (model_hope_fear) {
1198
+ applyPerFeatureLearningRates(featureValues, featureLearningRates, sparse_r0);
1199
+ } else {
1200
+ applyPerFeatureLearningRates(featureValuesHope, featureLearningRates, sparse_r0);
1201
+ applyPerFeatureLearningRates(featureValuesFear, featureLearningRates, sparse_r0);
1202
+ }
1203
+ } else {
1204
+ // apply fixed learning rates
1205
+ cerr << "Rank " << rank << ", epoch " << epoch << ", apply fixed learning rates, core: " << core_r0 << ", sparse: " << sparse_r0 << endl;
1206
+ if (core_r0 != 1.0 || sparse_r0 != 1.0) {
1207
+ if (model_hope_fear) {
1208
+ applyLearningRates(featureValues, core_r0, sparse_r0);
1209
+ } else {
1210
+ applyLearningRates(featureValuesHope, core_r0, sparse_r0);
1211
+ applyLearningRates(featureValuesFear, core_r0, sparse_r0);
1212
+ }
1213
+ }
1214
+ }
1215
+
1216
+ // Run optimiser on batch:
1217
+ VERBOSE(1, "\nRank " << rank << ", epoch " << epoch << ", run optimiser:" << endl);
1218
+ size_t update_status = 1;
1219
+ ScoreComponentCollection weightUpdate;
1220
+ if (perceptron_update) {
1221
+ vector<vector<float> > dummy1;
1222
+ update_status = optimiser->updateWeightsHopeFear( weightUpdate, featureValuesHope,
1223
+ featureValuesFear, dummy1, dummy1, dummy1, dummy1, learning_rate, rank, epoch);
1224
+ } else if (hope_fear) {
1225
+ if (bleuScoresHope[0][0] >= min_oracle_bleu) {
1226
+ if (hope_n == 1 && fear_n ==1 && batchSize == 1 && !hildreth) {
1227
+ update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(weightUpdate,
1228
+ featureValuesHope[0][0], featureValuesFear[0][0], bleuScoresHope[0][0],
1229
+ bleuScoresFear[0][0], modelScoresHope[0][0], modelScoresFear[0][0], learning_rate, rank, epoch);
1230
+ } else
1231
+ update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
1232
+ featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
1233
+ modelScoresFear, learning_rate, rank, epoch);
1234
+ } else
1235
+ update_status = 1;
1236
+ } else if (kbest) {
1237
+ if (batchSize == 1 && featureValuesHope[0].size() == 1 && !hildreth) {
1238
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
1239
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
1240
+ update_status = ((MiraOptimiser*) optimiser)->updateWeightsAnalytically(
1241
+ weightUpdate, featureValuesHope[0][0], featureValuesFear[0][0],
1242
+ bleuScoresHope[0][0], bleuScoresFear[0][0], modelScoresHope[0][0],
1243
+ modelScoresFear[0][0], learning_rate, rank, epoch);
1244
+ } else {
1245
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score hope: " << modelScoresHope[0][0] << endl;
1246
+ cerr << "Rank " << rank << ", epoch " << epoch << ", model score fear: " << modelScoresFear[0][0] << endl;
1247
+ update_status = optimiser->updateWeightsHopeFear(weightUpdate, featureValuesHope,
1248
+ featureValuesFear, bleuScoresHope, bleuScoresFear, modelScoresHope,
1249
+ modelScoresFear, learning_rate, rank, epoch);
1250
+ }
1251
+ } else {
1252
+ // model_hope_fear
1253
+ update_status = ((MiraOptimiser*) optimiser)->updateWeights(weightUpdate,
1254
+ featureValues, losses, bleuScores, modelScores, oracleFeatureValues,
1255
+ oracleBleuScores, oracleModelScores, learning_rate, rank, epoch);
1256
+ }
1257
+
1258
+ // sumStillViolatedConstraints += update_status;
1259
+
1260
+ if (update_status == 0) { // if weights were updated
1261
+ // apply weight update
1262
+ if (debug)
1263
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << weightUpdate << endl;
1264
+
1265
+ if (feature_confidence) {
1266
+ // update confidence counts based on weight update
1267
+ confidenceCounts.UpdateConfidenceCounts(weightUpdate, signed_counts);
1268
+
1269
+ // update feature learning rates
1270
+ featureLearningRates.UpdateLearningRates(decay_core, decay_sparse, confidenceCounts, core_r0, sparse_r0);
1271
+ }
1272
+
1273
+ // apply weight update to Moses weights
1274
+ mosesWeights.PlusEquals(weightUpdate);
1275
+
1276
+ if (normaliseWeights)
1277
+ mosesWeights.L1Normalise();
1278
+
1279
+ cumulativeWeights.PlusEquals(mosesWeights);
1280
+ if (sparseAverage) {
1281
+ ScoreComponentCollection binary;
1282
+ binary.SetToBinaryOf(mosesWeights);
1283
+ cumulativeWeightsBinary.PlusEquals(binary);
1284
+ }
1285
+
1286
+ ++numberOfUpdates;
1287
+ ++numberOfUpdatesThisEpoch;
1288
+ if (averageWeights) {
1289
+ ScoreComponentCollection averageWeights(cumulativeWeights);
1290
+ if (accumulateWeights) {
1291
+ averageWeights.DivideEquals(numberOfUpdates);
1292
+ } else {
1293
+ averageWeights.DivideEquals(numberOfUpdatesThisEpoch);
1294
+ }
1295
+
1296
+ mosesWeights = averageWeights;
1297
+ }
1298
+
1299
+ // set new Moses weights
1300
+ decoder->setWeights(mosesWeights);
1301
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", new weights: " << mosesWeights << endl;
1302
+ }
1303
+
1304
+ // update history (for approximate document Bleu)
1305
+ if (historyBleu || simpleHistoryBleu) {
1306
+ for (size_t i = 0; i < oneBests.size(); ++i)
1307
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update history with 1best length: " << oneBests[i].size() << " ";
1308
+ decoder->updateHistory(oneBests, inputLengths, ref_ids, rank, epoch);
1309
+ deleteTranslations(oneBests);
1310
+ }
1311
+ } // END TRANSLATE AND UPDATE BATCH
1312
+
1313
+ // size of all shards except for the last one
1314
+ size_t generalShardSize;
1315
+ if (trainWithMultipleFolds)
1316
+ generalShardSize = order.size()/coresPerFold;
1317
+ else
1318
+ generalShardSize = order.size()/size;
1319
+
1320
+ size_t mixing_base = mixingFrequency == 0 ? 0 : generalShardSize / mixingFrequency;
1321
+ size_t dumping_base = weightDumpFrequency == 0 ? 0 : generalShardSize / weightDumpFrequency;
1322
+ bool mix = evaluateModulo(shardPosition, mixing_base, actualBatchSize);
1323
+
1324
+ // mix weights?
1325
+ if (mix) {
1326
+ #ifdef MPI_ENABLE
1327
+ cerr << "Rank " << rank << ", epoch " << epoch << ", mixing weights.. " << endl;
1328
+ // collect all weights in mixedWeights and divide by number of processes
1329
+ mpi::reduce(world, mosesWeights, mixedWeights, SCCPlus(), 0);
1330
+
1331
+ // mix confidence counts
1332
+ //mpi::reduce(world, confidenceCounts, mixedConfidenceCounts, SCCPlus(), 0);
1333
+ ScoreComponentCollection totalBinary;
1334
+ if (sparseAverage) {
1335
+ ScoreComponentCollection binary;
1336
+ binary.SetToBinaryOf(mosesWeights);
1337
+ mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
1338
+ }
1339
+ if (rank == 0) {
1340
+ // divide by number of processes
1341
+ if (sparseNoAverage)
1342
+ mixedWeights.CoreDivideEquals(size); // average only core weights
1343
+ else if (sparseAverage)
1344
+ mixedWeights.DivideEquals(totalBinary);
1345
+ else
1346
+ mixedWeights.DivideEquals(size);
1347
+
1348
+ // divide confidence counts
1349
+ //mixedConfidenceCounts.DivideEquals(size);
1350
+
1351
+ // normalise weights after averaging
1352
+ if (normaliseWeights) {
1353
+ mixedWeights.L1Normalise();
1354
+ }
1355
+
1356
+ ++weightMixingThisEpoch;
1357
+
1358
+ if (pruneZeroWeights) {
1359
+ size_t pruned = mixedWeights.PruneZeroWeightFeatures();
1360
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1361
+ << pruned << " zero-weighted features pruned from mixedWeights." << endl;
1362
+
1363
+ pruned = cumulativeWeights.PruneZeroWeightFeatures();
1364
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1365
+ << pruned << " zero-weighted features pruned from cumulativeWeights." << endl;
1366
+ }
1367
+
1368
+ if (featureCutoff != -1 && weightMixingThisEpoch == mixingFrequency) {
1369
+ size_t pruned = mixedWeights.PruneSparseFeatures(featureCutoff);
1370
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1371
+ << pruned << " features pruned from mixedWeights." << endl;
1372
+
1373
+ pruned = cumulativeWeights.PruneSparseFeatures(featureCutoff);
1374
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1375
+ << pruned << " features pruned from cumulativeWeights." << endl;
1376
+ }
1377
+
1378
+ if (weightMixingThisEpoch == mixingFrequency || reg_on_every_mix) {
1379
+ if (l1_regularize) {
1380
+ size_t pruned;
1381
+ if (l1_reg_sparse)
1382
+ pruned = mixedWeights.SparseL1Regularize(l1_lambda);
1383
+ else
1384
+ pruned = mixedWeights.L1Regularize(l1_lambda);
1385
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1386
+ << "l1-reg. on mixedWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
1387
+ }
1388
+ if (l2_regularize) {
1389
+ if (l2_reg_sparse)
1390
+ mixedWeights.SparseL2Regularize(l2_lambda);
1391
+ else
1392
+ mixedWeights.L2Regularize(l2_lambda);
1393
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1394
+ << "l2-reg. on mixedWeights with lambda=" << l2_lambda << endl;
1395
+ }
1396
+ }
1397
+ }
1398
+
1399
+ // broadcast average weights from process 0
1400
+ mpi::broadcast(world, mixedWeights, 0);
1401
+ decoder->setWeights(mixedWeights);
1402
+ mosesWeights = mixedWeights;
1403
+
1404
+ // broadcast summed confidence counts
1405
+ //mpi::broadcast(world, mixedConfidenceCounts, 0);
1406
+ //confidenceCounts = mixedConfidenceCounts;
1407
+ #endif
1408
+ #ifndef MPI_ENABLE
1409
+ //cerr << "\nRank " << rank << ", no mixing, weights: " << mosesWeights << endl;
1410
+ mixedWeights = mosesWeights;
1411
+ #endif
1412
+ } // end mixing
1413
+
1414
+ // Dump weights?
1415
+ if (trainWithMultipleFolds || weightEpochDump == weightDumpFrequency) {
1416
+ // dump mixed weights at end of every epoch to enable continuing a crashed experiment
1417
+ // (for jackknife every time the weights are mixed)
1418
+ ostringstream filename;
1419
+ if (epoch < 10)
1420
+ filename << weightDumpStem << "_mixed_0" << epoch;
1421
+ else
1422
+ filename << weightDumpStem << "_mixed_" << epoch;
1423
+
1424
+ if (weightDumpFrequency > 1)
1425
+ filename << "_" << weightEpochDump;
1426
+
1427
+ mixedWeights.Save(filename.str());
1428
+ cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
1429
+ }
1430
+ if (dumpMixedWeights) {
1431
+ if (mix && rank == 0 && !weightDumpStem.empty()) {
1432
+ // dump mixed weights instead of average weights
1433
+ ostringstream filename;
1434
+ if (epoch < 10)
1435
+ filename << weightDumpStem << "_0" << epoch;
1436
+ else
1437
+ filename << weightDumpStem << "_" << epoch;
1438
+
1439
+ if (weightDumpFrequency > 1)
1440
+ filename << "_" << weightEpochDump;
1441
+
1442
+ cerr << "Dumping mixed weights during epoch " << epoch << " to " << filename.str() << endl << endl;
1443
+ mixedWeights.Save(filename.str());
1444
+ ++weightEpochDump;
1445
+ }
1446
+ } else {
1447
+ if (evaluateModulo(shardPosition, dumping_base, actualBatchSize)) {
1448
+ cerr << "Rank " << rank << ", epoch " << epoch << ", dump weights.. (pos: " << shardPosition << ", base: " << dumping_base << ")" << endl;
1449
+ ScoreComponentCollection tmpAverageWeights(cumulativeWeights);
1450
+ bool proceed = false;
1451
+ if (accumulateWeights) {
1452
+ if (numberOfUpdates > 0) {
1453
+ tmpAverageWeights.DivideEquals(numberOfUpdates);
1454
+ proceed = true;
1455
+ }
1456
+ } else {
1457
+ if (numberOfUpdatesThisEpoch > 0) {
1458
+ if (sparseNoAverage) // average only core weights
1459
+ tmpAverageWeights.CoreDivideEquals(numberOfUpdatesThisEpoch);
1460
+ else if (sparseAverage)
1461
+ tmpAverageWeights.DivideEquals(cumulativeWeightsBinary);
1462
+ else
1463
+ tmpAverageWeights.DivideEquals(numberOfUpdatesThisEpoch);
1464
+ proceed = true;
1465
+ }
1466
+ }
1467
+
1468
+ if (proceed) {
1469
+ #ifdef MPI_ENABLE
1470
+ // average across processes
1471
+ mpi::reduce(world, tmpAverageWeights, mixedAverageWeights, SCCPlus(), 0);
1472
+ ScoreComponentCollection totalBinary;
1473
+ if (sparseAverage) {
1474
+ ScoreComponentCollection binary;
1475
+ binary.SetToBinaryOf(mosesWeights);
1476
+ mpi::reduce(world, binary, totalBinary, SCCPlus(), 0);
1477
+ }
1478
+ #endif
1479
+ #ifndef MPI_ENABLE
1480
+ mixedAverageWeights = tmpAverageWeights;
1481
+ //FIXME: What do to for non-mpi version
1482
+ ScoreComponentCollection totalBinary;
1483
+ #endif
1484
+ if (rank == 0 && !weightDumpStem.empty()) {
1485
+ // divide by number of processes
1486
+ if (sparseNoAverage)
1487
+ mixedAverageWeights.CoreDivideEquals(size); // average only core weights
1488
+ else if (sparseAverage)
1489
+ mixedAverageWeights.DivideEquals(totalBinary);
1490
+ else
1491
+ mixedAverageWeights.DivideEquals(size);
1492
+
1493
+ // normalise weights after averaging
1494
+ if (normaliseWeights) {
1495
+ mixedAverageWeights.L1Normalise();
1496
+ }
1497
+
1498
+ // dump final average weights
1499
+ ostringstream filename;
1500
+ if (epoch < 10) {
1501
+ filename << weightDumpStem << "_0" << epoch;
1502
+ } else {
1503
+ filename << weightDumpStem << "_" << epoch;
1504
+ }
1505
+
1506
+ if (weightDumpFrequency > 1) {
1507
+ filename << "_" << weightEpochDump;
1508
+ }
1509
+
1510
+ /*if (accumulateWeights) {
1511
+ cerr << "\nMixed average weights (cumulative) during epoch " << epoch << ": " << mixedAverageWeights << endl;
1512
+ } else {
1513
+ cerr << "\nMixed average weights during epoch " << epoch << ": " << mixedAverageWeights << endl;
1514
+ }*/
1515
+
1516
+ cerr << "Dumping mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
1517
+ mixedAverageWeights.Save(filename.str());
1518
+ ++weightEpochDump;
1519
+
1520
+ if (weightEpochDump == weightDumpFrequency) {
1521
+ if (l1_regularize) {
1522
+ size_t pruned = mixedAverageWeights.SparseL1Regularize(l1_lambda);
1523
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1524
+ << "l1-reg. on mixedAverageWeights with lambda=" << l1_lambda << ", pruned: " << pruned << endl;
1525
+
1526
+ }
1527
+ if (l2_regularize) {
1528
+ mixedAverageWeights.SparseL2Regularize(l2_lambda);
1529
+ cerr << "Rank " << rank << ", epoch " << epoch << ", "
1530
+ << "l2-reg. on mixedAverageWeights with lambda=" << l2_lambda << endl;
1531
+ }
1532
+
1533
+ if (l1_regularize || l2_regularize) {
1534
+ filename << "_reg";
1535
+ cerr << "Dumping regularized mixed average weights during epoch " << epoch << " to " << filename.str() << endl << endl;
1536
+ mixedAverageWeights.Save(filename.str());
1537
+ }
1538
+ }
1539
+
1540
+ if (weightEpochDump == weightDumpFrequency && printFeatureCounts) {
1541
+ // print out all features with counts
1542
+ stringstream s1, s2;
1543
+ s1 << "sparse_feature_hope_counts" << "_" << epoch;
1544
+ s2 << "sparse_feature_fear_counts" << "_" << epoch;
1545
+ ofstream sparseFeatureCountsHope(s1.str().c_str());
1546
+ ofstream sparseFeatureCountsFear(s2.str().c_str());
1547
+
1548
+ mixedAverageWeights.PrintSparseHopeFeatureCounts(sparseFeatureCountsHope);
1549
+ mixedAverageWeights.PrintSparseFearFeatureCounts(sparseFeatureCountsFear);
1550
+ sparseFeatureCountsHope.close();
1551
+ sparseFeatureCountsFear.close();
1552
+ }
1553
+ }
1554
+ }
1555
+ }// end dumping
1556
+ } // end if dump
1557
+ } // end of shard loop, end of this epoch
1558
+ cerr << "Rank " << rank << ", epoch " << epoch << ", end of epoch.." << endl;
1559
+
1560
+ if (historyBleu || simpleHistoryBleu) {
1561
+ cerr << "Bleu feature history after epoch " << epoch << endl;
1562
+ decoder->printBleuFeatureHistory(cerr);
1563
+ }
1564
+ // cerr << "Rank " << rank << ", epoch " << epoch << ", sum of violated constraints: " << sumStillViolatedConstraints << endl;
1565
+
1566
+ // Check whether there were any weight updates during this epoch
1567
+ size_t sumUpdates;
1568
+ size_t *sendbuf_uint, *recvbuf_uint;
1569
+ sendbuf_uint = (size_t *) malloc(sizeof(size_t));
1570
+ recvbuf_uint = (size_t *) malloc(sizeof(size_t));
1571
+ #ifdef MPI_ENABLE
1572
+ sendbuf_uint[0] = numberOfUpdatesThisEpoch;
1573
+ recvbuf_uint[0] = 0;
1574
+ MPI_Reduce(sendbuf_uint, recvbuf_uint, 1, MPI_UNSIGNED, MPI_SUM, 0, world);
1575
+ sumUpdates = recvbuf_uint[0];
1576
+ #endif
1577
+ #ifndef MPI_ENABLE
1578
+ sumUpdates = numberOfUpdatesThisEpoch;
1579
+ #endif
1580
+ if (rank == 0 && sumUpdates == 0) {
1581
+ cerr << "\nNo weight updates during this epoch.. stopping." << endl;
1582
+ stop = true;
1583
+ #ifdef MPI_ENABLE
1584
+ mpi::broadcast(world, stop, 0);
1585
+ #endif
1586
+ }
1587
+
1588
+ if (!stop) {
1589
+ // Test if weights have converged
1590
+ if (weightConvergence) {
1591
+ bool reached = true;
1592
+ if (rank == 0 && (epoch >= 2)) {
1593
+ ScoreComponentCollection firstDiff, secondDiff;
1594
+ if (dumpMixedWeights) {
1595
+ firstDiff = mixedWeights;
1596
+ firstDiff.MinusEquals(mixedWeightsPrevious);
1597
+ secondDiff = mixedWeights;
1598
+ secondDiff.MinusEquals(mixedWeightsBeforePrevious);
1599
+ } else {
1600
+ firstDiff = mixedAverageWeights;
1601
+ firstDiff.MinusEquals(mixedAverageWeightsPrevious);
1602
+ secondDiff = mixedAverageWeights;
1603
+ secondDiff.MinusEquals(mixedAverageWeightsBeforePrevious);
1604
+ }
1605
+ VERBOSE(1, "Average weight changes since previous epoch: " << firstDiff << " (max: " << firstDiff.GetLInfNorm() << ")" << endl);
1606
+ VERBOSE(1, "Average weight changes since before previous epoch: " << secondDiff << " (max: " << secondDiff.GetLInfNorm() << ")" << endl << endl);
1607
+
1608
+ // check whether stopping criterion has been reached
1609
+ // (both difference vectors must have all weight changes smaller than min_weight_change)
1610
+ if (firstDiff.GetLInfNorm() >= min_weight_change)
1611
+ reached = false;
1612
+ if (secondDiff.GetLInfNorm() >= min_weight_change)
1613
+ reached = false;
1614
+ if (reached) {
1615
+ // stop MIRA
1616
+ stop = true;
1617
+ cerr << "\nWeights have converged after epoch " << epoch << ".. stopping MIRA." << endl;
1618
+ ScoreComponentCollection dummy;
1619
+ ostringstream endfilename;
1620
+ endfilename << "stopping";
1621
+ dummy.Save(endfilename.str());
1622
+ }
1623
+ }
1624
+
1625
+ mixedWeightsBeforePrevious = mixedWeightsPrevious;
1626
+ mixedWeightsPrevious = mixedWeights;
1627
+ mixedAverageWeightsBeforePrevious = mixedAverageWeightsPrevious;
1628
+ mixedAverageWeightsPrevious = mixedAverageWeights;
1629
+ #ifdef MPI_ENABLE
1630
+ mpi::broadcast(world, stop, 0);
1631
+ #endif
1632
+ } //end if (weightConvergence)
1633
+ }
1634
+ } // end of epoch loop
1635
+
1636
+ #ifdef MPI_ENABLE
1637
+ MPI_Finalize();
1638
+ #endif
1639
+
1640
+ time(&now);
1641
+ cerr << "Rank " << rank << ", " << ctime(&now);
1642
+
1643
+ if (rank == 0) {
1644
+ ScoreComponentCollection dummy;
1645
+ ostringstream endfilename;
1646
+ endfilename << "finished";
1647
+ dummy.Save(endfilename.str());
1648
+ }
1649
+
1650
+ delete decoder;
1651
+ exit(0);
1652
+ }
1653
+
1654
+ bool loadSentences(const string& filename, vector<string>& sentences)
1655
+ {
1656
+ ifstream in(filename.c_str());
1657
+ if (!in)
1658
+ return false;
1659
+ string line;
1660
+ while (getline(in, line))
1661
+ sentences.push_back(line);
1662
+ return true;
1663
+ }
1664
+
1665
+ bool evaluateModulo(size_t shard_position, size_t mix_or_dump_base, size_t actual_batch_size)
1666
+ {
1667
+ if (mix_or_dump_base == 0) return 0;
1668
+ if (actual_batch_size > 1) {
1669
+ bool mix_or_dump = false;
1670
+ size_t numberSubtracts = actual_batch_size;
1671
+ do {
1672
+ if (shard_position % mix_or_dump_base == 0) {
1673
+ mix_or_dump = true;
1674
+ break;
1675
+ }
1676
+ --shard_position;
1677
+ --numberSubtracts;
1678
+ } while (numberSubtracts > 0);
1679
+ return mix_or_dump;
1680
+ } else {
1681
+ return ((shard_position % mix_or_dump_base) == 0);
1682
+ }
1683
+ }
1684
+
1685
+ void printFeatureValues(vector<vector<ScoreComponentCollection> > &featureValues)
1686
+ {
1687
+ for (size_t i = 0; i < featureValues.size(); ++i) {
1688
+ for (size_t j = 0; j < featureValues[i].size(); ++j) {
1689
+ cerr << featureValues[i][j] << endl;
1690
+ }
1691
+ }
1692
+ cerr << endl;
1693
+ }
1694
+
1695
+ void deleteTranslations(vector<vector<const Word*> > &translations)
1696
+ {
1697
+ for (size_t i = 0; i < translations.size(); ++i) {
1698
+ for (size_t j = 0; j < translations[i].size(); ++j) {
1699
+ delete translations[i][j];
1700
+ }
1701
+ }
1702
+ }
1703
+
1704
+ void decodeHopeOrFear(size_t rank, size_t size, size_t decode, string filename, vector<string> &inputSentences, MosesDecoder* decoder, size_t n, float bleuWeight)
1705
+ {
1706
+ if (decode == 1)
1707
+ cerr << "Rank " << rank << ", decoding dev input set according to hope objective.. " << endl;
1708
+ else if (decode == 2)
1709
+ cerr << "Rank " << rank << ", decoding dev input set according to fear objective.. " << endl;
1710
+ else
1711
+ cerr << "Rank " << rank << ", decoding dev input set according to normal objective.. " << endl;
1712
+
1713
+ // Create shards according to the number of processes used
1714
+ vector<size_t> order;
1715
+ for (size_t i = 0; i < inputSentences.size(); ++i)
1716
+ order.push_back(i);
1717
+
1718
+ vector<size_t> shard;
1719
+ float shardSize = (float) (order.size()) / size;
1720
+ size_t shardStart = (size_t) (shardSize * rank);
1721
+ size_t shardEnd = (size_t) (shardSize * (rank + 1));
1722
+ if (rank == size - 1) {
1723
+ shardEnd = inputSentences.size();
1724
+ shardSize = shardEnd - shardStart;
1725
+ }
1726
+ VERBOSE(1, "Rank " << rank << ", shard start: " << shardStart << " Shard end: " << shardEnd << endl);
1727
+ VERBOSE(1, "Rank " << rank << ", shard size: " << shardSize << endl);
1728
+ shard.resize(shardSize);
1729
+ copy(order.begin() + shardStart, order.begin() + shardEnd, shard.begin());
1730
+
1731
+ // open files for writing
1732
+ stringstream fname;
1733
+ fname << filename << ".rank" << rank;
1734
+ filename = fname.str();
1735
+ ostringstream filename_nbest;
1736
+ filename_nbest << filename << "." << n << "best";
1737
+ ofstream out(filename.c_str());
1738
+ ofstream nbest_out((filename_nbest.str()).c_str());
1739
+ if (!out) {
1740
+ ostringstream msg;
1741
+ msg << "Unable to open " << fname.str();
1742
+ throw runtime_error(msg.str());
1743
+ }
1744
+ if (!nbest_out) {
1745
+ ostringstream msg;
1746
+ msg << "Unable to open " << filename_nbest;
1747
+ throw runtime_error(msg.str());
1748
+ }
1749
+
1750
+ for (size_t i = 0; i < shard.size(); ++i) {
1751
+ size_t sid = shard[i];
1752
+ string& input = inputSentences[sid];
1753
+
1754
+ vector<vector<ScoreComponentCollection> > dummyFeatureValues;
1755
+ vector<vector<float> > dummyBleuScores;
1756
+ vector<vector<float> > dummyModelScores;
1757
+
1758
+ vector<ScoreComponentCollection> newFeatureValues;
1759
+ vector<float> newScores;
1760
+ dummyFeatureValues.push_back(newFeatureValues);
1761
+ dummyBleuScores.push_back(newScores);
1762
+ dummyModelScores.push_back(newScores);
1763
+
1764
+ float factor = 0.0;
1765
+ if (decode == 1) factor = 1.0;
1766
+ if (decode == 2) factor = -1.0;
1767
+ cerr << "Rank " << rank << ", translating sentence " << sid << endl;
1768
+ bool realBleu = false;
1769
+ vector< vector<const Word*> > nbestOutput = decoder->getNBest(input, sid, n, factor, bleuWeight, dummyFeatureValues[0],
1770
+ dummyBleuScores[0], dummyModelScores[0], n, realBleu, true, false, rank, 0, "");
1771
+ cerr << endl;
1772
+ decoder->cleanup(StaticData::Instance().IsChart());
1773
+
1774
+ for (size_t i = 0; i < nbestOutput.size(); ++i) {
1775
+ vector<const Word*> output = nbestOutput[i];
1776
+ stringstream translation;
1777
+ for (size_t k = 0; k < output.size(); ++k) {
1778
+ Word* w = const_cast<Word*>(output[k]);
1779
+ translation << w->GetString(0);
1780
+ translation << " ";
1781
+ }
1782
+
1783
+ if (i == 0)
1784
+ out << translation.str() << endl;
1785
+ nbest_out << sid << " ||| " << translation.str() << " ||| " << dummyFeatureValues[0][i] <<
1786
+ " ||| " << dummyModelScores[0][i] << " ||| sBleu=" << dummyBleuScores[0][i] << endl;
1787
+ }
1788
+ }
1789
+
1790
+ out.close();
1791
+ nbest_out.close();
1792
+ cerr << "Closing files " << filename << " and " << filename_nbest.str() << endl;
1793
+
1794
+ #ifdef MPI_ENABLE
1795
+ MPI_Finalize();
1796
+ #endif
1797
+
1798
+ time_t now;
1799
+ time(&now);
1800
+ cerr << "Rank " << rank << ", " << ctime(&now);
1801
+
1802
+ delete decoder;
1803
+ exit(0);
1804
+ }
1805
+
1806
+ void applyLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, float core_r0, float sparse_r0)
1807
+ {
1808
+ for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
1809
+ for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
1810
+ featureValues[i][j].MultiplyEquals(core_r0, sparse_r0);
1811
+ }
1812
+
1813
+ void applyPerFeatureLearningRates(vector<vector<ScoreComponentCollection> > &featureValues, ScoreComponentCollection featureLearningRates, float sparse_r0)
1814
+ {
1815
+ for (size_t i=0; i<featureValues.size(); ++i) // each item in batch
1816
+ for (size_t j=0; j<featureValues[i].size(); ++j) // each item in nbest
1817
+ featureValues[i][j].MultiplyEqualsBackoff(featureLearningRates, sparse_r0);
1818
+ }
1819
+
1820
+ void scaleFeatureScore(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
1821
+ {
1822
+ string name = sp->GetScoreProducerDescription();
1823
+
1824
+ // scale down score
1825
+ float featureScore;
1826
+ for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
1827
+ for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
1828
+ featureScore = featureValues[i][j].GetScoreForProducer(sp);
1829
+ featureValues[i][j].Assign(sp, featureScore*scaling_factor);
1830
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
1831
+ }
1832
+ }
1833
+ }
1834
+
1835
+ void scaleFeatureScores(const FeatureFunction *sp, float scaling_factor, vector<vector<ScoreComponentCollection> > &featureValues, size_t rank, size_t epoch)
1836
+ {
1837
+ string name = sp->GetScoreProducerDescription();
1838
+
1839
+ // scale down score
1840
+ for (size_t i=0; i<featureValues.size(); ++i) { // each item in batch
1841
+ for (size_t j=0; j<featureValues[i].size(); ++j) { // each item in nbest
1842
+ vector<float> featureScores = featureValues[i][j].GetScoresForProducer(sp);
1843
+ for (size_t k=0; k<featureScores.size(); ++k)
1844
+ featureScores[k] *= scaling_factor;
1845
+ featureValues[i][j].Assign(sp, featureScores);
1846
+ //cerr << "Rank " << rank << ", epoch " << epoch << ", " << name << " score scaled from " << featureScore << " to " << featureScore/scaling_factor << endl;
1847
+ }
1848
+ }
1849
+ }
mosesdecoder/contrib/mira/Perceptron.cpp ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "Optimiser.h"
21
+
22
+ using namespace Moses;
23
+ using namespace std;
24
+
25
+ namespace Mira
26
+ {
27
+
28
+ size_t Perceptron::updateWeightsHopeFear(
29
+ ScoreComponentCollection& weightUpdate,
30
+ const vector< vector<ScoreComponentCollection> >& featureValuesHope,
31
+ const vector< vector<ScoreComponentCollection> >& featureValuesFear,
32
+ const vector< vector<float> >& dummy1,
33
+ const vector< vector<float> >& dummy2,
34
+ const vector< vector<float> >& dummy3,
35
+ const vector< vector<float> >& dummy4,
36
+ float perceptron_learning_rate,
37
+ size_t rank,
38
+ size_t epoch,
39
+ int updatePosition)
40
+ {
41
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope: " << featureValuesHope[0][0] << endl;
42
+ cerr << "Rank " << rank << ", epoch " << epoch << ", fear: " << featureValuesFear[0][0] << endl;
43
+ ScoreComponentCollection featureValueDiff = featureValuesHope[0][0];
44
+ featureValueDiff.MinusEquals(featureValuesFear[0][0]);
45
+ cerr << "Rank " << rank << ", epoch " << epoch << ", hope - fear: " << featureValueDiff << endl;
46
+ featureValueDiff.MultiplyEquals(perceptron_learning_rate);
47
+ weightUpdate.PlusEquals(featureValueDiff);
48
+ cerr << "Rank " << rank << ", epoch " << epoch << ", update: " << featureValueDiff << endl;
49
+ return 0;
50
+ }
51
+
52
+ }
53
+
mosesdecoder/contrib/mira/mira.xcodeproj/project.pbxproj ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // !$*UTF8*$!
2
+ {
3
+ archiveVersion = 1;
4
+ classes = {
5
+ };
6
+ objectVersion = 45;
7
+ objects = {
8
+
9
+ /* Begin PBXBuildFile section */
10
+ 1E141A311243527800123194 /* Perceptron.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E141A2F1243527800123194 /* Perceptron.cpp */; };
11
+ 1E56EBF51243B91600E8315C /* MiraOptimiser.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */; };
12
+ 1E9DC63C1242602F0059001A /* Decoder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E9DC6391242602F0059001A /* Decoder.cpp */; };
13
+ 1E9DC63D1242602F0059001A /* Main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E9DC63B1242602F0059001A /* Main.cpp */; };
14
+ 1E9DC6DA1242684C0059001A /* libmoses-chart.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6D1124268310059001A /* libmoses-chart.a */; };
15
+ 1E9DC6DB124268510059001A /* libmoses.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6CB124268270059001A /* libmoses.a */; };
16
+ 1E9DC6DC124268580059001A /* libOnDiskPt.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 1E9DC6D9124268440059001A /* libOnDiskPt.a */; };
17
+ 8DD76F6A0486A84900D96B5E /* mira.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* mira.1 */; };
18
+ /* End PBXBuildFile section */
19
+
20
+ /* Begin PBXContainerItemProxy section */
21
+ 1E9DC6CA124268270059001A /* PBXContainerItemProxy */ = {
22
+ isa = PBXContainerItemProxy;
23
+ containerPortal = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
24
+ proxyType = 2;
25
+ remoteGlobalIDString = D2AAC046055464E500DB518D;
26
+ remoteInfo = moses;
27
+ };
28
+ 1E9DC6D0124268310059001A /* PBXContainerItemProxy */ = {
29
+ isa = PBXContainerItemProxy;
30
+ containerPortal = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
31
+ proxyType = 2;
32
+ remoteGlobalIDString = D2AAC046055464E500DB518D;
33
+ remoteInfo = "moses-chart";
34
+ };
35
+ 1E9DC6D8124268440059001A /* PBXContainerItemProxy */ = {
36
+ isa = PBXContainerItemProxy;
37
+ containerPortal = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
38
+ proxyType = 2;
39
+ remoteGlobalIDString = D2AAC046055464E500DB518D;
40
+ remoteInfo = OnDiskPt;
41
+ };
42
+ 1EF4E84C12440612006233A0 /* PBXContainerItemProxy */ = {
43
+ isa = PBXContainerItemProxy;
44
+ containerPortal = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
45
+ proxyType = 1;
46
+ remoteGlobalIDString = D2AAC045055464E500DB518D /* moses */;
47
+ remoteInfo = moses;
48
+ };
49
+ 1EF4E84E12440612006233A0 /* PBXContainerItemProxy */ = {
50
+ isa = PBXContainerItemProxy;
51
+ containerPortal = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
52
+ proxyType = 1;
53
+ remoteGlobalIDString = D2AAC045055464E500DB518D /* moses-chart */;
54
+ remoteInfo = "moses-chart";
55
+ };
56
+ 1EF4E85012440612006233A0 /* PBXContainerItemProxy */ = {
57
+ isa = PBXContainerItemProxy;
58
+ containerPortal = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
59
+ proxyType = 1;
60
+ remoteGlobalIDString = D2AAC045055464E500DB518D /* OnDiskPt */;
61
+ remoteInfo = OnDiskPt;
62
+ };
63
+ /* End PBXContainerItemProxy section */
64
+
65
+ /* Begin PBXCopyFilesBuildPhase section */
66
+ 8DD76F690486A84900D96B5E /* CopyFiles */ = {
67
+ isa = PBXCopyFilesBuildPhase;
68
+ buildActionMask = 8;
69
+ dstPath = /usr/share/man/man1/;
70
+ dstSubfolderSpec = 0;
71
+ files = (
72
+ 8DD76F6A0486A84900D96B5E /* mira.1 in CopyFiles */,
73
+ );
74
+ runOnlyForDeploymentPostprocessing = 1;
75
+ };
76
+ /* End PBXCopyFilesBuildPhase section */
77
+
78
+ /* Begin PBXFileReference section */
79
+ 1E141A2F1243527800123194 /* Perceptron.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Perceptron.cpp; sourceTree = "<group>"; };
80
+ 1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = MiraOptimiser.cpp; sourceTree = "<group>"; };
81
+ 1E9DC6391242602F0059001A /* Decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Decoder.cpp; sourceTree = "<group>"; };
82
+ 1E9DC63A1242602F0059001A /* Decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Decoder.h; sourceTree = "<group>"; };
83
+ 1E9DC63B1242602F0059001A /* Main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Main.cpp; sourceTree = "<group>"; };
84
+ 1E9DC63E124260370059001A /* Optimiser.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Optimiser.h; sourceTree = "<group>"; };
85
+ 1E9DC6C6124268270059001A /* moses.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = moses.xcodeproj; path = ../moses/moses.xcodeproj; sourceTree = SOURCE_ROOT; };
86
+ 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = "moses-chart.xcodeproj"; path = "../moses-chart/moses-chart.xcodeproj"; sourceTree = SOURCE_ROOT; };
87
+ 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */ = {isa = PBXFileReference; lastKnownFileType = "wrapper.pb-project"; name = OnDiskPt.xcodeproj; path = ../OnDiskPt/OnDiskPt.xcodeproj; sourceTree = SOURCE_ROOT; };
88
+ 1E9DC76712426FC60059001A /* Main.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Main.h; sourceTree = "<group>"; };
89
+ 8DD76F6C0486A84900D96B5E /* mira */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mira; sourceTree = BUILT_PRODUCTS_DIR; };
90
+ C6859E8B029090EE04C91782 /* mira.1 */ = {isa = PBXFileReference; lastKnownFileType = text.man; path = mira.1; sourceTree = "<group>"; };
91
+ /* End PBXFileReference section */
92
+
93
+ /* Begin PBXFrameworksBuildPhase section */
94
+ 8DD76F660486A84900D96B5E /* Frameworks */ = {
95
+ isa = PBXFrameworksBuildPhase;
96
+ buildActionMask = 2147483647;
97
+ files = (
98
+ 1E9DC6DC124268580059001A /* libOnDiskPt.a in Frameworks */,
99
+ 1E9DC6DB124268510059001A /* libmoses.a in Frameworks */,
100
+ 1E9DC6DA1242684C0059001A /* libmoses-chart.a in Frameworks */,
101
+ );
102
+ runOnlyForDeploymentPostprocessing = 0;
103
+ };
104
+ /* End PBXFrameworksBuildPhase section */
105
+
106
+ /* Begin PBXGroup section */
107
+ 08FB7794FE84155DC02AAC07 /* mira */ = {
108
+ isa = PBXGroup;
109
+ children = (
110
+ 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */,
111
+ 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */,
112
+ 1E9DC6C6124268270059001A /* moses.xcodeproj */,
113
+ 08FB7795FE84155DC02AAC07 /* Source */,
114
+ C6859E8C029090F304C91782 /* Documentation */,
115
+ 1AB674ADFE9D54B511CA2CBB /* Products */,
116
+ );
117
+ name = mira;
118
+ sourceTree = "<group>";
119
+ };
120
+ 08FB7795FE84155DC02AAC07 /* Source */ = {
121
+ isa = PBXGroup;
122
+ children = (
123
+ 1E56EBF41243B91600E8315C /* MiraOptimiser.cpp */,
124
+ 1E141A2F1243527800123194 /* Perceptron.cpp */,
125
+ 1E9DC63E124260370059001A /* Optimiser.h */,
126
+ 1E9DC6391242602F0059001A /* Decoder.cpp */,
127
+ 1E9DC63A1242602F0059001A /* Decoder.h */,
128
+ 1E9DC63B1242602F0059001A /* Main.cpp */,
129
+ 1E9DC76712426FC60059001A /* Main.h */,
130
+ );
131
+ name = Source;
132
+ sourceTree = "<group>";
133
+ };
134
+ 1AB674ADFE9D54B511CA2CBB /* Products */ = {
135
+ isa = PBXGroup;
136
+ children = (
137
+ 8DD76F6C0486A84900D96B5E /* mira */,
138
+ );
139
+ name = Products;
140
+ sourceTree = "<group>";
141
+ };
142
+ 1E9DC6C7124268270059001A /* Products */ = {
143
+ isa = PBXGroup;
144
+ children = (
145
+ 1E9DC6CB124268270059001A /* libmoses.a */,
146
+ );
147
+ name = Products;
148
+ sourceTree = "<group>";
149
+ };
150
+ 1E9DC6CD124268310059001A /* Products */ = {
151
+ isa = PBXGroup;
152
+ children = (
153
+ 1E9DC6D1124268310059001A /* libmoses-chart.a */,
154
+ );
155
+ name = Products;
156
+ sourceTree = "<group>";
157
+ };
158
+ 1E9DC6D5124268440059001A /* Products */ = {
159
+ isa = PBXGroup;
160
+ children = (
161
+ 1E9DC6D9124268440059001A /* libOnDiskPt.a */,
162
+ );
163
+ name = Products;
164
+ sourceTree = "<group>";
165
+ };
166
+ C6859E8C029090F304C91782 /* Documentation */ = {
167
+ isa = PBXGroup;
168
+ children = (
169
+ C6859E8B029090EE04C91782 /* mira.1 */,
170
+ );
171
+ name = Documentation;
172
+ sourceTree = "<group>";
173
+ };
174
+ /* End PBXGroup section */
175
+
176
+ /* Begin PBXNativeTarget section */
177
+ 8DD76F620486A84900D96B5E /* mira */ = {
178
+ isa = PBXNativeTarget;
179
+ buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "mira" */;
180
+ buildPhases = (
181
+ 8DD76F640486A84900D96B5E /* Sources */,
182
+ 8DD76F660486A84900D96B5E /* Frameworks */,
183
+ 8DD76F690486A84900D96B5E /* CopyFiles */,
184
+ );
185
+ buildRules = (
186
+ );
187
+ dependencies = (
188
+ 1EF4E84D12440612006233A0 /* PBXTargetDependency */,
189
+ 1EF4E84F12440612006233A0 /* PBXTargetDependency */,
190
+ 1EF4E85112440612006233A0 /* PBXTargetDependency */,
191
+ );
192
+ name = mira;
193
+ productInstallPath = "$(HOME)/bin";
194
+ productName = mira;
195
+ productReference = 8DD76F6C0486A84900D96B5E /* mira */;
196
+ productType = "com.apple.product-type.tool";
197
+ };
198
+ /* End PBXNativeTarget section */
199
+
200
+ /* Begin PBXProject section */
201
+ 08FB7793FE84155DC02AAC07 /* Project object */ = {
202
+ isa = PBXProject;
203
+ buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "mira" */;
204
+ compatibilityVersion = "Xcode 3.1";
205
+ hasScannedForEncodings = 1;
206
+ mainGroup = 08FB7794FE84155DC02AAC07 /* mira */;
207
+ projectDirPath = "";
208
+ projectReferences = (
209
+ {
210
+ ProductGroup = 1E9DC6CD124268310059001A /* Products */;
211
+ ProjectRef = 1E9DC6CC124268310059001A /* moses-chart.xcodeproj */;
212
+ },
213
+ {
214
+ ProductGroup = 1E9DC6C7124268270059001A /* Products */;
215
+ ProjectRef = 1E9DC6C6124268270059001A /* moses.xcodeproj */;
216
+ },
217
+ {
218
+ ProductGroup = 1E9DC6D5124268440059001A /* Products */;
219
+ ProjectRef = 1E9DC6D4124268440059001A /* OnDiskPt.xcodeproj */;
220
+ },
221
+ );
222
+ projectRoot = "";
223
+ targets = (
224
+ 8DD76F620486A84900D96B5E /* mira */,
225
+ );
226
+ };
227
+ /* End PBXProject section */
228
+
229
+ /* Begin PBXReferenceProxy section */
230
+ 1E9DC6CB124268270059001A /* libmoses.a */ = {
231
+ isa = PBXReferenceProxy;
232
+ fileType = archive.ar;
233
+ path = libmoses.a;
234
+ remoteRef = 1E9DC6CA124268270059001A /* PBXContainerItemProxy */;
235
+ sourceTree = BUILT_PRODUCTS_DIR;
236
+ };
237
+ 1E9DC6D1124268310059001A /* libmoses-chart.a */ = {
238
+ isa = PBXReferenceProxy;
239
+ fileType = archive.ar;
240
+ path = "libmoses-chart.a";
241
+ remoteRef = 1E9DC6D0124268310059001A /* PBXContainerItemProxy */;
242
+ sourceTree = BUILT_PRODUCTS_DIR;
243
+ };
244
+ 1E9DC6D9124268440059001A /* libOnDiskPt.a */ = {
245
+ isa = PBXReferenceProxy;
246
+ fileType = archive.ar;
247
+ path = libOnDiskPt.a;
248
+ remoteRef = 1E9DC6D8124268440059001A /* PBXContainerItemProxy */;
249
+ sourceTree = BUILT_PRODUCTS_DIR;
250
+ };
251
+ /* End PBXReferenceProxy section */
252
+
253
+ /* Begin PBXSourcesBuildPhase section */
254
+ 8DD76F640486A84900D96B5E /* Sources */ = {
255
+ isa = PBXSourcesBuildPhase;
256
+ buildActionMask = 2147483647;
257
+ files = (
258
+ 1E9DC63C1242602F0059001A /* Decoder.cpp in Sources */,
259
+ 1E9DC63D1242602F0059001A /* Main.cpp in Sources */,
260
+ 1E141A311243527800123194 /* Perceptron.cpp in Sources */,
261
+ 1E56EBF51243B91600E8315C /* MiraOptimiser.cpp in Sources */,
262
+ );
263
+ runOnlyForDeploymentPostprocessing = 0;
264
+ };
265
+ /* End PBXSourcesBuildPhase section */
266
+
267
+ /* Begin PBXTargetDependency section */
268
+ 1EF4E84D12440612006233A0 /* PBXTargetDependency */ = {
269
+ isa = PBXTargetDependency;
270
+ name = moses;
271
+ targetProxy = 1EF4E84C12440612006233A0 /* PBXContainerItemProxy */;
272
+ };
273
+ 1EF4E84F12440612006233A0 /* PBXTargetDependency */ = {
274
+ isa = PBXTargetDependency;
275
+ name = "moses-chart";
276
+ targetProxy = 1EF4E84E12440612006233A0 /* PBXContainerItemProxy */;
277
+ };
278
+ 1EF4E85112440612006233A0 /* PBXTargetDependency */ = {
279
+ isa = PBXTargetDependency;
280
+ name = OnDiskPt;
281
+ targetProxy = 1EF4E85012440612006233A0 /* PBXContainerItemProxy */;
282
+ };
283
+ /* End PBXTargetDependency section */
284
+
285
+ /* Begin XCBuildConfiguration section */
286
+ 1DEB923208733DC60010E9CD /* Debug */ = {
287
+ isa = XCBuildConfiguration;
288
+ buildSettings = {
289
+ ALWAYS_SEARCH_USER_PATHS = NO;
290
+ COPY_PHASE_STRIP = NO;
291
+ GCC_DYNAMIC_NO_PIC = NO;
292
+ GCC_ENABLE_FIX_AND_CONTINUE = YES;
293
+ GCC_MODEL_TUNING = G5;
294
+ GCC_OPTIMIZATION_LEVEL = 0;
295
+ INSTALL_PATH = /usr/local/bin;
296
+ LIBRARY_SEARCH_PATHS = (
297
+ ../irstlm/lib/i386,
298
+ ../srilm/lib/macosx,
299
+ );
300
+ OTHER_LDFLAGS = (
301
+ "-lboost_program_options",
302
+ "-lz",
303
+ "-lirstlm",
304
+ "-lmisc",
305
+ "-ldstruct",
306
+ "-loolm",
307
+ "-lflm",
308
+ "-llattice",
309
+ );
310
+ PRODUCT_NAME = mira;
311
+ };
312
+ name = Debug;
313
+ };
314
+ 1DEB923308733DC60010E9CD /* Release */ = {
315
+ isa = XCBuildConfiguration;
316
+ buildSettings = {
317
+ ALWAYS_SEARCH_USER_PATHS = NO;
318
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
319
+ GCC_MODEL_TUNING = G5;
320
+ INSTALL_PATH = /usr/local/bin;
321
+ LIBRARY_SEARCH_PATHS = (
322
+ ../irstlm/lib/i386,
323
+ ../srilm/lib/macosx,
324
+ );
325
+ OTHER_LDFLAGS = (
326
+ "-lboost_program_options",
327
+ "-lz",
328
+ "-lirstlm",
329
+ "-lmisc",
330
+ "-ldstruct",
331
+ "-loolm",
332
+ "-lflm",
333
+ "-llattice",
334
+ );
335
+ PRODUCT_NAME = mira;
336
+ };
337
+ name = Release;
338
+ };
339
+ 1DEB923608733DC60010E9CD /* Debug */ = {
340
+ isa = XCBuildConfiguration;
341
+ buildSettings = {
342
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
343
+ GCC_C_LANGUAGE_STANDARD = gnu99;
344
+ GCC_OPTIMIZATION_LEVEL = 0;
345
+ GCC_WARN_ABOUT_RETURN_TYPE = YES;
346
+ GCC_WARN_UNUSED_VARIABLE = YES;
347
+ HEADER_SEARCH_PATHS = (
348
+ /usr/local/include,
349
+ "../moses-chart/src",
350
+ ../moses/src,
351
+ ../irstlm/include,
352
+ );
353
+ ONLY_ACTIVE_ARCH = YES;
354
+ PREBINDING = NO;
355
+ SDKROOT = macosx10.6;
356
+ };
357
+ name = Debug;
358
+ };
359
+ 1DEB923708733DC60010E9CD /* Release */ = {
360
+ isa = XCBuildConfiguration;
361
+ buildSettings = {
362
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
363
+ GCC_C_LANGUAGE_STANDARD = gnu99;
364
+ GCC_WARN_ABOUT_RETURN_TYPE = YES;
365
+ GCC_WARN_UNUSED_VARIABLE = YES;
366
+ HEADER_SEARCH_PATHS = (
367
+ /usr/local/include,
368
+ "../moses-chart/src",
369
+ ../moses/src,
370
+ ../irstlm/include,
371
+ );
372
+ PREBINDING = NO;
373
+ SDKROOT = macosx10.6;
374
+ };
375
+ name = Release;
376
+ };
377
+ /* End XCBuildConfiguration section */
378
+
379
+ /* Begin XCConfigurationList section */
380
+ 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "mira" */ = {
381
+ isa = XCConfigurationList;
382
+ buildConfigurations = (
383
+ 1DEB923208733DC60010E9CD /* Debug */,
384
+ 1DEB923308733DC60010E9CD /* Release */,
385
+ );
386
+ defaultConfigurationIsVisible = 0;
387
+ defaultConfigurationName = Release;
388
+ };
389
+ 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "mira" */ = {
390
+ isa = XCConfigurationList;
391
+ buildConfigurations = (
392
+ 1DEB923608733DC60010E9CD /* Debug */,
393
+ 1DEB923708733DC60010E9CD /* Release */,
394
+ );
395
+ defaultConfigurationIsVisible = 0;
396
+ defaultConfigurationName = Release;
397
+ };
398
+ /* End XCConfigurationList section */
399
+ };
400
+ rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
401
+ }
mosesdecoder/contrib/moses-speedtest/README.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Moses speedtesting framework
2
+
3
+ ### Description
4
+
5
+ This is an automatic test framework that is designed to test the day to day performance changes in Moses.
6
+
7
+ ### Set up
8
+
9
+ #### Set up a Moses repo
10
+ Set up a Moses repo and build it with the desired configuration.
11
+ ```bash
12
+ git clone https://github.com/moses-smt/mosesdecoder.git
13
+ cd mosesdecoder
14
+ ./bjam -j10 --with-cmph=/usr/include/
15
+ ```
16
+ You need to build Moses first, so that the testsuite knows what command you want it to use when rebuilding against newer revisions.
17
+
18
+ #### Create a parent directory.
19
+ Create a parent directory where the **runtests.py** and related scripts and configuration file should reside.
20
+ This should also be the location of the TEST_DIR and TEST_LOG_DIR as explained in the next section.
21
+
22
+ #### Set up a global configuration file.
23
+ You need a configuration file for the testsuite. A sample configuration file is provided in **testsuite\_config**
24
+ <pre>
25
+ MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
26
+ DROP_CACHES_COMM: sys_drop_caches 3
27
+ TEST_DIR: /home/moses-speedtest/phrase_tables/tests
28
+ TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
29
+ BASEBRANCH: RELEASE-2.1.1
30
+ MOSES_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-prof
31
+ MOSES_GOOGLE_PROFILER_REPO: /home/moses-speedtest/moses-standard/mosesdecoder-variant-gperftools
32
+ </pre>
33
+
34
+ The _MOSES\_REPO\_PATH_ is the place where you have set up and built moses.
35
+ The _DROP\_CACHES\_COMM_ is the command that would be used to drop caches. It should run without needing root access.
36
+ _TEST\_DIR_ is the directory where all the tests will reside.
37
+ _TEST\_LOG\_DIR_ is the directory where the performance logs will be gathered. It should be created before running the testsuite for the first time.
38
+ _BASEBRANCH_ is the branch against which all new tests will be compared. It should normally be set to be the latest Moses stable release.
39
+ _MOSES\_PROFILER\_REPO_ is a path to a moses repository set up and built with profiling enabled. Optional if you want to produce profiling results.
40
+ _MOSES\_GOOGLE\_PROFILER\_REPO is a path to moses repository set up with full tcmalloc and profiler, as well as shared link for use with gperftools.
41
+ ### Creating tests
42
+
43
+ In order to create a test one should go into the TEST_DIR and create a new folder. That folder will be used for the name of the test.
44
+ Inside that folder one should place a configuration file named **config**. The naming is mandatory.
45
+ An example such configuration file is **test\_config**
46
+
47
+ <pre>
48
+ Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
49
+ LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
50
+ Variants: vanilla, cached, ldpre, profile, google-profiler #Can't have cached without ldpre or vanilla
51
+ </pre>
52
+
53
+ The _Command:_ line specifies the executable (which is looked up in the /bin directory of the repo.) and any arguments necessary. Before running the test, the script cds to the current test directory so you can use relative paths.
54
+ The _LDPRE:_ specifies if tests should be run with any LD\_PRELOAD flags.
55
+ The _Variants:_ line specifies what type of tests should we run. This particular line will run the following tests:
56
+ 1. A Vanilla test meaning just the command after _Command_ will be issued.
57
+ 2. A vanilla cached test meaning that after the vanilla test, the test will be run again without dropping caches in order to benchmark performance on cached filesystem.
58
+ 3. A test with LD_PRELOAD ldpreloads moses -f command. For each available LDPRELOAD comma separated library to preload.
59
+ 4. A cached version of all LD_PRELOAD tests.
60
+ 5. A profile variant is only available if you have setup the profiler repository. It produces gprof outputs for all of the above in a subdirectory inside the _TEST\_LOG\_DIR.
61
+
62
+ #### Produce profiler results.
63
+ If you want to produce profiler results together in some tests you need to specify the _MOSES\_PROFILER\_REPO_ in the config
64
+ ```bash
65
+ git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-profile
66
+ cd mosesdecoder-profile
67
+ ./bjam -j10 --with-cmph=/usr/include/ variant=profile
68
+ ```
69
+
70
+ Afterwards for testcases which contain the **profile** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **gprof** output from every run (files ending in **\_profile**).
71
+
72
+ #### Produce google profiler results.
73
+ If you want to produce profiler results together in some tests you need to specify the _MOSES\_GOOGLE\_PROFILER\_REPO in the config
74
+ ```bash
75
+ git clone https://github.com/moses-smt/mosesdecoder.git mosesdecoder-google-profile
76
+ cd mosesdecoder
77
+ ./bjam link=shared -j10 --full-tcmalloc --with-cmph=/usr/include/
78
+ ```
79
+
80
+ Afterwards for testcases which contain the **google-profiler** keyword in **Variants** you will see a directory inside _TEST\_LOG\_DIR which contains the **google-profiler** output from every run (files prefixed with **pprof**). To analyze the output you need to use [pprof](http://google-perftools.googlecode.com/svn/trunk/doc/cpuprofile.html).
81
+
82
+ ### Running tests.
83
+ Running the tests is done through the **runtests.py** script.
84
+
85
+ #### Running all tests.
86
+ To run all tests, with the base branch and the latests revision (and generate new basebranch test data if such is missing) do a:
87
+ ```bash
88
+ python3 runtests.py -c testsuite_config
89
+ ```
90
+
91
+ #### Running specific tests.
92
+ The script allows the user to manually run a particular test or to test against a specific branch or revision:
93
+ <pre>
94
+ moses-speedtest@crom:~/phrase_tables$ python3 runtests.py --help
95
+ usage: runtests.py [-h] -c CONFIGFILE [-s SINGLETESTDIR] [-r REVISION]
96
+ [-b BRANCH]
97
+
98
+ A python based speedtest suite for moses.
99
+
100
+ optional arguments:
101
+ -h, --help show this help message and exit
102
+ -c CONFIGFILE, --configfile CONFIGFILE
103
+ Specify test config file
104
+ -s SINGLETESTDIR, --singletest SINGLETESTDIR
105
+ Single test name directory. Specify directory name,
106
+ not full path!
107
+ -r REVISION, --revision REVISION
108
+ Specify a specific revison for the test.
109
+ -b BRANCH, --branch BRANCH
110
+ Specify a branch for the test.
111
+ </pre>
112
+
113
+ ### Generating HTML report.
114
+ To generate a summary of the test results use the **html\_gen.py** script. It places a file named *index.html* in the current script directory.
115
+ ```bash
116
+ python3 html_gen.py testsuite_config
117
+ ```
118
+ You should use the generated file with the **style.css** file provided in the html directory.
119
+
120
+ ### Command line regression testing.
121
+ Alternatively you could check for regressions from the command line using the **check\_fo\r_regression.py** script:
122
+ ```bash
123
+ python3 check_for_regression.py TESTLOGS_DIRECTORY
124
+ ```
125
+
126
+ Alternatively the results of all tests are logged inside the the specified TESTLOGS directory so you can manually check them for additional information such as date, time, revision, branch, etc...
127
+
128
+ ### Create a cron job:
129
+ Create a cron job to run the tests daily and generate an html report. An example *cronjob* is available.
130
+ ```bash
131
+ #!/bin/sh
132
+ cd /home/moses-speedtest/phrase_tables
133
+
134
+ python3 runtests.py -c testsuite_config #Run the tests.
135
+ python3 html_gen.py testsuite_config #Generate html
136
+
137
+ cp index.html /fs/thor4/html/www/speed-test/ #Update the html
138
+ ```
139
+
140
+ Place the script in _/etc/cron.daily_ for dayly testing
141
+
142
+ ###### Author
143
+ Nikolay Bogoychev, 2014
144
+
145
+ ###### License
146
+ This software is licensed under the LGPL.
mosesdecoder/contrib/moses-speedtest/check_for_regression.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Checks if any of the latests tests has performed considerably different than
2
+ the previous ones. Takes the log directory as an argument."""
3
+ import os
4
+ import sys
5
+ from testsuite_common import Result, processLogLine, bcolors, getLastTwoLines
6
+
7
+ LOGDIR = sys.argv[1] #Get the log directory as an argument
8
+ PERCENTAGE = 5 #Default value for how much a test shoudl change
9
+ if len(sys.argv) == 3:
10
+ PERCENTAGE = float(sys.argv[2]) #Default is 5%, but we can specify more
11
+ #line parameter
12
+
13
+ def printResults(regressed, better, unchanged, firsttime):
14
+ """Pretty print the results in different colours"""
15
+ if regressed != []:
16
+ for item in regressed:
17
+ print(bcolors.RED + "REGRESSION! " + item.testname + " Was: "\
18
+ + str(item.previous) + " Is: " + str(item.current) + " Change: "\
19
+ + str(abs(item.percentage)) + "%. Revision: " + item.revision\
20
+ + bcolors.ENDC)
21
+ print('\n')
22
+ if unchanged != []:
23
+ for item in unchanged:
24
+ print(bcolors.BLUE + "UNCHANGED: " + item.testname + " Revision: " +\
25
+ item.revision + bcolors.ENDC)
26
+ print('\n')
27
+ if better != []:
28
+ for item in better:
29
+ print(bcolors.GREEN + "IMPROVEMENT! " + item.testname + " Was: "\
30
+ + str(item.previous) + " Is: " + str(item.current) + " Change: "\
31
+ + str(abs(item.percentage)) + "%. Revision: " + item.revision\
32
+ + bcolors.ENDC)
33
+ if firsttime != []:
34
+ for item in firsttime:
35
+ print(bcolors.PURPLE + "First time test! " + item.testname +\
36
+ " Took: " + str(item.real) + " seconds. Revision: " +\
37
+ item.revision + bcolors.ENDC)
38
+
39
+
40
+ all_files = os.listdir(LOGDIR)
41
+ regressed = []
42
+ better = []
43
+ unchanged = []
44
+ firsttime = []
45
+
46
+ #Go through all log files and find which tests have performed better.
47
+ for logfile in all_files:
48
+ (line1, line2) = getLastTwoLines(logfile, LOGDIR)
49
+ log1 = processLogLine(line1)
50
+ if line2 == '\n': # Empty line, only one test ever run
51
+ firsttime.append(log1)
52
+ continue
53
+ log2 = processLogLine(line2)
54
+ res = Result(log1.testname, log1.real, log2.real, log2.revision,\
55
+ log2.branch, log1.revision, log1.branch)
56
+ if res.percentage < -PERCENTAGE:
57
+ regressed.append(res)
58
+ elif res.change > PERCENTAGE:
59
+ better.append(res)
60
+ else:
61
+ unchanged.append(res)
62
+
63
+ printResults(regressed, better, unchanged, firsttime)
mosesdecoder/contrib/moses-speedtest/cronjob ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+ cd /home/moses-speedtest/phrase_tables
3
+
4
+ python3 runtests.py -c testsuite_config #Run the tests.
5
+ python3 html_gen.py testsuite_config #Generate html
6
+
7
+ cp index.html /fs/thor4/html/www/speed-test/ #Update the html
mosesdecoder/contrib/moses-speedtest/runtests.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Given a config file, runs tests"""
2
+ import os
3
+ import subprocess
4
+ import time
5
+ import shutil
6
+ from argparse import ArgumentParser
7
+ from testsuite_common import processLogLine
8
+
9
+ def parse_cmd():
10
+ """Parse the command line arguments"""
11
+ description = "A python based speedtest suite for moses."
12
+ parser = ArgumentParser(description=description)
13
+ parser.add_argument("-c", "--configfile", action="store",\
14
+ dest="configfile", required=True,\
15
+ help="Specify test config file")
16
+ parser.add_argument("-s", "--singletest", action="store",\
17
+ dest="singletestdir", default=None,\
18
+ help="Single test name directory. Specify directory name,\
19
+ not full path!")
20
+ parser.add_argument("-r", "--revision", action="store",\
21
+ dest="revision", default=None,\
22
+ help="Specify a specific revison for the test.")
23
+ parser.add_argument("-b", "--branch", action="store",\
24
+ dest="branch", default=None,\
25
+ help="Specify a branch for the test.")
26
+
27
+ arguments = parser.parse_args()
28
+ return arguments
29
+
30
+ def repoinit(testconfig, profiler=None):
31
+ """Determines revision and sets up the repo. If given the profiler optional
32
+ argument, wil init the profiler repo instead of the default one."""
33
+ revision = ''
34
+ #Update the repo
35
+ if profiler == "gnu-profiler":
36
+ if testconfig.repo_prof is not None:
37
+ os.chdir(testconfig.repo_prof)
38
+ else:
39
+ raise ValueError('Profiling repo is not defined')
40
+ elif profiler == "google-profiler":
41
+ if testconfig.repo_gprof is not None:
42
+ os.chdir(testconfig.repo_gprof)
43
+ else:
44
+ raise ValueError('Profiling repo is not defined')
45
+ else:
46
+ os.chdir(testconfig.repo)
47
+ #Checkout specific branch, else maintain main branch
48
+ if testconfig.branch != 'master':
49
+ subprocess.call(['git', 'checkout', testconfig.branch])
50
+ rev, _ = subprocess.Popen(['git', 'rev-parse', 'HEAD'],\
51
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
52
+ revision = str(rev).replace("\\n'", '').replace("b'", '')
53
+ else:
54
+ subprocess.call(['git checkout master'], shell=True)
55
+
56
+ #Check a specific revision. Else checkout master.
57
+ if testconfig.revision:
58
+ subprocess.call(['git', 'checkout', testconfig.revision])
59
+ revision = testconfig.revision
60
+ elif testconfig.branch == 'master':
61
+ subprocess.call(['git pull'], shell=True)
62
+ rev, _ = subprocess.Popen(['git rev-parse HEAD'], stdout=subprocess.PIPE,\
63
+ stderr=subprocess.PIPE, shell=True).communicate()
64
+ revision = str(rev).replace("\\n'", '').replace("b'", '')
65
+
66
+ return revision
67
+
68
+ class Configuration:
69
+ """A simple class to hold all of the configuration constatns"""
70
+ def __init__(self, repo, drop_caches, tests, testlogs, basebranch, baserev, repo_prof=None, repo_gprof=None):
71
+ self.repo = repo
72
+ self.repo_prof = repo_prof
73
+ self.repo_gprof = repo_gprof
74
+ self.drop_caches = drop_caches
75
+ self.tests = tests
76
+ self.testlogs = testlogs
77
+ self.basebranch = basebranch
78
+ self.baserev = baserev
79
+ self.singletest = None
80
+ self.revision = None
81
+ self.branch = 'master' # Default branch
82
+
83
+ def additional_args(self, singletest, revision, branch):
84
+ """Additional configuration from command line arguments"""
85
+ self.singletest = singletest
86
+ if revision is not None:
87
+ self.revision = revision
88
+ if branch is not None:
89
+ self.branch = branch
90
+
91
+ def set_revision(self, revision):
92
+ """Sets the current revision that is being tested"""
93
+ self.revision = revision
94
+
95
+
96
+ class Test:
97
+ """A simple class to contain all information about tests"""
98
+ def __init__(self, name, command, ldopts, permutations, prof_command=None, gprof_command=None):
99
+ self.name = name
100
+ self.command = command
101
+ self.prof_command = prof_command
102
+ self.gprof_command = gprof_command
103
+ self.ldopts = ldopts.replace(' ', '').split(',') #Not tested yet
104
+ self.permutations = permutations
105
+
106
+ def parse_configfile(conffile, testdir, moses_repo, moses_prof_repo=None, moses_gprof_repo=None):
107
+ """Parses the config file"""
108
+ command, ldopts, prof_command, gprof_command = '', '', None, None
109
+ permutations = []
110
+ fileopen = open(conffile, 'r')
111
+ for line in fileopen:
112
+ line = line.split('#')[0] # Discard comments
113
+ if line == '' or line == '\n':
114
+ continue # Discard lines with comments only and empty lines
115
+ opt, args = line.split(' ', 1) # Get arguments
116
+
117
+ if opt == 'Command:':
118
+ command = args.replace('\n', '')
119
+ if moses_prof_repo is not None: # Get optional command for profiling
120
+ prof_command = moses_prof_repo + '/bin/' + command
121
+ if moses_gprof_repo is not None: # Get optional command for google-perftools
122
+ gprof_command = moses_gprof_repo + '/bin/' + command
123
+ command = moses_repo + '/bin/' + command
124
+ elif opt == 'LDPRE:':
125
+ ldopts = args.replace('\n', '')
126
+ elif opt == 'Variants:':
127
+ permutations = args.replace('\n', '').replace(' ', '').split(',')
128
+ else:
129
+ raise ValueError('Unrecognized option ' + opt)
130
+ #We use the testdir as the name.
131
+ testcase = Test(testdir, command, ldopts, permutations, prof_command, gprof_command)
132
+ fileopen.close()
133
+ return testcase
134
+
135
+ def parse_testconfig(conffile):
136
+ """Parses the config file for the whole testsuite."""
137
+ repo_path, drop_caches, tests_dir, testlog_dir = '', '', '', ''
138
+ basebranch, baserev, repo_prof_path, repo_gprof_path = '', '', None, None
139
+ fileopen = open(conffile, 'r')
140
+ for line in fileopen:
141
+ line = line.split('#')[0] # Discard comments
142
+ if line == '' or line == '\n':
143
+ continue # Discard lines with comments only and empty lines
144
+ opt, args = line.split(' ', 1) # Get arguments
145
+ if opt == 'MOSES_REPO_PATH:':
146
+ repo_path = args.replace('\n', '')
147
+ elif opt == 'DROP_CACHES_COMM:':
148
+ drop_caches = args.replace('\n', '')
149
+ elif opt == 'TEST_DIR:':
150
+ tests_dir = args.replace('\n', '')
151
+ elif opt == 'TEST_LOG_DIR:':
152
+ testlog_dir = args.replace('\n', '')
153
+ elif opt == 'BASEBRANCH:':
154
+ basebranch = args.replace('\n', '')
155
+ elif opt == 'BASEREV:':
156
+ baserev = args.replace('\n', '')
157
+ elif opt == 'MOSES_PROFILER_REPO:': # Optional
158
+ repo_prof_path = args.replace('\n', '')
159
+ elif opt == 'MOSES_GOOGLE_PROFILER_REPO:': # Optional
160
+ repo_gprof_path = args.replace('\n', '')
161
+ else:
162
+ raise ValueError('Unrecognized option ' + opt)
163
+ config = Configuration(repo_path, drop_caches, tests_dir, testlog_dir,\
164
+ basebranch, baserev, repo_prof_path, repo_gprof_path)
165
+ fileopen.close()
166
+ return config
167
+
168
+ def get_config():
169
+ """Builds the config object with all necessary attributes"""
170
+ args = parse_cmd()
171
+ config = parse_testconfig(args.configfile)
172
+ config.additional_args(args.singletestdir, args.revision, args.branch)
173
+ revision = repoinit(config)
174
+ if config.repo_prof is not None:
175
+ repoinit(config, "gnu-profiler")
176
+ if config.repo_gprof is not None:
177
+ repoinit(config, "google-profiler")
178
+ config.set_revision(revision)
179
+ return config
180
+
181
+ def check_for_basever(testlogfile, basebranch):
182
+ """Checks if the base revision is present in the testlogs"""
183
+ filetoopen = open(testlogfile, 'r')
184
+ for line in filetoopen:
185
+ templine = processLogLine(line)
186
+ if templine.branch == basebranch:
187
+ return True
188
+ return False
189
+
190
+ def split_time(filename):
191
+ """Splits the output of the time function into seperate parts.
192
+ We will write time to file, because many programs output to
193
+ stderr which makes it difficult to get only the exact results we need."""
194
+ timefile = open(filename, 'r')
195
+ realtime = float(timefile.readline().replace('\n', '').split()[1])
196
+ usertime = float(timefile.readline().replace('\n', '').split()[1])
197
+ systime = float(timefile.readline().replace('\n', '').split()[1])
198
+ timefile.close()
199
+
200
+ return (realtime, usertime, systime)
201
+
202
+
203
+ def write_log(time_file, logname, config):
204
+ """Writes to a logfile"""
205
+ log_write = open(config.testlogs + '/' + logname, 'a') # Open logfile
206
+ date_run = time.strftime("%d.%m.%Y %H:%M:%S") # Get the time of the test
207
+ realtime, usertime, systime = split_time(time_file) # Get the times in a nice form
208
+
209
+ # Append everything to a log file.
210
+ writestr = date_run + " " + config.revision + " Testname: " + logname +\
211
+ " RealTime: " + str(realtime) + " UserTime: " + str(usertime) +\
212
+ " SystemTime: " + str(systime) + " Branch: " + config.branch +'\n'
213
+ log_write.write(writestr)
214
+ log_write.close()
215
+
216
+ def write_gprof(command, name, variant, config):
217
+ """Produces a gprof report from a gmon file"""
218
+ #Check if we have a directory for the profiling of this testcase:
219
+ output_dir = config.testlogs + '/' + name
220
+ if not os.path.exists(output_dir):
221
+ os.makedirs(output_dir)
222
+ outputfile = output_dir + '/' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
223
+
224
+ #Compile a gprof command and output the file in the directory we just created
225
+ gmon_path = os.getcwd() + '/gmon.out' # Path to the profiling file
226
+ executable_path = command.split(' ')[0] # Path to the moses binary
227
+ gprof_command = 'gprof ' + executable_path + ' ' + gmon_path + ' > ' + outputfile
228
+ subprocess.call([gprof_command], shell=True)
229
+ os.remove(gmon_path) # After we are done discard the gmon file
230
+
231
+ def write_pprof(name, variant, config):
232
+ """Copies the google-perftools profiler output to the corresponding test directory"""
233
+ output_dir = config.testlogs + '/' + name
234
+ if not os.path.exists(output_dir):
235
+ os.makedirs(output_dir)
236
+ outputfile = output_dir + '/pprof_' + time.strftime("%d.%m.%Y_%H:%M:%S") + '_' + name + '_' + variant
237
+ shutil.move("/tmp/moses.prof", outputfile)
238
+
239
+
240
+ def execute_test(command, path, name, variant, config, profile=None):
241
+ """Executes a testcase given a whole command, path to the test file output,
242
+ name of the test and variant tested. Config is the global configuration"""
243
+ subprocess.Popen([command], stdout=None, stderr=subprocess.PIPE, shell=True).communicate()
244
+ if profile is None:
245
+ write_log(path, name + '_' + variant, config)
246
+ elif profile == "gnu-profiler": # Basically produce a gmon output
247
+ write_gprof(command, name, variant, config)
248
+ elif profile == "google-profiler":
249
+ write_pprof(name, variant, config)
250
+
251
+
252
+ def execute_tests(testcase, cur_directory, config):
253
+ """Executes timed tests based on the config file"""
254
+ #Several global commands related to the time wrapper
255
+ time_command = ' time -p -o /tmp/time_moses_tests '
256
+ time_path = '/tmp/time_moses_tests'
257
+
258
+ #Figure out the order of which tests must be executed.
259
+ #Change to the current test directory
260
+ os.chdir(config.tests + '/' + cur_directory)
261
+ #Clear caches
262
+ subprocess.call(['sync'], shell=True)
263
+ subprocess.call([config.drop_caches], shell=True)
264
+ #Perform vanilla test and if a cached test exists - as well
265
+ print(testcase.name)
266
+ if 'vanilla' in testcase.permutations:
267
+ #Create the command for executing moses
268
+ whole_command = time_command + testcase.command
269
+
270
+ #test normal and cached
271
+ execute_test(whole_command, time_path, testcase.name, 'vanilla', config)
272
+ if 'cached' in testcase.permutations:
273
+ execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config)
274
+
275
+ #Now perform LD_PRELOAD tests
276
+ if 'ldpre' in testcase.permutations:
277
+ for opt in testcase.ldopts:
278
+ #Clear caches
279
+ subprocess.call(['sync'], shell=True)
280
+ subprocess.call([config.drop_caches], shell=True)
281
+
282
+ #Create the command for executing moses:
283
+ whole_command = 'LD_PRELOAD=' + opt + time_command + testcase.command
284
+ variant = 'ldpre_' + opt
285
+
286
+ #test normal and cached
287
+ execute_test(whole_command, time_path, testcase.name, variant, config)
288
+ if 'cached' in testcase.permutations:
289
+ execute_test(whole_command, time_path, testcase.name, variant + '_cached', config)
290
+
291
+ #Perform profiling test. Mostly same as the above lines but necessary duplication.
292
+ #All actual code is inside execute_test so those lines shouldn't need modifying
293
+ if 'profile' in testcase.permutations:
294
+ subprocess.call(['sync'], shell=True) # Drop caches first
295
+ subprocess.call([config.drop_caches], shell=True)
296
+
297
+ if 'vanilla' in testcase.permutations:
298
+ whole_command = testcase.prof_command
299
+ execute_test(whole_command, time_path, testcase.name, 'profile', config, "gnu-profiler")
300
+ if 'cached' in testcase.permutations:
301
+ execute_test(whole_command, time_path, testcase.name, 'profile_cached', config, "gnu-profiler")
302
+
303
+ if 'ldpre' in testcase.permutations:
304
+ for opt in testcase.ldopts:
305
+ #Clear caches
306
+ subprocess.call(['sync'], shell=True)
307
+ subprocess.call([config.drop_caches], shell=True)
308
+
309
+ #Create the command for executing moses:
310
+ whole_command = 'LD_PRELOAD=' + opt + " " + testcase.prof_command
311
+ variant = 'profile_ldpre_' + opt
312
+
313
+ #test normal and cached
314
+ execute_test(whole_command, time_path, testcase.name, variant, config, "gnu-profiler")
315
+ if 'cached' in testcase.permutations:
316
+ execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, "gnu-profiler")
317
+
318
+ #Google-perftools profiler
319
+ if 'google-profiler' in testcase.permutations:
320
+ subprocess.call(['sync'], shell=True) # Drop caches first
321
+ subprocess.call([config.drop_caches], shell=True)
322
+
323
+ #Create the command for executing moses
324
+ whole_command = "CPUPROFILE=/tmp/moses.prof " + testcase.gprof_command
325
+
326
+ #test normal and cached
327
+ execute_test(whole_command, time_path, testcase.name, 'vanilla', config, 'google-profiler')
328
+ if 'cached' in testcase.permutations:
329
+ execute_test(whole_command, time_path, testcase.name, 'vanilla_cached', config, 'google-profiler')
330
+
331
+ #Now perform LD_PRELOAD tests
332
+ if 'ldpre' in testcase.permutations:
333
+ for opt in testcase.ldopts:
334
+ #Clear caches
335
+ subprocess.call(['sync'], shell=True)
336
+ subprocess.call([config.drop_caches], shell=True)
337
+
338
+ #Create the command for executing moses:
339
+ whole_command = 'LD_PRELOAD=' + opt + " " + whole_command
340
+ variant = 'ldpre_' + opt
341
+
342
+ #test normal and cached
343
+ execute_test(whole_command, time_path, testcase.name, variant, config, 'google-profiler')
344
+ if 'cached' in testcase.permutations:
345
+ execute_test(whole_command, time_path, testcase.name, variant + '_cached', config, 'google-profiler')
346
+
347
+
348
+ # Go through all the test directories and executes tests
349
+ if __name__ == '__main__':
350
+ CONFIG = get_config()
351
+ ALL_DIR = os.listdir(CONFIG.tests)
352
+
353
+ #We should first check if any of the tests is run for the first time.
354
+ #If some of them are run for the first time we should first get their
355
+ #time with the base version (usually the previous release)
356
+ FIRSTTIME = []
357
+ TESTLOGS = []
358
+ #Strip filenames of test underscores
359
+ for listline in os.listdir(CONFIG.testlogs):
360
+ listline = listline.replace('_vanilla', '')
361
+ listline = listline.replace('_cached', '')
362
+ listline = listline.replace('_ldpre', '')
363
+ TESTLOGS.append(listline)
364
+ for directory in ALL_DIR:
365
+ if directory not in TESTLOGS:
366
+ FIRSTTIME.append(directory)
367
+
368
+ #Sometimes even though we have the log files, we will need to rerun them
369
+ #Against a base version, because we require a different baseversion (for
370
+ #example when a new version of Moses is released.) Therefore we should
371
+ #Check if the version of Moses that we have as a base version is in all
372
+ #of the log files.
373
+
374
+ for logfile in os.listdir(CONFIG.testlogs):
375
+ logfile_name = CONFIG.testlogs + '/' + logfile
376
+ if os.path.isfile(logfile_name) and not check_for_basever(logfile_name, CONFIG.basebranch):
377
+ logfile = logfile.replace('_vanilla', '')
378
+ logfile = logfile.replace('_cached', '')
379
+ logfile = logfile.replace('_ldpre', '')
380
+ FIRSTTIME.append(logfile)
381
+ FIRSTTIME = list(set(FIRSTTIME)) #Deduplicate
382
+
383
+ if FIRSTTIME != []:
384
+ #Create a new configuration for base version tests:
385
+ BASECONFIG = Configuration(CONFIG.repo, CONFIG.drop_caches,\
386
+ CONFIG.tests, CONFIG.testlogs, CONFIG.basebranch,\
387
+ CONFIG.baserev, CONFIG.repo_prof, CONFIG.repo_gprof)
388
+ BASECONFIG.additional_args(None, CONFIG.baserev, CONFIG.basebranch)
389
+ #Set up the repository and get its revision:
390
+ REVISION = repoinit(BASECONFIG)
391
+ BASECONFIG.set_revision(REVISION)
392
+ #Build
393
+ os.chdir(BASECONFIG.repo)
394
+ subprocess.call(['./previous.sh'], shell=True)
395
+ #If profiler configuration exists also init it
396
+ if BASECONFIG.repo_prof is not None:
397
+ repoinit(BASECONFIG, "gnu-profiler")
398
+ os.chdir(BASECONFIG.repo_prof)
399
+ subprocess.call(['./previous.sh'], shell=True)
400
+
401
+ if BASECONFIG.repo_gprof is not None:
402
+ repoinit(BASECONFIG, "google-profiler")
403
+ os.chdir(BASECONFIG.repo_gprof)
404
+ subprocess.call(['./previous.sh'], shell=True)
405
+
406
+ #Perform tests
407
+ for directory in FIRSTTIME:
408
+ cur_testcase = parse_configfile(BASECONFIG.tests + '/' + directory +\
409
+ '/config', directory, BASECONFIG.repo, BASECONFIG.repo_prof, BASECONFIG.repo_gprof)
410
+ execute_tests(cur_testcase, directory, BASECONFIG)
411
+
412
+ #Reset back the repository to the normal configuration
413
+ repoinit(CONFIG)
414
+ if BASECONFIG.repo_prof is not None:
415
+ repoinit(CONFIG, "gnu-profiler")
416
+
417
+ if BASECONFIG.repo_gprof is not None:
418
+ repoinit(CONFIG, "google-profiler")
419
+
420
+ #Builds moses
421
+ os.chdir(CONFIG.repo)
422
+ subprocess.call(['./previous.sh'], shell=True)
423
+ if CONFIG.repo_prof is not None:
424
+ os.chdir(CONFIG.repo_prof)
425
+ subprocess.call(['./previous.sh'], shell=True)
426
+
427
+ if CONFIG.repo_gprof is not None:
428
+ os.chdir(CONFIG.repo_gprof)
429
+ subprocess.call(['./previous.sh'], shell=True)
430
+
431
+ if CONFIG.singletest:
432
+ TESTCASE = parse_configfile(CONFIG.tests + '/' +\
433
+ CONFIG.singletest + '/config', CONFIG.singletest, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
434
+ execute_tests(TESTCASE, CONFIG.singletest, CONFIG)
435
+ else:
436
+ for directory in ALL_DIR:
437
+ cur_testcase = parse_configfile(CONFIG.tests + '/' + directory +\
438
+ '/config', directory, CONFIG.repo, CONFIG.repo_prof, CONFIG.repo_gprof)
439
+ execute_tests(cur_testcase, directory, CONFIG)
mosesdecoder/contrib/moses-speedtest/sys_drop_caches.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/spython
2
+ from sys import argv, stderr, exit
3
+ from os import linesep as ls
4
+ procfile = "/proc/sys/vm/drop_caches"
5
+ options = ["1","2","3"]
6
+ flush_type = None
7
+ try:
8
+ flush_type = argv[1][0:1]
9
+ if not flush_type in options:
10
+ raise IndexError, "not in options"
11
+ with open(procfile, "w") as f:
12
+ f.write("%s%s" % (flush_type,ls))
13
+ exit(0)
14
+ except IndexError, e:
15
+ stderr.write("Argument %s required.%s" % (options, ls))
16
+ except IOError, e:
17
+ stderr.write("Error writing to file.%s" % ls)
18
+ except StandardError, e:
19
+ stderr.write("Unknown Error.%s" % ls)
20
+
21
+ exit(1)
22
+
mosesdecoder/contrib/moses-speedtest/test_config ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ Command: moses -f ... -i fff #Looks for the command in the /bin directory of the repo specified in the testsuite_config
2
+ LDPRE: ldpreloads #Comma separated LD_LIBRARY_PATH:/,
3
+ Variants: vanilla, cached, ldpre #Can't have cached without ldpre or vanilla
mosesdecoder/contrib/moses-speedtest/testsuite_config ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ MOSES_REPO_PATH: /home/moses-speedtest/moses-standard/mosesdecoder
2
+ DROP_CACHES_COMM: sys_drop_caches 3
3
+ TEST_DIR: /home/moses-speedtest/phrase_tables/tests
4
+ TEST_LOG_DIR: /home/moses-speedtest/phrase_tables/testlogs
5
+ BASEBRANCH: RELEASE-2.1.1
mosesdecoder/contrib/picaro/README ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ README - 16 Jan 2011b
2
+ Author: Jason Riesa <jason.riesa@gmail.com>
3
+
4
+ Picaro [v1.0]: A simple command-line alignment visualization tool.
5
+ Visualize alignments in grid-format.
6
+
7
+ This brief README is organized as follows:
8
+ I. REQUIREMENTS
9
+ II. USAGE
10
+ III. INPUT FORMAT
11
+ IV. EXAMPLE USAGE
12
+ V. NOTES
13
+
14
+ I. REQUIREMENTS
15
+ ===============
16
+ Python v2.5 or higher is required.
17
+
18
+ II. USAGE
19
+ =========
20
+ Picaro takes as input 3 mandatory arguments and up to 2 optional arguments:
21
+ Mandatory arguments:
22
+ 1. -a1 <alignment1> where alignment1 is a path to an alignment file
23
+ 2. -e <e> where e is a path to a file of English sentences
24
+ 3. -f <f> where f is a path to a file of French sentences
25
+ Optional arguments:
26
+ 1. -a2 <a2> path to alignment2 file in f-e format
27
+ 2. -maxlen <len> for each sentence pair, render only when each
28
+ sentence has length in words <= len
29
+
30
+ For historical reasons we use the labels e, f, English, and French,
31
+ but any language pair will do.
32
+
33
+ III. INPUT FORMAT
34
+ =================
35
+ - Files e and f must be sentence-aligned
36
+ - Alignment files must be in f-e format
37
+ See included sample files in zh/ and es/.
38
+
39
+ IV. EXAMPLE USAGE
40
+ =================
41
+ WITH A SINGLE ALIGNMENT:
42
+ $ picaro.py -e zh/sample.e -f zh/sample.f -a1 zh/sample.aln
43
+
44
+ COMPARING TWO ALIGNMENTS:
45
+ $ picaro.py -e zh/sample.e -f zh/sample.f -a1 zh/alternate.aln -a2 zh/sample.aln
46
+
47
+ When visualizing two alignments at once, refer to the following color scheme:
48
+ Green blocks: alignments a1 and a2 agree
49
+ Blue blocks: alignment a1 only
50
+ Gold blocks: alignment a2 only
51
+
52
+ V. NOTES
53
+ ========
54
+ RIGHT-TO-LEFT TEXT:
55
+ If you are using right-to-left text, e.g. Arabic, transliterate your text first.
56
+ Terminals generally render unexpectedly with mixed left-to-right and right-to-left text.
57
+ For Arabic, in particular, we use the Buckwalter translitation scheme [1] when using this tool.
58
+ The following Perl module implements Buckwalter transliteration:
59
+ http://search.cpan.org/~smrz/Encode-Arabic-1.8/lib/Encode/Arabic.pm
60
+
61
+ [1] http://www.ldc.upenn.edu/myl/morph/buckwalter.html
62
+
mosesdecoder/contrib/picaro/es/README ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Spanish-English sample
2
+ sample.f Spanish text
3
+ sample.e English text
4
+ sample.a Alignment file with links in f-e format
mosesdecoder/contrib/picaro/es/sample.aln ADDED
@@ -0,0 +1 @@
 
 
1
+ 0-0 0-1 1-2 1-3 2-4 3-5 4-6 5-7
mosesdecoder/contrib/picaro/es/sample.e ADDED
@@ -0,0 +1 @@
 
 
1
+ i want to go to spain tomorrow .
mosesdecoder/contrib/picaro/es/sample.f ADDED
@@ -0,0 +1 @@
 
 
1
+ quiero ir a españa mañana .
mosesdecoder/contrib/picaro/picaro.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ #
3
+ # Picaro: An simple command-line alignment visualization tool.
4
+ #
5
+ # picaro.py
6
+ # Visualize alignments between sentences in a grid format.
7
+ #
8
+ # Jason Riesa <riesa@isi.edu>
9
+ # version: 01-16-2010
10
+ #
11
+ # Copyright (C) 2013 Jason Riesa
12
+ #
13
+ # This library is free software; you can redistribute it and/or
14
+ # modify it under the terms of the GNU Lesser General Public
15
+ # License as published by the Free Software Foundation; either
16
+ # version 2.1 of the License, or (at your option) any later version.
17
+ #
18
+ # This library is distributed in the hope that it will be useful,
19
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21
+ # Lesser General Public License for more details.
22
+ #
23
+ # You should have received a copy of the GNU Lesser General Public
24
+ # License along with this library; if not, write to the Free Software
25
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26
+
27
+ import sys, os, commands
28
+ from collections import defaultdict
29
+
30
+ #TC_BIN = "tc/tc.linux32"
31
+
32
+ a1_file_str = ""
33
+ a2_file_str = ""
34
+ f_file_str = ""
35
+ e_file_str = ""
36
+ SHOW_TC_A1 = 0
37
+ SHOW_TC_A2 = 0
38
+ maxlen = float('inf')
39
+
40
+ # Process command line options
41
+ try:
42
+ while len(sys.argv) > 1:
43
+ option = sys.argv[1]; del sys.argv[1]
44
+ if option == '-a1':
45
+ a1_file_str = sys.argv[1]; del sys.argv[1]
46
+ elif option == '-a2':
47
+ a2_file_str = sys.argv[1]; del sys.argv[1]
48
+ elif option == '-f':
49
+ f_file_str = sys.argv[1]; del sys.argv[1]
50
+ elif option == '-e':
51
+ e_file_str = sys.argv[1]; del sys.argv[1]
52
+ elif option == '-maxlen':
53
+ maxlen = int(sys.argv[1]); del sys.argv[1]
54
+ else:
55
+ sys.stderr.write("Invalid option: %s\n" % (option))
56
+ sys.exit(1)
57
+ '''
58
+ elif option == '-tc':
59
+ if sys.argv[1] == '1':
60
+ SHOW_TC_A1 = 1; del sys.argv[1]
61
+ elif sys.argv[1] == '2':
62
+ SHOW_TC_A2 = 2; del sys.argv[1]
63
+ else:
64
+ raise Exception, "Invalid argument to option -tc"
65
+ '''
66
+
67
+ if a1_file_str == "" or f_file_str == "" or e_file_str == "":
68
+ raise Exception, "Not all options properly specified."
69
+ # Make sure transitive closure binary exists if user has enabled this option
70
+ if SHOW_TC_A1 or SHOW_TC_A2:
71
+ if not os.path.exists(TC_BIN):
72
+ raise Exception, "Transitive closure binary "+TC_BIN+" not found."
73
+ except Exception, msg:
74
+ sys.stderr.write("%s: %s\n" % (sys.argv[0], msg))
75
+ sys.stderr.write("Usage: %s: -a1 <alignment1> -f <f> -e <e> [-a2 <alignment2>]\n" % (sys.argv[0]))
76
+ sys.stderr.write("Mandatory arguments:\n")
77
+ sys.stderr.write(" -a1 <a1>\t path to alignment 1 file in f-e format\n")
78
+ sys.stderr.write(" -f <f>\t\t path to source text f\n")
79
+ sys.stderr.write(" -e <e>\t\t path to target text e\n")
80
+ sys.stderr.write("Optional arguments:\n")
81
+ sys.stderr.write(" -a2 <a2>\t path to alignment 2 file in f-e format\n")
82
+ sys.stderr.write(" -maxlen <len>\t display alignment only when e and f have length <= len\n")
83
+ sys.exit(1)
84
+
85
+
86
+ a_file = open(a1_file_str, 'r')
87
+ f_file = open(f_file_str, 'r')
88
+ e_file = open(e_file_str, 'r')
89
+ if a2_file_str != "":
90
+ a2_file = open(a2_file_str, 'r')
91
+
92
+ sentenceNumber = 0
93
+ nextRequested = 1
94
+ for aline in a_file:
95
+ eline = e_file.readline()
96
+ fline = f_file.readline()
97
+ if a2_file_str != "":
98
+ a2line = a2_file.readline()
99
+
100
+ links = aline.split()
101
+ e_words = eline.split()
102
+ f_words = fline.split()
103
+ if a2_file_str != "":
104
+ links2 = a2line.split()
105
+
106
+ # Get transitive closure of links and links2
107
+ if SHOW_TC_A1:
108
+ cmd = 'echo "' + ' '.join(links) + '" | ' + TC_BIN
109
+ failure1, output1 = commands.getstatusoutput(cmd)
110
+ tc1 = output1.split()
111
+ if SHOW_TC_A2:
112
+ cmd = 'echo "' + ' '.join(links2) + '" | ' + TC_BIN
113
+ failure2, output2 = commands.getstatusoutput(cmd)
114
+ tc2 = output2.split()
115
+
116
+ # Update tracking counts
117
+ sentenceNumber += 1
118
+ if sentenceNumber < nextRequested:
119
+ continue
120
+
121
+ # Don't generate alignment grids for very large sentences
122
+ if len(e_words) > maxlen or len(f_words) > maxlen:
123
+ continue
124
+
125
+
126
+ print "== SENTENCE ",sentenceNumber," =="
127
+
128
+ # Initialize alignment objects
129
+ # a holds alignments of user-specified -a1 <file>
130
+ # a2 holds alignments of user-specified -a2 <file>
131
+ a = defaultdict(lambda: defaultdict(int))
132
+ a2 = defaultdict(lambda: defaultdict(int))
133
+
134
+ # Print e_words on the columns
135
+ # First, find the length of the longest word
136
+ longestEWordSize = 0
137
+ longestEWord = 0
138
+ for w in e_words:
139
+ if len(w) > longestEWordSize:
140
+ longestEWordSize = len(w)
141
+ longestEWord = w
142
+
143
+ # Now, print the e-words
144
+ for i in range(longestEWordSize, 0, -1):
145
+ for w in e_words:
146
+ if len(w) < i:
147
+ print " ",
148
+ else:
149
+ print w[(i*-1)],
150
+ print
151
+
152
+
153
+ # Fill in alignment matrix 1
154
+ for link in links:
155
+ i, j = map(int, link.split('-'))
156
+ a[int(i)][int(j)] = 1
157
+ # Fill in extra links added by transitive closure
158
+ if SHOW_TC_A1:
159
+ for link in tc1:
160
+ i, j = map(int, link.split('-'))
161
+ if(a[i][j] != 1):
162
+ a[i][j] = 2
163
+
164
+ # Fill in alignment matrix 2
165
+ if(a2_file_str != ""):
166
+ for link in links2:
167
+ i, j = map(int, link.split('-'))
168
+ a2[i][j] = 1
169
+ # Fill in extra links added by transitive closure
170
+ if SHOW_TC_A2:
171
+ for link in tc2:
172
+ i, j = map(int, link.split('-'))
173
+ if(a2[i][j] != 1):
174
+ a2[i][j] = 2
175
+
176
+ # Print filled-in alignment matrix
177
+ if a2_file_str == "":
178
+ for i, _ in enumerate(f_words):
179
+ for j, _ in enumerate(e_words):
180
+ val1 = a[i][j]
181
+ if val1 == 0:
182
+ # No link
183
+ print ':',
184
+ elif val1 == 1:
185
+ # Regular link
186
+ print u'\u001b[44m\u0020\u001b[0m',
187
+ elif val1 == 2:
188
+ # Link due to transitive closure
189
+ # Render as gray-shaded square
190
+ print 'O',
191
+ print f_words[i]
192
+ print
193
+ else:
194
+ for i, _ in enumerate(f_words):
195
+ for j, _ in enumerate(e_words):
196
+ val1 = a[i][j]
197
+ val2 = a2[i][j]
198
+
199
+ if val1 == 0 and val2 == 0:
200
+ # Link not in a nor a2
201
+ # Empty grid box
202
+ print ':',
203
+ # Link in both a and a2
204
+ elif val1 > 0 and val2 > 0:
205
+ # Green box
206
+ if val1 == 1:
207
+ if val2 == 1:
208
+ print u'\u001b[42m\u001b[1m\u0020\u001b[0m',
209
+ elif val2 == 2:
210
+ print u'\u001b[42m\u001b[30m2\u001b[0m',
211
+ elif val1 == 2:
212
+ if val2 == 1:
213
+ print u'\u001b[42m\u0020\u001b[0m',
214
+ elif val2 == 2:
215
+ print u'\u001b[42m\u001b[30m3\u001b[0m',
216
+ # Link in a2, but not a
217
+ elif val1 == 0 and val2 > 0:
218
+ if val2 == 1:
219
+ # Yellow box
220
+ print u'\u001b[1m\u001b[43m\u0020\u001b[0m',
221
+ elif val2 == 2:
222
+ # Artificial link by transitive closure
223
+ print u'\u001b[43m\u001b[30m2\u001b[0m',
224
+
225
+ # Link in a, but not a2
226
+ elif val1 > 0 and val2 == 0:
227
+ if val1 == 1:
228
+ # Blue box
229
+ print u'\u001b[1m\u001b[44m\u0020\u001b[0m',
230
+ elif val1 == 2:
231
+ print u'\u001b[44m\u001b[37m1\u001b[0m',
232
+ print f_words[i]
233
+ nextDefault = sentenceNumber + 1
234
+ sys.stdout.write("Enter next alignment number or 'q' to quit [%d]: " %(nextDefault))
235
+ user_input = sys.stdin.readline().strip()
236
+ if user_input == "":
237
+ nextRequested = nextDefault
238
+ elif user_input[0] == "q" or user_input == "quit":
239
+ sys.exit(1)
240
+ else:
241
+ try:
242
+ nextRequested = int(user_input)
243
+ except:
244
+ nextRequested = sentenceNumber + 1
245
+ sys.stdout.write("Unknown alignment id: %s\nContinuing with %d.\n" %(user_input, nextRequested))
246
+
247
+ a_file.close()
248
+ e_file.close()
249
+ f_file.close()
250
+
mosesdecoder/contrib/promix/test_data/esen.ep.model.filtered/phrase-table.0-0.1.1.binphr.idx ADDED
Binary file (68 Bytes). View file
 
mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.idx ADDED
Binary file (76 Bytes). View file
 
mosesdecoder/contrib/promix/test_data/esen.nc.model.filtered/phrase-table.0-0.1.1.binphr.srctree.wa ADDED
Binary file (728 Bytes). View file