wiki-cased / tools /mosesdecoder /contrib /m4m /modules /phrase-table.m4m
Quagmire1's picture
Upload folder using huggingface_hub
41f6dd8 verified
# -*- makefile -*-
# Makefile for building a Moses system from a word-aligned corpus
# (c) 2011 - 2012 Ulrich Germann
# by default, we use good-turing smoothing for phrase tables
# (is that actually worth it?)
# the following variables should be defined in moses-parameters.make:
# ptable.max-phrase-length
# ptable.smoothing
# ptable.source-factors
# ptable.target-factors
word-alignment ?= fast
pll.txt1 = ${WDIR}/crp/trn/aln/${word-alignment}/${L1}.txt.gz
pll.txt2 = ${WDIR}/crp/trn/aln/${word-alignment}/${L2}.txt.gz
pll.aln = ${WDIR}/crp/trn/aln/${word-alignment}/${L1}-${L2}.symal.gz
define phrase_extract_cmd
${moses.extract-phrases} ${moses.extract} $(1:.aln.gz=) ${L1} ${L2} \
-l ${ptable.max-phrase-length} -m $2
endef
#################################################################################
# $1: stem of phrase extractions
# $2: L1 text
# $3: L2 text
# $4: symal file
# normally, $2 ... $4 are default values ${pll.txt1} ${pll.txt2} ${pll.aln}
define extract_phrases
SHARDS = $$(foreach x, $${L1} $${L2} aln, $1.shards/$$x-DONE)
$1.shards/${L1}-DONE: $(if $2,$2,$$(pll.txt1))
$$(lock)
zcat -f $$< \
| ${parallel} --pipe -k -N ${SHARDSIZE} "gzip > $${@D}/{#}.${L1}.gz"
touch $$@
$$(unlock)
$1.shards/${L2}-DONE: $(if $3,$3,$$(pll.txt2))
$$(lock)
zcat -f $$< \
| ${parallel} --pipe -k -N ${SHARDSIZE} "gzip > $${@D}/{#}.${L2}.gz"
$$(unlock)
$1.shards/aln-DONE: $(if $4,$4,$$(pll.aln))
$$(lock)
zcat -f $$< \
| ${parallel} --pipe -k -N ${SHARDSIZE} "gzip > $${@D}/{#}.aln.gz"
$$(unlock)
$1.shards/extract.batch: $$(SHARDS)
$$(info SHARDS $1 $$(shell ls $${@D}/*.aln.gz))
$$(lock)
echo -n '' > $$@_
$$(foreach x, $$(shell ls $${@D}/*.aln.gz 2>/dev/null),\
echo "$$(call phrase_extract_cmd,$$x,$$(word 1,$${dmodels}))" >> $$@_;\
$$(foreach i, $$(shell seq 2 $$(words $${dmodels})),\
echo "$$(call phrase_extract_cmd,$$x,$$(word $$i,$${dmodels})) -x" >> $$@_))
mv $$@_ $$@;
$$(unlock)
$1.shards/extract.done: $1.shards/extract.batch
$$(lock)
${parallel} -j$(shell echo $$((${NUMCORES}/4))) < $1.shards/extract.batch
touch $$@
$$(unlock)
endef
#################################################################################
# create_phrase_table: add rules to create a standard phrase table
# ADD RULES TO CREATE A STANDARD PHRASE TABLE FROM
# $(pll.txt1),$(pll.txt2),$(pll.aln) that are specified as target-specific
# variables like this:
# $1.txt.gz: pll.txt1 = ...
# $1.txt.gz: pll.txt2 = ...
# $1.txt.gz: pll.aln = ...
# This function is normally called indirectly via $(eval $(call add_bin_pt,...))
#
# Note: this section should be improved:
# - split into shards
# - create bash file with jobs
# - run batch file in parallel
#--------------------------------------------------------------------------------
define create_phrase_table
$(call extract_phrases,$1,$${pll.txt1},$${pll.txt2},$${pll.aln})
ptable: $1.txt.gz
$1.txt.gz: | ${merge-sorted}
$1.txt.gz: | ${MOSES_BIN}/consolidate
$1.txt.gz: | $1.tmp/fwd.scored.gz
$1.txt.gz: | $1.tmp/bwd/scoring.done
$$(lock)
${MOSES_BIN}/consolidate \
<(zcat -f $1.tmp/fwd.scored.gz) \
<(${merge-sorted} $1.tmp/bwd/scored.*.gz) /dev/stdout \
$(if $(ptable.smoothing), $(ptable.smoothing) $1.tmp/fwd.coc) \
| gzip > $$@_
mv $$@_ $$@
$$(unlock)
$1.tmp/fwd.scored.gz: | $(merge-sorted)
$1.tmp/fwd.scored.gz: | $1.shards/extract.done
$1.tmp/fwd.scored.gz: | $1.${L1}-given-${L2}.lex.gz
$$(lock)
$(merge-sorted) $1.shards/*.fwd.gz \
| $(moses.score-phrases) ${MOSES_BIN}/score - $1.${L1}-given-${L2}.lex.gz \
$${@D}/fwd $(ptable.smoothing) && mv $$@_ $$@
$$(unlock)
$1.tmp/bwd/scoring.done: | $1.shards/extract.done
$1.tmp/bwd/scoring.done: | $1.${L2}-given-${L1}.lex.gz
$$(lock)
$(merge-sorted) $1.shards/*.bwd.gz \
| ${moses.score-phrases} ${MOSES_BIN}/score - $1.${L2}-given-${L1}.lex.gz \
$${@D}/scored $(ptable.smoothing) --Inverse && touch $$@
$$(unlock)
# reminder: $2,$3,$4 = L1text, L2text, alignment
$1.${L2}-given-${L1}.lex.gz: | $1.${L1}-given-${L2}.lex.gz
$1.${L1}-given-${L2}.lex.gz: | $(if $2,$2,$$(pll.txt1))
$1.${L1}-given-${L2}.lex.gz: | $(if $3,$3,$$(pll.txt2))
$1.${L1}-given-${L2}.lex.gz: | $(if $4,$4,$$(pll.aln))
$$(lock)
$(moses.make-lex) \
$(if $2,$2,$$(pll.txt1)) \
$(if $3,$3,$$(pll.txt2)) \
$(if $4,$4,$$(pll.aln)) \
$1.${L1}-given-${L2}.lex.gz \
$1.${L2}-given-${L1}.lex.gz
$$(unlock)
endef
# end of create_phrase_table
#################################################################################
#################################################################################
# $1: input factor(s)
# $2: output factor(s)
# $3: number of features
# $4: stem of phrase table
define add_binary_phrase_table
$(call create_phrase_table,$4)
mystem := $(strip $4)
ffname := TranslationModel$(words ${PTABLE_ENTRIES})
MY_ENTRY := PhraseDictionaryBinary
MY_ENTRY += name=$$(ffname)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += input-factor=$(strip $1)
MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += path=$(abspath $(strip $4))
$(if ${moses.ini_ttable-limit},MY_ENTRY += table-limit=${moses.ini_ttable-limit})
PTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
PTABLES += $(strip $4).binphr.idx
$(strip $4).binphr.idx: infactor=$1
$(strip $4).binphr.idx: outfactor=$2
$(strip $4).binphr.idx: nscores=$3
endef
#################################################################################
%.binphr.idx: %.txt.gz | ${MOSES_BIN}/processPhraseTable
$(lock)
zcat -f $*.txt.gz | ${MOSES_BIN}/processPhraseTable \
-ttable $(infactor) $(outfactor) - -nscores $(nscores) -out $@.lock/$(notdir $*)
mv $@.lock/$(notdir $*).* ${@D}
$(unlock)
#################################################################################
# $1: input factor(s)
# $2: output factor(s)
# $3: number of features
# $4: stem of phrase table
define add_text_phrase_table
$(call create_phrase_table,$4)
ffname := TranslationModel$(words ${PTABLE_ENTRIES})
MY_ENTRY := PhraseDictionaryMemory
MY_ENTRY += name=$$(ffname)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += input-factor=$(strip $1)
MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += path=$(abspath $4).txt.gz
$(if ${moses.ini_ttable-limit},MY_ENTRY += table-limit=${moses.ini_ttable-limit})
PTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
PTABLES += $(strip $4).txt.gz
endef
#################################################################################
# .SECONDEXPANSION:
#################################################################################
define create_lexical_reordering_table
mystem := $(strip $1).$(strip $2)
$${mystem}.gz: dmshards = $$(shell ls $3.shards/*.dst.gz 2>/dev/null)
$${mystem}.gz: dm.type=$(word 1,$(subst -, ,$2))
$${mystem}.gz: dm.orient=$(word 2,$(subst -, ,$2))
$${mystem}.gz: ${strip $3}.shards/extract.done
$$(lock)
${moses.score-reordering} \
<(${merge-sorted} \
$$(if $${dmshards},$${dmshards},$$(warning No dst shards found!))) \
${dmodel.smooth} $$@.lock/$$(notdir $(strip $1)). --model "$${dm.type} $${dm.orient} $2"
mv $$@.lock/$$(notdir $(strip $1).$(strip $2)).gz $${@D}
$$(unlock)
endef
#################################################################################
define add_binary_reordering_table
$(call create_lexical_reordering_table,$5,$4,$6)
mystem := $(strip $5).$(strip $4)
ffname := LexicalReordering$(words ${DTABLE_ENTRIES})
MY_ENTRY := LexicalReordering
MY_ENTRY += name=$$(ffname)
MY_ENTRY += input-factor=$(strip $1)
MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += type=$(strip $4)
MY_ENTRY += path=$$(abspath $${mystem})
DTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
DTABLES += $(strip $5).$(strip $4).binlexr.idx
endef
%.binlexr.idx :| %.gz
$(lock)
${MOSES_BIN}/processLexicalTable \
-in <(zcat -f $*.gz) -out $@.lock/$(notdir $*)
mv $@.lock/$(notdir $*).* ${@D}
$(unlock)
#################################################################################
# add_dynsa_pt: add a dynamic suffix array phrase table
# $1,$2,$3: source and target factors, number of features
# $4,$5,$6: source and target text (gzipped), wrd-aln. in gzipped symal format
#--------------------------------------------------------------------------------
define add_dynsa_phrase_table
ffname := TranslationModel$(words ${PTABLE_ENTRIES})
MY_ENTRY := PhraseDictionaryDynSuffixArray
MY_ENTRY += name=$$(ffname)
MY_ENTRY += input-factor=$(strip $1)
MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += source=$(abspath $4)
MY_ENTRY += target=$(abspath $5)
MY_ENTRY += alignment=$(strip $6)
MOSES_INI_PREREQ += $4 $5 $6
PTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
PTABLES += $(strip $4) $(strip $5) $(strip $6)
endef
#################################################################################
# $1: input factor
# $2: output factor
# $3: num-features
# $4: path to mmapped data
# $5: path to text data
# $6: path and basename of dynamic data
define add_mmsapt
$(call mmap_bitext,$(strip $5),$(strip $4))
ffname := PT$(words ${PTABLE_ENTRIES})
MY_ENTRY := Mmsapt
MY_ENTRY += name=$$(ffname)
MY_ETNRY += input-factor=$(strip $1)
MY_ENTRY += output-factor=$(strip $2)
MY_ENTRY += num-features=$(strip $3)
MY_ENTRY += base=$(abspath $4)/ L1=${L1} L2=${L2}
PTABLE_ENTRIES += $$(subst $$(space),;,$${MY_ENTRY})
MOSES_INI_PREREQ += $(addprefix $(strip $4)/${L1},.mct .tdx .sfa)
MOSES_INI_PREREQ += $(addprefix $(strip $4)/${L2},.mct .tdx .sfa)
MOSES_INI_PREREQ += $(strip $4)/${L1}-${L2}.mam
MOSES_INI_PREREQ += $(strip $4)/${L1}-${L2}.lex
endef