wiki-cased / tools /mosesdecoder /contrib /m4m /modules /obsolete /phrase-table.make.scratch
Quagmire1's picture
Upload folder using huggingface_hub
41f6dd8 verified
# .PHONY: $1
# $1: $1.binphr.idx
# $1.txt.gz: | L1text = $4
# $1.txt.gz: | L2text = $5
# $1.txt.gz: | symal = $6
# ${moses.ini}: $1
# PTABLES += 1;$2;$3;5;$1
# endef
# ${target}.tmp/fwd/scored.gz: | ${target}/phrase-extraction.DONE
# | ${L1File} ${L2File} ${symal}
# # convert phrase table from text file to binary format
# %.binphr.idx: | %.txt.gz ${MOSES_BIN}/processPhraseTable
# $(lock)
# zcat -f $*.txt.gz | ${MOSES_BIN}/processPhraseTable \
# -ttable ${L1factors} ${L2factors} - -nscores 5 -out ${@D}/_${@F} \
# && mv ${@D}/_${@F} $@
# $(unlock)
# # directory definitions
# mo_mdl = model
# mo_tmp = model/tmp
# wrdaln = ${fstaln}/out
# # wrdaln should be set elsewhere!
# # milestone files created during phrase table construction
# ptable_bin = ${mo_mdl}/ptable.${L1}-${L2}
# ptable = ${mo_mdl}/ptable.${L1}-${L2}.txt.gz
# lex1given2 = ${mo_mdl}/${L1}-given-${L2}.lex.gz
# lex2given1 = ${mo_mdl}/${L2}-given-${L1}.lex.gz
# mosesinifile = ${mo_mdl}/moses.ini.0
# .PHONY: lex ptable
# lex: ${lex1given2} ${lex2given1}
# ptable: ${ptable_bin}
# # steps taken in this module
# # -------------------------------------------------------------------------------
# # --- STEP 1a: extract raw phrases from word-aligned corpus ---------------------
# # -------------------------------------------------------------------------------
# # Note: the script ${moses.extract-phrases} takes care of initial sorting
# ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract-phrases}
# ${mo_tmp}/phrase-extraction.DONE: | ${moses.extract}
# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}.txt.gz
# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L2}.txt.gz
# ${mo_tmp}/phrase-extraction.DONE: | ${wrdaln}/${L1}-${L2}.symal.gz
# ${mo_tmp}/phrase-extraction.DONE:
# $(lock)
# ${moses.extract-phrases} \
# ${moses.extract} \
# ${wrdaln}/${L1}.txt.gz \
# ${wrdaln}/${L2}.txt.gz \
# ${wrdaln}/${L1}-${L2}.symal.gz \
# ${mo_tmp} ${max_phrase_length} \
# ${dmodel.type}-${dmodel.orientation} \
# && touch $@
# $(unlock)
# # -------------------------------------------------------------------------------
# # --- STEP 1a: extract word translation lexica from word-aligned corpus ---------
# # --- (for lexical phrase scoring) ---------
# # -------------------------------------------------------------------------------
# $(lex2given1): $(lex1given2)
# $(lex1given2): | ${wrdaln}/${L1}.txt.gz
# $(lex1given2): | ${wrdaln}/${L2}.txt.gz
# $(lex1given2): | ${wrdaln}/${L1}-${L2}.symal.gz
# $(lock)
# $(moses.make-lex) \
# ${wrdaln}/${L1}.txt.gz \
# ${wrdaln}/${L2}.txt.gz \
# ${wrdaln}/${L1}-${L2}.symal.gz \
# $(lex1given2) \
# $(lex2given1)
# $(unlock)
# # -------------------------------------------------------------------------------
# # --- STEP 2: score extracted phrase pairs --------------------------------------
# # -------------------------------------------------------------------------------
# ptfwdhalf = ${mo_tmp}/fwd/phrases.fwd.scored.gz
# ptbwdhalf = ${mo_tmp}/bwd/phrase-scoring.DONE
# # -------------------------------------------------------------------------------
# # --- STEP 2a: score phrases in the 'forward' direction -------------------------
# # -------------------------------------------------------------------------------
# $(ptfwdhalf): | ${mo_tmp}/phrase-extraction.DONE
# $(ptfwdhalf): | ${lex1given2}
# $(lock)
# $(merge-sorted) ${mo_tmp}/fwd/part.*.gz \
# | ${moses.score-phrases} ${MOSES_BIN}/score - ${lex1given2} ${@:.scored.gz=} \
# $(ptable.smoothing) && mv $@_ $@
# $(unlock)
# # -------------------------------------------------------------------------------
# # --- STEP 2b: score phrases in the 'backward' direction -------------------------
# # -------------------------------------------------------------------------------
# # Note: ${moses.score-phrases} re-sorts the scored backward phrases
# $(ptbwdhalf): | ${mo_tmp}/phrase-extraction.DONE
# $(ptbwdhalf): | ${lex2given1}
# $(lock)
# $(merge-sorted) ${mo_tmp}/bwd/part.*.gz \
# | ${moses.score-phrases} ${MOSES_BIN}/score - ${lex2given1} ${@D}/scored \
# "$(ptable.smoothing)" --Inverse && touch $@
# $(unlock)
# # -------------------------------------------------------------------------------
# # --- STEP 3: put the two phrase table halves together --------------------------
# # -------------------------------------------------------------------------------
# # ptfwdhalf is a single .gz file, ptbwdhalf is a collection .gz files
# $(ptable): | ${MOSES_BIN}/consolidate
# $(ptable): | $(ptfwdhalf) $(ptbwdhalf)
# $(lock)
# ${MOSES_BIN}/consolidate \
# <(zcat ${ptfwdhalf}) \
# <(${merge-sorted} ${mo_tmp}/bwd/scored.*.gz) /dev/stdout \
# $(if $(ptable.smoothing), \
# $(ptable.smoothing) $(ptfwdhalf:.sorted.gz=.coc)) \
# | gzip > $@_ && mv $@_ $@
# $(unlock)