wiki-cased / tools /mosesdecoder /contrib /m4m /scripts /moses.extract-phrases.sh
Quagmire1's picture
Upload folder using huggingface_hub
41f6dd8 verified
#!/bin/bash
# helper script for phrase extraction
# (c) 2011-2012 Ulrich Germann
# txtdir - directory with gzipped plain text files
# sntdir - directory with files in Giza's .snt format, also including the .OK files
# produced by giza.txt2snt.sh
# gizdir - directory where aligned corpus resides
# L1,L2 - language tags for L1,L2
# plmax - max phrase length to be extraced
extractor=$1
L1_text=$2
L2_text=$3
aln=$4
odir=$5
max_plen=$6
dmodel=$7
echo $#
if [ $# -lt 6 ] ; then
echo <<EOF \
"usage: $0 <moses-extract-command> <L1 text> <L2 text> <alignment file> <output dir> <max phrase length> <distortion-model>"
EOF
exit 1
fi
fifo=$odir/fifo.$$
cleanup()
{
if [ -e $fifo ] ; then rm $fifo; fi
if [ -e $fifo.inv ] ; then rm $fifo.inv; fi
if [ -e $fifo.o ] ; then rm $fifo.o; fi
}
trap 'cleanup' 0
export LC_ALL=C
mkdir -p $odir/fwd $odir/bwd $odir/dst
mkfifo $fifo
parallel < $fifo -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/fwd/part.{#}.gz" &
mkfifo $fifo.inv
parallel < $fifo.inv -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/bwd/part.{#}.gz" &
if [ "$dmodel" != "" ] ; then
mkfifo $fifo.o
parallel < $fifo.o -j6 --pipe --blocksize 250M "sort -S 5G | gzip > $odir/dst/part.{#}.gz" &
dmodel="orientation --model $dmodel"
fi
#echo "($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1"
($extractor <(zcat -f $L2_text) <(zcat -f $L1_text) <(zcat -f $aln) $fifo $max_plen $dmodel) || exit 1
wait
# for part in fwd bwd dst; do
# echo -n '' > $odir/${part}/sort.batch
# for f in $odir/${part}/part.[0-9][0-9][0-9][0-9].gz; do
# g=`echo $f | sed 's/.gz$//'`
# # echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz && rm \$f.gz; fi" \
# echo "f=$g; if [ -e \$f.gz ] ; then zcat \$f.gz | LC_ALL=C sort | gzip > \$f.gz_ && mv \$f.gz_ \$f.sorted.gz; fi" \
# >> $odir/${part}/sort.batch
# done
# done