| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| use IPC::Open3; |
| use File::Temp qw/tempdir/; |
| use File::Path qw/rmtree/; |
| use Getopt::Long "GetOptions"; |
| use Symbol; |
|
|
| binmode(STDIN, ":utf8"); |
| binmode(STDOUT, ":utf8"); |
| binmode(STDERR, ":utf8"); |
|
|
| my $SRILM = "/home/pkoehn/moses/srilm/bin/i686-m64"; |
| my $TEMPDIR = "/tmp"; |
| my ($TUNING,$LM,$NAME,$GROUP,$WEIGHTS,$CONTINUE); |
|
|
| die("interpolate-lm.perl --tuning set --name out-lm --lm lm0,lm1,lm2,lm3 [--srilm srilm-dir --tempdir tempdir --group \"0,1 2,3\"]") |
| unless &GetOptions('tuning=s' => => \$TUNING, |
| 'name=s' => \$NAME, |
| 'srilm=s' => \$SRILM, |
| 'tempdir=s' => \$TEMPDIR, |
| 'continue' => \$CONTINUE, |
| 'group=s' => \$GROUP, |
| 'weights=s' => \$WEIGHTS, |
| 'lm=s' => \$LM); |
|
|
| |
| die("ERROR: please specify output language model name --name") unless defined($NAME); |
| die("ERROR: please specify tuning set with --tuning") unless defined($TUNING); |
| die("ERROR: please specify language models with --lm") unless defined($LM); |
| die("ERROR: can't read $TUNING") unless -e $TUNING; |
| die("ERROR: did not find srilm dir") unless -e $SRILM; |
| die("ERROR: cannot run ngram") unless -x $SRILM."/ngram"; |
|
|
| my @LM = split(/,/,$LM); |
| my @WEIGHT; |
| @WEIGHT = split(/,/,$WEIGHTS) if defined($WEIGHTS); |
| die("ERROR: different number of weights and language models: ".scalar(@WEIGHT)." vs. ".scalar(@LM)) |
| if defined($WEIGHTS) && scalar(@WEIGHT) != scalar(@LM); |
|
|
| |
| my $order = 0; |
| foreach my $lm (@LM) { |
| my $lm_order; |
| $lm .= ".gz" if (! -e $lm && -e "$lm.gz"); |
| if ($lm =~ /gz$/) { |
| open(LM,"zcat $lm|") || die("ERROR: could not find language model file '$lm'"); |
| } |
| else { |
| open(LM,$lm) || die("ERROR: could not find language model file '$lm'"); |
| } |
| while(<LM>) { |
| $lm_order = $1 if /ngram\s+(\d+)/; |
| last if /1-grams/; |
| } |
| close(LM); |
| $order = $lm_order if $order == 0; |
| die("ERROR: language models have different order") if $order != $lm_order; |
| } |
| print STDERR "language models have order $order.\n"; |
|
|
| |
| if (!defined($GROUP) && scalar(@LM) > 10) { |
| print STDERR "more than 10, automatically grouping language models.\n"; |
| my $num_groups = int(scalar(@LM)/10 + 0.99); |
| my $size_groups = int(scalar(@LM)/$num_groups + 0.99); |
|
|
| $GROUP = ""; |
| for(my $i=0;$i<$num_groups;$i++) { |
| $GROUP .= " " unless $i==0; |
| for(my $j=0;$j<$size_groups;$j++) { |
| my $lm_i = $i*$size_groups+$j; |
| next if $lm_i >= scalar(@LM); |
| $GROUP .= "," unless $j==0; |
| $GROUP .= $lm_i; |
| } |
| } |
| print STDERR "groups: $GROUP\n"; |
| } |
|
|
| |
| if (!defined($GROUP)) { |
| &interpolate($NAME,\@WEIGHT,@LM); |
| exit; |
| } |
|
|
| |
| my %ALREADY; |
| my $g = 0; |
| my @SUB_NAME; |
| foreach my $subgroup (split(/ /,$GROUP)) { |
| my @SUB_LM; |
| foreach my $lm_i (split(/,/,$subgroup)) { |
| die("ERROR: LM id $lm_i in group definition out of range") if $lm_i >= scalar(@LM); |
| push @SUB_LM,$LM[$lm_i]; |
| $ALREADY{$lm_i} = 1; |
| } |
| |
| |
| |
| |
| |
| my $name = $NAME.".group-".chr(97+($g++)); |
| push @SUB_NAME,$name; |
| print STDERR "\n=== BUILDING SUB LM $name from\n\t".join("\n\t",@SUB_LM)."\n===\n\n"; |
| &interpolate($name, undef, @SUB_LM) unless $CONTINUE && -e $name; |
| } |
| for(my $lm_i=0; $lm_i < scalar(@LM); $lm_i++) { |
| next if defined($ALREADY{$lm_i}); |
| push @SUB_NAME, $LM[$lm_i]; |
| } |
| print STDERR "\n=== BUILDING FINAL LM ===\n\n"; |
| &interpolate($NAME, undef, @SUB_NAME); |
|
|
| |
| sub interpolate { |
| my ($name,$WEIGHT,@LM) = @_; |
|
|
| die("cannot interpolate more than 10 language models at once: ",join(",",@LM)) |
| if scalar(@LM) > 10; |
|
|
| my $tmp = tempdir(DIR=>$TEMPDIR); |
| my @LAMBDA; |
|
|
| |
| if (defined($WEIGHT) && scalar(@$WEIGHT) == scalar(@LM)) { |
| @LAMBDA = @$WEIGHT; |
| } |
| |
| else { |
| |
| my $i = 0; |
| foreach my $lm (@LM) { |
| print STDERR "compute perplexity for $lm\n"; |
| safesystem("$SRILM/ngram -unk -order $order -lm $lm -ppl $TUNING -debug 2 > $tmp/iplm.$$.$i") or die "Failed to compute perplexity for $lm\n"; |
| print STDERR `tail -n 2 $tmp/iplm.$$.$i`; |
| $i++; |
| } |
|
|
| |
| print STDERR "computing lambdas...\n"; |
| my $cmd = "$SRILM/compute-best-mix"; |
| for(my $i=0;$i<scalar(@LM);$i++) { |
| $cmd .= " $tmp/iplm.$$.$i"; |
| } |
| my ($mixout, $mixerr, $mixexitcode) = saferun3($cmd); |
| die "Failed to mix models: $mixerr" if $mixexitcode != 0; |
| my $mix = $mixout; |
| `rm $tmp/iplm.$$.*`; |
| $mix =~ /best lambda \(([\d\. e-]+)\)/ || die("ERROR: computing lambdas failed: $mix"); |
| @LAMBDA = split(/ /,$1); |
| } |
|
|
| |
| print STDERR "creating new language model...\n"; |
| my $i = 0; |
| my $cmd = "$SRILM/ngram -unk -order $order -write-lm $name"; |
| foreach my $lm (@LM) { |
| $cmd .= " -lm " if $i==0; |
| $cmd .= " -mix-lm " if $i==1; |
| $cmd .= " -mix-lm$i " if $i>1; |
| $cmd .= $lm; |
| $cmd .= " -lambda " if $i==0; |
| $cmd .= " -mix-lambda$i " if $i>1; |
| $cmd .= $LAMBDA[$i] if $i!=1; |
| $i++; |
| } |
| safesystem($cmd) or die "Failed."; |
|
|
| rmtree($tmp); |
| print STDERR "done.\n"; |
| } |
|
|
| sub safesystem { |
| print STDERR "Executing: @_\n"; |
| system(@_); |
| if ($? == -1) { |
| print STDERR "Failed to execute: @_\n $!\n"; |
| exit(1); |
| } |
| elsif ($? & 127) { |
| printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
| ($? & 127), ($? & 128) ? 'with' : 'without'; |
| exit(1); |
| } |
| else { |
| my $exitcode = $? >> 8; |
| print STDERR "Exit code: $exitcode\n" if $exitcode; |
| return ! $exitcode; |
| } |
| } |
|
|
| sub saferun3 { |
| print STDERR "Executing: @_\n"; |
| my $wtr = gensym(); |
| my $rdr = gensym(); |
| my $err = gensym(); |
| my $pid = open3($wtr, $rdr, $err, @_); |
| close($wtr); |
| my $gotout = ""; |
| $gotout .= $_ while (<$rdr>); |
| close $rdr; |
| my $goterr = ""; |
| if (defined $err) { |
| $goterr .= $_ while (<$err>); |
| close $err; |
| } |
| waitpid($pid, 0); |
| if ($? == -1) { |
| print STDERR "Failed to execute: @_\n $!\n"; |
| exit(1); |
| } |
| elsif ($? & 127) { |
| printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", |
| ($? & 127), ($? & 128) ? 'with' : 'without'; |
| exit(1); |
| } |
| else { |
| my $exitcode = $? >> 8; |
| print STDERR "Exit code: $exitcode\n" if $exitcode; |
| return ( $gotout, $goterr, $exitcode ); |
| } |
| } |
|
|