| |
|
|
| |
| |
| |
| |
|
|
|
|
| use strict; |
| use open ':utf8'; |
| binmode STDIN, ':utf8'; |
| binmode STDOUT, ':utf8'; |
|
|
| use Getopt::Long "GetOptions"; |
|
|
| sub main { |
| my $usage = " |
| USAGE |
| ----- |
| combine-ptables.pl --mode=(interp|union|fillup|backoff|intersect1|stats) ptable1 ptable2 ... ptableN > combined-ptable |
| combine-ptables.pl --mode=intersect1 reotable-unpruned ptable-pruned > reotable-pruned |
| ----- |
| # |
| # This scripts reads two or more *sorted* phrase tables and combines them in different modes. |
| # |
| # (Note: if present, word alignments are ignored). |
| # |
| # ---------------- |
| # OPTIONS |
| # ---------------- |
| # |
| # Required: |
| # --mode fillup: Each entry is taken only from the first table that contains it. |
| # A binary feature is added from each table except the first. |
| # backoff: Each entry is taken only from the first table that contains it. |
| # NO binary feature is added. |
| # interp: Linear interpolation. |
| # union: Union of entries, feature vectors are concatenated. |
| # intersect1: Intersection of entries, feature vectors taken from the first table. |
| # stats: Only compute some statistics about tables overlap. No table is produced. |
| # |
| # NOTE: if present, additional fields such as word alignment, phrase counts etc. are always |
| # taken from the first table. |
| # |
| # Generic options: |
| # --phpenalty=FLOAT Constant value for phrase penalty. Default is exp(1)=2.718 |
| # --phpenalty-at=N The (N+1)th score of each table is considered as phrase penalty with a constant value. |
| # In 'interp' mode, the corresponding feature is not interpolated but simply set to the constant. |
| # In 'union' mode, the ph.penalty (constant) is output only once, after all the other scores. |
| # By default, no score is considered as phrase penalty. |
| # |
| # |
| # Options for 'fillup': |
| # --newSourceMaxLength=INT Don't include \"new\" source phrases if longer than INT words. |
| # |
| # Options for 'backoff': |
| # --newSourceMaxLength=INT Don't include \"new\" source phrases if longer than INT words. |
| # |
| # Options for 'interp': |
| # --weights=W1,W2,...WN Weights for interpolation. By default, uniform weights are applied. |
| # --epsilon=X Score to assume when a phrase pair is not contained in a table (in 'interp' and 'union' modes). |
| # Default epsilon is 1e-06. |
| # |
| # Options for 'union': |
| # |
| # |
| "; |
|
|
| my $combination_mode = ''; |
| my $debug = ''; |
| my $weights_str = ''; |
| my $epsilon = 0.000001; |
| my $phPenalty = 2.718; |
| my $phPenalty_idx = -1; |
| my $delim= " ||| "; |
| my $delim_RE= ' \|\|\| '; |
| my $exp_one = 2.718; |
| my $exp_zero = 1; |
| my $newSourceMaxLength = -1; |
| my $help = ''; |
|
|
| GetOptions ('debug' => \$debug, |
| 'mode=s' => \$combination_mode, |
| 'weights=s' => \$weights_str, |
| 'epsilon=f' => \$epsilon, |
| 'phpenalty=f' => \$phPenalty, |
| 'phpenalty-at=i' => \$phPenalty_idx, |
| 'newSourceMaxLength=i' => \$newSourceMaxLength, |
| 'help' => \$help); |
|
|
| if($help) { die "$usage\n\n"; } |
|
|
| if($combination_mode!~/(interp|union|fillup|backoff|intersect1|stats)/) {die "$usage\nUnknown combination mode!\n"}; |
|
|
| if(@ARGV < 2) {die "$usage\n\n Please provide at least 2 tables to combine \n\n";} |
|
|
| print STDERR " |
| WARNING: Your phrase tables must be sorted (with LC_ALL=C) !! |
| ****************************** |
| Combination mode is [$combination_mode] |
| ****************************** |
| "; |
|
|
| my @tables = @ARGV; |
| my $nbtables = scalar(@tables); |
|
|
| |
|
|
| |
| my %sourcePhrasesPT1; |
| if((($combination_mode eq "fillup") || ($combination_mode eq "backoff")) && $newSourceMaxLength>-1) { |
| my $table1=$tables[0]; |
| $table1 =~ s/(.*\.gz)\s*$/gzip -dc < $1|/; |
| open(TABLE1, "$table1") or die "Cannot open $table1: ($!)\n"; |
| while(my $line=<TABLE1>) { |
| $line=~m/^(.*?)$delim_RE/; |
| $sourcePhrasesPT1{$1}++; |
| } |
| close(TABLE1); |
| } |
|
|
| my @table_files=(); |
| foreach my $table (@tables) { |
| $table =~ s/(.*\.gz)\s*$/gzip -dc < $1|/; |
| |
| local *FILE; |
| open(FILE, "$table") or die "Cannot open $table: ($!)\n"; |
| push(@table_files, *FILE); |
| } |
|
|
|
|
| |
| my @read_ppairs=(); |
| my $nbscores = &read_line_from_tables(\@table_files, \@read_ppairs); |
| print STDERR "Each phrase table contains $nbscores features.\n"; |
|
|
| |
|
|
| if($phPenalty_idx!=-1) { |
| if($phPenalty_idx<0 || $phPenalty_idx>=$nbscores) { |
| die "Invalid value for option phpenalty-at! Should be in the range [0,($nbscores-1)]\n\n"; |
| } |
| else { print STDERR "Phrase penalty at index $phPenalty_idx\n"; } |
| } |
|
|
| |
| |
|
|
| my @weights=(); |
| if($combination_mode eq "interp") { |
| my @table_level_weights=(); |
| if($weights_str eq "") { |
| @table_level_weights= ((1/$nbtables) x $nbtables); |
| } |
| else { |
| @table_level_weights= split(/,/, $weights_str); |
| if(scalar(@table_level_weights) != $nbtables) { |
| die "$usage\n Invalid string for option --weights! Must be a comma-separated list of floats, one per ph.table.\n"; |
| } |
| } |
|
|
| for(my $i=0; $i<$nbtables; $i++) { |
| my @weights_pt = (($table_level_weights[$i]) x $nbscores); |
| if($phPenalty_idx!=-1) { |
| $weights_pt[$phPenalty_idx]=0; |
| } |
| print STDERR "WEIGHTS-PT_$i: ", join(" -- ", @weights_pt), "\n"; |
| $weights[$i] = \@weights_pt; |
| } |
| print STDERR "EPSILON: $epsilon \n"; |
| } |
|
|
|
|
| |
|
|
| my @empty_ppair=(""); |
| my @epsilons = (($epsilon) x $nbscores); |
| if($phPenalty_idx>-1) { |
| pop @epsilons; |
| } |
|
|
| my $nbPpairs_inAll=0; |
| my @nbPairs_found_only_in=((0) x $nbtables); |
| my $MINSCORE=1; |
|
|
| print STDERR "Working...\n\n"; |
| while(1) { |
| my $min_ppair=""; |
| my $reached_end_of_tables=1; |
| my @tablesContainingPpair=((0) x $nbtables); |
| for(my $i=0; $i<$nbtables; $i++) { |
| my $ppair=$read_ppairs[$i]->[0]; |
| if($ppair ne "") { |
| $reached_end_of_tables=0; |
| if($min_ppair eq "" || $ppair lt $min_ppair) { |
| $min_ppair=$ppair; |
| @tablesContainingPpair=((0) x $nbtables); |
| $tablesContainingPpair[$i]=1; |
| } |
| elsif($ppair eq $min_ppair) { |
| $tablesContainingPpair[$i]=1; |
| } |
| } |
| } |
| last if($reached_end_of_tables); |
|
|
| |
| &combine_ppair(\@read_ppairs, \@tablesContainingPpair); |
|
|
| &read_line_from_tables(\@table_files, \@read_ppairs, \@tablesContainingPpair); |
| |
| } |
|
|
| print STDERR "...done!\n"; |
|
|
| print STDERR "The minimum score in all tables is $MINSCORE\n"; |
|
|
| if($combination_mode eq "stats") { |
| my $tot_ppairs=0; |
| print " |
| # entries |
| found in all tables: $nbPpairs_inAll\n"; |
|
|
| for(my $i=0; $i<$nbtables; $i++) { |
| print "found only in PT_$i: $nbPairs_found_only_in[$i]\n"; |
| } |
|
|
| } |
|
|
| |
| sub combine_ppair(PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) { |
| my $ra_ppairs=shift; |
| |
| |
|
|
| my $ra_toRead=shift; |
|
|
| my $ppair=""; |
| my @scores=(); |
| my $additional_info=""; |
|
|
| my $to_print=1; |
|
|
| if($debug) { |
| print STDERR "combine_ppair:\n"; |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| print STDERR "ppair_$i= ", join (" // ", @{$ra_ppairs->[$i]}), "\n"; |
| } |
| } |
| } |
|
|
| if($combination_mode eq "stats") { |
| $to_print=0; |
| my $found_in=-1; |
| my $nb_found=0; |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| $found_in=$i; |
| $nb_found++; |
| } |
| } |
| if($nb_found==1) { $nbPairs_found_only_in[$found_in]++; } |
| elsif($nb_found==$nbtables) { $nbPpairs_inAll++; } |
| } |
| |
| elsif($combination_mode eq "fillup") { |
| my @bin_feats=(($exp_zero) x ($nbtables-1)); |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| $ppair= shift(@{$ra_ppairs->[$i]}); |
| |
| if($i>0 && $newSourceMaxLength>-1) { |
| $ppair=~m/^(.*?)$delim_RE/; |
| if(scalar(split(/ +/, $1)) > $newSourceMaxLength && |
| !defined($sourcePhrasesPT1{$1})) |
| { $to_print=0; } |
| } |
| |
| @scores = @{shift(@{$ra_ppairs->[$i]})}; |
| |
| if($i>0) { $bin_feats[$i-1]=$exp_one; } |
| $additional_info=shift(@{$ra_ppairs->[$i]}); |
| last; |
| } |
| } |
| push(@scores, @bin_feats); |
| } |
| |
| elsif($combination_mode eq "backoff") { |
| |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| $ppair= shift(@{$ra_ppairs->[$i]}); |
| |
| if($i>0 && $newSourceMaxLength>-1) { |
| $ppair=~m/^(.*?)$delim_RE/; |
| if(scalar(split(/ +/, $1)) > $newSourceMaxLength && |
| !defined($sourcePhrasesPT1{$1})) |
| { $to_print=0; } |
| } |
| @scores = @{shift(@{$ra_ppairs->[$i]})}; |
| $additional_info=shift(@{$ra_ppairs->[$i]}); |
| last; |
| } |
| } |
| } |
| |
| elsif($combination_mode eq "interp") { |
| my $firstPpair=-1; |
| @scores=((0) x $nbscores); |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| if($firstPpair==-1) { $firstPpair=$i; } |
| $ppair= shift(@{$ra_ppairs->[$i]}); |
| my @scoresPT = @{shift(@{$ra_ppairs->[$i]})}; |
| for(my $j=0; $j<$nbscores; $j++) { |
| |
| $scores[$j]+= $weights[$i]->[$j]* $scoresPT[$j]; |
| } |
| } |
| else { |
| for(my $j=0; $j<$nbscores; $j++) { |
| $scores[$j]+= $weights[$i]->[$j]* $epsilon; |
| } |
| } |
| if($phPenalty_idx!=-1) { |
| $scores[$phPenalty_idx]= $phPenalty; |
| } |
| } |
| if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; } |
| $additional_info= shift(@{$ra_ppairs->[$firstPpair]}); |
| } |
| |
| elsif($combination_mode eq "union") { |
| my $firstPpair=-1; |
| for(my $i=0; $i<$nbtables; $i++) { |
| if($ra_toRead->[$i]) { |
| if($firstPpair==-1) { $firstPpair=$i; } |
| $ppair= shift(@{$ra_ppairs->[$i]}); |
| my @scoresPT= @{shift(@{$ra_ppairs->[$i]})}; |
| if($phPenalty_idx!=-1) { |
| |
| splice(@scoresPT, $phPenalty_idx, 1); |
| } |
| |
| push(@scores, @scoresPT); |
| } |
| else { |
| push(@scores, @epsilons); |
| } |
| } |
| if($phPenalty_idx!=-1) { |
| push(@scores, $phPenalty); |
| } |
| if($debug) { print STDERR "..taking info from ptable_$firstPpair\n"; } |
| $additional_info= shift(@{$ra_ppairs->[$firstPpair]}); |
| } |
| |
| elsif($combination_mode eq "intersect1") { |
| $to_print=0; |
| my $found_in_all=1; |
| for(my $i=0; $i<$nbtables; $i++) { |
| if(!$ra_toRead->[$i]) { |
| $found_in_all=0; |
| last; |
| } |
| } |
| if($found_in_all) { |
| $to_print=1; |
| $ppair= shift(@{$ra_ppairs->[0]}); |
| |
| @scores= @{shift(@{$ra_ppairs->[0]})}; |
| $additional_info= shift(@{$ra_ppairs->[0]}); |
| } |
| } |
| else { |
| die "$usage\nUnknown combination mode!\n"; |
| } |
|
|
|
|
| if($to_print) { |
| if($additional_info eq "") { |
| print $ppair, join(" ", @scores), "\n"; |
| }else { |
| print $ppair, join(" ", @scores), $delim, $additional_info, "\n"; |
| } |
| } |
| } |
|
|
| |
| |
| |
| |
| sub read_line_from_tables(FILES_REFARRAY, PPAIRS_REFARRAY, TABLE_INDICES_REFARRAY) { |
| my $ra_files=shift; |
| my $ra_ppairs=shift; |
|
|
| my $ra_toRead=shift; |
| my @toRead=((1) x $nbtables); |
| if($ra_toRead ne "") { |
| @toRead=@$ra_toRead; |
| } |
|
|
| my $nbscores=-1; |
| my $key=""; my $additional_info=""; |
| for(my $i=0; $i<$nbtables; $i++) { |
| next if($toRead[$i]==0); |
| my @ppair=(); |
| my $file=$ra_files->[$i]; |
| if(my $line = <$file>) { |
| chomp $line; |
| my @fields = split(/$delim_RE/, $line); |
| if(scalar(@fields)<3) { |
| die "Invalid phrase table entry:\n$line\n"; |
| } |
| my @scores = split(/\s+/, $fields[2]); |
| foreach my $score (@scores) { |
| if($score<$MINSCORE) { $MINSCORE=$score; } |
| } |
| |
| |
| if($nbscores==-1) { |
| $nbscores=scalar(@scores); |
| } elsif($nbscores!=scalar(@scores) && $combination_mode ne "intersect1") { |
| die "Wrong number of scores in table-$i! Should be $nbscores\n"; |
| } |
| |
| if(scalar(@fields)>3) { |
| $additional_info=join($delim, splice(@fields,3)); |
| |
| } |
| my $key = "$fields[0]$delim$fields[1]$delim"; |
| push(@ppair, $key, \@scores, $additional_info); |
| } |
| else { |
| push(@ppair, ""); |
| } |
| $ra_ppairs->[$i]=\@ppair; |
| } |
|
|
| return $nbscores; |
| } |
|
|
| |
| } |
|
|
|
|
| &main; |
|
|