File size: 3,627 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
# Build necessary files for sparse lexical features
# * target word insertion
# * source word deletion
# * word translation
# * phrase length
my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
my $ini = "[feature]\n";
my %ALREADY;
my %ID;
foreach my $feature_spec (split(/,\s*/,$specification)) {
my @SPEC = split(/\s+/,$feature_spec);
my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0";
$factor = $1 if $feature_spec =~ / factor ([\d\-]+)/;
$feature_spec =~ s/ factor ([\d\-]+)//;
if ($SPEC[0] eq 'target-word-insertion') {
$ini .= "TargetWordInsertionFeature name=TWI".&get_id($SPEC[0])." factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($output_extension, $SPEC[2]);
$ini .= " path=$file";
}
elsif ($SPEC[1] eq 'all') {
}
else {
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
}
$ini .= "\n";
}
elsif ($SPEC[0] eq 'source-word-deletion') {
$ini .= "SourceWordDeletionFeature name=SWD".&get_id($SPEC[0])." factor=$factor";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
my $file = &create_top_words($input_extension, $SPEC[2]);
$ini .= " path=$file";
}
elsif ($SPEC[1] eq 'all') {
}
else {
die("ERROR: Unknown parameter specification in '$feature_spec'\n");
}
$ini .= "\n";
}
elsif ($SPEC[0] eq 'word-translation') {
my $extra_ini = "";
if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
my $file_in = &create_top_words($input_extension, $SPEC[2]);
my $file_out = &create_top_words($output_extension, $SPEC[3]);
$extra_ini .= " source-path=$file_in target-path=$file_out"
}
elsif ($SPEC[1] eq 'all') {
# nothing to specify
}
else {
die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
}
my ($input_factor,$output_factor) = split(/\-/,$factor);
$ini .= "WordTranslationFeature name=WT".&get_id($SPEC[0])." input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
}
elsif ($SPEC[0] eq 'phrase-length') {
$ini .= "PhraseLengthFeature name=PL\n";
}
else {
die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
}
}
open(INI,">$outfile_prefix.ini");
print INI "$ini\n\n";
close(INI);
sub create_top_words {
my ($extension, $count) = @_;
my $file = "$outfile_prefix.$extension.top$count";
return $file if defined($ALREADY{"$extension,$count"});
$ALREADY{"$extension,$count"}++;
# get counts
my %COUNT;
open(CORPUS,"$corpus.$extension");
while(<CORPUS>) {
chop;
foreach (split) {
$_ =~ s/\|.+//; # only surface factor at this point
$COUNT{$_}++ unless $_ eq "";
}
}
close(CORPUS);
# sort
my @COUNT_WORD;
foreach (keys %COUNT) {
next if $COUNT{$_} <= 3; # avoid large tail
next if $_ =~ /:/; # avoid colon bug
push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
}
my @SORTED = reverse sort @COUNT_WORD;
# write top n to file
open(TOP,">$file");
for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
$SORTED[$i] =~ /^\d+ (.+)$/;
print TOP "$1\n";
}
close(TOP);
return $file;
}
sub get_id {
my ($name) = @_;
$ID{$name}++;
return "" if $ID{$name} == 1;
return $ID{$name};
}
|