File size: 3,627 Bytes
fd49381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

# Build necessary files for sparse lexical features
# * target word insertion
# * source word deletion
# * word translation
# * phrase length

my ($corpus,$input_extension,$output_extension,$outfile_prefix,$specification) = @ARGV;
my $ini = "[feature]\n";
my %ALREADY;
my %ID;

foreach my $feature_spec (split(/,\s*/,$specification)) {
  my @SPEC = split(/\s+/,$feature_spec);

  my $factor = ($SPEC[0] eq 'word-translation') ? "0-0" : "0";
  $factor = $1 if $feature_spec =~ / factor ([\d\-]+)/;
  $feature_spec =~ s/ factor ([\d\-]+)//;

  if ($SPEC[0] eq 'target-word-insertion') {
    $ini .= "TargetWordInsertionFeature name=TWI".&get_id($SPEC[0])." factor=$factor";

    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
      my $file = &create_top_words($output_extension, $SPEC[2]);
      $ini .= " path=$file";
    }
    elsif ($SPEC[1] eq 'all') {
    }
    else {
      die("ERROR: Unknown parameter specification in '$feature_spec'\n");
    }
    $ini .= "\n";
  }
  elsif ($SPEC[0] eq 'source-word-deletion') {
    $ini .= "SourceWordDeletionFeature name=SWD".&get_id($SPEC[0])." factor=$factor";
    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/) {
      my $file = &create_top_words($input_extension, $SPEC[2]);
      $ini .= " path=$file";
    }
    elsif ($SPEC[1] eq 'all') {
    }
    else {
      die("ERROR: Unknown parameter specification in '$feature_spec'\n");
    }
    $ini .= "\n";
  }
  elsif ($SPEC[0] eq 'word-translation') {
    my $extra_ini = "";
    if ($SPEC[1] eq 'top' && $SPEC[2] =~ /^\d+$/ && $SPEC[3] =~ /^\d+$/) {
      my $file_in  = &create_top_words($input_extension,  $SPEC[2]);
      my $file_out = &create_top_words($output_extension, $SPEC[3]);
      $extra_ini .= " source-path=$file_in target-path=$file_out"
    }
    elsif ($SPEC[1] eq 'all') {
      # nothing to specify
    }
    else {
      die("ERROR: Unknown parameter specification in '$SPEC[1]'\n");
    }
    my ($input_factor,$output_factor) = split(/\-/,$factor);
    $ini .= "WordTranslationFeature name=WT".&get_id($SPEC[0])." input-factor=$input_factor output-factor=$output_factor simple=1 source-context=0 target-context=0$extra_ini\n";
  }
  elsif ($SPEC[0] eq 'phrase-length') {
    $ini .= "PhraseLengthFeature name=PL\n";
  }
  else {
    die("ERROR: Unknown feature type '$SPEC[0]' in specification '$feature_spec'\nfull spec: '$specification'\n");
  }
}

open(INI,">$outfile_prefix.ini");
print INI "$ini\n\n";
close(INI);

sub create_top_words {
  my ($extension, $count) = @_;
  my $file = "$outfile_prefix.$extension.top$count";
  return $file if defined($ALREADY{"$extension,$count"});
  $ALREADY{"$extension,$count"}++;

  # get counts
  my %COUNT;
  open(CORPUS,"$corpus.$extension");
  while(<CORPUS>) {
    chop;
    foreach (split) {
      $_ =~ s/\|.+//; # only surface factor at this point
      $COUNT{$_}++ unless $_ eq "";
    }
  }
  close(CORPUS);

  # sort
  my @COUNT_WORD;
  foreach (keys %COUNT) {
    next if $COUNT{$_} <= 3; # avoid large tail
    next if $_ =~ /:/; # avoid colon bug
    push @COUNT_WORD,sprintf("%09d %s",$COUNT{$_},$_);
  }
  my @SORTED = reverse sort @COUNT_WORD;

  # write top n to file
  open(TOP,">$file");
  for(my $i=0; $i<$count && $i<scalar(@SORTED); $i++) {
    $SORTED[$i] =~ /^\d+ (.+)$/;
    print TOP "$1\n";
  }
  close(TOP);

  return $file;
}

sub get_id {
  my ($name) = @_;
  $ID{$name}++;
  return "" if $ID{$name} == 1;
  return $ID{$name};
}