| | |
| | |
| | |
| | |
| |
|
| | use warnings; |
| | use strict; |
| | use Getopt::Long "GetOptions"; |
| | use FindBin qw($RealBin); |
| |
|
| | my ($EGRET_DIR,$MOSES_DIR,$TREE_CONVERTER,$FOREST,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE,$RAW_IN,$RAW_OUT,$EGRET_OPTIONS,$TREE_CONVERTER_OPTIONS); |
| |
|
| | $UNPARSEABLE = 0; |
| |
|
| | die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n") |
| | unless &GetOptions |
| | ('egret-dir=s' => \$EGRET_DIR, |
| | 'moses-dir=s' => \$MOSES_DIR, |
| | 'tree-converter=s' => \$TREE_CONVERTER, |
| | 'forest' => \$FOREST, |
| | 'split-hyphen' => \$SPLIT_HYPHEN, |
| | 'split-slash' => \$SPLIT_SLASH, |
| | 'mark-split' => \$MARK_SPLIT, |
| | 'binarize' => \$BINARIZE, |
| | 'unparseable' => \$UNPARSEABLE, |
| | 'raw-in=s' => \$RAW_IN, |
| | 'raw-out=s' => \$RAW_OUT, |
| | 'egret-options=s' => \$EGRET_OPTIONS, |
| | 'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS |
| | ) |
| | && defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER); |
| |
|
| | die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR; |
| | die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR; |
| | die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER; |
| |
|
| | |
| |
|
| | my $tmpEscaped = "/tmp/parse-en-egret.1.$$"; |
| | my $tmpDeescaped = "/tmp/parse-en-egret.2.$$"; |
| | my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$"; |
| |
|
| | open(ESCAPED, ">>$tmpEscaped"); |
| | open(DEESCAPED, "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped"); |
| | open(SPLIT_POINTS, ">>$tmpSplitPoints"); |
| |
|
| | |
| | |
| | while(<STDIN>) { |
| | print ESCAPED $_; |
| | my @tokens = split; |
| | my $new_token = ""; |
| | my $i = 0; |
| | my $j = -1; |
| | my $s = ""; |
| | my $t = ""; |
| | while ($i <= $#tokens) { |
| | if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 && |
| | $tokens[$i] eq "\@\-\@") { |
| | my $pos = length $new_token; |
| | $new_token .= "-$tokens[$i+1]"; |
| | $t .= "$j,$pos,- "; |
| | $i += 2; |
| | } elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 && |
| | $tokens[$i] eq "\@\/\@") { |
| | my $pos = length $new_token; |
| | $new_token .= "/$tokens[$i+1]"; |
| | $t .= "$j,$pos,/ "; |
| | $i += 2; |
| | } else { |
| | $s .= "$new_token "; |
| | $new_token = $tokens[$i]; |
| | $i++; |
| | $j++; |
| | } |
| | } |
| | $s .= "$new_token"; |
| | $s =~ s/^\s+//; |
| | $t =~ s/^\s+//; |
| | print DEESCAPED "$s\n"; |
| | print SPLIT_POINTS "$t\n"; |
| | } |
| |
|
| | close(SPLIT_POINTS); |
| | close(DEESCAPED); |
| | close(ESCAPED); |
| |
|
| | |
| |
|
| | |
| | my $pipeline = ""; |
| | if (defined($RAW_IN)) { |
| | $pipeline .= "cat \"$RAW_IN\" |"; |
| | } else { |
| | $pipeline .= "$EGRET_DIR/egret"; |
| | $pipeline .= " -lapcfg"; |
| | $pipeline .= " -data=$EGRET_DIR/eng_grammar"; |
| | $pipeline .= " -printForest" if $FOREST; |
| | $pipeline .= " -i=$tmpDeescaped"; |
| | $pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS); |
| | $pipeline .= " |"; |
| | } |
| | if (defined($RAW_OUT)) { |
| | $pipeline .= "tee \"$RAW_OUT\" |"; |
| | } |
| |
|
| | |
| | unless ($FOREST) { |
| | $pipeline .= 'sed \'s/^(//\' |'; |
| | $pipeline .= 'sed \'s/)$//\' |'; |
| | $pipeline .= "$TREE_CONVERTER"; |
| | $pipeline .= " -input_format penn"; |
| | $pipeline .= " -output_format egret"; |
| | $pipeline .= " |"; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | $pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests"; |
| | $pipeline .= " --Escape" if $FOREST; |
| | $pipeline .= " --MarkSplitPoints $tmpSplitPoints"; |
| | $pipeline .= " |"; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | my $output_format = $FOREST ? "egret" : "penn"; |
| | $pipeline .= "$TREE_CONVERTER"; |
| | $pipeline .= " -input_format egret"; |
| | $pipeline .= " -output_format $output_format"; |
| | |
| | $pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN); |
| | $pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH); |
| | $pipeline .= " $TREE_CONVERTER_OPTIONS" if defined($TREE_CONVERTER_OPTIONS); |
| | $pipeline .= " |"; |
| |
|
| | unless ($FOREST) { |
| | $pipeline .= 'sed \'s/^()$//\' |'; |
| | $pipeline .= 'sed \'s/^(/( (/\' |'; |
| | $pipeline .= 'sed \'s/)$/))/\' |'; |
| | $pipeline .= 'sed \'s/^$/(())/\' |'; |
| | $pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl |"; |
| | $pipeline .= 'sed \'s/^<tree label="TOP"/<tree label="ROOT"/\' |'; |
| | } |
| |
|
| | |
| |
|
| | open(PARSE, $pipeline); |
| |
|
| | if ($FOREST) { |
| | while (<PARSE>) { |
| | print $_; |
| | } |
| | } else { |
| | open(TMPESCAPED, $tmpEscaped); |
| | while (<PARSE>) { |
| | my $outLine = $_; |
| | my $unparsedLine = <TMPESCAPED>; |
| | if ($UNPARSEABLE == 1 && length($outLine) == 1) { |
| | print $unparsedLine; |
| | } else { |
| | print $outLine; |
| | } |
| | } |
| | } |
| |
|
| | close(PARSE); |
| |
|
| | `rm $tmpSplitPoints`; |
| | `rm $tmpDeescaped`; |
| | `rm $tmpEscaped`; |
| |
|