| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| use Getopt::Long "GetOptions"; |
| use FindBin qw($RealBin); |
|
|
| my ($EGRET_DIR,$MOSES_DIR,$TREE_CONVERTER,$FOREST,$SPLIT_HYPHEN,$SPLIT_SLASH,$MARK_SPLIT,$BINARIZE,$UNPARSEABLE,$RAW_IN,$RAW_OUT,$EGRET_OPTIONS,$TREE_CONVERTER_OPTIONS); |
|
|
| $UNPARSEABLE = 0; |
|
|
| die("ERROR: syntax is: parse-en-egret.perl [-forest] [-split-hyphen] [-split-slash] [-mark-split] [-binarize] [-unparseable] [-raw-in PATH] [-raw-out PATH] [-egret-options OPTIONS] [-tree-converter-options OPTIONS] -egret-dir DIR -moses-dir DIR -tree-converter PATH < in > out\n") |
| unless &GetOptions |
| ('egret-dir=s' => \$EGRET_DIR, |
| 'moses-dir=s' => \$MOSES_DIR, |
| 'tree-converter=s' => \$TREE_CONVERTER, |
| 'forest' => \$FOREST, |
| 'split-hyphen' => \$SPLIT_HYPHEN, |
| 'split-slash' => \$SPLIT_SLASH, |
| 'mark-split' => \$MARK_SPLIT, |
| 'binarize' => \$BINARIZE, |
| 'unparseable' => \$UNPARSEABLE, |
| 'raw-in=s' => \$RAW_IN, |
| 'raw-out=s' => \$RAW_OUT, |
| 'egret-options=s' => \$EGRET_OPTIONS, |
| 'tree-converter-options=s' => \$TREE_CONVERTER_OPTIONS |
| ) |
| && defined($EGRET_DIR) && defined($MOSES_DIR) && defined($TREE_CONVERTER); |
|
|
| die("ERROR: could not find egret directory: '$EGRET_DIR'\n") unless -d $EGRET_DIR; |
| die("ERROR: could not find moses directory: '$MOSES_DIR'\n") unless -d $MOSES_DIR; |
| die("ERROR: file not found or not executable: '$TREE_CONVERTER'\n") unless -x $TREE_CONVERTER; |
|
|
| |
|
|
| my $tmpEscaped = "/tmp/parse-en-egret.1.$$"; |
| my $tmpDeescaped = "/tmp/parse-en-egret.2.$$"; |
| my $tmpSplitPoints = "/tmp/parse-en-egret.3.$$"; |
|
|
| open(ESCAPED, ">>$tmpEscaped"); |
| open(DEESCAPED, "| $RealBin/../../tokenizer/deescape-special-chars.perl > $tmpDeescaped"); |
| open(SPLIT_POINTS, ">>$tmpSplitPoints"); |
|
|
| |
| |
| while(<STDIN>) { |
| print ESCAPED $_; |
| my @tokens = split; |
| my $new_token = ""; |
| my $i = 0; |
| my $j = -1; |
| my $s = ""; |
| my $t = ""; |
| while ($i <= $#tokens) { |
| if (defined($SPLIT_HYPHEN) && $i <= $#tokens-1 && |
| $tokens[$i] eq "\@\-\@") { |
| my $pos = length $new_token; |
| $new_token .= "-$tokens[$i+1]"; |
| $t .= "$j,$pos,- "; |
| $i += 2; |
| } elsif (defined($SPLIT_SLASH) && $i <= $#tokens-1 && |
| $tokens[$i] eq "\@\/\@") { |
| my $pos = length $new_token; |
| $new_token .= "/$tokens[$i+1]"; |
| $t .= "$j,$pos,/ "; |
| $i += 2; |
| } else { |
| $s .= "$new_token "; |
| $new_token = $tokens[$i]; |
| $i++; |
| $j++; |
| } |
| } |
| $s .= "$new_token"; |
| $s =~ s/^\s+//; |
| $t =~ s/^\s+//; |
| print DEESCAPED "$s\n"; |
| print SPLIT_POINTS "$t\n"; |
| } |
|
|
| close(SPLIT_POINTS); |
| close(DEESCAPED); |
| close(ESCAPED); |
|
|
| |
|
|
| |
| my $pipeline = ""; |
| if (defined($RAW_IN)) { |
| $pipeline .= "cat \"$RAW_IN\" |"; |
| } else { |
| $pipeline .= "$EGRET_DIR/egret"; |
| $pipeline .= " -lapcfg"; |
| $pipeline .= " -data=$EGRET_DIR/eng_grammar"; |
| $pipeline .= " -printForest" if $FOREST; |
| $pipeline .= " -i=$tmpDeescaped"; |
| $pipeline .= " $EGRET_OPTIONS" if defined($EGRET_OPTIONS); |
| $pipeline .= " |"; |
| } |
| if (defined($RAW_OUT)) { |
| $pipeline .= "tee \"$RAW_OUT\" |"; |
| } |
|
|
| |
| unless ($FOREST) { |
| $pipeline .= 'sed \'s/^(//\' |'; |
| $pipeline .= 'sed \'s/)$//\' |'; |
| $pipeline .= "$TREE_CONVERTER"; |
| $pipeline .= " -input_format penn"; |
| $pipeline .= " -output_format egret"; |
| $pipeline .= " |"; |
| } |
|
|
| |
| |
| |
| |
| $pipeline .= "$MOSES_DIR/bin/postprocess-egret-forests"; |
| $pipeline .= " --Escape" if $FOREST; |
| $pipeline .= " --MarkSplitPoints $tmpSplitPoints"; |
| $pipeline .= " |"; |
|
|
| |
| |
| |
| |
| |
| my $output_format = $FOREST ? "egret" : "penn"; |
| $pipeline .= "$TREE_CONVERTER"; |
| $pipeline .= " -input_format egret"; |
| $pipeline .= " -output_format $output_format"; |
| |
| $pipeline .= " -split \@\-\@" if defined($SPLIT_HYPHEN); |
| $pipeline .= " -split \@\/\@" if defined($SPLIT_SLASH); |
| $pipeline .= " $TREE_CONVERTER_OPTIONS" if defined($TREE_CONVERTER_OPTIONS); |
| $pipeline .= " |"; |
|
|
| unless ($FOREST) { |
| $pipeline .= 'sed \'s/^()$//\' |'; |
| $pipeline .= 'sed \'s/^(/( (/\' |'; |
| $pipeline .= 'sed \'s/)$/))/\' |'; |
| $pipeline .= 'sed \'s/^$/(())/\' |'; |
| $pipeline .= "$RealBin/berkeleyparsed2mosesxml.perl |"; |
| $pipeline .= 'sed \'s/^<tree label="TOP"/<tree label="ROOT"/\' |'; |
| } |
|
|
| |
|
|
| open(PARSE, $pipeline); |
|
|
| if ($FOREST) { |
| while (<PARSE>) { |
| print $_; |
| } |
| } else { |
| open(TMPESCAPED, $tmpEscaped); |
| while (<PARSE>) { |
| my $outLine = $_; |
| my $unparsedLine = <TMPESCAPED>; |
| if ($UNPARSEABLE == 1 && length($outLine) == 1) { |
| print $unparsedLine; |
| } else { |
| print $outLine; |
| } |
| } |
| } |
|
|
| close(PARSE); |
|
|
| `rm $tmpSplitPoints`; |
| `rm $tmpDeescaped`; |
| `rm $tmpEscaped`; |
|
|