| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| use File::Temp qw/tempfile/; |
| use Getopt::Long "GetOptions"; |
| use File::Basename; |
| use FindBin qw($RealBin); |
| use Cwd 'abs_path'; |
|
|
| sub GetFactors; |
|
|
|
|
| my $TMPDIR = "tmp"; |
| my $KEEP_TMP = 0; |
| my $MADA_DIR; |
| my $CONFIG; |
|
|
| my $FACTORS_STR; |
| my @FACTORS; |
|
|
| GetOptions( |
| "tmpdir=s" => \$TMPDIR, |
| "keep-tmp" => \$KEEP_TMP, |
| "mada-dir=s" => \$MADA_DIR, |
| "factors=s" => \$FACTORS_STR, |
| "config=s" => \$CONFIG |
| ) or die("ERROR: unknown options"); |
|
|
| if (!defined($CONFIG)) { |
| $CONFIG = "$MADA_DIR/samples/sampleConfigFile.xml"; |
| } |
|
|
| $TMPDIR = abs_path($TMPDIR); |
| print STDERR "TMPDIR=$TMPDIR \n"; |
|
|
| if (defined($FACTORS_STR)) { |
| @FACTORS = split(",", $FACTORS_STR); |
| } |
|
|
| |
| |
|
|
| $TMPDIR = "$TMPDIR/madamira.$$"; |
| `mkdir -p $TMPDIR`; |
| `mkdir -p $TMPDIR/split`; |
| `mkdir -p $TMPDIR/out`; |
|
|
| my $infile = "$TMPDIR/input"; |
| print STDERR $infile."\n"; |
|
|
| open(TMP,">$infile"); |
| while(<STDIN>) { |
| print TMP $_; |
| } |
| close(TMP); |
|
|
| my $cmd; |
|
|
| |
| my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; |
| if($SPLIT_EXEC) { |
| $SPLIT_EXEC = 'gsplit'; |
| } |
| else { |
| $SPLIT_EXEC = 'split'; |
| } |
|
|
| $cmd = "$SPLIT_EXEC -l 10000 -a 7 -d $TMPDIR/input $TMPDIR/split/x"; |
| `$cmd`; |
|
|
| $cmd = "cd $MADA_DIR && parallel --jobs 4 java -Xmx2500m -Xms2500m -XX:NewRatio=3 -jar $MADA_DIR/MADAMIRA.jar -rawinput {} -rawoutdir $TMPDIR/out -rawconfig $CONFIG ::: $TMPDIR/split/x*"; |
| print STDERR "Executing: $cmd\n"; |
| `$cmd`; |
|
|
| $cmd = "cat $TMPDIR/out/x*.mada > $infile.mada"; |
| print STDERR "Executing: $cmd\n"; |
| `$cmd`; |
|
|
| |
| open(MADA_OUT,"<$infile.mada"); |
| |
| while(my $line = <MADA_OUT>) { |
| chomp($line); |
| |
|
|
| if (index($line, "SENTENCE BREAK") == 0) { |
| |
| |
| print "\n"; |
| } |
| elsif (index($line, ";;WORD") == 0) { |
| |
| my $word = substr($line, 7, length($line) - 8); |
| |
|
|
| for (my $i = 0; $i < 4; ++$i) { |
| $line = <MADA_OUT>; |
| } |
|
|
| my $factors = GetFactors($line, \@FACTORS); |
| $word .= $factors; |
|
|
| print "$word "; |
| } |
| else { |
| |
| } |
| } |
| close (MADA_OUT); |
|
|
|
|
| if ($KEEP_TMP == 0) { |
| |
| } |
|
|
|
|
| |
| sub GetFactors |
| { |
| my $line = shift; |
| my $factorsRef = shift; |
| my @factors = @{$factorsRef}; |
|
|
| |
| my %allFactors; |
| my @toks = split(" ", $line); |
| for (my $i = 1; $i < scalar(@toks); ++$i) { |
| |
|
|
| my ($key, $value) = split(":", $toks[$i]); |
| $allFactors{$key} = $value; |
| } |
|
|
| my $ret = ""; |
| my $factorType; |
| foreach $factorType(@factors) { |
| |
| my $value = $allFactors{$factorType}; |
|
|
| $ret .= "|$value"; |
| } |
|
|
| return $ret; |
| } |
|
|
|
|