| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| use File::Basename; |
| use File::Temp qw/tempfile/; |
| use Getopt::Long "GetOptions"; |
|
|
| my $COLLINS = "/exports/home/s0565741/work/bin/COLLINS-PARSER"; |
| my $MXPOST = "/exports/home/s0565741/work/bin/mxpost"; |
| my $TMPDIR = "tmp"; |
| my $KEEP_TMP = 0; |
| my $RAW = undef; |
|
|
| my $BASIC = 0; |
| GetOptions( |
| "collins=s" => \$COLLINS, |
| "mxpost=s" => \$MXPOST, |
| "tmpdir=s" => \$TMPDIR, |
| "keep-tmp" => \$KEEP_TMP, |
| "raw=s" => \$RAW |
| ) or die("ERROR: unknown options"); |
|
|
| `mkdir -p $TMPDIR`; |
|
|
| |
| my $MaxChar=10000; |
| my $MaxWord=120; |
| my $ParserBin="$COLLINS/code/parser"; |
| my $ParserEvn="$COLLINS/models/model2/events.gz"; |
| my $ParserGrm="$COLLINS/models/model2/grammar"; |
| my ($scriptname, $directories) = fileparse($0); |
| my ($TMP, $tmpfile) = tempfile("$scriptname-XXXXXXXXXX", DIR=>$TMPDIR, UNLINK=>!$KEEP_TMP); |
|
|
| |
| my $pipeline = "perl -ne 'use Encode; encode(\"iso-8859-1\", decode(\"utf8\", \$_)); print \$_;' |"; |
| $pipeline .= "perl -ne 'tr/\\x20-\\x7f//cd; print \$_.\"\\n\";' | "; |
| $pipeline .= "$MXPOST/mxpost $MXPOST/tagger.project |"; |
|
|
| open(TAG,$pipeline); |
| my $sentence_count=0; |
| while(<TAG>) { |
| if ($sentence_count % 2000 == 0) { |
| close(PARSER_IN) if $sentence_count; |
| open(PARSER_IN,sprintf(">%s.%05d",$tmpfile,$sentence_count/2000)); |
| } |
| $sentence_count++; |
| chop; |
|
|
| |
| my $line = &conv_posfmt($_); |
|
|
| |
| $line = "1 SentenceTooLong NN" if (! &check_length($line)); |
|
|
| |
| print PARSER_IN "$line\n"; |
| } |
| close(TAG); |
| close(PARSER_IN); |
|
|
| |
| for(my $i=0;$i * 2000 < $sentence_count;$i++) { |
| my $i_formatted = sprintf("%05d",$i); |
| `gunzip -c $ParserEvn | $ParserBin $tmpfile.$i_formatted $ParserGrm 10000 1 1 1 1 > $tmpfile.$i_formatted.out`; |
| } |
|
|
| |
| my $DEBUG = 0; |
| my $DEBUG_SPACE = " "; |
| open(PARSER,"cat $tmpfile.?????.out|"); |
| while(my $line = <PARSER>) { |
| next unless $line =~ /^\(/; |
| if ($line =~ /SentenceTooLong/) { |
| print "\n"; |
| next; |
| } |
| chop($line); |
| my @LABEL = (); |
| my @OUT = (); |
| for(my $i=0;$i<length($line);$i++) { |
| |
| if (substr($line,$i,1) eq "(") { |
| my ($label,$rest) = split(/[\( ]/,substr($line,$i+1)); |
| print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."BEGINNING of $label\n" if $DEBUG; |
| $i+=length($label); |
| $label =~ s/\$/PUNC/g; |
| $label =~ s/\|/:/g; |
| $label =~ s/\~.+//; |
| push @OUT,"<tree label=\"$label\">"; |
| push @LABEL,$label; |
| $i++ if substr($line,$i+1,1) eq " "; |
| $i++ if substr($line,$i+1,1) eq " "; |
| } |
| elsif (substr($line,$i,1) eq ")") { |
| die("ERROR: NO LABEL ON STACK") unless @LABEL; |
| my $label = pop @LABEL; |
| print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."END of $label\n" if $DEBUG; |
| push @OUT,"</tree>"; |
| $i++ if substr($line,$i+1,1) eq " "; |
| } |
| else { |
| my ($word,$rest) = split(/ /,substr($line,$i)); |
| if (substr($line,$i,2) eq "\\)") { |
| $word = substr($line,$i,2); |
| } |
| $i+=length($word); |
| print STDERR substr($DEBUG_SPACE,0,scalar @LABEL)."WORD $word\n" if $DEBUG; |
| $word =~ /^(.+)\/([^\/]+)$/; |
| my ($w,$p) = ($1,$2); |
| $w = "(" if $w eq "-LRB-"; |
| $w = ")" if $w eq "-RRB-"; |
| $w = &escape($w); |
| $p =~ s/^-//; |
| $p =~ s/-$//; |
| push @OUT,"<tree label=\"$p\"> $w </tree>"; |
| } |
| } |
| die("ERROR: STACK NOT EMPTY $#LABEL\n") if @LABEL; |
| my $first=1; |
| foreach (@OUT) { |
| print " " unless $first; |
| |
| print $_; |
| $first = 0; |
| } |
| print "\n"; |
| } |
|
|
| sub escape { |
| my ($text) = @_; |
| $text =~ s/&/&/g; |
| $text =~ s/</</g; |
| $text =~ s/>/>/g; |
| return $text; |
| } |
|
|
| sub check_length { |
| my ($line) = @_; |
| my ($numc,$numw,@words); |
|
|
| return 0 if $line =~ /^\d+ [^a-z0-9]+$/i || $line eq "0" || $line eq "0 "; |
|
|
| $numc = length($line); |
| @words = split(" ",$line); |
| $numw = ($#words+1)/2; |
|
|
| return ($numc <= $MaxChar) && ($numw <= $MaxWord); |
| } |
|
|
| sub conv_posfmt { |
| my ($line) = @_; |
| my ($sep,$ret,$w,$i,$w1,$w2,$numw); |
|
|
| |
|
|
| $ret=""; $sep=""; $numw=0; |
| for $w (split(" ",$line)) { |
| $i = rindex($w,"_"); |
| $w1 = substr($w,0,$i); |
| $w2 = substr($w,$i+1); |
| $ret .= "$sep$w1 $w2"; |
| $sep = " "; $numw++; |
| } |
| $ret = "$numw $ret"; |
|
|
| |
| $ret =~ s/\(/-LRB-/g; |
| $ret =~ s/\)/-RRB-/g; |
|
|
| $ret; |
| } |
|
|