File size: 1,667 Bytes
ee0c4e2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
# ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
# ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed)
# (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the)
# (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP
#(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and))
# (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again))
# (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a)
# (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope))
#(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a)
# (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )
while(<STDIN>) {
if (/^$/) {
print "\n"; # parse failures
next;
}
# parenheses
s/\(/\-LRB\-/g; # tokens
s/\)/\-RRB\-/g;
s/\"LRB\"/\"\-LRB\-\"/g; # labels
s/\"RRB\"/\"\-RRB\-\"/g;
# main
s/<tree label=\"([^\"]+)\">/\($1/g;
s/ *<\/tree>/\)/g;
s/^\(TOP/\(/;
# de-escape
s/\&bar;/\|/g; # factor separator
s/\</\</g; # xml
s/\>/\>/g; # xml
s/\&bra;/\[/g; # syntax non-terminal (legacy)
s/\&ket;/\]/g; # syntax non-terminal (legacy)
s/\"/\"/g; # xml
s/\'/\'/g; # xml
s/\[/\[/g; # syntax non-terminal
s/\]/\]/g; # syntax non-terminal
s/\&/\&/g; # escape escape
# cleanup
s/ +/ /g;
s/ $//g;
s/\)$/ \)/g;
# output
print $_;
}
|