| #!/usr/bin/env perl | |
| # | |
| # This file is part of moses. Its use is licensed under the GNU Lesser General | |
| # Public License version 2.1 or, at your option, any later version. | |
| use warnings; | |
| use strict; | |
| # ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) ) | |
| # ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed) | |
| # (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the) | |
| # (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP | |
| #(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and)) | |
| # (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again)) | |
| # (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a) | |
| # (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope)) | |
| #(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a) | |
| # (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) ) | |
| while(<STDIN>) { | |
| if (/^$/) { | |
| print "\n"; # parse failures | |
| next; | |
| } | |
| # parenheses | |
| s/\(/\-LRB\-/g; # tokens | |
| s/\)/\-RRB\-/g; | |
| s/\"LRB\"/\"\-LRB\-\"/g; # labels | |
| s/\"RRB\"/\"\-RRB\-\"/g; | |
| # main | |
| s/<tree label=\"([^\"]+)\">/\($1/g; | |
| s/ *<\/tree>/\)/g; | |
| s/^\(TOP/\(/; | |
| # de-escape | |
| s/\&bar;/\|/g; # factor separator | |
| s/\</\</g; # xml | |
| s/\>/\>/g; # xml | |
| s/\&bra;/\[/g; # syntax non-terminal (legacy) | |
| s/\&ket;/\]/g; # syntax non-terminal (legacy) | |
| s/\"/\"/g; # xml | |
| s/\'/\'/g; # xml | |
| s/\[/\[/g; # syntax non-terminal | |
| s/\]/\]/g; # syntax non-terminal | |
| s/\&/\&/g; # escape escape | |
| # cleanup | |
| s/ +/ /g; | |
| s/ $//g; | |
| s/\)$/ \)/g; | |
| # output | |
| print $_; | |
| } | |