File size: 1,667 Bytes
ee0c4e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

# ( (NP (NP (NN resumption)) (PP (IN of) (NP (DT the) (NN session)))) )
# ( (S (@S (@S (@S (S (NP (PRP I)) (VP (VB declare) (VP (@VP (VBD resumed)
# (NP (@NP (NP (DT the) (NN session)) (PP (IN of) (NP (@NP (DT the)
# (NNP European)) (NNP Parliament)))) (VP (VBN adjourned) (PP (IN on) (NP
#(NNP Friday) (CD 17)))))) (NP (NNP December) (CD 1999))))) (, ,)) (CC and))
# (S (NP (PRP I)) (VP (MD would) (VP (VB like) (S (ADVP (RB once) (RB again))
# (VP (TO to) (VP (@VP (VB wish) (NP (PRP you))) (NP (NP (@NP (@NP (DT a)
# (JJ happy)) (JJ new)) (NN year)) (PP (IN in) (NP (@NP (DT the) (NN hope))
#(SBAR (IN that) (S (NP (PRP you)) (VP (VBD enjoyed) (NP (@NP (@NP (DT a)
# (JJ pleasant)) (JJ festive)) (NN period))))))))))))))) (. .)) )

while(<STDIN>) {
  if (/^$/) {
    print "\n"; # parse failures
    next;
  }

  # parenheses
  s/\(/\-LRB\-/g; # tokens
  s/\)/\-RRB\-/g;
  s/\"LRB\"/\"\-LRB\-\"/g; # labels
  s/\"RRB\"/\"\-RRB\-\"/g;

  # main
  s/<tree label=\"([^\"]+)\">/\($1/g;
  s/ *<\/tree>/\)/g;
  s/^\(TOP/\(/;

  # de-escape
  s/\&bar;/\|/g;   # factor separator
  s/\&lt;/\</g;    # xml
  s/\&gt;/\>/g;    # xml
  s/\&bra;/\[/g;   # syntax non-terminal (legacy)
  s/\&ket;/\]/g;   # syntax non-terminal (legacy)
  s/\&quot;/\"/g;  # xml
  s/\&apos;/\'/g;  # xml
  s/\&#91;/\[/g;   # syntax non-terminal
  s/\&#93;/\]/g;   # syntax non-terminal
  s/\&amp;/\&/g;   # escape escape

  # cleanup
  s/ +/ /g;
  s/ $//g;
  s/\)$/ \)/g;

  # output
  print $_;
}