File size: 3,801 Bytes
fd49381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include "ForestParser.h"

#include <istream>
#include <string>

#include <boost/make_shared.hpp>

#include "util/tokenize_piece.hh"

#include "syntax-common/exception.h"

namespace MosesTraining
{
namespace Syntax
{
namespace PostprocessEgretForests
{

ForestParser::ForestParser()
  : m_input(0)
{
}

ForestParser::ForestParser(std::istream &input)
  : m_input(&input)
{
  ++(*this);
}

ForestParser &ForestParser::operator++()
{
  if (!m_input) {
    return *this;
  }
  m_vertexSet.clear();
  m_entry.forest.vertices.clear();
  if (!std::getline(*m_input, m_tmpLine)) {
    m_input = 0;
    return *this;
  }
  // The first line contains the sentence number.
  ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
  // The second line contains the sentence string.
  std::getline(*m_input, m_entry.sentence);
  // Subsequent lines contain hyperedges -- or a blank line if there was a
  // parse failure -- terminated by a blank line.
  std::getline(*m_input, m_tmpLine);
  if (m_tmpLine == "") {  // Parse failure
    std::getline(*m_input, m_tmpLine);
    assert(m_tmpLine == "");
    return *this;
  }
  while (m_tmpLine != "") {
    ParseHyperedgeLine(m_tmpLine, m_entry.forest);
    std::getline(*m_input, m_tmpLine);
  }
  return *this;
}

boost::shared_ptr<Forest::Vertex> ForestParser::AddVertex(const VertexSP &v)
{
  std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
  if (ret.second) {
    m_entry.forest.vertices.push_back(*ret.first);
  }
  return *ret.first;
}

void ForestParser::ParseSentenceNumLine(const std::string &line,
                                        std::size_t &sentNum)
{
  const util::AnyCharacter delimiter(" \t");
  util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
  if (*p != "sentence") {
    // FIXME
    throw Exception("");
  }
  ++p;
  std::string tmp;
  p->CopyToString(&tmp);
  sentNum = std::atoi(tmp.c_str());
}

void ForestParser::ParseHyperedgeLine(const std::string &line, Forest &forest)
{
  const util::AnyCharacter delimiter(" \t");
  util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
  VertexSP v = AddVertex(ParseVertex(*p));
  HyperedgeSP e = boost::make_shared<Forest::Hyperedge>();
  e->head = v.get();
  ++p;
  if (*p != "=>") {
    // FIXME
    throw Exception("");
  }
  for (++p; *p != "|||"; ++p) {
    v = ParseVertex(*p);
    if (v->start == -1) {
      // Egret does not give start/end for terminals.
      v->start = v->end = e->head->start;
    }
    e->tail.push_back(AddVertex(v).get());
  }
  ++p;
  std::string tmp;
  p->CopyToString(&tmp);
  e->weight = std::atof(tmp.c_str());
  e->head->incoming.push_back(e);
}

boost::shared_ptr<Forest::Vertex> ForestParser::ParseVertex(
  const StringPiece &s)
{
  VertexSP v = boost::make_shared<Forest::Vertex>();
  std::size_t pos = s.rfind('[');
  if (pos == std::string::npos) {
    s.CopyToString(&v->symbol.value);
    v->symbol.isNonTerminal = false;
    v->start = v->end = -1;
    return v;
  }
  if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
    s.substr(0, pos-2).CopyToString(&v->symbol.value);
  } else {
    s.substr(0, pos).CopyToString(&v->symbol.value);
  }
  v->symbol.isNonTerminal = true;
  std::size_t begin = pos + 1;
  pos = s.find(',', begin+1);
  std::string tmp;
  s.substr(begin, pos-begin).CopyToString(&tmp);
  v->start = std::atoi(tmp.c_str());
  s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
  v->end = std::atoi(tmp.c_str());
  return v;
}

bool operator==(const ForestParser &lhs, const ForestParser &rhs)
{
  // TODO Is this right?  Compare values of istreams if non-zero?
  return lhs.m_input == rhs.m_input;
}

bool operator!=(const ForestParser &lhs, const ForestParser &rhs)
{
  return !(lhs == rhs);
}

}  // namespace PostprocessEgretForests
}  // namespace Syntax
}  // namespace MosesTraining