File size: 3,801 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
#include "ForestParser.h"
#include <istream>
#include <string>
#include <boost/make_shared.hpp>
#include "util/tokenize_piece.hh"
#include "syntax-common/exception.h"
namespace MosesTraining
{
namespace Syntax
{
namespace PostprocessEgretForests
{
ForestParser::ForestParser()
: m_input(0)
{
}
ForestParser::ForestParser(std::istream &input)
: m_input(&input)
{
++(*this);
}
ForestParser &ForestParser::operator++()
{
if (!m_input) {
return *this;
}
m_vertexSet.clear();
m_entry.forest.vertices.clear();
if (!std::getline(*m_input, m_tmpLine)) {
m_input = 0;
return *this;
}
// The first line contains the sentence number.
ParseSentenceNumLine(m_tmpLine, m_entry.sentNum);
// The second line contains the sentence string.
std::getline(*m_input, m_entry.sentence);
// Subsequent lines contain hyperedges -- or a blank line if there was a
// parse failure -- terminated by a blank line.
std::getline(*m_input, m_tmpLine);
if (m_tmpLine == "") { // Parse failure
std::getline(*m_input, m_tmpLine);
assert(m_tmpLine == "");
return *this;
}
while (m_tmpLine != "") {
ParseHyperedgeLine(m_tmpLine, m_entry.forest);
std::getline(*m_input, m_tmpLine);
}
return *this;
}
boost::shared_ptr<Forest::Vertex> ForestParser::AddVertex(const VertexSP &v)
{
std::pair<VertexSet::iterator, bool> ret = m_vertexSet.insert(v);
if (ret.second) {
m_entry.forest.vertices.push_back(*ret.first);
}
return *ret.first;
}
void ForestParser::ParseSentenceNumLine(const std::string &line,
std::size_t &sentNum)
{
const util::AnyCharacter delimiter(" \t");
util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
if (*p != "sentence") {
// FIXME
throw Exception("");
}
++p;
std::string tmp;
p->CopyToString(&tmp);
sentNum = std::atoi(tmp.c_str());
}
void ForestParser::ParseHyperedgeLine(const std::string &line, Forest &forest)
{
const util::AnyCharacter delimiter(" \t");
util::TokenIter<util::AnyCharacter, true> p(line, delimiter);
VertexSP v = AddVertex(ParseVertex(*p));
HyperedgeSP e = boost::make_shared<Forest::Hyperedge>();
e->head = v.get();
++p;
if (*p != "=>") {
// FIXME
throw Exception("");
}
for (++p; *p != "|||"; ++p) {
v = ParseVertex(*p);
if (v->start == -1) {
// Egret does not give start/end for terminals.
v->start = v->end = e->head->start;
}
e->tail.push_back(AddVertex(v).get());
}
++p;
std::string tmp;
p->CopyToString(&tmp);
e->weight = std::atof(tmp.c_str());
e->head->incoming.push_back(e);
}
boost::shared_ptr<Forest::Vertex> ForestParser::ParseVertex(
const StringPiece &s)
{
VertexSP v = boost::make_shared<Forest::Vertex>();
std::size_t pos = s.rfind('[');
if (pos == std::string::npos) {
s.CopyToString(&v->symbol.value);
v->symbol.isNonTerminal = false;
v->start = v->end = -1;
return v;
}
if (pos > 2 && s[pos-2] == '^' && s[pos-1] == 'g') {
s.substr(0, pos-2).CopyToString(&v->symbol.value);
} else {
s.substr(0, pos).CopyToString(&v->symbol.value);
}
v->symbol.isNonTerminal = true;
std::size_t begin = pos + 1;
pos = s.find(',', begin+1);
std::string tmp;
s.substr(begin, pos-begin).CopyToString(&tmp);
v->start = std::atoi(tmp.c_str());
s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
v->end = std::atoi(tmp.c_str());
return v;
}
bool operator==(const ForestParser &lhs, const ForestParser &rhs)
{
// TODO Is this right? Compare values of istreams if non-zero?
return lhs.m_input == rhs.m_input;
}
bool operator!=(const ForestParser &lhs, const ForestParser &rhs)
{
return !(lhs == rhs);
}
} // namespace PostprocessEgretForests
} // namespace Syntax
} // namespace MosesTraining
|