|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "AlignedSentenceSyntax.h" |
|
|
#include "Parameter.h" |
|
|
#include "pugixml.hpp" |
|
|
#include "moses/Util.h" |
|
|
|
|
|
using namespace std; |
|
|
|
|
|
AlignedSentenceSyntax::AlignedSentenceSyntax(int lineNum, |
|
|
const std::string &source, |
|
|
const std::string &target, |
|
|
const std::string &alignment) |
|
|
:AlignedSentence(lineNum) |
|
|
,m_sourceStr(source) |
|
|
,m_targetStr(target) |
|
|
,m_alignmentStr(alignment) |
|
|
{ |
|
|
} |
|
|
|
|
|
AlignedSentenceSyntax::~AlignedSentenceSyntax() |
|
|
{ |
|
|
|
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::Populate(bool isSyntax, int mixedSyntaxType, const Parameter ¶ms, |
|
|
string line, Phrase &phrase, SyntaxTree &tree) |
|
|
{ |
|
|
|
|
|
if (isSyntax) { |
|
|
line = "<xml><tree label=\"X\">" + line + "</tree></xml>"; |
|
|
XMLParse(phrase, tree, line, params); |
|
|
|
|
|
if (mixedSyntaxType != 0) { |
|
|
|
|
|
tree.SetHieroLabel(params.hieroNonTerm); |
|
|
if (mixedSyntaxType == 2) { |
|
|
tree.AddToAll(params.hieroNonTerm); |
|
|
} |
|
|
} |
|
|
} else { |
|
|
PopulateWordVec(phrase, line); |
|
|
tree.SetHieroLabel(params.hieroNonTerm); |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::Create(const Parameter ¶ms) |
|
|
{ |
|
|
Populate(params.sourceSyntax, params.mixedSyntaxType, params, m_sourceStr, |
|
|
m_source, m_sourceTree); |
|
|
Populate(params.targetSyntax, params.mixedSyntaxType, params, m_targetStr, |
|
|
m_target, m_targetTree); |
|
|
|
|
|
PopulateAlignment(m_alignmentStr); |
|
|
CreateConsistentPhrases(params); |
|
|
|
|
|
|
|
|
CreateNonTerms(); |
|
|
} |
|
|
|
|
|
void Escape(string &text) |
|
|
{ |
|
|
text = Moses::Replace(text, "&", "&"); |
|
|
text = Moses::Replace(text, "|", "|"); |
|
|
text = Moses::Replace(text, "<", "<"); |
|
|
text = Moses::Replace(text, ">", ">"); |
|
|
text = Moses::Replace(text, "'", "'"); |
|
|
text = Moses::Replace(text, "\"", """); |
|
|
text = Moses::Replace(text, "[", "["); |
|
|
text = Moses::Replace(text, "]", "]"); |
|
|
|
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::XMLParse(Phrase &output, |
|
|
SyntaxTree &tree, |
|
|
const pugi::xml_node &parentNode, |
|
|
const Parameter ¶ms) |
|
|
{ |
|
|
int childNum = 0; |
|
|
for (pugi::xml_node childNode = parentNode.first_child(); childNode; childNode = childNode.next_sibling()) { |
|
|
string nodeName = childNode.name(); |
|
|
|
|
|
|
|
|
string label; |
|
|
int startPos = output.size(); |
|
|
|
|
|
if (!nodeName.empty()) { |
|
|
pugi::xml_attribute attribute = childNode.attribute("label"); |
|
|
label = attribute.as_string(); |
|
|
|
|
|
|
|
|
XMLParse(output, tree, childNode, params); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
string text = childNode.value(); |
|
|
Escape(text); |
|
|
|
|
|
|
|
|
std::vector<string> toks; |
|
|
Moses::Tokenize(toks, text); |
|
|
|
|
|
for (size_t i = 0; i < toks.size(); ++i) { |
|
|
const string &tok = toks[i]; |
|
|
Word *word = new Word(output.size(), tok); |
|
|
output.push_back(word); |
|
|
} |
|
|
|
|
|
|
|
|
int endPos = output.size() - 1; |
|
|
|
|
|
|
|
|
if (!label.empty()) { |
|
|
label = "[" + label + "]"; |
|
|
tree.Add(startPos, endPos, label, params); |
|
|
} |
|
|
|
|
|
++childNum; |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::XMLParse(Phrase &output, |
|
|
SyntaxTree &tree, |
|
|
const std::string input, |
|
|
const Parameter ¶ms) |
|
|
{ |
|
|
pugi::xml_document doc; |
|
|
pugi::xml_parse_result result = doc.load(input.c_str(), |
|
|
pugi::parse_default | pugi::parse_comments); |
|
|
|
|
|
pugi::xml_node topNode = doc.child("xml"); |
|
|
XMLParse(output, tree, topNode, params); |
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::CreateNonTerms() |
|
|
{ |
|
|
for (int sourceStart = 0; sourceStart < m_source.size(); ++sourceStart) { |
|
|
for (int sourceEnd = sourceStart; sourceEnd < m_source.size(); ++sourceEnd) { |
|
|
ConsistentPhrases::Coll &coll = m_consistentPhrases.GetColl(sourceStart, sourceEnd); |
|
|
const SyntaxTree::Labels &sourceLabels = m_sourceTree.Find(sourceStart, sourceEnd); |
|
|
|
|
|
ConsistentPhrases::Coll::iterator iter; |
|
|
for (iter = coll.begin(); iter != coll.end(); ++iter) { |
|
|
ConsistentPhrase &cp = **iter; |
|
|
|
|
|
int targetStart = cp.corners[2]; |
|
|
int targetEnd = cp.corners[3]; |
|
|
const SyntaxTree::Labels &targetLabels = m_targetTree.Find(targetStart, targetEnd); |
|
|
|
|
|
CreateNonTerms(cp, sourceLabels, targetLabels); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
void AlignedSentenceSyntax::CreateNonTerms(ConsistentPhrase &cp, |
|
|
const SyntaxTree::Labels &sourceLabels, |
|
|
const SyntaxTree::Labels &targetLabels) |
|
|
{ |
|
|
SyntaxTree::Labels::const_iterator iterSource; |
|
|
for (iterSource = sourceLabels.begin(); iterSource != sourceLabels.end(); ++iterSource) { |
|
|
const string &sourceLabel = *iterSource; |
|
|
|
|
|
SyntaxTree::Labels::const_iterator iterTarget; |
|
|
for (iterTarget = targetLabels.begin(); iterTarget != targetLabels.end(); ++iterTarget) { |
|
|
const string &targetLabel = *iterTarget; |
|
|
cp.AddNonTerms(sourceLabel, targetLabel); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|