Upload 356 files
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +19 -0
- mosesdecoder/phrase-extract/Alignment.cpp +70 -0
- mosesdecoder/phrase-extract/Alignment.h +35 -0
- mosesdecoder/phrase-extract/AlignmentPhrase.cpp +49 -0
- mosesdecoder/phrase-extract/AlignmentPhrase.h +74 -0
- mosesdecoder/phrase-extract/DomainFeature.cpp +170 -0
- mosesdecoder/phrase-extract/DomainFeature.h +143 -0
- mosesdecoder/phrase-extract/ExtractedRule.h +83 -0
- mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp +584 -0
- mosesdecoder/phrase-extract/ExtractionPhrasePair.h +179 -0
- mosesdecoder/phrase-extract/Hole.h +116 -0
- mosesdecoder/phrase-extract/HoleCollection.cpp +77 -0
- mosesdecoder/phrase-extract/HoleCollection.h +95 -0
- mosesdecoder/phrase-extract/InputFileStream.cpp +61 -0
- mosesdecoder/phrase-extract/InputFileStream.h +48 -0
- mosesdecoder/phrase-extract/InternalStructFeature.cpp +57 -0
- mosesdecoder/phrase-extract/InternalStructFeature.h +64 -0
- mosesdecoder/phrase-extract/Jamfile +19 -0
- mosesdecoder/phrase-extract/OutputFileStream.cpp +90 -0
- mosesdecoder/phrase-extract/OutputFileStream.h +81 -0
- mosesdecoder/phrase-extract/PhraseExtractionOptions.h +193 -0
- mosesdecoder/phrase-extract/PhraseOrientation.cpp +481 -0
- mosesdecoder/phrase-extract/PhraseOrientation.h +127 -0
- mosesdecoder/phrase-extract/PropertiesConsolidator.cpp +350 -0
- mosesdecoder/phrase-extract/PropertiesConsolidator.h +67 -0
- mosesdecoder/phrase-extract/RuleExist.h +65 -0
- mosesdecoder/phrase-extract/RuleExtractionOptions.h +95 -0
- mosesdecoder/phrase-extract/ScoreFeature.cpp +114 -0
- mosesdecoder/phrase-extract/ScoreFeature.h +143 -0
- mosesdecoder/phrase-extract/ScoreFeatureTest.cpp +140 -0
- mosesdecoder/phrase-extract/SentenceAlignment.cpp +144 -0
- mosesdecoder/phrase-extract/SentenceAlignment.h +59 -0
- mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp +78 -0
- mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h +69 -0
- mosesdecoder/phrase-extract/SyntaxNode.h +46 -0
- mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp +163 -0
- mosesdecoder/phrase-extract/SyntaxNodeCollection.h +91 -0
- mosesdecoder/phrase-extract/SyntaxTree.h +12 -0
- mosesdecoder/phrase-extract/XmlException.h +46 -0
- mosesdecoder/phrase-extract/XmlTree.cpp +430 -0
- mosesdecoder/phrase-extract/XmlTree.h +41 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o +0 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest +3 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o +0 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output +8 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run +8 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test +1 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o +0 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o +0 -0
- mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o +0 -0
.gitattributes
CHANGED
|
@@ -105,3 +105,22 @@ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/lmbrgrid fi
|
|
| 105 |
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
|
| 106 |
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
|
| 107 |
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
|
| 106 |
mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
|
| 107 |
mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-direct filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-lex filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-rules filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/relax-parse filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/score filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/statistics filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
mosesdecoder/phrase-extract/extract-ghkm/bin/gcc-9/release/link-static/threading-multi/extract-ghkm filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
mosesdecoder/phrase-extract/extract-mixed-syntax/bin/gcc-9/release/link-static/threading-multi/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
mosesdecoder/phrase-extract/filter-rule-table/bin/gcc-9/release/link-static/threading-multi/filter-rule-table filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
mosesdecoder/phrase-extract/lexical-reordering/bin/gcc-9/release/link-static/threading-multi/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
mosesdecoder/phrase-extract/postprocess-egret-forests/bin/gcc-9/release/link-static/threading-multi/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-multi/score-stsg filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
|
mosesdecoder/phrase-extract/Alignment.cpp
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "Alignment.h"
|
| 21 |
+
|
| 22 |
+
#include "phrase-extract/syntax-common/exception.h"
|
| 23 |
+
|
| 24 |
+
#include <algorithm>
|
| 25 |
+
#include <cassert>
|
| 26 |
+
#include <cstdlib>
|
| 27 |
+
|
| 28 |
+
namespace MosesTraining
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
void ReadAlignment(const std::string &s, Alignment &a)
|
| 32 |
+
{
|
| 33 |
+
const std::string digits = "0123456789";
|
| 34 |
+
|
| 35 |
+
a.clear();
|
| 36 |
+
|
| 37 |
+
std::string::size_type begin = 0;
|
| 38 |
+
while (true) {
|
| 39 |
+
std::string::size_type end = s.find("-", begin);
|
| 40 |
+
if (end == std::string::npos) {
|
| 41 |
+
return;
|
| 42 |
+
}
|
| 43 |
+
int src = std::atoi(s.substr(begin, end-begin).c_str());
|
| 44 |
+
if (end+1 == s.size()) {
|
| 45 |
+
throw Syntax::Exception("Target index missing");
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
begin = end+1;
|
| 49 |
+
end = s.find_first_not_of(digits, begin+1);
|
| 50 |
+
int tgt;
|
| 51 |
+
if (end == std::string::npos) {
|
| 52 |
+
tgt = std::atoi(s.substr(begin).c_str());
|
| 53 |
+
a.push_back(std::make_pair(src, tgt));
|
| 54 |
+
return;
|
| 55 |
+
} else {
|
| 56 |
+
tgt = std::atoi(s.substr(begin, end-begin).c_str());
|
| 57 |
+
a.push_back(std::make_pair(src, tgt));
|
| 58 |
+
}
|
| 59 |
+
begin = end+1;
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
void FlipAlignment(Alignment &a)
|
| 64 |
+
{
|
| 65 |
+
for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
|
| 66 |
+
std::swap(p->first, p->second);
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/Alignment.h
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <string>
|
| 23 |
+
#include <utility>
|
| 24 |
+
#include <vector>
|
| 25 |
+
|
| 26 |
+
namespace MosesTraining
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
typedef std::vector<std::pair<int, int> > Alignment;
|
| 30 |
+
|
| 31 |
+
void ReadAlignment(const std::string &, Alignment &);
|
| 32 |
+
|
| 33 |
+
void FlipAlignment(Alignment &);
|
| 34 |
+
|
| 35 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/AlignmentPhrase.cpp
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based language decoder
|
| 4 |
+
Copyright (C) 2006 University of Edinburgh
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#include <algorithm>
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include "AlignmentPhrase.h"
|
| 24 |
+
|
| 25 |
+
using namespace std;
|
| 26 |
+
|
| 27 |
+
namespace MosesTraining
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
void AlignmentElement::Merge(size_t align)
|
| 31 |
+
{
|
| 32 |
+
m_elements.insert(align);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
void AlignmentPhrase::Merge(const std::vector< std::vector<size_t> > &source)
|
| 36 |
+
{
|
| 37 |
+
for (size_t idx = 0 ; idx < source.size() ; ++idx) {
|
| 38 |
+
AlignmentElement &currElement = m_elements[idx];
|
| 39 |
+
const vector<size_t> &newElement = source[idx];
|
| 40 |
+
|
| 41 |
+
for (size_t pos = 0 ; pos < newElement.size() ; ++pos) {
|
| 42 |
+
currElement.Merge(newElement[pos]);
|
| 43 |
+
}
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
} // namespace
|
| 48 |
+
|
| 49 |
+
|
mosesdecoder/phrase-extract/AlignmentPhrase.h
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based language decoder
|
| 4 |
+
Copyright (C) 2006 University of Edinburgh
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
#pragma once
|
| 22 |
+
|
| 23 |
+
#include <vector>
|
| 24 |
+
#include <set>
|
| 25 |
+
|
| 26 |
+
namespace MosesTraining
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
class WordsRange;
|
| 30 |
+
|
| 31 |
+
class AlignmentElement
|
| 32 |
+
{
|
| 33 |
+
protected:
|
| 34 |
+
std::set<size_t> m_elements;
|
| 35 |
+
public:
|
| 36 |
+
typedef std::set<size_t>::iterator iterator;
|
| 37 |
+
typedef std::set<size_t>::const_iterator const_iterator;
|
| 38 |
+
const_iterator begin() const {
|
| 39 |
+
return m_elements.begin();
|
| 40 |
+
}
|
| 41 |
+
const_iterator end() const {
|
| 42 |
+
return m_elements.end();
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
AlignmentElement() {
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
size_t GetSize() const {
|
| 49 |
+
return m_elements.size();
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
void Merge(size_t align);
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
class AlignmentPhrase
|
| 56 |
+
{
|
| 57 |
+
protected:
|
| 58 |
+
std::vector<AlignmentElement> m_elements;
|
| 59 |
+
public:
|
| 60 |
+
AlignmentPhrase(size_t size)
|
| 61 |
+
:m_elements(size) {
|
| 62 |
+
}
|
| 63 |
+
void Merge(const AlignmentPhrase &newAlignment, const WordsRange &newAlignmentRange);
|
| 64 |
+
void Merge(const std::vector< std::vector<size_t> > &source);
|
| 65 |
+
size_t GetSize() const {
|
| 66 |
+
return m_elements.size();
|
| 67 |
+
}
|
| 68 |
+
const AlignmentElement &GetElement(size_t pos) const {
|
| 69 |
+
return m_elements[pos];
|
| 70 |
+
}
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
} // namespace
|
| 74 |
+
|
mosesdecoder/phrase-extract/DomainFeature.cpp
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "DomainFeature.h"
|
| 2 |
+
#include "ExtractionPhrasePair.h"
|
| 3 |
+
#include "tables-core.h"
|
| 4 |
+
#include "InputFileStream.h"
|
| 5 |
+
#include "util/tokenize.hh"
|
| 6 |
+
|
| 7 |
+
using namespace std;
|
| 8 |
+
|
| 9 |
+
namespace MosesTraining
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
// handling of domain names: load database with sentence-id / domain name info
|
| 13 |
+
void Domain::load( const std::string &domainFileName )
|
| 14 |
+
{
|
| 15 |
+
Moses::InputFileStream fileS( domainFileName );
|
| 16 |
+
istream *fileP = &fileS;
|
| 17 |
+
|
| 18 |
+
string line;
|
| 19 |
+
while(getline(*fileP, line)) {
|
| 20 |
+
// read
|
| 21 |
+
const vector< string > domainSpecLine = util::tokenize( line );
|
| 22 |
+
int lineNumber;
|
| 23 |
+
if (domainSpecLine.size() != 2 ||
|
| 24 |
+
! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
|
| 25 |
+
std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
|
| 26 |
+
exit(1);
|
| 27 |
+
}
|
| 28 |
+
// store
|
| 29 |
+
const string &name = domainSpecLine[1];
|
| 30 |
+
spec.push_back( make_pair( lineNumber, name ));
|
| 31 |
+
if (name2id.find( name ) == name2id.end()) {
|
| 32 |
+
name2id[ name ] = list.size();
|
| 33 |
+
list.push_back( name );
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// get domain name based on sentence number
|
| 39 |
+
string Domain::getDomainOfSentence( int sentenceId ) const
|
| 40 |
+
{
|
| 41 |
+
for(size_t i=0; i<spec.size(); i++) {
|
| 42 |
+
if (sentenceId <= spec[i].first) {
|
| 43 |
+
return spec[i].second;
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
return "undefined";
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
|
| 50 |
+
{
|
| 51 |
+
//process domain file
|
| 52 |
+
m_domain.load(domainFile);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
| 56 |
+
float count,
|
| 57 |
+
int sentenceId) const
|
| 58 |
+
{
|
| 59 |
+
std::string value = m_domain.getDomainOfSentence(sentenceId);
|
| 60 |
+
phrasePair.AddProperty(m_propertyKey, value, count);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
void DomainFeature::add(const ScoreFeatureContext& context,
|
| 64 |
+
std::vector<float>& denseValues,
|
| 65 |
+
std::map<std::string,float>& sparseValues) const
|
| 66 |
+
{
|
| 67 |
+
const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
|
| 68 |
+
assert( domainCount != NULL );
|
| 69 |
+
add(*domainCount,
|
| 70 |
+
context.phrasePair.GetCount(),
|
| 71 |
+
context.maybeLog,
|
| 72 |
+
denseValues, sparseValues);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
void SubsetDomainFeature::add(const map<string,float>& domainCount,
|
| 76 |
+
float count,
|
| 77 |
+
const MaybeLog& maybeLog,
|
| 78 |
+
std::vector<float>& denseValues,
|
| 79 |
+
std::map<std::string,float>& sparseValues) const
|
| 80 |
+
{
|
| 81 |
+
if (m_domain.list.size() > 6) {
|
| 82 |
+
UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
|
| 83 |
+
"too many domains for core domain subset features");
|
| 84 |
+
}
|
| 85 |
+
size_t bitmap = 0;
|
| 86 |
+
for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
|
| 87 |
+
if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
|
| 88 |
+
bitmap += 1 << bit;
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
|
| 92 |
+
denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
|
| 93 |
+
}
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
|
| 97 |
+
const MaybeLog& maybeLog,
|
| 98 |
+
std::vector<float>& denseValues,
|
| 99 |
+
std::map<std::string,float>& sparseValues) const
|
| 100 |
+
{
|
| 101 |
+
typedef vector<string>::const_iterator I;
|
| 102 |
+
ostringstream key;
|
| 103 |
+
key << "doms";
|
| 104 |
+
for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
|
| 105 |
+
if (domainCount.find(*i) != domainCount.end()) {
|
| 106 |
+
key << "_" << *i;
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
sparseValues[key.str()] = 1;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
|
| 114 |
+
const MaybeLog& maybeLog,
|
| 115 |
+
std::vector<float>& denseValues,
|
| 116 |
+
std::map<std::string,float>& sparseValues) const
|
| 117 |
+
{
|
| 118 |
+
typedef vector< string >::const_iterator I;
|
| 119 |
+
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
|
| 120 |
+
map<string,float>::const_iterator dci = domainCount.find(*i);
|
| 121 |
+
if (dci == domainCount.end() ) {
|
| 122 |
+
denseValues.push_back(maybeLog( 1 ));
|
| 123 |
+
} else {
|
| 124 |
+
denseValues.push_back(maybeLog(exp( dci->second / count ) ));
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
|
| 131 |
+
const MaybeLog& maybeLog,
|
| 132 |
+
std::vector<float>& denseValues,
|
| 133 |
+
std::map<std::string,float>& sparseValues) const
|
| 134 |
+
{
|
| 135 |
+
typedef map< string, float >::const_iterator I;
|
| 136 |
+
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
|
| 137 |
+
sparseValues["domr_" + i->first] = (i->second / count);
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
|
| 143 |
+
const MaybeLog& maybeLog,
|
| 144 |
+
std::vector<float>& denseValues,
|
| 145 |
+
std::map<std::string,float>& sparseValues) const
|
| 146 |
+
{
|
| 147 |
+
typedef vector< string >::const_iterator I;
|
| 148 |
+
for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
|
| 149 |
+
map<string,float>::const_iterator dci = domainCount.find(*i);
|
| 150 |
+
if (dci == domainCount.end() ) {
|
| 151 |
+
denseValues.push_back(maybeLog( 1 ));
|
| 152 |
+
} else {
|
| 153 |
+
denseValues.push_back(maybeLog(2.718));
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
|
| 159 |
+
const MaybeLog& maybeLog,
|
| 160 |
+
std::vector<float>& denseValues,
|
| 161 |
+
std::map<std::string,float>& sparseValues) const
|
| 162 |
+
{
|
| 163 |
+
typedef map< string, float >::const_iterator I;
|
| 164 |
+
for (I i=domainCount.begin(); i != domainCount.end(); i++) {
|
| 165 |
+
sparseValues["dom_" + i->first] = 1;
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
}
|
| 170 |
+
|
mosesdecoder/phrase-extract/DomainFeature.h
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
#ifndef _DOMAIN_H
|
| 4 |
+
#define _DOMAIN_H
|
| 5 |
+
|
| 6 |
+
#include <iostream>
|
| 7 |
+
#include <fstream>
|
| 8 |
+
#include <cassert>
|
| 9 |
+
#include <cstdlib>
|
| 10 |
+
#include <string>
|
| 11 |
+
#include <queue>
|
| 12 |
+
#include <map>
|
| 13 |
+
#include <cmath>
|
| 14 |
+
|
| 15 |
+
#include "ScoreFeature.h"
|
| 16 |
+
|
| 17 |
+
namespace MosesTraining
|
| 18 |
+
{
|
| 19 |
+
|
| 20 |
+
class Domain
|
| 21 |
+
{
|
| 22 |
+
public:
|
| 23 |
+
std::vector< std::pair< int, std::string > > spec;
|
| 24 |
+
std::vector< std::string > list;
|
| 25 |
+
std::map< std::string, int > name2id;
|
| 26 |
+
void load( const std::string &fileName );
|
| 27 |
+
std::string getDomainOfSentence( int sentenceId ) const;
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
class DomainFeature : public ScoreFeature
|
| 31 |
+
{
|
| 32 |
+
public:
|
| 33 |
+
|
| 34 |
+
DomainFeature(const std::string& domainFile);
|
| 35 |
+
|
| 36 |
+
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
| 37 |
+
float count,
|
| 38 |
+
int sentenceId) const;
|
| 39 |
+
|
| 40 |
+
void add(const ScoreFeatureContext& context,
|
| 41 |
+
std::vector<float>& denseValues,
|
| 42 |
+
std::map<std::string,float>& sparseValues) const;
|
| 43 |
+
|
| 44 |
+
protected:
|
| 45 |
+
/** Overridden in subclass */
|
| 46 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 47 |
+
const MaybeLog& maybeLog,
|
| 48 |
+
std::vector<float>& denseValues,
|
| 49 |
+
std::map<std::string,float>& sparseValues) const = 0;
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
Domain m_domain;
|
| 53 |
+
|
| 54 |
+
const std::string m_propertyKey;
|
| 55 |
+
|
| 56 |
+
};
|
| 57 |
+
|
| 58 |
+
class SubsetDomainFeature : public DomainFeature
|
| 59 |
+
{
|
| 60 |
+
public:
|
| 61 |
+
SubsetDomainFeature(const std::string& domainFile) :
|
| 62 |
+
DomainFeature(domainFile) {}
|
| 63 |
+
|
| 64 |
+
protected:
|
| 65 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 66 |
+
const MaybeLog& maybeLog,
|
| 67 |
+
std::vector<float>& denseValues,
|
| 68 |
+
std::map<std::string,float>& sparseValues) const;
|
| 69 |
+
};
|
| 70 |
+
|
| 71 |
+
class SparseSubsetDomainFeature : public DomainFeature
|
| 72 |
+
{
|
| 73 |
+
public:
|
| 74 |
+
SparseSubsetDomainFeature(const std::string& domainFile) :
|
| 75 |
+
DomainFeature(domainFile) {}
|
| 76 |
+
|
| 77 |
+
protected:
|
| 78 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 79 |
+
const MaybeLog& maybeLog,
|
| 80 |
+
std::vector<float>& denseValues,
|
| 81 |
+
std::map<std::string,float>& sparseValues) const;
|
| 82 |
+
|
| 83 |
+
};
|
| 84 |
+
|
| 85 |
+
class IndicatorDomainFeature : public DomainFeature
|
| 86 |
+
{
|
| 87 |
+
public:
|
| 88 |
+
IndicatorDomainFeature(const std::string& domainFile) :
|
| 89 |
+
DomainFeature(domainFile) {}
|
| 90 |
+
|
| 91 |
+
protected:
|
| 92 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 93 |
+
const MaybeLog& maybeLog,
|
| 94 |
+
std::vector<float>& denseValues,
|
| 95 |
+
std::map<std::string,float>& sparseValues) const;
|
| 96 |
+
};
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class SparseIndicatorDomainFeature : public DomainFeature
|
| 100 |
+
{
|
| 101 |
+
public:
|
| 102 |
+
SparseIndicatorDomainFeature(const std::string& domainFile) :
|
| 103 |
+
DomainFeature(domainFile) {}
|
| 104 |
+
|
| 105 |
+
protected:
|
| 106 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 107 |
+
const MaybeLog& maybeLog,
|
| 108 |
+
std::vector<float>& denseValues,
|
| 109 |
+
std::map<std::string,float>& sparseValues) const;
|
| 110 |
+
};
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
class RatioDomainFeature : public DomainFeature
|
| 114 |
+
{
|
| 115 |
+
public:
|
| 116 |
+
RatioDomainFeature(const std::string& domainFile) :
|
| 117 |
+
DomainFeature(domainFile) {}
|
| 118 |
+
|
| 119 |
+
protected:
|
| 120 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 121 |
+
const MaybeLog& maybeLog,
|
| 122 |
+
std::vector<float>& denseValues,
|
| 123 |
+
std::map<std::string,float>& sparseValues) const;
|
| 124 |
+
};
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
class SparseRatioDomainFeature : public DomainFeature
|
| 128 |
+
{
|
| 129 |
+
public:
|
| 130 |
+
SparseRatioDomainFeature(const std::string& domainFile) :
|
| 131 |
+
DomainFeature(domainFile) {}
|
| 132 |
+
|
| 133 |
+
protected:
|
| 134 |
+
virtual void add(const std::map<std::string,float>& domainCounts, float count,
|
| 135 |
+
const MaybeLog& maybeLog,
|
| 136 |
+
std::vector<float>& denseValues,
|
| 137 |
+
std::map<std::string,float>& sparseValues) const;
|
| 138 |
+
};
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
#endif
|
mosesdecoder/phrase-extract/ExtractedRule.h
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef EXTRACTEDRULE_H_INCLUDED_
|
| 22 |
+
#define EXTRACTEDRULE_H_INCLUDED_
|
| 23 |
+
|
| 24 |
+
#include <string>
|
| 25 |
+
#include <iostream>
|
| 26 |
+
#include <sstream>
|
| 27 |
+
#include <map>
|
| 28 |
+
|
| 29 |
+
#include "PhraseOrientation.h"
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
// sentence-level collection of rules
|
| 35 |
+
class ExtractedRule
|
| 36 |
+
{
|
| 37 |
+
public:
|
| 38 |
+
std::string source;
|
| 39 |
+
std::string target;
|
| 40 |
+
std::string alignment;
|
| 41 |
+
std::string alignmentInv;
|
| 42 |
+
std::string sourceContextLeft;
|
| 43 |
+
std::string sourceContextRight;
|
| 44 |
+
std::string targetContextLeft;
|
| 45 |
+
std::string targetContextRight;
|
| 46 |
+
std::string sourceHoleString;
|
| 47 |
+
std::string targetHoleString;
|
| 48 |
+
std::string targetSyntacticPreference;
|
| 49 |
+
int startT;
|
| 50 |
+
int endT;
|
| 51 |
+
int startS;
|
| 52 |
+
int endS;
|
| 53 |
+
float count;
|
| 54 |
+
double pcfgScore;
|
| 55 |
+
PhraseOrientation::REO_CLASS l2rOrientation;
|
| 56 |
+
PhraseOrientation::REO_CLASS r2lOrientation;
|
| 57 |
+
|
| 58 |
+
ExtractedRule(int sT, int eT, int sS, int eS)
|
| 59 |
+
: source()
|
| 60 |
+
, target()
|
| 61 |
+
, alignment()
|
| 62 |
+
, alignmentInv()
|
| 63 |
+
, sourceContextLeft()
|
| 64 |
+
, sourceContextRight()
|
| 65 |
+
, targetContextLeft()
|
| 66 |
+
, targetContextRight()
|
| 67 |
+
, sourceHoleString()
|
| 68 |
+
, targetHoleString()
|
| 69 |
+
, targetSyntacticPreference()
|
| 70 |
+
, startT(sT)
|
| 71 |
+
, endT(eT)
|
| 72 |
+
, startS(sS)
|
| 73 |
+
, endS(eS)
|
| 74 |
+
, count(0)
|
| 75 |
+
, pcfgScore(0.0)
|
| 76 |
+
, l2rOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
|
| 77 |
+
, r2lOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
|
| 78 |
+
{ }
|
| 79 |
+
};
|
| 80 |
+
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
#endif
|
mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp
ADDED
|
@@ -0,0 +1,584 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2009 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <sstream>
|
| 21 |
+
#include "ExtractionPhrasePair.h"
|
| 22 |
+
#include "tables-core.h"
|
| 23 |
+
#include "score.h"
|
| 24 |
+
#include "moses/Util.h"
|
| 25 |
+
|
| 26 |
+
#include <cstdlib>
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
extern Vocabulary vcbT;
|
| 36 |
+
extern Vocabulary vcbS;
|
| 37 |
+
|
| 38 |
+
extern bool hierarchicalFlag;
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
|
| 42 |
+
const PHRASE *phraseTarget,
|
| 43 |
+
ALIGNMENT *targetToSourceAlignment,
|
| 44 |
+
float count, float pcfgSum ) :
|
| 45 |
+
m_phraseSource(phraseSource),
|
| 46 |
+
m_phraseTarget(phraseTarget),
|
| 47 |
+
m_count(count),
|
| 48 |
+
m_pcfgSum(pcfgSum)
|
| 49 |
+
{
|
| 50 |
+
assert(!phraseSource->empty());
|
| 51 |
+
|
| 52 |
+
m_count = count;
|
| 53 |
+
m_pcfgSum = pcfgSum;
|
| 54 |
+
|
| 55 |
+
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
| 56 |
+
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
| 57 |
+
|
| 58 |
+
m_lastTargetToSourceAlignment = insertedAlignment.first;
|
| 59 |
+
m_lastCount = m_count;
|
| 60 |
+
m_lastPcfgSum = m_pcfgSum;
|
| 61 |
+
|
| 62 |
+
m_isValid = true;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
ExtractionPhrasePair::~ExtractionPhrasePair( )
|
| 67 |
+
{
|
| 68 |
+
Clear();
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
// return value: true if the given alignment was seen for the first time and thus will be stored,
|
| 73 |
+
// false if it was present already (the pointer may thus be deleted(
|
| 74 |
+
bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
|
| 75 |
+
float count, float pcfgSum )
|
| 76 |
+
{
|
| 77 |
+
m_count += count;
|
| 78 |
+
m_pcfgSum += pcfgSum;
|
| 79 |
+
|
| 80 |
+
m_lastCount = count;
|
| 81 |
+
m_lastPcfgSum = pcfgSum;
|
| 82 |
+
|
| 83 |
+
std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
|
| 84 |
+
if ( *(iter->first) == *targetToSourceAlignment ) {
|
| 85 |
+
iter->second += count;
|
| 86 |
+
return false;
|
| 87 |
+
} else {
|
| 88 |
+
std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
|
| 89 |
+
m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
|
| 90 |
+
if ( !insertedAlignment.second ) {
|
| 91 |
+
// the alignment already exists: increment count
|
| 92 |
+
insertedAlignment.first->second += count;
|
| 93 |
+
return false;
|
| 94 |
+
}
|
| 95 |
+
m_lastTargetToSourceAlignment = insertedAlignment.first;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
return true;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
|
| 103 |
+
{
|
| 104 |
+
m_count += count;
|
| 105 |
+
m_pcfgSum += pcfgSum;
|
| 106 |
+
m_lastTargetToSourceAlignment->second += count;
|
| 107 |
+
// properties
|
| 108 |
+
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
| 109 |
+
iter !=m_properties.end(); ++iter ) {
|
| 110 |
+
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
|
| 111 |
+
(*lastPropertyValue)->second += count;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
m_lastCount = count;
|
| 115 |
+
m_lastPcfgSum = pcfgSum;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
// Check for lexical match
|
| 120 |
+
// and in case of SCFG rules for equal non-terminal alignment.
|
| 121 |
+
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
| 122 |
+
const PHRASE *otherPhraseTarget,
|
| 123 |
+
ALIGNMENT *otherTargetToSourceAlignment ) const
|
| 124 |
+
{
|
| 125 |
+
if (*otherPhraseTarget != *m_phraseTarget) {
|
| 126 |
+
return false;
|
| 127 |
+
}
|
| 128 |
+
if (*otherPhraseSource != *m_phraseSource) {
|
| 129 |
+
return false;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
return MatchesAlignment( otherTargetToSourceAlignment );
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
// Check for lexical match
|
| 136 |
+
// and in case of SCFG rules for equal non-terminal alignment.
|
| 137 |
+
// Set boolean indicators.
|
| 138 |
+
// (Note that we check in the order: target - source - alignment
|
| 139 |
+
// and do not touch the subsequent boolean indicators once a previous one has been set to false.)
|
| 140 |
+
bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
|
| 141 |
+
const PHRASE *otherPhraseTarget,
|
| 142 |
+
ALIGNMENT *otherTargetToSourceAlignment,
|
| 143 |
+
bool &sourceMatch,
|
| 144 |
+
bool &targetMatch,
|
| 145 |
+
bool &alignmentMatch ) const
|
| 146 |
+
{
|
| 147 |
+
if (*otherPhraseSource != *m_phraseSource) {
|
| 148 |
+
sourceMatch = false;
|
| 149 |
+
return false;
|
| 150 |
+
} else {
|
| 151 |
+
sourceMatch = true;
|
| 152 |
+
}
|
| 153 |
+
if (*otherPhraseTarget != *m_phraseTarget) {
|
| 154 |
+
targetMatch = false;
|
| 155 |
+
return false;
|
| 156 |
+
} else {
|
| 157 |
+
targetMatch = true;
|
| 158 |
+
}
|
| 159 |
+
if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
|
| 160 |
+
alignmentMatch = false;
|
| 161 |
+
return false;
|
| 162 |
+
} else {
|
| 163 |
+
alignmentMatch = true;
|
| 164 |
+
}
|
| 165 |
+
return true;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// Check for equal non-terminal alignment in case of SCFG rules.
|
| 169 |
+
// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
|
| 170 |
+
bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
|
| 171 |
+
{
|
| 172 |
+
if (!hierarchicalFlag) return true;
|
| 173 |
+
|
| 174 |
+
// all or none of the phrasePair's word alignment matrices match, so just pick one
|
| 175 |
+
const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
|
| 176 |
+
|
| 177 |
+
assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
|
| 178 |
+
assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
|
| 179 |
+
|
| 180 |
+
// loop over all symbols but the left hand side of the rule
|
| 181 |
+
for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
|
| 182 |
+
if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
|
| 183 |
+
size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
|
| 184 |
+
size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
|
| 185 |
+
|
| 186 |
+
if (thisTargetToSourceAlignment->at(i).size() != 1 ||
|
| 187 |
+
otherTargetToSourceAlignment->at(i).size() != 1 ||
|
| 188 |
+
thisAlign != otherAlign) {
|
| 189 |
+
return false;
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
return true;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
void ExtractionPhrasePair::Clear()
|
| 198 |
+
{
|
| 199 |
+
delete m_phraseSource;
|
| 200 |
+
delete m_phraseTarget;
|
| 201 |
+
|
| 202 |
+
m_count = 0.0f;
|
| 203 |
+
m_pcfgSum = 0.0f;
|
| 204 |
+
|
| 205 |
+
for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
|
| 206 |
+
iter!=m_targetToSourceAlignments.end(); ++iter) {
|
| 207 |
+
delete iter->first;
|
| 208 |
+
}
|
| 209 |
+
m_targetToSourceAlignments.clear();
|
| 210 |
+
|
| 211 |
+
for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
|
| 212 |
+
iter!=m_properties.end(); ++iter) {
|
| 213 |
+
delete (iter->second).second;
|
| 214 |
+
delete (iter->second).first;
|
| 215 |
+
}
|
| 216 |
+
m_properties.clear();
|
| 217 |
+
|
| 218 |
+
m_lastCount = 0.0f;
|
| 219 |
+
m_lastPcfgSum = 0.0f;
|
| 220 |
+
m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
|
| 221 |
+
|
| 222 |
+
m_isValid = false;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
|
| 227 |
+
{
|
| 228 |
+
if (propertiesString.empty()) {
|
| 229 |
+
return;
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
vector<std::string> toks;
|
| 233 |
+
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
| 234 |
+
for (size_t i = 1; i < toks.size(); ++i) {
|
| 235 |
+
std::string &tok = toks[i];
|
| 236 |
+
if (tok.empty()) {
|
| 237 |
+
continue;
|
| 238 |
+
}
|
| 239 |
+
size_t endPos = tok.rfind("}");
|
| 240 |
+
tok = tok.substr(0, endPos - 1);
|
| 241 |
+
|
| 242 |
+
vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
| 243 |
+
if (keyValue.size() == 2) {
|
| 244 |
+
AddProperty(keyValue[0], keyValue[1], count);
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
|
| 251 |
+
{
|
| 252 |
+
float bestAlignmentCount = -1;
|
| 253 |
+
|
| 254 |
+
std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
|
| 255 |
+
|
| 256 |
+
for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
|
| 257 |
+
iter!=m_targetToSourceAlignments.end(); ++iter) {
|
| 258 |
+
if ( (iter->second > bestAlignmentCount) ||
|
| 259 |
+
( (iter->second == bestAlignmentCount) &&
|
| 260 |
+
(*(iter->first) > *(bestAlignment->first)) ) ) {
|
| 261 |
+
bestAlignmentCount = iter->second;
|
| 262 |
+
bestAlignment = iter;
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
if ( bestAlignment == m_targetToSourceAlignments.end()) {
|
| 267 |
+
return NULL;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
return bestAlignment->first;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
|
| 275 |
+
{
|
| 276 |
+
float bestPropertyCount = -1;
|
| 277 |
+
|
| 278 |
+
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
|
| 279 |
+
if ( allPropertyValues == NULL ) {
|
| 280 |
+
return NULL;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
|
| 284 |
+
|
| 285 |
+
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
| 286 |
+
iter!=allPropertyValues->end(); ++iter) {
|
| 287 |
+
if ( (iter->second > bestPropertyCount) ||
|
| 288 |
+
( (iter->second == bestPropertyCount) &&
|
| 289 |
+
(iter->first > bestPropertyValue->first) ) ) {
|
| 290 |
+
bestPropertyCount = iter->second;
|
| 291 |
+
bestPropertyValue = iter;
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
if ( bestPropertyValue == allPropertyValues->end()) {
|
| 296 |
+
return NULL;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
return &(bestPropertyValue->first);
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
|
| 304 |
+
{
|
| 305 |
+
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
|
| 306 |
+
|
| 307 |
+
if ( allPropertyValues == NULL ) {
|
| 308 |
+
return "";
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
std::ostringstream oss;
|
| 312 |
+
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
| 313 |
+
iter!=allPropertyValues->end(); ++iter) {
|
| 314 |
+
if (!(iter->first).empty()) {
|
| 315 |
+
if (iter!=allPropertyValues->begin()) {
|
| 316 |
+
oss << " ";
|
| 317 |
+
}
|
| 318 |
+
oss << iter->first;
|
| 319 |
+
oss << " ";
|
| 320 |
+
oss << iter->second;
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
std::string allPropertyValuesString(oss.str());
|
| 325 |
+
return allPropertyValuesString;
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
|
| 330 |
+
std::set<std::string>& labelSet,
|
| 331 |
+
boost::unordered_map<std::string,float>& countsLabelsLHS,
|
| 332 |
+
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
|
| 333 |
+
Vocabulary &vcbT) const
|
| 334 |
+
{
|
| 335 |
+
const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
|
| 336 |
+
|
| 337 |
+
if ( allPropertyValues == NULL ) {
|
| 338 |
+
return "";
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
std::string lhs="", rhs="", currentRhs="";
|
| 342 |
+
float currentRhsCount = 0.0;
|
| 343 |
+
std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
|
| 344 |
+
|
| 345 |
+
std::ostringstream oss;
|
| 346 |
+
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
| 347 |
+
iter!=allPropertyValues->end(); ++iter) {
|
| 348 |
+
|
| 349 |
+
size_t space = (iter->first).find_last_of(' ');
|
| 350 |
+
if ( space == string::npos ) {
|
| 351 |
+
lhs = iter->first;
|
| 352 |
+
rhs.clear();
|
| 353 |
+
} else {
|
| 354 |
+
lhs = (iter->first).substr(space+1);
|
| 355 |
+
rhs = (iter->first).substr(0,space);
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
labelSet.insert(lhs);
|
| 359 |
+
|
| 360 |
+
if ( rhs.compare(currentRhs) ) {
|
| 361 |
+
|
| 362 |
+
if ( iter!=allPropertyValues->begin() ) {
|
| 363 |
+
if ( !currentRhs.empty() ) {
|
| 364 |
+
istringstream tokenizer(currentRhs);
|
| 365 |
+
std::string rhsLabel;
|
| 366 |
+
while ( tokenizer.peek() != EOF ) {
|
| 367 |
+
tokenizer >> rhsLabel;
|
| 368 |
+
labelSet.insert(rhsLabel);
|
| 369 |
+
}
|
| 370 |
+
oss << " " << currentRhs << " " << currentRhsCount;
|
| 371 |
+
}
|
| 372 |
+
if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
|
| 373 |
+
if ( !currentRhs.empty() ) {
|
| 374 |
+
oss << " " << lhsGivenCurrentRhsCounts.size();
|
| 375 |
+
}
|
| 376 |
+
for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
|
| 377 |
+
iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
|
| 378 |
+
oss << " " << iter2->first << " " << iter2->second;
|
| 379 |
+
|
| 380 |
+
// update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
|
| 381 |
+
std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
|
| 382 |
+
ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
|
| 383 |
+
ruleTargetLhs.erase(ruleTargetLhs.size()-1);
|
| 384 |
+
|
| 385 |
+
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
|
| 386 |
+
countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 387 |
+
if (!insertedCountsLabelsLHS.second) {
|
| 388 |
+
(insertedCountsLabelsLHS.first)->second += iter2->second;
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
|
| 392 |
+
jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
|
| 393 |
+
if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
|
| 394 |
+
boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
|
| 395 |
+
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 396 |
+
jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
|
| 397 |
+
} else {
|
| 398 |
+
boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
|
| 399 |
+
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
|
| 400 |
+
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 401 |
+
if (!insertedJointCounts.second) {
|
| 402 |
+
(insertedJointCounts.first)->second += iter2->second;
|
| 403 |
+
}
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
}
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
lhsGivenCurrentRhsCounts.clear();
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
currentRhsCount = 0.0;
|
| 413 |
+
currentRhs = rhs;
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
currentRhsCount += iter->second;
|
| 417 |
+
lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
if ( !currentRhs.empty() ) {
|
| 421 |
+
istringstream tokenizer(currentRhs);
|
| 422 |
+
std::string rhsLabel;
|
| 423 |
+
while ( tokenizer.peek() != EOF ) {
|
| 424 |
+
tokenizer >> rhsLabel;
|
| 425 |
+
labelSet.insert(rhsLabel);
|
| 426 |
+
}
|
| 427 |
+
oss << " " << currentRhs << " " << currentRhsCount;
|
| 428 |
+
}
|
| 429 |
+
if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
|
| 430 |
+
if ( !currentRhs.empty() ) {
|
| 431 |
+
oss << " " << lhsGivenCurrentRhsCounts.size();
|
| 432 |
+
}
|
| 433 |
+
for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
|
| 434 |
+
iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
|
| 435 |
+
oss << " " << iter2->first << " " << iter2->second;
|
| 436 |
+
|
| 437 |
+
// update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
|
| 438 |
+
std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
|
| 439 |
+
ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
|
| 440 |
+
ruleTargetLhs.erase(ruleTargetLhs.size()-1);
|
| 441 |
+
|
| 442 |
+
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
|
| 443 |
+
countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 444 |
+
if (!insertedCountsLabelsLHS.second) {
|
| 445 |
+
(insertedCountsLabelsLHS.first)->second += iter2->second;
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
|
| 449 |
+
jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
|
| 450 |
+
if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
|
| 451 |
+
boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
|
| 452 |
+
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 453 |
+
jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
|
| 454 |
+
} else {
|
| 455 |
+
boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
|
| 456 |
+
std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
|
| 457 |
+
jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
|
| 458 |
+
if (!insertedJointCounts.second) {
|
| 459 |
+
(insertedJointCounts.first)->second += iter2->second;
|
| 460 |
+
}
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
}
|
| 464 |
+
}
|
| 465 |
+
|
| 466 |
+
std::string allPropertyValuesString(oss.str());
|
| 467 |
+
return allPropertyValuesString;
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
|
| 472 |
+
const std::vector<float> &orientationClassPriorsL2R,
|
| 473 |
+
const std::vector<float> &orientationClassPriorsR2L,
|
| 474 |
+
double smoothingFactor,
|
| 475 |
+
std::ostream &out) const
|
| 476 |
+
{
|
| 477 |
+
assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
|
| 478 |
+
|
| 479 |
+
const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
|
| 480 |
+
|
| 481 |
+
if ( allPropertyValues == NULL ) {
|
| 482 |
+
return;
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
// bidirectional MSLR phrase orientation with 2x4 orientation classes:
|
| 486 |
+
// mono swap dright dleft
|
| 487 |
+
std::vector<float> orientationClassCountSumL2R(4,0);
|
| 488 |
+
std::vector<float> orientationClassCountSumR2L(4,0);
|
| 489 |
+
|
| 490 |
+
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
| 491 |
+
iter!=allPropertyValues->end(); ++iter) {
|
| 492 |
+
std::string l2rOrientationClass, r2lOrientationClass;
|
| 493 |
+
try {
|
| 494 |
+
istringstream tokenizer(iter->first);
|
| 495 |
+
tokenizer >> l2rOrientationClass;
|
| 496 |
+
tokenizer >> r2lOrientationClass;
|
| 497 |
+
if ( tokenizer.peek() != EOF ) {
|
| 498 |
+
UTIL_THROW(util::Exception, "ExtractionPhrasePair"
|
| 499 |
+
<< ": Collecting phrase orientations failed. "
|
| 500 |
+
<< "Too many tokens?");
|
| 501 |
+
}
|
| 502 |
+
} catch (const std::exception &e) {
|
| 503 |
+
UTIL_THROW(util::Exception, "ExtractionPhrasePair"
|
| 504 |
+
<< ": Collecting phrase orientations failed. "
|
| 505 |
+
<< "Flawed property value in extract file?");
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
int l2rOrientationClassId = -1;
|
| 509 |
+
if (!l2rOrientationClass.compare("mono")) {
|
| 510 |
+
l2rOrientationClassId = 0;
|
| 511 |
+
}
|
| 512 |
+
if (!l2rOrientationClass.compare("swap")) {
|
| 513 |
+
l2rOrientationClassId = 1;
|
| 514 |
+
}
|
| 515 |
+
if (!l2rOrientationClass.compare("dleft")) {
|
| 516 |
+
l2rOrientationClassId = 2;
|
| 517 |
+
}
|
| 518 |
+
if (!l2rOrientationClass.compare("dright")) {
|
| 519 |
+
l2rOrientationClassId = 3;
|
| 520 |
+
}
|
| 521 |
+
if (l2rOrientationClassId == -1) {
|
| 522 |
+
UTIL_THROW(util::Exception, "ExtractionPhrasePair"
|
| 523 |
+
<< ": Collecting phrase orientations failed. "
|
| 524 |
+
<< "Unknown orientation class \"" << l2rOrientationClass << "\"." );
|
| 525 |
+
}
|
| 526 |
+
int r2lOrientationClassId = -1;
|
| 527 |
+
if (!r2lOrientationClass.compare("mono")) {
|
| 528 |
+
r2lOrientationClassId = 0;
|
| 529 |
+
}
|
| 530 |
+
if (!r2lOrientationClass.compare("swap")) {
|
| 531 |
+
r2lOrientationClassId = 1;
|
| 532 |
+
}
|
| 533 |
+
if (!r2lOrientationClass.compare("dleft")) {
|
| 534 |
+
r2lOrientationClassId = 2;
|
| 535 |
+
}
|
| 536 |
+
if (!r2lOrientationClass.compare("dright")) {
|
| 537 |
+
r2lOrientationClassId = 3;
|
| 538 |
+
}
|
| 539 |
+
if (r2lOrientationClassId == -1) {
|
| 540 |
+
UTIL_THROW(util::Exception, "ExtractionPhrasePair"
|
| 541 |
+
<< ": Collecting phrase orientations failed. "
|
| 542 |
+
<< "Unknown orientation class \"" << r2lOrientationClass << "\"." );
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
|
| 546 |
+
orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
|
| 547 |
+
}
|
| 548 |
+
|
| 549 |
+
for (size_t i=0; i<4; ++i) {
|
| 550 |
+
if (i>0) {
|
| 551 |
+
out << " ";
|
| 552 |
+
}
|
| 553 |
+
out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
|
| 554 |
+
}
|
| 555 |
+
for (size_t i=0; i<4; ++i) {
|
| 556 |
+
out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
|
| 557 |
+
}
|
| 558 |
+
}
|
| 559 |
+
|
| 560 |
+
|
| 561 |
+
void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
|
| 562 |
+
std::set<std::string>& vocabulary) const
|
| 563 |
+
{
|
| 564 |
+
const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
|
| 565 |
+
|
| 566 |
+
if ( allPropertyValues == NULL ) {
|
| 567 |
+
return;
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
|
| 571 |
+
iter!=allPropertyValues->end(); ++iter) {
|
| 572 |
+
|
| 573 |
+
std::vector<std::string> tokens = Moses::Tokenize(iter->first);
|
| 574 |
+
for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
|
| 575 |
+
tokenIt!=tokens.end(); ++tokenIt) {
|
| 576 |
+
vocabulary.insert(*tokenIt);
|
| 577 |
+
}
|
| 578 |
+
}
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
}
|
| 584 |
+
|
mosesdecoder/phrase-extract/ExtractionPhrasePair.h
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2009 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#include "tables-core.h"
|
| 22 |
+
|
| 23 |
+
#include <vector>
|
| 24 |
+
#include <set>
|
| 25 |
+
#include <map>
|
| 26 |
+
#include <boost/unordered_map.hpp>
|
| 27 |
+
|
| 28 |
+
namespace MosesTraining
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
typedef std::vector< std::set<size_t> > ALIGNMENT;
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ExtractionPhrasePair
|
| 36 |
+
{
|
| 37 |
+
|
| 38 |
+
protected:
|
| 39 |
+
|
| 40 |
+
typedef std::map<std::string,float> PROPERTY_VALUES;
|
| 41 |
+
typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
bool m_isValid;
|
| 45 |
+
|
| 46 |
+
const PHRASE *m_phraseSource;
|
| 47 |
+
const PHRASE *m_phraseTarget;
|
| 48 |
+
|
| 49 |
+
float m_count;
|
| 50 |
+
float m_pcfgSum;
|
| 51 |
+
|
| 52 |
+
std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
|
| 53 |
+
std::map<std::string,
|
| 54 |
+
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
|
| 55 |
+
|
| 56 |
+
float m_lastCount;
|
| 57 |
+
float m_lastPcfgSum;
|
| 58 |
+
std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
|
| 59 |
+
|
| 60 |
+
public:
|
| 61 |
+
|
| 62 |
+
ExtractionPhrasePair( const PHRASE *phraseSource,
|
| 63 |
+
const PHRASE *phraseTarget,
|
| 64 |
+
ALIGNMENT *targetToSourceAlignment,
|
| 65 |
+
float count, float pcfgSum );
|
| 66 |
+
|
| 67 |
+
~ExtractionPhrasePair();
|
| 68 |
+
|
| 69 |
+
bool Add( ALIGNMENT *targetToSourceAlignment,
|
| 70 |
+
float count, float pcfgSum );
|
| 71 |
+
|
| 72 |
+
void IncrementPrevious( float count, float pcfgSum );
|
| 73 |
+
|
| 74 |
+
bool Matches( const PHRASE *otherPhraseSource,
|
| 75 |
+
const PHRASE *otherPhraseTarget,
|
| 76 |
+
ALIGNMENT *otherTargetToSourceAlignment ) const;
|
| 77 |
+
|
| 78 |
+
bool Matches( const PHRASE *otherPhraseSource,
|
| 79 |
+
const PHRASE *otherPhraseTarget,
|
| 80 |
+
ALIGNMENT *otherTargetToSourceAlignment,
|
| 81 |
+
bool &sourceMatch,
|
| 82 |
+
bool &targetMatch,
|
| 83 |
+
bool &alignmentMatch ) const;
|
| 84 |
+
|
| 85 |
+
bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
|
| 86 |
+
|
| 87 |
+
void Clear();
|
| 88 |
+
|
| 89 |
+
bool IsValid() const {
|
| 90 |
+
return m_isValid;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
const PHRASE *GetSource() const {
|
| 95 |
+
return m_phraseSource;
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
const PHRASE *GetTarget() const {
|
| 99 |
+
return m_phraseTarget;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
float GetCount() const {
|
| 103 |
+
return m_count;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
float GetPcfgScore() const {
|
| 107 |
+
return m_pcfgSum;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
const size_t GetNumberOfProperties() const {
|
| 111 |
+
return m_properties.size();
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
const std::map<std::string,float> *GetProperty( const std::string &key ) const {
|
| 115 |
+
std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
|
| 116 |
+
iter = m_properties.find(key);
|
| 117 |
+
if (iter == m_properties.end()) {
|
| 118 |
+
return NULL;
|
| 119 |
+
} else {
|
| 120 |
+
return iter->second.first;
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const ALIGNMENT *FindBestAlignmentTargetToSource() const;
|
| 125 |
+
|
| 126 |
+
const std::string *FindBestPropertyValue(const std::string &key) const;
|
| 127 |
+
|
| 128 |
+
std::string CollectAllPropertyValues(const std::string &key) const;
|
| 129 |
+
|
| 130 |
+
std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
|
| 131 |
+
std::set<std::string>& sourceLabelSet,
|
| 132 |
+
boost::unordered_map<std::string,float>& sourceLHSCounts,
|
| 133 |
+
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
|
| 134 |
+
Vocabulary &vcbT) const;
|
| 135 |
+
|
| 136 |
+
void CollectAllPhraseOrientations(const std::string &key,
|
| 137 |
+
const std::vector<float> &orientationClassPriorsL2R,
|
| 138 |
+
const std::vector<float> &orientationClassPriorsR2L,
|
| 139 |
+
double smoothingFactor,
|
| 140 |
+
std::ostream &out) const;
|
| 141 |
+
|
| 142 |
+
void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
|
| 143 |
+
std::set<std::string>& vocabulary) const;
|
| 144 |
+
|
| 145 |
+
void AddProperties(const std::string &str, float count);
|
| 146 |
+
|
| 147 |
+
void AddProperty(const std::string &key, const std::string &value, float count) {
|
| 148 |
+
std::map<std::string,
|
| 149 |
+
std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
|
| 150 |
+
if ( iter == m_properties.end() ) {
|
| 151 |
+
// key not found: insert property key and value
|
| 152 |
+
PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
|
| 153 |
+
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
|
| 154 |
+
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
|
| 155 |
+
m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
|
| 156 |
+
} else {
|
| 157 |
+
LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
|
| 158 |
+
if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
|
| 159 |
+
// property key-value pair exists already: add count
|
| 160 |
+
(*lastPropertyValue)->second += count;
|
| 161 |
+
} else { // need to check whether the property key-value pair has appeared before (insert if not)
|
| 162 |
+
// property key exists, but not in combination with this value:
|
| 163 |
+
// add new value with count
|
| 164 |
+
PROPERTY_VALUES *propertyValues = (iter->second).first;
|
| 165 |
+
std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
|
| 166 |
+
if ( !insertedProperty.second ) { // property value for this key appeared before: add count
|
| 167 |
+
insertedProperty.first->second += count;
|
| 168 |
+
}
|
| 169 |
+
LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
|
| 170 |
+
delete (iter->second).second;
|
| 171 |
+
(iter->second).second = lastPropertyValue;
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
};
|
| 177 |
+
|
| 178 |
+
}
|
| 179 |
+
|
mosesdecoder/phrase-extract/Hole.h
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef HOLE_H_INCLUDED_
|
| 22 |
+
#define HOLE_H_INCLUDED_
|
| 23 |
+
|
| 24 |
+
#include <cassert>
|
| 25 |
+
#include <list>
|
| 26 |
+
#include <string>
|
| 27 |
+
#include <vector>
|
| 28 |
+
|
| 29 |
+
namespace MosesTraining
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
class Hole
|
| 33 |
+
{
|
| 34 |
+
protected:
|
| 35 |
+
std::vector<int> m_start, m_end, m_pos;
|
| 36 |
+
std::vector<std::string> m_label;
|
| 37 |
+
|
| 38 |
+
public:
|
| 39 |
+
Hole()
|
| 40 |
+
: m_start(2)
|
| 41 |
+
, m_end(2)
|
| 42 |
+
, m_pos(2)
|
| 43 |
+
, m_label(2) {
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
Hole(const Hole ©)
|
| 47 |
+
: m_start(copy.m_start)
|
| 48 |
+
, m_end(copy.m_end)
|
| 49 |
+
, m_pos(copy.m_pos)
|
| 50 |
+
, m_label(copy.m_label) {
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
Hole(int startS, int endS, int startT, int endT)
|
| 54 |
+
: m_start(2)
|
| 55 |
+
, m_end(2)
|
| 56 |
+
, m_pos(2)
|
| 57 |
+
, m_label(2) {
|
| 58 |
+
m_start[0] = startS;
|
| 59 |
+
m_end[0] = endS;
|
| 60 |
+
m_start[1] = startT;
|
| 61 |
+
m_end[1] = endT;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
int GetStart(size_t direction) const {
|
| 65 |
+
return m_start[direction];
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
int GetEnd(size_t direction) const {
|
| 69 |
+
return m_end[direction];
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
int GetSize(size_t direction) const {
|
| 73 |
+
return m_end[direction] - m_start[direction] + 1;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
void SetPos(int pos, size_t direction) {
|
| 77 |
+
m_pos[direction] = pos;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
int GetPos(size_t direction) const {
|
| 81 |
+
return m_pos[direction];
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
void SetLabel(const std::string &label, size_t direction) {
|
| 85 |
+
m_label[direction] = label;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
const std::string &GetLabel(size_t direction) const {
|
| 89 |
+
return m_label[direction];
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
bool Overlap(const Hole &otherHole, size_t direction) const {
|
| 93 |
+
return ! ( otherHole.GetEnd(direction) < GetStart(direction) ||
|
| 94 |
+
otherHole.GetStart(direction) > GetEnd(direction) );
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
bool Neighbor(const Hole &otherHole, size_t direction) const {
|
| 98 |
+
return ( otherHole.GetEnd(direction)+1 == GetStart(direction) ||
|
| 99 |
+
otherHole.GetStart(direction) == GetEnd(direction)+1 );
|
| 100 |
+
}
|
| 101 |
+
};
|
| 102 |
+
|
| 103 |
+
typedef std::list<Hole> HoleList;
|
| 104 |
+
|
| 105 |
+
class HoleSourceOrderer
|
| 106 |
+
{
|
| 107 |
+
public:
|
| 108 |
+
bool operator()(const Hole* holeA, const Hole* holeB) const {
|
| 109 |
+
assert(holeA->GetStart(0) != holeB->GetStart(0));
|
| 110 |
+
return holeA->GetStart(0) < holeB->GetStart(0);
|
| 111 |
+
}
|
| 112 |
+
};
|
| 113 |
+
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
#endif
|
mosesdecoder/phrase-extract/HoleCollection.cpp
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "HoleCollection.h"
|
| 21 |
+
|
| 22 |
+
#include <algorithm>
|
| 23 |
+
|
| 24 |
+
namespace MosesTraining
|
| 25 |
+
{
|
| 26 |
+
|
| 27 |
+
void HoleCollection::SortSourceHoles()
|
| 28 |
+
{
|
| 29 |
+
assert(m_sortedSourceHoles.size() == 0);
|
| 30 |
+
|
| 31 |
+
// add
|
| 32 |
+
HoleList::iterator iter;
|
| 33 |
+
for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
|
| 34 |
+
Hole &currHole = *iter;
|
| 35 |
+
m_sortedSourceHoles.push_back(&currHole);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
// sort
|
| 39 |
+
std::sort(m_sortedSourceHoles.begin(), m_sortedSourceHoles.end(), HoleSourceOrderer());
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
void HoleCollection::Add(int startT, int endT, int startS, int endS)
|
| 43 |
+
{
|
| 44 |
+
Hole hole(startS, endS, startT, endT);
|
| 45 |
+
m_scope.push_back(Scope(hole));
|
| 46 |
+
m_sourceHoleStartPoints.push_back(startS);
|
| 47 |
+
m_sourceHoleEndPoints.push_back(endS);
|
| 48 |
+
m_holes.push_back(hole);
|
| 49 |
+
m_sortedSourceHoles.clear();
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
void HoleCollection::RemoveLast()
|
| 53 |
+
{
|
| 54 |
+
m_scope.pop_back();
|
| 55 |
+
m_sourceHoleStartPoints.pop_back();
|
| 56 |
+
m_sourceHoleEndPoints.pop_back();
|
| 57 |
+
m_holes.pop_back();
|
| 58 |
+
m_sortedSourceHoles.clear();
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
int HoleCollection::Scope(const Hole &proposedHole) const
|
| 62 |
+
{
|
| 63 |
+
const int holeStart = proposedHole.GetStart(0);
|
| 64 |
+
const int holeEnd = proposedHole.GetEnd(0);
|
| 65 |
+
int scope = m_scope.back();
|
| 66 |
+
if (holeStart == m_sourcePhraseStart.back() ||
|
| 67 |
+
find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
|
| 68 |
+
++scope; // Adding hole would introduce choice point at start of hole.
|
| 69 |
+
}
|
| 70 |
+
if (holeEnd == m_sourcePhraseEnd.back() ||
|
| 71 |
+
find(m_sourceHoleStartPoints.begin(), m_sourceHoleStartPoints.end(), holeEnd-1) != m_sourceHoleStartPoints.end()) {
|
| 72 |
+
++scope; // Adding hole would introduce choice point at end of hole.
|
| 73 |
+
}
|
| 74 |
+
return scope;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
}
|
mosesdecoder/phrase-extract/HoleCollection.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef HOLECOLLECTION_H_INCLUDED_
|
| 22 |
+
#define HOLECOLLECTION_H_INCLUDED_
|
| 23 |
+
|
| 24 |
+
#include <set>
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
#include "Hole.h"
|
| 28 |
+
|
| 29 |
+
namespace MosesTraining
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
class HoleCollection
|
| 33 |
+
{
|
| 34 |
+
protected:
|
| 35 |
+
HoleList m_holes;
|
| 36 |
+
std::vector<Hole*> m_sortedSourceHoles;
|
| 37 |
+
std::vector<int> m_sourceHoleStartPoints;
|
| 38 |
+
std::vector<int> m_sourceHoleEndPoints;
|
| 39 |
+
std::vector<int> m_scope;
|
| 40 |
+
std::vector<int> m_sourcePhraseStart;
|
| 41 |
+
std::vector<int> m_sourcePhraseEnd;
|
| 42 |
+
|
| 43 |
+
public:
|
| 44 |
+
HoleCollection(int sourcePhraseStart, int sourcePhraseEnd)
|
| 45 |
+
: m_scope(1, 0)
|
| 46 |
+
, m_sourcePhraseStart(1, sourcePhraseStart)
|
| 47 |
+
, m_sourcePhraseEnd(1, sourcePhraseEnd) {
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
const HoleList &GetHoles() const {
|
| 51 |
+
return m_holes;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
HoleList &GetHoles() {
|
| 55 |
+
return m_holes;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
std::vector<Hole*> &GetSortedSourceHoles() {
|
| 59 |
+
return m_sortedSourceHoles;
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
void Add(int startT, int endT, int startS, int endS);
|
| 63 |
+
|
| 64 |
+
void RemoveLast();
|
| 65 |
+
|
| 66 |
+
bool OverlapSource(const Hole &sourceHole) const {
|
| 67 |
+
HoleList::const_iterator iter;
|
| 68 |
+
for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
|
| 69 |
+
const Hole &currHole = *iter;
|
| 70 |
+
if (currHole.Overlap(sourceHole, 0))
|
| 71 |
+
return true;
|
| 72 |
+
}
|
| 73 |
+
return false;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
bool ConsecSource(const Hole &sourceHole) const {
|
| 77 |
+
HoleList::const_iterator iter;
|
| 78 |
+
for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
|
| 79 |
+
const Hole &currHole = *iter;
|
| 80 |
+
if (currHole.Neighbor(sourceHole, 0))
|
| 81 |
+
return true;
|
| 82 |
+
}
|
| 83 |
+
return false;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
// Determine the scope that would result from adding the given hole.
|
| 87 |
+
int Scope(const Hole &proposedHole) const;
|
| 88 |
+
|
| 89 |
+
void SortSourceHoles();
|
| 90 |
+
|
| 91 |
+
};
|
| 92 |
+
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
#endif
|
mosesdecoder/phrase-extract/InputFileStream.cpp
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "InputFileStream.h"
|
| 23 |
+
#include "gzfilebuf.h"
|
| 24 |
+
#include <iostream>
|
| 25 |
+
|
| 26 |
+
using namespace std;
|
| 27 |
+
|
| 28 |
+
namespace Moses
|
| 29 |
+
{
|
| 30 |
+
InputFileStream::InputFileStream(const std::string &filePath)
|
| 31 |
+
: std::istream(NULL)
|
| 32 |
+
, m_streambuf(NULL)
|
| 33 |
+
{
|
| 34 |
+
if (filePath.size() > 3 &&
|
| 35 |
+
filePath.substr(filePath.size() - 3, 3) == ".gz") {
|
| 36 |
+
m_streambuf = new gzfilebuf(filePath.c_str());
|
| 37 |
+
} else {
|
| 38 |
+
std::filebuf* fb = new std::filebuf();
|
| 39 |
+
fb = fb->open(filePath.c_str(), std::ios::in);
|
| 40 |
+
if (! fb) {
|
| 41 |
+
cerr << "Can't read " << filePath.c_str() << endl;
|
| 42 |
+
exit(1);
|
| 43 |
+
}
|
| 44 |
+
m_streambuf = fb;
|
| 45 |
+
}
|
| 46 |
+
this->init(m_streambuf);
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
InputFileStream::~InputFileStream()
|
| 50 |
+
{
|
| 51 |
+
delete m_streambuf;
|
| 52 |
+
m_streambuf = NULL;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
void InputFileStream::Close()
|
| 56 |
+
{
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
}
|
| 61 |
+
|
mosesdecoder/phrase-extract/InputFileStream.h
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_InputFileStream_h
|
| 23 |
+
#define moses_InputFileStream_h
|
| 24 |
+
|
| 25 |
+
#include <cstdlib>
|
| 26 |
+
#include <fstream>
|
| 27 |
+
#include <string>
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
/** Used in place of std::istream, can read zipped files if it ends in .gz
|
| 33 |
+
*/
|
| 34 |
+
class InputFileStream : public std::istream
|
| 35 |
+
{
|
| 36 |
+
protected:
|
| 37 |
+
std::streambuf *m_streambuf;
|
| 38 |
+
public:
|
| 39 |
+
|
| 40 |
+
explicit InputFileStream(const std::string &filePath);
|
| 41 |
+
~InputFileStream();
|
| 42 |
+
|
| 43 |
+
void Close();
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
#endif
|
mosesdecoder/phrase-extract/InternalStructFeature.cpp
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "InternalStructFeature.h"
|
| 2 |
+
#include <map>
|
| 3 |
+
|
| 4 |
+
using namespace std;
|
| 5 |
+
|
| 6 |
+
namespace MosesTraining
|
| 7 |
+
{
|
| 8 |
+
|
| 9 |
+
void InternalStructFeature::add(const ScoreFeatureContext& context,
|
| 10 |
+
std::vector<float>& denseValues,
|
| 11 |
+
std::map<std::string,float>& sparseValues) const
|
| 12 |
+
{
|
| 13 |
+
const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
|
| 14 |
+
for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
|
| 15 |
+
iter!=allTrees->end(); ++iter ) {
|
| 16 |
+
add(&(iter->first), iter->second, denseValues, sparseValues);
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
void InternalStructFeatureDense::add(const std::string *treeFragment,
|
| 21 |
+
float count,
|
| 22 |
+
std::vector<float>& denseValues,
|
| 23 |
+
std::map<std::string,float>& sparseValues) const
|
| 24 |
+
{
|
| 25 |
+
//cout<<"Dense: "<<*internalStruct<<endl;
|
| 26 |
+
size_t start=0;
|
| 27 |
+
int countNP=0;
|
| 28 |
+
while((start = treeFragment->find("NP", start)) != string::npos) {
|
| 29 |
+
countNP += count;
|
| 30 |
+
start+=2; //length of "NP"
|
| 31 |
+
}
|
| 32 |
+
//should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
|
| 33 |
+
//should use this but don't know what it does? -> maybeLog( (bitmap == i) ? 2.718 : 1 )
|
| 34 |
+
denseValues.push_back(exp(countNP));
|
| 35 |
+
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
void InternalStructFeatureSparse::add(const std::string *treeFragment,
|
| 39 |
+
float count,
|
| 40 |
+
std::vector<float>& denseValues,
|
| 41 |
+
std::map<std::string,float>& sparseValues) const
|
| 42 |
+
{
|
| 43 |
+
//cout<<"Sparse: "<<*internalStruct<<endl;
|
| 44 |
+
if(treeFragment->find("VBZ")!=std::string::npos)
|
| 45 |
+
sparseValues["NTVBZ"] += count;
|
| 46 |
+
if(treeFragment->find("VBD")!=std::string::npos)
|
| 47 |
+
sparseValues["NTVBD"] += count;
|
| 48 |
+
if(treeFragment->find("VBP")!=std::string::npos)
|
| 49 |
+
sparseValues["NTVBP"] += count;
|
| 50 |
+
if(treeFragment->find("PP")!=std::string::npos)
|
| 51 |
+
sparseValues["NTPP"] += count;
|
| 52 |
+
if(treeFragment->find("SBAR")!=std::string::npos)
|
| 53 |
+
sparseValues["NTSBAR"] += count;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
}
|
mosesdecoder/phrase-extract/InternalStructFeature.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <iostream>
|
| 2 |
+
#include <fstream>
|
| 3 |
+
#include <cassert>
|
| 4 |
+
#include <cstdlib>
|
| 5 |
+
#include <string>
|
| 6 |
+
#include <queue>
|
| 7 |
+
#include <map>
|
| 8 |
+
#include <cmath>
|
| 9 |
+
|
| 10 |
+
#include "ScoreFeature.h"
|
| 11 |
+
#include "extract-ghkm/Node.h"
|
| 12 |
+
|
| 13 |
+
namespace MosesTraining
|
| 14 |
+
{
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class InternalStructFeature : public ScoreFeature
|
| 18 |
+
{
|
| 19 |
+
public:
|
| 20 |
+
InternalStructFeature() : m_type(0) {};
|
| 21 |
+
/** Add the values for this feature function. */
|
| 22 |
+
void add(const ScoreFeatureContext& context,
|
| 23 |
+
std::vector<float>& denseValues,
|
| 24 |
+
std::map<std::string,float>& sparseValues) const;
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
protected:
|
| 28 |
+
/** Overridden in subclass */
|
| 29 |
+
virtual void add(const std::string *treeFragment,
|
| 30 |
+
float count,
|
| 31 |
+
std::vector<float>& denseValues,
|
| 32 |
+
std::map<std::string,float>& sparseValues) const = 0;
|
| 33 |
+
int m_type;
|
| 34 |
+
};
|
| 35 |
+
|
| 36 |
+
class InternalStructFeatureDense : public InternalStructFeature
|
| 37 |
+
{
|
| 38 |
+
public:
|
| 39 |
+
InternalStructFeatureDense()
|
| 40 |
+
:InternalStructFeature() {
|
| 41 |
+
m_type=1;
|
| 42 |
+
} //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
|
| 43 |
+
protected:
|
| 44 |
+
virtual void add(const std::string *treeFragment,
|
| 45 |
+
float count,
|
| 46 |
+
std::vector<float>& denseValues,
|
| 47 |
+
std::map<std::string,float>& sparseValues) const;
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
class InternalStructFeatureSparse : public InternalStructFeature
|
| 51 |
+
{
|
| 52 |
+
public:
|
| 53 |
+
InternalStructFeatureSparse()
|
| 54 |
+
:InternalStructFeature() {
|
| 55 |
+
m_type=2;
|
| 56 |
+
}// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
|
| 57 |
+
protected:
|
| 58 |
+
virtual void add(const std::string *treeFragment,
|
| 59 |
+
float count,
|
| 60 |
+
std::vector<float>& denseValues,
|
| 61 |
+
std::map<std::string,float>& sparseValues) const;
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
}
|
mosesdecoder/phrase-extract/Jamfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ;
|
| 2 |
+
#Build .o files with include path setting, reused.
|
| 3 |
+
for local d in $(most-deps) {
|
| 4 |
+
obj $(d:B).o : $(d) ;
|
| 5 |
+
}
|
| 6 |
+
#and stuff them into an alias.
|
| 7 |
+
alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ../moses//ThreadPool ../moses//Util ../util//kenutil ;
|
| 8 |
+
|
| 9 |
+
#ExtractionPhrasePair.cpp requires that main define some global variables.
|
| 10 |
+
#Build the mains that do not need these global variables.
|
| 11 |
+
for local m in [ glob *-main.cpp : score-main.cpp ] {
|
| 12 |
+
exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
#The side dishes that use ExtractionPhrasePair.cpp
|
| 16 |
+
exe score : ExtractionPhrasePair.cpp score-main.cpp deps ;
|
| 17 |
+
|
| 18 |
+
import testing ;
|
| 19 |
+
run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
|
mosesdecoder/phrase-extract/OutputFileStream.cpp
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 24 |
+
#include <boost/iostreams/filter/gzip.hpp>
|
| 25 |
+
#include "OutputFileStream.h"
|
| 26 |
+
#include "gzfilebuf.h"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
using namespace boost::algorithm;
|
| 30 |
+
|
| 31 |
+
namespace Moses
|
| 32 |
+
{
|
| 33 |
+
OutputFileStream::OutputFileStream()
|
| 34 |
+
:boost::iostreams::filtering_ostream()
|
| 35 |
+
,m_outFile(NULL)
|
| 36 |
+
,m_open(false)
|
| 37 |
+
{
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
OutputFileStream::OutputFileStream(const std::string &filePath)
|
| 41 |
+
:m_outFile(NULL)
|
| 42 |
+
,m_open(false)
|
| 43 |
+
{
|
| 44 |
+
Open(filePath);
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
OutputFileStream::~OutputFileStream()
|
| 48 |
+
{
|
| 49 |
+
Close();
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
bool OutputFileStream::Open(const std::string &filePath)
|
| 53 |
+
{
|
| 54 |
+
assert(!m_open);
|
| 55 |
+
if (filePath == std::string("-")) {
|
| 56 |
+
// Write to standard output. Leave m_outFile null.
|
| 57 |
+
this->push(std::cout);
|
| 58 |
+
} else {
|
| 59 |
+
m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
|
| 60 |
+
if (m_outFile->fail()) {
|
| 61 |
+
return false;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
if (ends_with(filePath, ".gz")) {
|
| 65 |
+
this->push(boost::iostreams::gzip_compressor());
|
| 66 |
+
}
|
| 67 |
+
this->push(*m_outFile);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
m_open = true;
|
| 71 |
+
return true;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
void OutputFileStream::Close()
|
| 75 |
+
{
|
| 76 |
+
if (!m_open) return;
|
| 77 |
+
this->flush();
|
| 78 |
+
if (m_outFile) {
|
| 79 |
+
this->pop(); // file
|
| 80 |
+
|
| 81 |
+
m_outFile->close();
|
| 82 |
+
delete m_outFile;
|
| 83 |
+
m_outFile = NULL;
|
| 84 |
+
}
|
| 85 |
+
m_open = false;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
}
|
| 90 |
+
|
mosesdecoder/phrase-extract/OutputFileStream.h
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#pragma once
|
| 23 |
+
|
| 24 |
+
#include <cstdlib>
|
| 25 |
+
#include <fstream>
|
| 26 |
+
#include <string>
|
| 27 |
+
#include <iostream>
|
| 28 |
+
#include <boost/iostreams/filtering_stream.hpp>
|
| 29 |
+
|
| 30 |
+
namespace Moses
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
/** Version of std::ostream with transparent compression.
|
| 34 |
+
*
|
| 35 |
+
* Transparently compresses output when writing to a file whose name ends in
|
| 36 |
+
* ".gz". Or, writes to stdout instead of a file when given a filename
|
| 37 |
+
* consisting of just a dash ("-").
|
| 38 |
+
*/
|
| 39 |
+
class OutputFileStream : public boost::iostreams::filtering_ostream
|
| 40 |
+
{
|
| 41 |
+
private:
|
| 42 |
+
/** File that needs flushing & closing when we close this stream.
|
| 43 |
+
*
|
| 44 |
+
* Is NULL when no file is opened, e.g. when writing to standard output.
|
| 45 |
+
*/
|
| 46 |
+
std::ofstream *m_outFile;
|
| 47 |
+
|
| 48 |
+
/// Is this stream open?
|
| 49 |
+
bool m_open;
|
| 50 |
+
|
| 51 |
+
public:
|
| 52 |
+
/** Create an unopened OutputFileStream.
|
| 53 |
+
*
|
| 54 |
+
* Until it's been opened, nothing can be done with this stream.
|
| 55 |
+
*/
|
| 56 |
+
OutputFileStream();
|
| 57 |
+
|
| 58 |
+
/// Create an OutputFileStream, and open it by calling Open().
|
| 59 |
+
OutputFileStream(const std::string &filePath);
|
| 60 |
+
virtual ~OutputFileStream();
|
| 61 |
+
|
| 62 |
+
// TODO: Can we please just always throw an exception when this fails?
|
| 63 |
+
/** Open stream.
|
| 64 |
+
*
|
| 65 |
+
* If filePath is "-" (just a dash), this opens the stream for writing to
|
| 66 |
+
* standard output. Otherwise, it opens the given file. If the filename
|
| 67 |
+
* has the ".gz" suffix, output will be transparently compressed.
|
| 68 |
+
*
|
| 69 |
+
* Call Close() to close the file.
|
| 70 |
+
*
|
| 71 |
+
* Returns whether opening the file was successful. It may also throw an
|
| 72 |
+
* exception on failure.
|
| 73 |
+
*/
|
| 74 |
+
bool Open(const std::string &filePath);
|
| 75 |
+
|
| 76 |
+
/// Flush and close stream. After this, the stream can be opened again.
|
| 77 |
+
void Close();
|
| 78 |
+
};
|
| 79 |
+
|
| 80 |
+
}
|
| 81 |
+
|
mosesdecoder/phrase-extract/PhraseExtractionOptions.h
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
/***********************************************************************
|
| 3 |
+
Moses - factored phrase-based language decoder
|
| 4 |
+
Copyright (C) 2010 University of Edinburgh
|
| 5 |
+
|
| 6 |
+
This library is free software; you can redistribute it and/or
|
| 7 |
+
modify it under the terms of the GNU Lesser General Public
|
| 8 |
+
License as published by the Free Software Foundation; either
|
| 9 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 10 |
+
|
| 11 |
+
This library is distributed in the hope that it will be useful,
|
| 12 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 13 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 14 |
+
Lesser General Public License for more details.
|
| 15 |
+
|
| 16 |
+
You should have received a copy of the GNU Lesser General Public
|
| 17 |
+
License along with this library; if not, write to the Free Software
|
| 18 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 19 |
+
***********************************************************************/
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
#include <string>
|
| 23 |
+
#include <vector>
|
| 24 |
+
|
| 25 |
+
namespace MosesTraining
|
| 26 |
+
{
|
| 27 |
+
enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
|
| 28 |
+
enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PhraseExtractionOptions
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
public:
|
| 35 |
+
int maxPhraseLength;
|
| 36 |
+
int minPhraseLength;
|
| 37 |
+
std::string separator;
|
| 38 |
+
|
| 39 |
+
private:
|
| 40 |
+
bool allModelsOutputFlag;
|
| 41 |
+
bool wordModel;
|
| 42 |
+
REO_MODEL_TYPE wordType;
|
| 43 |
+
bool phraseModel;
|
| 44 |
+
REO_MODEL_TYPE phraseType;
|
| 45 |
+
bool hierModel;
|
| 46 |
+
REO_MODEL_TYPE hierType;
|
| 47 |
+
bool orientationFlag;
|
| 48 |
+
bool translationFlag;
|
| 49 |
+
bool includeSentenceIdFlag; //include sentence id in extract file
|
| 50 |
+
bool onlyOutputSpanInfo;
|
| 51 |
+
bool gzOutput;
|
| 52 |
+
std::string instanceWeightsFile; //weights for each sentence
|
| 53 |
+
bool targetConstituentConstrainedFlag;
|
| 54 |
+
bool targetConstituentBoundariesFlag;
|
| 55 |
+
bool flexScoreFlag;
|
| 56 |
+
bool singleWordHeuristicFlag;
|
| 57 |
+
|
| 58 |
+
public:
|
| 59 |
+
std::vector<std::string> placeholders;
|
| 60 |
+
bool debug;
|
| 61 |
+
|
| 62 |
+
PhraseExtractionOptions(const int initmaxPhraseLength):
|
| 63 |
+
maxPhraseLength(initmaxPhraseLength),
|
| 64 |
+
minPhraseLength(3),
|
| 65 |
+
separator("|||"),
|
| 66 |
+
allModelsOutputFlag(false),
|
| 67 |
+
wordModel(false),
|
| 68 |
+
wordType(REO_MSD),
|
| 69 |
+
phraseModel(false),
|
| 70 |
+
phraseType(REO_MSD),
|
| 71 |
+
hierModel(false),
|
| 72 |
+
hierType(REO_MSD),
|
| 73 |
+
orientationFlag(false),
|
| 74 |
+
translationFlag(true),
|
| 75 |
+
includeSentenceIdFlag(false),
|
| 76 |
+
onlyOutputSpanInfo(false),
|
| 77 |
+
gzOutput(false),
|
| 78 |
+
targetConstituentConstrainedFlag(false),
|
| 79 |
+
targetConstituentBoundariesFlag(false),
|
| 80 |
+
flexScoreFlag(false),
|
| 81 |
+
singleWordHeuristicFlag(false),
|
| 82 |
+
debug(false) {
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
//functions for initialization of options
|
| 86 |
+
void initAllModelsOutputFlag(const bool initallModelsOutputFlag) {
|
| 87 |
+
allModelsOutputFlag=initallModelsOutputFlag;
|
| 88 |
+
}
|
| 89 |
+
void initWordModel(const bool initwordModel) {
|
| 90 |
+
wordModel=initwordModel;
|
| 91 |
+
}
|
| 92 |
+
void initWordType(REO_MODEL_TYPE initwordType ) {
|
| 93 |
+
wordType=initwordType;
|
| 94 |
+
}
|
| 95 |
+
void initPhraseModel(const bool initphraseModel ) {
|
| 96 |
+
phraseModel=initphraseModel;
|
| 97 |
+
}
|
| 98 |
+
void initPhraseType(REO_MODEL_TYPE initphraseType) {
|
| 99 |
+
phraseType=initphraseType;
|
| 100 |
+
}
|
| 101 |
+
void initHierModel(const bool inithierModel) {
|
| 102 |
+
hierModel=inithierModel;
|
| 103 |
+
}
|
| 104 |
+
void initHierType(REO_MODEL_TYPE inithierType) {
|
| 105 |
+
hierType=inithierType;
|
| 106 |
+
}
|
| 107 |
+
void initOrientationFlag(const bool initorientationFlag) {
|
| 108 |
+
orientationFlag=initorientationFlag;
|
| 109 |
+
}
|
| 110 |
+
void initTranslationFlag(const bool inittranslationFlag) {
|
| 111 |
+
translationFlag=inittranslationFlag;
|
| 112 |
+
}
|
| 113 |
+
void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) {
|
| 114 |
+
includeSentenceIdFlag=initincludeSentenceIdFlag;
|
| 115 |
+
}
|
| 116 |
+
void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) {
|
| 117 |
+
onlyOutputSpanInfo= initonlyOutputSpanInfo;
|
| 118 |
+
}
|
| 119 |
+
void initGzOutput (const bool initgzOutput) {
|
| 120 |
+
gzOutput= initgzOutput;
|
| 121 |
+
}
|
| 122 |
+
void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
|
| 123 |
+
instanceWeightsFile = std::string(initInstanceWeightsFile);
|
| 124 |
+
}
|
| 125 |
+
void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
|
| 126 |
+
targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
|
| 127 |
+
}
|
| 128 |
+
void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
|
| 129 |
+
targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
|
| 130 |
+
}
|
| 131 |
+
void initFlexScoreFlag(const bool initflexScoreFlag) {
|
| 132 |
+
flexScoreFlag=initflexScoreFlag;
|
| 133 |
+
}
|
| 134 |
+
void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
|
| 135 |
+
singleWordHeuristicFlag = initSingleWordHeuristicFlag;
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
// functions for getting values
|
| 139 |
+
bool isAllModelsOutputFlag() const {
|
| 140 |
+
return allModelsOutputFlag;
|
| 141 |
+
}
|
| 142 |
+
bool isWordModel() const {
|
| 143 |
+
return wordModel;
|
| 144 |
+
}
|
| 145 |
+
REO_MODEL_TYPE isWordType() const {
|
| 146 |
+
return wordType;
|
| 147 |
+
}
|
| 148 |
+
bool isPhraseModel() const {
|
| 149 |
+
return phraseModel;
|
| 150 |
+
}
|
| 151 |
+
REO_MODEL_TYPE isPhraseType() const {
|
| 152 |
+
return phraseType;
|
| 153 |
+
}
|
| 154 |
+
bool isHierModel() const {
|
| 155 |
+
return hierModel;
|
| 156 |
+
}
|
| 157 |
+
REO_MODEL_TYPE isHierType() const {
|
| 158 |
+
return hierType;
|
| 159 |
+
}
|
| 160 |
+
bool isOrientationFlag() const {
|
| 161 |
+
return orientationFlag;
|
| 162 |
+
}
|
| 163 |
+
bool isTranslationFlag() const {
|
| 164 |
+
return translationFlag;
|
| 165 |
+
}
|
| 166 |
+
bool isIncludeSentenceIdFlag() const {
|
| 167 |
+
return includeSentenceIdFlag;
|
| 168 |
+
}
|
| 169 |
+
bool isOnlyOutputSpanInfo() const {
|
| 170 |
+
return onlyOutputSpanInfo;
|
| 171 |
+
}
|
| 172 |
+
bool isGzOutput () const {
|
| 173 |
+
return gzOutput;
|
| 174 |
+
}
|
| 175 |
+
std::string getInstanceWeightsFile() const {
|
| 176 |
+
return instanceWeightsFile;
|
| 177 |
+
}
|
| 178 |
+
bool isTargetConstituentConstrainedFlag() const {
|
| 179 |
+
return targetConstituentConstrainedFlag;
|
| 180 |
+
}
|
| 181 |
+
bool isTargetConstituentBoundariesFlag() const {
|
| 182 |
+
return targetConstituentBoundariesFlag;
|
| 183 |
+
}
|
| 184 |
+
bool isFlexScoreFlag() const {
|
| 185 |
+
return flexScoreFlag;
|
| 186 |
+
}
|
| 187 |
+
bool isSingleWordHeuristicFlag() const {
|
| 188 |
+
return singleWordHeuristicFlag;
|
| 189 |
+
}
|
| 190 |
+
};
|
| 191 |
+
|
| 192 |
+
}
|
| 193 |
+
|
mosesdecoder/phrase-extract/PhraseOrientation.cpp
ADDED
|
@@ -0,0 +1,481 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "PhraseOrientation.h"
|
| 21 |
+
|
| 22 |
+
#include <iostream>
|
| 23 |
+
#include <sstream>
|
| 24 |
+
#include <limits>
|
| 25 |
+
#include <cassert>
|
| 26 |
+
|
| 27 |
+
#include <boost/assign/list_of.hpp>
|
| 28 |
+
|
| 29 |
+
namespace MosesTraining
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
|
| 33 |
+
std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
|
| 34 |
+
|
| 35 |
+
PhraseOrientation::PhraseOrientation(int sourceSize,
|
| 36 |
+
int targetSize,
|
| 37 |
+
const Alignment &alignment)
|
| 38 |
+
: m_countF(sourceSize)
|
| 39 |
+
, m_countE(targetSize)
|
| 40 |
+
{
|
| 41 |
+
// prepare data structures for alignments
|
| 42 |
+
std::vector<std::vector<int> > alignedToS;
|
| 43 |
+
for(int i=0; i<m_countF; ++i) {
|
| 44 |
+
std::vector< int > dummy;
|
| 45 |
+
alignedToS.push_back(dummy);
|
| 46 |
+
}
|
| 47 |
+
for(int i=0; i<m_countE; ++i) {
|
| 48 |
+
std::vector< int > dummy;
|
| 49 |
+
m_alignedToT.push_back(dummy);
|
| 50 |
+
}
|
| 51 |
+
std::vector<int> alignedCountS(m_countF,0);
|
| 52 |
+
|
| 53 |
+
for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) {
|
| 54 |
+
alignedToS[a->first].push_back(a->second);
|
| 55 |
+
alignedCountS[a->first]++;
|
| 56 |
+
m_alignedToT[a->second].push_back(a->first);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
PhraseOrientation::PhraseOrientation(int sourceSize,
|
| 64 |
+
int targetSize,
|
| 65 |
+
const Moses::AlignmentInfo &alignTerm,
|
| 66 |
+
const Moses::AlignmentInfo &alignNonTerm)
|
| 67 |
+
: m_countF(sourceSize)
|
| 68 |
+
, m_countE(targetSize)
|
| 69 |
+
{
|
| 70 |
+
// prepare data structures for alignments
|
| 71 |
+
std::vector<std::vector<int> > alignedToS;
|
| 72 |
+
for(int i=0; i<m_countF; ++i) {
|
| 73 |
+
std::vector< int > dummy;
|
| 74 |
+
alignedToS.push_back(dummy);
|
| 75 |
+
}
|
| 76 |
+
for(int i=0; i<m_countE; ++i) {
|
| 77 |
+
std::vector< int > dummy;
|
| 78 |
+
m_alignedToT.push_back(dummy);
|
| 79 |
+
}
|
| 80 |
+
std::vector<int> alignedCountS(m_countF,0);
|
| 81 |
+
|
| 82 |
+
for (Moses::AlignmentInfo::const_iterator it=alignTerm.begin();
|
| 83 |
+
it!=alignTerm.end(); ++it) {
|
| 84 |
+
alignedToS[it->first].push_back(it->second);
|
| 85 |
+
alignedCountS[it->first]++;
|
| 86 |
+
m_alignedToT[it->second].push_back(it->first);
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
for (Moses::AlignmentInfo::const_iterator it=alignNonTerm.begin();
|
| 90 |
+
it!=alignNonTerm.end(); ++it) {
|
| 91 |
+
alignedToS[it->first].push_back(it->second);
|
| 92 |
+
alignedCountS[it->first]++;
|
| 93 |
+
m_alignedToT[it->second].push_back(it->first);
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
PhraseOrientation::PhraseOrientation(int sourceSize,
|
| 100 |
+
int targetSize,
|
| 101 |
+
const std::vector<std::vector<int> > &alignedToT,
|
| 102 |
+
const std::vector<std::vector<int> > &alignedToS,
|
| 103 |
+
const std::vector<int> &alignedCountS)
|
| 104 |
+
: m_countF(sourceSize)
|
| 105 |
+
, m_countE(targetSize)
|
| 106 |
+
, m_alignedToT(alignedToT)
|
| 107 |
+
{
|
| 108 |
+
Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
void PhraseOrientation::Init(int sourceSize,
|
| 113 |
+
int targetSize,
|
| 114 |
+
const std::vector<std::vector<int> > &alignedToT,
|
| 115 |
+
const std::vector<std::vector<int> > &alignedToS,
|
| 116 |
+
const std::vector<int> &alignedCountS)
|
| 117 |
+
{
|
| 118 |
+
for (int startF=0; startF<m_countF; ++startF) {
|
| 119 |
+
for (int endF=startF; endF<m_countF; ++endF) {
|
| 120 |
+
|
| 121 |
+
int minE = std::numeric_limits<int>::max();
|
| 122 |
+
int maxE = -1;
|
| 123 |
+
for (int fi=startF; fi<=endF; ++fi) {
|
| 124 |
+
for (size_t i=0; i<alignedToS[fi].size(); ++i) {
|
| 125 |
+
int ei = alignedToS[fi][i];
|
| 126 |
+
if (ei<minE) {
|
| 127 |
+
minE = ei;
|
| 128 |
+
}
|
| 129 |
+
if (ei>maxE) {
|
| 130 |
+
maxE = ei;
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE);
|
| 136 |
+
}
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// check alignments for target phrase startE...endE
|
| 140 |
+
// loop over continuous phrases which are compatible with the word alignments
|
| 141 |
+
for (int startE=0; startE<m_countE; ++startE) {
|
| 142 |
+
for (int endE=startE; endE<m_countE; ++endE) {
|
| 143 |
+
|
| 144 |
+
int minF = std::numeric_limits<int>::max();
|
| 145 |
+
int maxF = -1;
|
| 146 |
+
std::vector< int > usedF = alignedCountS;
|
| 147 |
+
for (int ei=startE; ei<=endE; ++ei) {
|
| 148 |
+
for (size_t i=0; i<alignedToT[ei].size(); ++i) {
|
| 149 |
+
int fi = alignedToT[ei][i];
|
| 150 |
+
if (fi<minF) {
|
| 151 |
+
minF = fi;
|
| 152 |
+
}
|
| 153 |
+
if (fi>maxF) {
|
| 154 |
+
maxF = fi;
|
| 155 |
+
}
|
| 156 |
+
usedF[fi]--;
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
m_minAndMaxAlignedToTargetSpan[ std::pair<int,int>(startE,endE) ] = std::pair<int,int>(minF,maxF);
|
| 161 |
+
|
| 162 |
+
if (maxF >= 0) { // aligned to any source words at all
|
| 163 |
+
|
| 164 |
+
// check if source words are aligned to out of bounds target words
|
| 165 |
+
bool out_of_bounds = false;
|
| 166 |
+
for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi)
|
| 167 |
+
if (usedF[fi]>0) {
|
| 168 |
+
// cout << "out of bounds: " << fi << "\n";
|
| 169 |
+
out_of_bounds = true;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
// cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
|
| 173 |
+
if (!out_of_bounds) {
|
| 174 |
+
// start point of source phrase may retreat over unaligned
|
| 175 |
+
for (int startF=minF;
|
| 176 |
+
(startF>=0 &&
|
| 177 |
+
(startF==minF || alignedCountS[startF]==0)); // unaligned
|
| 178 |
+
startF--) {
|
| 179 |
+
// end point of source phrase may advance over unaligned
|
| 180 |
+
for (int endF=maxF;
|
| 181 |
+
(endF<m_countF &&
|
| 182 |
+
(endF==maxF || alignedCountS[endF]==0)); // unaligned
|
| 183 |
+
endF++) { // at this point we have extracted a phrase
|
| 184 |
+
|
| 185 |
+
InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight,
|
| 186 |
+
startF, startE, endF, endE);
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y )
|
| 197 |
+
{
|
| 198 |
+
std::set<int> tmp;
|
| 199 |
+
tmp.insert(x);
|
| 200 |
+
std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
|
| 201 |
+
if (ret.second == false) {
|
| 202 |
+
ret.first->second.insert(x);
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft,
|
| 208 |
+
HSentenceVertices & topRight,
|
| 209 |
+
HSentenceVertices & bottomLeft,
|
| 210 |
+
HSentenceVertices & bottomRight,
|
| 211 |
+
int startF, int startE, int endF, int endE)
|
| 212 |
+
{
|
| 213 |
+
|
| 214 |
+
InsertVertex(topLeft, startF, startE);
|
| 215 |
+
InsertVertex(topRight, endF, startE);
|
| 216 |
+
InsertVertex(bottomLeft, startF, endE);
|
| 217 |
+
InsertVertex(bottomRight, endF, endE);
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const
|
| 222 |
+
{
|
| 223 |
+
boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
|
| 224 |
+
= m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
|
| 225 |
+
|
| 226 |
+
if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
|
| 227 |
+
int startE = (foundMinMax->second).first;
|
| 228 |
+
int endE = (foundMinMax->second).second;
|
| 229 |
+
// std::cerr << "Phrase orientation for"
|
| 230 |
+
// << " startF=" << startF
|
| 231 |
+
// << " endF=" << endF
|
| 232 |
+
// << " startE=" << startE
|
| 233 |
+
// << " endE=" << endE
|
| 234 |
+
// << std::endl;
|
| 235 |
+
return GetOrientationInfoString(startF, startE, endF, endE, direction);
|
| 236 |
+
} else {
|
| 237 |
+
std::cerr << "PhraseOrientation::GetOrientationInfoString(): Error: not able to determine phrase orientation" << std::endl;
|
| 238 |
+
std::exit(1);
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
|
| 244 |
+
{
|
| 245 |
+
REO_CLASS hierPrevOrient=REO_CLASS_UNKNOWN, hierNextOrient=REO_CLASS_UNKNOWN;
|
| 246 |
+
|
| 247 |
+
if ( direction == REO_DIR_L2R || direction == REO_DIR_BIDIR )
|
| 248 |
+
hierPrevOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_L2R);
|
| 249 |
+
|
| 250 |
+
if ( direction == REO_DIR_R2L || direction == REO_DIR_BIDIR )
|
| 251 |
+
hierNextOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_R2L);
|
| 252 |
+
|
| 253 |
+
switch (direction) {
|
| 254 |
+
case REO_DIR_L2R:
|
| 255 |
+
return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR);
|
| 256 |
+
break;
|
| 257 |
+
case REO_DIR_R2L:
|
| 258 |
+
return GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
|
| 259 |
+
break;
|
| 260 |
+
case REO_DIR_BIDIR:
|
| 261 |
+
return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
|
| 262 |
+
break;
|
| 263 |
+
default:
|
| 264 |
+
return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
|
| 265 |
+
break;
|
| 266 |
+
}
|
| 267 |
+
return "PhraseOrientationERROR";
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const
|
| 272 |
+
{
|
| 273 |
+
boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
|
| 274 |
+
= m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
|
| 275 |
+
|
| 276 |
+
if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
|
| 277 |
+
int startE = (foundMinMax->second).first;
|
| 278 |
+
int endE = (foundMinMax->second).second;
|
| 279 |
+
// std::cerr << "Phrase orientation for"
|
| 280 |
+
// << " startF=" << startF
|
| 281 |
+
// << " endF=" << endF
|
| 282 |
+
// << " startE=" << startE
|
| 283 |
+
// << " endE=" << endE
|
| 284 |
+
// << std::endl;
|
| 285 |
+
return GetOrientationInfo(startF, startE, endF, endE, direction);
|
| 286 |
+
} else {
|
| 287 |
+
std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: not able to determine phrase orientation" << std::endl;
|
| 288 |
+
std::exit(1);
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const
|
| 294 |
+
{
|
| 295 |
+
if ( direction != REO_DIR_L2R && direction != REO_DIR_R2L ) {
|
| 296 |
+
std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: direction should be either L2R or R2L" << std::endl;
|
| 297 |
+
std::exit(1);
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
if ( direction == REO_DIR_L2R )
|
| 301 |
+
return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
|
| 302 |
+
startF, endF, startE, endE, m_countF-1, 0, 0, 1,
|
| 303 |
+
&ge, &le,
|
| 304 |
+
m_bottomRight, m_bottomLeft);
|
| 305 |
+
|
| 306 |
+
if ( direction == REO_DIR_R2L )
|
| 307 |
+
return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
|
| 308 |
+
endF, startF, endE, startE, 0, m_countF-1, m_countE-1, -1,
|
| 309 |
+
&le, &ge,
|
| 310 |
+
m_topLeft, m_topRight);
|
| 311 |
+
|
| 312 |
+
return REO_CLASS_UNKNOWN;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
// to be called with countF-1 instead of countF
|
| 317 |
+
PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType,
|
| 318 |
+
int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
|
| 319 |
+
bool (*ge)(int, int), bool (*le)(int, int),
|
| 320 |
+
const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const
|
| 321 |
+
{
|
| 322 |
+
bool leftSourceSpanIsAligned = ( (startF != zeroF) && SourceSpanIsAligned(zeroF,startF-unit) );
|
| 323 |
+
bool topTargetSpanIsAligned = ( (startE != zeroE) && TargetSpanIsAligned(zeroE,startE-unit) );
|
| 324 |
+
|
| 325 |
+
if (!topTargetSpanIsAligned && !leftSourceSpanIsAligned)
|
| 326 |
+
return REO_CLASS_LEFT;
|
| 327 |
+
|
| 328 |
+
HSentenceVertices::const_iterator it;
|
| 329 |
+
|
| 330 |
+
if (//(connectedLeftTop && !connectedRightTop) ||
|
| 331 |
+
((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
|
| 332 |
+
it->second.find(startF-unit) != it->second.end()))
|
| 333 |
+
return REO_CLASS_LEFT;
|
| 334 |
+
|
| 335 |
+
if (modelType == REO_MODEL_TYPE_MONO)
|
| 336 |
+
return REO_CLASS_UNKNOWN;
|
| 337 |
+
|
| 338 |
+
if (//(!connectedLeftTop && connectedRightTop) ||
|
| 339 |
+
((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
|
| 340 |
+
it->second.find(endF + unit) != it->second.end()))
|
| 341 |
+
return REO_CLASS_RIGHT;
|
| 342 |
+
|
| 343 |
+
if (modelType == REO_MODEL_TYPE_MSD)
|
| 344 |
+
return REO_CLASS_UNKNOWN;
|
| 345 |
+
|
| 346 |
+
for (int indexF=startF-2*unit; (*ge)(indexF, zeroF); indexF=indexF-unit) {
|
| 347 |
+
if ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
|
| 348 |
+
it->second.find(indexF) != it->second.end())
|
| 349 |
+
return REO_CLASS_DLEFT;
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
for (int indexF=endF+2*unit; (*le)(indexF, countF); indexF=indexF+unit) {
|
| 353 |
+
if ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
|
| 354 |
+
it->second.find(indexF) != it->second.end())
|
| 355 |
+
return REO_CLASS_DRIGHT;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
return REO_CLASS_UNKNOWN;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
bool PhraseOrientation::SourceSpanIsAligned(int index1, int index2) const
|
| 362 |
+
{
|
| 363 |
+
return SpanIsAligned(index1, index2, m_minAndMaxAlignedToSourceSpan);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
bool PhraseOrientation::TargetSpanIsAligned(int index1, int index2) const
|
| 367 |
+
{
|
| 368 |
+
return SpanIsAligned(index1, index2, m_minAndMaxAlignedToTargetSpan);
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
bool PhraseOrientation::SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const
|
| 372 |
+
{
|
| 373 |
+
boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator itMinAndMaxAligned =
|
| 374 |
+
minAndMaxAligned.find(std::pair<int,int>(std::min(index1,index2),std::max(index1,index2)));
|
| 375 |
+
|
| 376 |
+
if (itMinAndMaxAligned == minAndMaxAligned.end()) {
|
| 377 |
+
std::cerr << "PhraseOrientation::SourceSpanIsAligned(): Error" << std::endl;
|
| 378 |
+
std::exit(1);
|
| 379 |
+
} else {
|
| 380 |
+
if (itMinAndMaxAligned->second.first == std::numeric_limits<int>::max()) {
|
| 381 |
+
return false;
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
return true;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
const std::string PhraseOrientation::GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType)
|
| 389 |
+
{
|
| 390 |
+
std::ostringstream oss;
|
| 391 |
+
WriteOrientation(oss, orient, modelType);
|
| 392 |
+
return oss.str();
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType)
|
| 397 |
+
{
|
| 398 |
+
switch(orient) {
|
| 399 |
+
case REO_CLASS_LEFT:
|
| 400 |
+
out << "mono";
|
| 401 |
+
break;
|
| 402 |
+
case REO_CLASS_RIGHT:
|
| 403 |
+
out << "swap";
|
| 404 |
+
break;
|
| 405 |
+
case REO_CLASS_DLEFT:
|
| 406 |
+
out << "dleft";
|
| 407 |
+
break;
|
| 408 |
+
case REO_CLASS_DRIGHT:
|
| 409 |
+
out << "dright";
|
| 410 |
+
break;
|
| 411 |
+
case REO_CLASS_UNKNOWN:
|
| 412 |
+
switch(modelType) {
|
| 413 |
+
case REO_MODEL_TYPE_MONO:
|
| 414 |
+
out << "nomono";
|
| 415 |
+
break;
|
| 416 |
+
case REO_MODEL_TYPE_MSD:
|
| 417 |
+
out << "other";
|
| 418 |
+
break;
|
| 419 |
+
case REO_MODEL_TYPE_MSLR:
|
| 420 |
+
out << "dleft";
|
| 421 |
+
break;
|
| 422 |
+
}
|
| 423 |
+
break;
|
| 424 |
+
}
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
bool PhraseOrientation::IsAligned(int fi, int ei) const
|
| 429 |
+
{
|
| 430 |
+
if (ei == -1 && fi == -1)
|
| 431 |
+
return true;
|
| 432 |
+
|
| 433 |
+
if (ei <= -1 || fi <= -1)
|
| 434 |
+
return false;
|
| 435 |
+
|
| 436 |
+
if (ei == m_countE && fi == m_countF)
|
| 437 |
+
return true;
|
| 438 |
+
|
| 439 |
+
if (ei >= m_countE || fi >= m_countF)
|
| 440 |
+
return false;
|
| 441 |
+
|
| 442 |
+
for (size_t i=0; i<m_alignedToT[ei].size(); ++i)
|
| 443 |
+
if (m_alignedToT[ei][i] == fi)
|
| 444 |
+
return true;
|
| 445 |
+
|
| 446 |
+
return false;
|
| 447 |
+
}
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment)
|
| 451 |
+
{
|
| 452 |
+
assert(direction==REO_DIR_L2R || direction==REO_DIR_R2L);
|
| 453 |
+
if (direction == REO_DIR_L2R) {
|
| 454 |
+
m_l2rOrientationPriorCounts[orient] += increment;
|
| 455 |
+
} else if (direction == REO_DIR_R2L) {
|
| 456 |
+
m_r2lOrientationPriorCounts[orient] += increment;
|
| 457 |
+
}
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType)
|
| 462 |
+
{
|
| 463 |
+
std::map<std::string,float> l2rOrientationPriorCountsMap;
|
| 464 |
+
std::map<std::string,float> r2lOrientationPriorCountsMap;
|
| 465 |
+
for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
|
| 466 |
+
l2rOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_l2rOrientationPriorCounts[orient];
|
| 467 |
+
}
|
| 468 |
+
for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
|
| 469 |
+
r2lOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_r2lOrientationPriorCounts[orient];
|
| 470 |
+
}
|
| 471 |
+
for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin();
|
| 472 |
+
l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) {
|
| 473 |
+
out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl;
|
| 474 |
+
}
|
| 475 |
+
for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin();
|
| 476 |
+
r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) {
|
| 477 |
+
out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl;
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/PhraseOrientation.h
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2011 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <set>
|
| 24 |
+
#include <string>
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
#include <boost/unordered_map.hpp>
|
| 28 |
+
|
| 29 |
+
#include "moses/AlignmentInfo.h"
|
| 30 |
+
|
| 31 |
+
#include "Alignment.h"
|
| 32 |
+
|
| 33 |
+
namespace MosesTraining
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
// The key of the map is the English index and the value is a set of the source ones
|
| 37 |
+
typedef std::map <int, std::set<int> > HSentenceVertices;
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class PhraseOrientation
|
| 41 |
+
{
|
| 42 |
+
public:
|
| 43 |
+
|
| 44 |
+
enum REO_MODEL_TYPE {REO_MODEL_TYPE_MSD, REO_MODEL_TYPE_MSLR, REO_MODEL_TYPE_MONO};
|
| 45 |
+
enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN};
|
| 46 |
+
enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR};
|
| 47 |
+
|
| 48 |
+
PhraseOrientation() {};
|
| 49 |
+
|
| 50 |
+
PhraseOrientation(int sourceSize,
|
| 51 |
+
int targetSize,
|
| 52 |
+
const Alignment &alignment);
|
| 53 |
+
|
| 54 |
+
PhraseOrientation(int sourceSize,
|
| 55 |
+
int targetSize,
|
| 56 |
+
const Moses::AlignmentInfo &alignTerm,
|
| 57 |
+
const Moses::AlignmentInfo &alignNonTerm);
|
| 58 |
+
|
| 59 |
+
PhraseOrientation(int sourceSize,
|
| 60 |
+
int targetSize,
|
| 61 |
+
const std::vector<std::vector<int> > &alignedToT,
|
| 62 |
+
const std::vector<std::vector<int> > &alignedToS,
|
| 63 |
+
const std::vector<int> &alignedCountS);
|
| 64 |
+
|
| 65 |
+
REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
|
| 66 |
+
REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
|
| 67 |
+
const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const;
|
| 68 |
+
const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=REO_DIR_BIDIR) const;
|
| 69 |
+
static const std::string GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
|
| 70 |
+
static void WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
|
| 71 |
+
void IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment);
|
| 72 |
+
static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
|
| 73 |
+
bool SourceSpanIsAligned(int index1, int index2) const;
|
| 74 |
+
bool TargetSpanIsAligned(int index1, int index2) const;
|
| 75 |
+
|
| 76 |
+
private:
|
| 77 |
+
|
| 78 |
+
void Init(int sourceSize, int targetSize,
|
| 79 |
+
const std::vector<std::vector<int> > &alignedToT,
|
| 80 |
+
const std::vector<std::vector<int> > &alignedToS,
|
| 81 |
+
const std::vector<int> &alignedCountS);
|
| 82 |
+
|
| 83 |
+
void InsertVertex( HSentenceVertices & corners, int x, int y );
|
| 84 |
+
|
| 85 |
+
void InsertPhraseVertices(HSentenceVertices & topLeft,
|
| 86 |
+
HSentenceVertices & topRight,
|
| 87 |
+
HSentenceVertices & bottomLeft,
|
| 88 |
+
HSentenceVertices & bottomRight,
|
| 89 |
+
int startF, int startE, int endF, int endE);
|
| 90 |
+
|
| 91 |
+
REO_CLASS GetOrientHierModel(REO_MODEL_TYPE modelType,
|
| 92 |
+
int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
|
| 93 |
+
bool (*ge)(int, int), bool (*lt)(int, int),
|
| 94 |
+
const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const;
|
| 95 |
+
|
| 96 |
+
bool SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const;
|
| 97 |
+
|
| 98 |
+
bool IsAligned(int fi, int ei) const;
|
| 99 |
+
|
| 100 |
+
static bool ge(int first, int second) {
|
| 101 |
+
return first >= second;
|
| 102 |
+
};
|
| 103 |
+
static bool le(int first, int second) {
|
| 104 |
+
return first <= second;
|
| 105 |
+
};
|
| 106 |
+
static bool lt(int first, int second) {
|
| 107 |
+
return first < second;
|
| 108 |
+
};
|
| 109 |
+
|
| 110 |
+
int m_countF;
|
| 111 |
+
int m_countE;
|
| 112 |
+
|
| 113 |
+
std::vector<std::vector<int> > m_alignedToT;
|
| 114 |
+
|
| 115 |
+
HSentenceVertices m_topLeft;
|
| 116 |
+
HSentenceVertices m_topRight;
|
| 117 |
+
HSentenceVertices m_bottomLeft;
|
| 118 |
+
HSentenceVertices m_bottomRight;
|
| 119 |
+
|
| 120 |
+
boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan;
|
| 121 |
+
boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToTargetSpan;
|
| 122 |
+
|
| 123 |
+
static std::vector<float> m_l2rOrientationPriorCounts;
|
| 124 |
+
static std::vector<float> m_r2lOrientationPriorCounts;
|
| 125 |
+
};
|
| 126 |
+
|
| 127 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/PropertiesConsolidator.cpp
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "PropertiesConsolidator.h"
|
| 21 |
+
|
| 22 |
+
#include <sstream>
|
| 23 |
+
#include <limits>
|
| 24 |
+
#include <vector>
|
| 25 |
+
|
| 26 |
+
#include "moses/Util.h"
|
| 27 |
+
#include "phrase-extract/InputFileStream.h"
|
| 28 |
+
#include "phrase-extract/OutputFileStream.h"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
|
| 35 |
+
{
|
| 36 |
+
Moses::InputFileStream inFile(sourceLabelSetFile);
|
| 37 |
+
|
| 38 |
+
// read source label set
|
| 39 |
+
m_sourceLabels.clear();
|
| 40 |
+
std::string line;
|
| 41 |
+
while (getline(inFile, line)) {
|
| 42 |
+
std::istringstream tokenizer(line);
|
| 43 |
+
std::string label;
|
| 44 |
+
size_t index;
|
| 45 |
+
try {
|
| 46 |
+
tokenizer >> label >> index;
|
| 47 |
+
} catch (const std::exception &e) {
|
| 48 |
+
UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
|
| 49 |
+
}
|
| 50 |
+
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
|
| 51 |
+
UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
inFile.Close();
|
| 55 |
+
|
| 56 |
+
m_sourceLabelsFlag = true;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
|
| 61 |
+
{
|
| 62 |
+
Moses::InputFileStream inFile(partsOfSpeechFile);
|
| 63 |
+
|
| 64 |
+
// read parts-of-speech vocabulary
|
| 65 |
+
m_partsOfSpeechVocabulary.clear();
|
| 66 |
+
std::string line;
|
| 67 |
+
while (getline(inFile, line)) {
|
| 68 |
+
std::istringstream tokenizer(line);
|
| 69 |
+
std::string label;
|
| 70 |
+
size_t index;
|
| 71 |
+
try {
|
| 72 |
+
tokenizer >> label >> index;
|
| 73 |
+
} catch (const std::exception &e) {
|
| 74 |
+
UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
|
| 75 |
+
}
|
| 76 |
+
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
|
| 77 |
+
UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
inFile.Close();
|
| 81 |
+
|
| 82 |
+
m_partsOfSpeechFlag = true;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
|
| 87 |
+
{
|
| 88 |
+
Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
|
| 89 |
+
|
| 90 |
+
// read target syntactic preferences label set
|
| 91 |
+
m_targetSyntacticPreferencesLabels.clear();
|
| 92 |
+
std::string line;
|
| 93 |
+
while (getline(inFile, line)) {
|
| 94 |
+
std::istringstream tokenizer(line);
|
| 95 |
+
std::string label;
|
| 96 |
+
size_t index;
|
| 97 |
+
try {
|
| 98 |
+
tokenizer >> label >> index;
|
| 99 |
+
} catch (const std::exception &e) {
|
| 100 |
+
UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
|
| 101 |
+
}
|
| 102 |
+
std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
|
| 103 |
+
UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
inFile.Close();
|
| 107 |
+
|
| 108 |
+
m_targetSyntacticPreferencesFlag = true;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
|
| 113 |
+
{
|
| 114 |
+
if ( propertiesString.empty() ) {
|
| 115 |
+
return;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
std::vector<std::string> toks;
|
| 119 |
+
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
| 120 |
+
for (size_t i = 1; i < toks.size(); ++i) {
|
| 121 |
+
std::string &tok = toks[i];
|
| 122 |
+
if (tok.empty()) {
|
| 123 |
+
continue;
|
| 124 |
+
}
|
| 125 |
+
size_t endPos = tok.rfind("}");
|
| 126 |
+
tok = tok.substr(0, endPos - 1);
|
| 127 |
+
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
| 128 |
+
assert(keyValue.size() == 2);
|
| 129 |
+
|
| 130 |
+
if ( !keyValue[0].compare("SourceLabels") ) {
|
| 131 |
+
|
| 132 |
+
if ( m_sourceLabelsFlag ) {
|
| 133 |
+
|
| 134 |
+
// SourceLabels property: replace strings with vocabulary indices
|
| 135 |
+
out << " {{" << keyValue[0];
|
| 136 |
+
ProcessSourceLabelsPropertyValue(keyValue[1], out);
|
| 137 |
+
out << "}}";
|
| 138 |
+
|
| 139 |
+
} else { // don't process SourceLabels property
|
| 140 |
+
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
} else if ( !keyValue[0].compare("POS") ) {
|
| 144 |
+
|
| 145 |
+
/* DO NOTHING (property is not registered in the decoder at the moment)
|
| 146 |
+
if ( m_partsOfSpeechFlag ) {
|
| 147 |
+
|
| 148 |
+
// POS property: replace strings with vocabulary indices
|
| 149 |
+
out << " {{" << keyValue[0];
|
| 150 |
+
ProcessPOSPropertyValue(keyValue[1], out);
|
| 151 |
+
out << "}}";
|
| 152 |
+
|
| 153 |
+
} else { // don't process POS property
|
| 154 |
+
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
| 155 |
+
}
|
| 156 |
+
*/
|
| 157 |
+
|
| 158 |
+
} else if ( !keyValue[0].compare("TargetPreferences") ) {
|
| 159 |
+
|
| 160 |
+
if ( m_targetSyntacticPreferencesFlag ) {
|
| 161 |
+
|
| 162 |
+
// TargetPreferences property: replace strings with vocabulary indices
|
| 163 |
+
out << " {{" << keyValue[0];
|
| 164 |
+
ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
|
| 165 |
+
out << "}}";
|
| 166 |
+
|
| 167 |
+
} else { // don't process TargetPreferences property
|
| 168 |
+
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
} else {
|
| 172 |
+
|
| 173 |
+
// output other property
|
| 174 |
+
out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
void PropertiesConsolidator::ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
| 181 |
+
{
|
| 182 |
+
// SourceLabels property: replace strings with vocabulary indices
|
| 183 |
+
std::istringstream tokenizer(value);
|
| 184 |
+
|
| 185 |
+
size_t nNTs;
|
| 186 |
+
double totalCount;
|
| 187 |
+
|
| 188 |
+
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
| 189 |
+
UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
|
| 190 |
+
<< "Flawed SourceLabels property?");
|
| 191 |
+
}
|
| 192 |
+
assert( nNTs > 0 );
|
| 193 |
+
out << " " << nNTs;
|
| 194 |
+
|
| 195 |
+
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
| 196 |
+
UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
|
| 197 |
+
<< "Flawed SourceLabels property?");
|
| 198 |
+
}
|
| 199 |
+
assert( totalCount > 0.0 );
|
| 200 |
+
out << " " << totalCount;
|
| 201 |
+
|
| 202 |
+
while (tokenizer.peek() != EOF) {
|
| 203 |
+
try {
|
| 204 |
+
|
| 205 |
+
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
| 206 |
+
|
| 207 |
+
std::string token;
|
| 208 |
+
|
| 209 |
+
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
| 210 |
+
for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
|
| 211 |
+
tokenizer >> token; // RHS source non-terminal label
|
| 212 |
+
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
| 213 |
+
UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
| 214 |
+
out << " " << found->second;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
tokenizer >> token; // sourceLabelsRHSCount
|
| 218 |
+
out << " " << token;
|
| 219 |
+
|
| 220 |
+
tokenizer >> numberOfLHSsGivenRHS;
|
| 221 |
+
out << " " << numberOfLHSsGivenRHS;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
|
| 225 |
+
tokenizer >> token; // LHS source non-terminal label
|
| 226 |
+
std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
|
| 227 |
+
UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
| 228 |
+
out << " " << found->second;
|
| 229 |
+
|
| 230 |
+
tokenizer >> token; // ruleSourceLabelledCount
|
| 231 |
+
out << " " << token;
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
} catch (const std::exception &e) {
|
| 235 |
+
UTIL_THROW2("Flawed item in SourceLabels property?");
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
void PropertiesConsolidator::ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
| 242 |
+
{
|
| 243 |
+
std::istringstream tokenizer(value);
|
| 244 |
+
while (tokenizer.peek() != EOF) {
|
| 245 |
+
std::string token;
|
| 246 |
+
tokenizer >> token;
|
| 247 |
+
std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
|
| 248 |
+
UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
|
| 249 |
+
out << " " << found->second;
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const
|
| 255 |
+
{
|
| 256 |
+
out.clear();
|
| 257 |
+
if ( propertiesString.empty() ) {
|
| 258 |
+
return false;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
std::vector<std::string> toks;
|
| 262 |
+
Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
|
| 263 |
+
for (size_t i = 1; i < toks.size(); ++i) {
|
| 264 |
+
std::string &tok = toks[i];
|
| 265 |
+
if (tok.empty()) {
|
| 266 |
+
continue;
|
| 267 |
+
}
|
| 268 |
+
size_t endPos = tok.rfind("}");
|
| 269 |
+
tok = tok.substr(0, endPos - 1);
|
| 270 |
+
std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
|
| 271 |
+
assert(keyValue.size() == 2);
|
| 272 |
+
|
| 273 |
+
if ( !keyValue[0].compare("POS") ) {
|
| 274 |
+
std::istringstream tokenizer(keyValue[1]);
|
| 275 |
+
while (tokenizer.peek() != EOF) {
|
| 276 |
+
std::string token;
|
| 277 |
+
tokenizer >> token;
|
| 278 |
+
out.push_back(token);
|
| 279 |
+
}
|
| 280 |
+
return true;
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
return false;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
|
| 289 |
+
{
|
| 290 |
+
// TargetPreferences property: replace strings with vocabulary indices
|
| 291 |
+
std::istringstream tokenizer(value);
|
| 292 |
+
|
| 293 |
+
size_t nNTs;
|
| 294 |
+
double totalCount;
|
| 295 |
+
|
| 296 |
+
if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
|
| 297 |
+
UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
|
| 298 |
+
<< "Flawed TargetPreferences property?");
|
| 299 |
+
}
|
| 300 |
+
assert( nNTs > 0 );
|
| 301 |
+
out << " " << nNTs;
|
| 302 |
+
|
| 303 |
+
if (! (tokenizer >> totalCount)) { // second token: overall rule count
|
| 304 |
+
UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
|
| 305 |
+
<< "Flawed TargetPreferences property?");
|
| 306 |
+
}
|
| 307 |
+
assert( totalCount > 0.0 );
|
| 308 |
+
out << " " << totalCount;
|
| 309 |
+
|
| 310 |
+
while (tokenizer.peek() != EOF) {
|
| 311 |
+
try {
|
| 312 |
+
|
| 313 |
+
size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
|
| 314 |
+
|
| 315 |
+
std::string token;
|
| 316 |
+
|
| 317 |
+
if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
|
| 318 |
+
for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
|
| 319 |
+
tokenizer >> token; // RHS target preference non-terminal label
|
| 320 |
+
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
|
| 321 |
+
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
|
| 322 |
+
out << " " << found->second;
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
tokenizer >> token; // targetPreferenceRHSCount
|
| 326 |
+
out << " " << token;
|
| 327 |
+
|
| 328 |
+
tokenizer >> numberOfLHSsGivenRHS;
|
| 329 |
+
out << " " << numberOfLHSsGivenRHS;
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
|
| 333 |
+
tokenizer >> token; // LHS target preference non-terminal label
|
| 334 |
+
std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
|
| 335 |
+
UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
|
| 336 |
+
out << " " << found->second;
|
| 337 |
+
|
| 338 |
+
tokenizer >> token; // ruleTargetPreferenceLabelledCount
|
| 339 |
+
out << " " << token;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
} catch (const std::exception &e) {
|
| 343 |
+
UTIL_THROW2("Flawed item in TargetPreferences property?");
|
| 344 |
+
}
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
} // namespace MosesTraining
|
| 350 |
+
|
mosesdecoder/phrase-extract/PropertiesConsolidator.h
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#pragma once
|
| 22 |
+
|
| 23 |
+
#include <string>
|
| 24 |
+
#include <map>
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
#include "OutputFileStream.h"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
namespace MosesTraining
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
class PropertiesConsolidator
|
| 34 |
+
{
|
| 35 |
+
public:
|
| 36 |
+
|
| 37 |
+
PropertiesConsolidator()
|
| 38 |
+
: m_sourceLabelsFlag(false)
|
| 39 |
+
, m_partsOfSpeechFlag(false)
|
| 40 |
+
, m_targetSyntacticPreferencesFlag(false)
|
| 41 |
+
{};
|
| 42 |
+
|
| 43 |
+
void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
|
| 44 |
+
void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
|
| 45 |
+
void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
|
| 46 |
+
|
| 47 |
+
bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
|
| 48 |
+
|
| 49 |
+
void ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const;
|
| 50 |
+
|
| 51 |
+
protected:
|
| 52 |
+
|
| 53 |
+
void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
| 54 |
+
void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
| 55 |
+
void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
|
| 56 |
+
|
| 57 |
+
bool m_sourceLabelsFlag;
|
| 58 |
+
std::map<std::string,size_t> m_sourceLabels;
|
| 59 |
+
bool m_partsOfSpeechFlag;
|
| 60 |
+
std::map<std::string,size_t> m_partsOfSpeechVocabulary;
|
| 61 |
+
bool m_targetSyntacticPreferencesFlag;
|
| 62 |
+
std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
|
| 63 |
+
|
| 64 |
+
};
|
| 65 |
+
|
| 66 |
+
} // namespace MosesTraining
|
| 67 |
+
|
mosesdecoder/phrase-extract/RuleExist.h
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef RULEEXIST_H_INCLUDED_
|
| 22 |
+
#define RULEEXIST_H_INCLUDED_
|
| 23 |
+
|
| 24 |
+
#include <vector>
|
| 25 |
+
|
| 26 |
+
#include "Hole.h"
|
| 27 |
+
|
| 28 |
+
namespace MosesTraining
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
// reposity of extracted phrase pairs
|
| 32 |
+
// which are potential holes in larger phrase pairs
|
| 33 |
+
class RuleExist
|
| 34 |
+
{
|
| 35 |
+
protected:
|
| 36 |
+
std::vector< std::vector<HoleList> > m_phraseExist;
|
| 37 |
+
// indexed by source pos. and source length
|
| 38 |
+
// maps to list of holes where <int, int> are target pos
|
| 39 |
+
|
| 40 |
+
public:
|
| 41 |
+
RuleExist(size_t size)
|
| 42 |
+
:m_phraseExist(size) {
|
| 43 |
+
// size is the length of the source sentence
|
| 44 |
+
for (size_t pos = 0; pos < size; ++pos) {
|
| 45 |
+
// create empty hole lists
|
| 46 |
+
std::vector<HoleList> &endVec = m_phraseExist[pos];
|
| 47 |
+
endVec.resize(size - pos);
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
void Add(int startT, int endT, int startS, int endS) {
|
| 52 |
+
m_phraseExist[startT][endT - startT].push_back(Hole(startS, endS, startT, endT));
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
const HoleList &GetSourceHoles(int startT, int endT) const {
|
| 56 |
+
const HoleList &sourceHoles = m_phraseExist[startT][endT - startT];
|
| 57 |
+
return sourceHoles;
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
};
|
| 61 |
+
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
#endif
|
mosesdecoder/phrase-extract/RuleExtractionOptions.h
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
namespace MosesTraining
|
| 23 |
+
{
|
| 24 |
+
|
| 25 |
+
struct RuleExtractionOptions {
|
| 26 |
+
public:
|
| 27 |
+
int maxSpan;
|
| 28 |
+
int minHoleSource;
|
| 29 |
+
int minHoleTarget;
|
| 30 |
+
int minWords;
|
| 31 |
+
int maxSymbolsTarget;
|
| 32 |
+
int maxSymbolsSource;
|
| 33 |
+
int maxNonTerm;
|
| 34 |
+
int maxScope;
|
| 35 |
+
bool onlyDirectFlag;
|
| 36 |
+
bool glueGrammarFlag;
|
| 37 |
+
bool unknownWordLabelFlag;
|
| 38 |
+
bool onlyOutputSpanInfo;
|
| 39 |
+
bool noFileLimit;
|
| 40 |
+
bool properConditioning;
|
| 41 |
+
bool nonTermFirstWord;
|
| 42 |
+
bool nonTermConsecTarget;
|
| 43 |
+
bool nonTermConsecSource;
|
| 44 |
+
bool requireAlignedWord;
|
| 45 |
+
bool sourceSyntax;
|
| 46 |
+
bool targetSyntax;
|
| 47 |
+
bool targetSyntacticPreferences;
|
| 48 |
+
bool duplicateRules;
|
| 49 |
+
bool fractionalCounting;
|
| 50 |
+
bool pcfgScore;
|
| 51 |
+
bool gzOutput;
|
| 52 |
+
bool unpairedExtractFormat;
|
| 53 |
+
bool conditionOnTargetLhs;
|
| 54 |
+
bool boundaryRules;
|
| 55 |
+
bool flexScoreFlag;
|
| 56 |
+
bool phraseOrientation;
|
| 57 |
+
|
| 58 |
+
RuleExtractionOptions()
|
| 59 |
+
: maxSpan(10)
|
| 60 |
+
, minHoleSource(2)
|
| 61 |
+
, minHoleTarget(1)
|
| 62 |
+
, minWords(1)
|
| 63 |
+
, maxSymbolsTarget(999)
|
| 64 |
+
, maxSymbolsSource(5)
|
| 65 |
+
, maxNonTerm(2)
|
| 66 |
+
, maxScope(999)
|
| 67 |
+
// int minHoleSize(1)
|
| 68 |
+
// int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
|
| 69 |
+
, onlyDirectFlag(false)
|
| 70 |
+
, glueGrammarFlag(false)
|
| 71 |
+
, unknownWordLabelFlag(false)
|
| 72 |
+
, onlyOutputSpanInfo(false)
|
| 73 |
+
, noFileLimit(false)
|
| 74 |
+
//bool zipFiles(false)
|
| 75 |
+
, properConditioning(false)
|
| 76 |
+
, nonTermFirstWord(true)
|
| 77 |
+
, nonTermConsecTarget(true)
|
| 78 |
+
, nonTermConsecSource(false)
|
| 79 |
+
, requireAlignedWord(true)
|
| 80 |
+
, sourceSyntax(false)
|
| 81 |
+
, targetSyntax(false)
|
| 82 |
+
, targetSyntacticPreferences(false)
|
| 83 |
+
, duplicateRules(true)
|
| 84 |
+
, fractionalCounting(true)
|
| 85 |
+
, pcfgScore(false)
|
| 86 |
+
, gzOutput(false)
|
| 87 |
+
, unpairedExtractFormat(false)
|
| 88 |
+
, conditionOnTargetLhs(false)
|
| 89 |
+
, boundaryRules(false)
|
| 90 |
+
, flexScoreFlag(false)
|
| 91 |
+
, phraseOrientation(false) {}
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
}
|
| 95 |
+
|
mosesdecoder/phrase-extract/ScoreFeature.cpp
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2012- University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 21 |
+
#include "ScoreFeature.h"
|
| 22 |
+
#include "DomainFeature.h"
|
| 23 |
+
#include "InternalStructFeature.h"
|
| 24 |
+
|
| 25 |
+
using namespace std;
|
| 26 |
+
using namespace boost::algorithm;
|
| 27 |
+
|
| 28 |
+
namespace MosesTraining
|
| 29 |
+
{
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
const string& ScoreFeatureManager::usage() const
|
| 33 |
+
{
|
| 34 |
+
const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ;
|
| 35 |
+
return usage;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
void ScoreFeatureManager::configure(const std::vector<std::string> args)
|
| 39 |
+
{
|
| 40 |
+
bool domainAdded = false;
|
| 41 |
+
bool sparseDomainAdded = false;
|
| 42 |
+
|
| 43 |
+
for (size_t i = 0; i < args.size(); ++i) {
|
| 44 |
+
if (args[i] == "--IgnoreSentenceId") {
|
| 45 |
+
m_includeSentenceId = true;
|
| 46 |
+
} else if (starts_with(args[i], "--Domain")) {
|
| 47 |
+
string type = args[i].substr(8);
|
| 48 |
+
++i;
|
| 49 |
+
UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
|
| 50 |
+
string domainFile = args[i];
|
| 51 |
+
UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
|
| 52 |
+
"Only allowed one domain feature");
|
| 53 |
+
if (type == "Subset") {
|
| 54 |
+
m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
|
| 55 |
+
} else if (type == "Ratio") {
|
| 56 |
+
m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
|
| 57 |
+
} else if (type == "Indicator") {
|
| 58 |
+
m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
|
| 59 |
+
} else {
|
| 60 |
+
UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
|
| 61 |
+
}
|
| 62 |
+
domainAdded = true;
|
| 63 |
+
m_includeSentenceId = true;
|
| 64 |
+
} else if (starts_with(args[i], "--SparseDomain")) {
|
| 65 |
+
string type = args[i].substr(14);
|
| 66 |
+
++i;
|
| 67 |
+
UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
|
| 68 |
+
string domainFile = args[i];
|
| 69 |
+
UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
|
| 70 |
+
"Only allowed one sparse domain feature");
|
| 71 |
+
if (type == "Subset") {
|
| 72 |
+
m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
|
| 73 |
+
} else if (type == "Ratio") {
|
| 74 |
+
m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
|
| 75 |
+
} else if (type == "Indicator") {
|
| 76 |
+
m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
|
| 77 |
+
} else {
|
| 78 |
+
UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
|
| 79 |
+
}
|
| 80 |
+
sparseDomainAdded = true;
|
| 81 |
+
m_includeSentenceId = true;
|
| 82 |
+
} else if(args[i] == "--TreeFeatureSparse") {
|
| 83 |
+
//MARIA
|
| 84 |
+
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
|
| 85 |
+
} else if(args[i] == "--TreeFeatureDense") {
|
| 86 |
+
//MARIA
|
| 87 |
+
m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
|
| 88 |
+
} else {
|
| 89 |
+
UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
| 97 |
+
float count,
|
| 98 |
+
int sentenceId) const
|
| 99 |
+
{
|
| 100 |
+
for (size_t i = 0; i < m_features.size(); ++i) {
|
| 101 |
+
m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
|
| 106 |
+
std::vector<float>& denseValues,
|
| 107 |
+
std::map<std::string,float>& sparseValues) const
|
| 108 |
+
{
|
| 109 |
+
for (size_t i = 0; i < m_features.size(); ++i) {
|
| 110 |
+
m_features[i]->add(context, denseValues, sparseValues);
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
|
mosesdecoder/phrase-extract/ScoreFeature.h
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2012- University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
/**
|
| 21 |
+
* This contains extra features that can be added to the scorer. To add a new feature:
|
| 22 |
+
* 1. Implement a subclass of ScoreFeature
|
| 23 |
+
* 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to
|
| 24 |
+
* display usage info.
|
| 25 |
+
* 3. Write unit tests (see ScoreFeatureTest.cpp) and regression tests
|
| 26 |
+
**/
|
| 27 |
+
|
| 28 |
+
#pragma once
|
| 29 |
+
|
| 30 |
+
#include <string>
|
| 31 |
+
#include <map>
|
| 32 |
+
#include <vector>
|
| 33 |
+
|
| 34 |
+
#include <boost/shared_ptr.hpp>
|
| 35 |
+
|
| 36 |
+
#include "util/exception.hh"
|
| 37 |
+
|
| 38 |
+
#include "ExtractionPhrasePair.h"
|
| 39 |
+
|
| 40 |
+
namespace MosesTraining
|
| 41 |
+
{
|
| 42 |
+
|
| 43 |
+
struct MaybeLog {
|
| 44 |
+
MaybeLog(bool useLog, float negativeLog):
|
| 45 |
+
m_useLog(useLog), m_negativeLog(negativeLog) {}
|
| 46 |
+
|
| 47 |
+
inline float operator() (float a) const {
|
| 48 |
+
return m_useLog ? m_negativeLog*log(a) : a;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
float m_useLog;
|
| 52 |
+
float m_negativeLog;
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
class ScoreFeatureArgumentException : public util::Exception
|
| 56 |
+
{
|
| 57 |
+
public:
|
| 58 |
+
ScoreFeatureArgumentException() throw() {
|
| 59 |
+
*this << "Unable to configure features: ";
|
| 60 |
+
}
|
| 61 |
+
~ScoreFeatureArgumentException() throw() {}
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
/** Passed to each feature to be used to calculate its values */
|
| 65 |
+
struct ScoreFeatureContext {
|
| 66 |
+
ScoreFeatureContext(
|
| 67 |
+
const ExtractionPhrasePair &thePhrasePair,
|
| 68 |
+
const MaybeLog& theMaybeLog
|
| 69 |
+
) :
|
| 70 |
+
phrasePair(thePhrasePair),
|
| 71 |
+
maybeLog(theMaybeLog) {
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
const ExtractionPhrasePair &phrasePair;
|
| 75 |
+
MaybeLog maybeLog;
|
| 76 |
+
};
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* Abstract base class for extra features that can be added to the phrase table
|
| 80 |
+
* during scoring.
|
| 81 |
+
**/
|
| 82 |
+
class ScoreFeature
|
| 83 |
+
{
|
| 84 |
+
public:
|
| 85 |
+
|
| 86 |
+
/** Some features might need to store properties in ExtractionPhrasePair,
|
| 87 |
+
* e.g. to pass along external information loaded by a feature
|
| 88 |
+
* which may distinguish several phrase occurrences based on sentence ID */
|
| 89 |
+
virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
| 90 |
+
float count,
|
| 91 |
+
int sentenceId) const {};
|
| 92 |
+
|
| 93 |
+
/** Add the values for this score feature. */
|
| 94 |
+
virtual void add(const ScoreFeatureContext& context,
|
| 95 |
+
std::vector<float>& denseValues,
|
| 96 |
+
std::map<std::string,float>& sparseValues) const = 0;
|
| 97 |
+
|
| 98 |
+
virtual ~ScoreFeature() {}
|
| 99 |
+
|
| 100 |
+
};
|
| 101 |
+
|
| 102 |
+
typedef boost::shared_ptr<ScoreFeature> ScoreFeaturePtr;
|
| 103 |
+
class ScoreFeatureManager
|
| 104 |
+
{
|
| 105 |
+
public:
|
| 106 |
+
ScoreFeatureManager():
|
| 107 |
+
m_includeSentenceId(false) {}
|
| 108 |
+
|
| 109 |
+
/** To be appended to the score usage message */
|
| 110 |
+
const std::string& usage() const;
|
| 111 |
+
|
| 112 |
+
/** Pass the unused command-line arguments to configure the extra features */
|
| 113 |
+
void configure(const std::vector<std::string> args);
|
| 114 |
+
|
| 115 |
+
/** Some features might need to store properties in ExtractionPhrasePair,
|
| 116 |
+
* e.g. to pass along external information loaded by a feature
|
| 117 |
+
* which may distinguish several phrase occurrences based on sentence ID */
|
| 118 |
+
void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
|
| 119 |
+
float count,
|
| 120 |
+
int sentenceId) const;
|
| 121 |
+
|
| 122 |
+
/** Add all the features */
|
| 123 |
+
void addFeatures(const ScoreFeatureContext& context,
|
| 124 |
+
std::vector<float>& denseValues,
|
| 125 |
+
std::map<std::string,float>& sparseValues) const;
|
| 126 |
+
|
| 127 |
+
const std::vector<ScoreFeaturePtr>& getFeatures() const {
|
| 128 |
+
return m_features;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
/** Do we need to include sentence ids in phrase pairs? */
|
| 132 |
+
bool includeSentenceId() const {
|
| 133 |
+
return m_includeSentenceId;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
private:
|
| 137 |
+
std::vector<ScoreFeaturePtr> m_features;
|
| 138 |
+
bool m_includeSentenceId;
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
mosesdecoder/phrase-extract/ScoreFeatureTest.cpp
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2012- University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "DomainFeature.h"
|
| 21 |
+
#include "ScoreFeature.h"
|
| 22 |
+
#include "tables-core.h"
|
| 23 |
+
|
| 24 |
+
#define BOOST_TEST_MODULE MosesTrainingScoreFeature
|
| 25 |
+
#include <boost/test/test_tools.hpp>
|
| 26 |
+
#include <boost/test/unit_test.hpp>
|
| 27 |
+
#include <boost/assign/list_of.hpp>
|
| 28 |
+
|
| 29 |
+
#include <unordered_set>
|
| 30 |
+
#include <unordered_map>
|
| 31 |
+
|
| 32 |
+
using namespace MosesTraining;
|
| 33 |
+
using namespace std;
|
| 34 |
+
|
| 35 |
+
//pesky global variables
|
| 36 |
+
namespace MosesTraining
|
| 37 |
+
{
|
| 38 |
+
bool hierarchicalFlag = false;
|
| 39 |
+
Vocabulary vcbT;
|
| 40 |
+
Vocabulary vcbS;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
const char *DomainFileLocation()
|
| 45 |
+
{
|
| 46 |
+
if (boost::unit_test::framework::master_test_suite().argc < 2) {
|
| 47 |
+
return "test.domain";
|
| 48 |
+
}
|
| 49 |
+
return boost::unit_test::framework::master_test_suite().argv[1];
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
|
| 54 |
+
{
|
| 55 |
+
//Check that configure rejects illegal domain arg combinations
|
| 56 |
+
ScoreFeatureManager manager;
|
| 57 |
+
BOOST_CHECK_THROW(
|
| 58 |
+
manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
|
| 59 |
+
ScoreFeatureArgumentException);
|
| 60 |
+
BOOST_CHECK_THROW(
|
| 61 |
+
manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
|
| 62 |
+
ScoreFeatureArgumentException);
|
| 63 |
+
BOOST_CHECK_THROW(
|
| 64 |
+
manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
|
| 65 |
+
ScoreFeatureArgumentException);
|
| 66 |
+
BOOST_CHECK_THROW(
|
| 67 |
+
manager.configure(boost::assign::list_of("--DomainSubset")),
|
| 68 |
+
ScoreFeatureArgumentException);
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
template <class Expected>
|
| 72 |
+
static void checkDomainConfigured(
|
| 73 |
+
const vector<string>& args)
|
| 74 |
+
{
|
| 75 |
+
ScoreFeatureManager manager;
|
| 76 |
+
manager.configure(args);
|
| 77 |
+
const std::vector<ScoreFeaturePtr>& features = manager.getFeatures();
|
| 78 |
+
//BOOST_REQUIRE_EQUAL(features.size(), 2);
|
| 79 |
+
//if I add to features this check will fail?
|
| 80 |
+
BOOST_REQUIRE_EQUAL(features.size(), 1); //MARIA -> what is this check and why does it fail when I add my feature?
|
| 81 |
+
Expected* feature = dynamic_cast<Expected*>(features[0].get());
|
| 82 |
+
BOOST_REQUIRE(feature);
|
| 83 |
+
BOOST_CHECK(manager.includeSentenceId());
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
template<typename T>
|
| 87 |
+
T adder(T v)
|
| 88 |
+
{
|
| 89 |
+
return v;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
template<typename T, typename... Args>
|
| 93 |
+
T adder(T first, Args... args)
|
| 94 |
+
{
|
| 95 |
+
return first + adder(args...);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
BOOST_AUTO_TEST_CASE(manager_config_domain)
|
| 100 |
+
{
|
| 101 |
+
checkDomainConfigured<RatioDomainFeature>
|
| 102 |
+
(boost::assign::list_of("--DomainRatio")("/dev/null"));
|
| 103 |
+
checkDomainConfigured<IndicatorDomainFeature>
|
| 104 |
+
(boost::assign::list_of("--DomainIndicator")("/dev/null"));
|
| 105 |
+
checkDomainConfigured<SubsetDomainFeature>
|
| 106 |
+
(boost::assign::list_of("--DomainSubset")("/dev/null"));
|
| 107 |
+
checkDomainConfigured<SparseRatioDomainFeature>
|
| 108 |
+
(boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
|
| 109 |
+
checkDomainConfigured<SparseIndicatorDomainFeature>
|
| 110 |
+
(boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
|
| 111 |
+
checkDomainConfigured<SparseSubsetDomainFeature>
|
| 112 |
+
(boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
|
| 113 |
+
|
| 114 |
+
// C++11 testing
|
| 115 |
+
unordered_set<int> s;
|
| 116 |
+
s.insert(4);
|
| 117 |
+
s.insert(7);
|
| 118 |
+
s.insert(4);
|
| 119 |
+
s.insert(1);
|
| 120 |
+
|
| 121 |
+
for (auto i: s) {
|
| 122 |
+
cerr << i << " ";
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
unordered_map<std::string, int> m;
|
| 126 |
+
m["a"] = 4;
|
| 127 |
+
m["ba"] = 6;
|
| 128 |
+
m["aabc"] = 7;
|
| 129 |
+
|
| 130 |
+
for (auto i: m) {
|
| 131 |
+
cerr << i.first << "=" << i.second << " ";
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
long sum = adder(1, 2, 3, 8, 7);
|
| 135 |
+
|
| 136 |
+
std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
|
| 137 |
+
std::string ssum = adder(s1, s2, s3, s4);
|
| 138 |
+
|
| 139 |
+
}
|
| 140 |
+
|
mosesdecoder/phrase-extract/SentenceAlignment.cpp
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "SentenceAlignment.h"
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <set>
|
| 24 |
+
#include <string>
|
| 25 |
+
|
| 26 |
+
#include "tables-core.h"
|
| 27 |
+
#include "util/tokenize.hh"
|
| 28 |
+
|
| 29 |
+
using namespace std;
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
SentenceAlignment::~SentenceAlignment() {}
|
| 35 |
+
|
| 36 |
+
void addBoundaryWords(vector<string> &phrase)
|
| 37 |
+
{
|
| 38 |
+
phrase.insert(phrase.begin(), "<s>");
|
| 39 |
+
phrase.push_back("</s>");
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
|
| 43 |
+
{
|
| 44 |
+
target = util::tokenize(targetString);
|
| 45 |
+
if (boundaryRules)
|
| 46 |
+
addBoundaryWords(target);
|
| 47 |
+
return true;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
|
| 51 |
+
{
|
| 52 |
+
source = util::tokenize(sourceString);
|
| 53 |
+
if (boundaryRules)
|
| 54 |
+
addBoundaryWords(source);
|
| 55 |
+
return true;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
bool SentenceAlignment::create(const char targetString[],
|
| 59 |
+
const char sourceString[],
|
| 60 |
+
const char alignmentString[],
|
| 61 |
+
const char weightString[],
|
| 62 |
+
int sentenceID, bool boundaryRules)
|
| 63 |
+
{
|
| 64 |
+
using namespace std;
|
| 65 |
+
this->sentenceID = sentenceID;
|
| 66 |
+
this->weightString = std::string(weightString);
|
| 67 |
+
|
| 68 |
+
// process sentence strings and store in target and source members.
|
| 69 |
+
if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
|
| 70 |
+
return false;
|
| 71 |
+
}
|
| 72 |
+
if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
|
| 73 |
+
return false;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
// check if sentences are empty
|
| 77 |
+
if (target.size() == 0 || source.size() == 0) {
|
| 78 |
+
cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
|
| 79 |
+
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
| 80 |
+
return false;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
// prepare data structures for alignments
|
| 84 |
+
for(size_t i=0; i<source.size(); i++) {
|
| 85 |
+
alignedCountS.push_back( 0 );
|
| 86 |
+
}
|
| 87 |
+
for(size_t i=0; i<target.size(); i++) {
|
| 88 |
+
vector< int > dummy;
|
| 89 |
+
alignedToT.push_back( dummy );
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
// reading in alignments
|
| 93 |
+
vector<string> alignmentSequence = util::tokenize( alignmentString );
|
| 94 |
+
for(size_t i=0; i<alignmentSequence.size(); i++) {
|
| 95 |
+
int s,t;
|
| 96 |
+
// cout << "scaning " << alignmentSequence[i].c_str() << endl;
|
| 97 |
+
if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
|
| 98 |
+
cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
|
| 99 |
+
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
| 100 |
+
return false;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
if (boundaryRules) {
|
| 104 |
+
++s;
|
| 105 |
+
++t;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
|
| 109 |
+
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
|
| 110 |
+
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
|
| 111 |
+
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
|
| 112 |
+
return false;
|
| 113 |
+
}
|
| 114 |
+
alignedToT[t].push_back( s );
|
| 115 |
+
alignedCountS[s]++;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
if (boundaryRules) {
|
| 119 |
+
alignedToT[0].push_back(0);
|
| 120 |
+
alignedCountS[0]++;
|
| 121 |
+
|
| 122 |
+
alignedToT.back().push_back(alignedCountS.size() - 1);
|
| 123 |
+
alignedCountS.back()++;
|
| 124 |
+
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
return true;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
void SentenceAlignment::invertAlignment()
|
| 131 |
+
{
|
| 132 |
+
alignedToS.resize(source.size());
|
| 133 |
+
for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
|
| 134 |
+
const std::vector<int> &vec = alignedToT[targetPos];
|
| 135 |
+
for (size_t i = 0; i < vec.size(); ++i) {
|
| 136 |
+
int sourcePos = vec[i];
|
| 137 |
+
alignedToS[sourcePos].push_back(targetPos);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
}
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
}
|
| 144 |
+
|
mosesdecoder/phrase-extract/SentenceAlignment.h
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
#ifndef SENTENCE_ALIGNMENT_H_INCLUDED_
|
| 22 |
+
#define SENTENCE_ALIGNMENT_H_INCLUDED_
|
| 23 |
+
|
| 24 |
+
#include <string>
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
namespace MosesTraining
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
class SentenceAlignment
|
| 31 |
+
{
|
| 32 |
+
public:
|
| 33 |
+
std::vector<std::string> target;
|
| 34 |
+
std::vector<std::string> source;
|
| 35 |
+
std::vector<int> alignedCountS;
|
| 36 |
+
std::vector<std::vector<int> > alignedToT, alignedToS;
|
| 37 |
+
int sentenceID;
|
| 38 |
+
std::string weightString;
|
| 39 |
+
|
| 40 |
+
virtual ~SentenceAlignment();
|
| 41 |
+
|
| 42 |
+
virtual bool processTargetSentence(const char *, int, bool boundaryRules);
|
| 43 |
+
|
| 44 |
+
virtual bool processSourceSentence(const char *, int, bool boundaryRules);
|
| 45 |
+
|
| 46 |
+
bool create(const char targetString[],
|
| 47 |
+
const char sourceString[],
|
| 48 |
+
const char alignmentString[],
|
| 49 |
+
const char weightString[],
|
| 50 |
+
int sentenceID, bool boundaryRules);
|
| 51 |
+
|
| 52 |
+
void invertAlignment();
|
| 53 |
+
|
| 54 |
+
};
|
| 55 |
+
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
#endif
|
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "SentenceAlignmentWithSyntax.h"
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <set>
|
| 24 |
+
#include <string>
|
| 25 |
+
|
| 26 |
+
#include "tables-core.h"
|
| 27 |
+
#include "XmlException.h"
|
| 28 |
+
#include "XmlTree.h"
|
| 29 |
+
#include "util/tokenize.hh"
|
| 30 |
+
|
| 31 |
+
using namespace std;
|
| 32 |
+
|
| 33 |
+
namespace MosesTraining
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
|
| 37 |
+
{
|
| 38 |
+
if (!m_targetSyntax) {
|
| 39 |
+
return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
string targetStringCPP(targetString);
|
| 43 |
+
try {
|
| 44 |
+
ProcessAndStripXMLTags(targetStringCPP, targetTree,
|
| 45 |
+
m_targetLabelCollection,
|
| 46 |
+
m_targetTopLabelCollection,
|
| 47 |
+
false);
|
| 48 |
+
} catch (const XmlException & e) {
|
| 49 |
+
std::cerr << "WARNING: failed to process target sentence at line "
|
| 50 |
+
<< sentenceID << ": " << e.getMsg() << std::endl;
|
| 51 |
+
return false;
|
| 52 |
+
}
|
| 53 |
+
target = util::tokenize(targetStringCPP);
|
| 54 |
+
return true;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
|
| 58 |
+
{
|
| 59 |
+
if (!m_sourceSyntax) {
|
| 60 |
+
return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
string sourceStringCPP(sourceString);
|
| 64 |
+
try {
|
| 65 |
+
ProcessAndStripXMLTags(sourceStringCPP, sourceTree,
|
| 66 |
+
m_sourceLabelCollection ,
|
| 67 |
+
m_sourceTopLabelCollection,
|
| 68 |
+
false);
|
| 69 |
+
} catch (const XmlException & e) {
|
| 70 |
+
std::cerr << "WARNING: failed to process source sentence at line "
|
| 71 |
+
<< sentenceID << ": " << e.getMsg() << std::endl;
|
| 72 |
+
return false;
|
| 73 |
+
}
|
| 74 |
+
source = util::tokenize(sourceStringCPP);
|
| 75 |
+
return true;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
} // namespace
|
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <set>
|
| 24 |
+
#include <string>
|
| 25 |
+
#include <vector>
|
| 26 |
+
|
| 27 |
+
#include "RuleExtractionOptions.h"
|
| 28 |
+
#include "SentenceAlignment.h"
|
| 29 |
+
#include "SyntaxNodeCollection.h"
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
class SentenceAlignmentWithSyntax : public SentenceAlignment
|
| 35 |
+
{
|
| 36 |
+
public:
|
| 37 |
+
SyntaxNodeCollection targetTree;
|
| 38 |
+
SyntaxNodeCollection sourceTree;
|
| 39 |
+
std::set<std::string> & m_targetLabelCollection;
|
| 40 |
+
std::set<std::string> & m_sourceLabelCollection;
|
| 41 |
+
std::map<std::string, int> & m_targetTopLabelCollection;
|
| 42 |
+
std::map<std::string, int> & m_sourceTopLabelCollection;
|
| 43 |
+
const bool m_targetSyntax, m_sourceSyntax;
|
| 44 |
+
|
| 45 |
+
SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
|
| 46 |
+
std::set<std::string> & srcLabelColl,
|
| 47 |
+
std::map<std::string,int> & tgtTopLabelColl,
|
| 48 |
+
std::map<std::string,int> & srcTopLabelColl,
|
| 49 |
+
bool targetSyntax,
|
| 50 |
+
bool sourceSyntax)
|
| 51 |
+
: m_targetLabelCollection(tgtLabelColl)
|
| 52 |
+
, m_sourceLabelCollection(srcLabelColl)
|
| 53 |
+
, m_targetTopLabelCollection(tgtTopLabelColl)
|
| 54 |
+
, m_sourceTopLabelCollection(srcTopLabelColl)
|
| 55 |
+
, m_targetSyntax(targetSyntax)
|
| 56 |
+
, m_sourceSyntax(sourceSyntax) {
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
virtual ~SentenceAlignmentWithSyntax() {}
|
| 60 |
+
|
| 61 |
+
bool
|
| 62 |
+
processTargetSentence(const char *, int, bool boundaryRules);
|
| 63 |
+
|
| 64 |
+
bool
|
| 65 |
+
processSourceSentence(const char *, int, bool boundaryRules);
|
| 66 |
+
};
|
| 67 |
+
|
| 68 |
+
}
|
| 69 |
+
|
mosesdecoder/phrase-extract/SyntaxNode.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2009 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <string>
|
| 24 |
+
|
| 25 |
+
namespace MosesTraining
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
/*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a
|
| 29 |
+
* label and a span plus an arbitrary set of name/value attributes.
|
| 30 |
+
*/
|
| 31 |
+
struct SyntaxNode {
|
| 32 |
+
typedef std::map<std::string, std::string> AttributeMap;
|
| 33 |
+
|
| 34 |
+
SyntaxNode(const std::string &label_, int start_, int end_)
|
| 35 |
+
: label(label_)
|
| 36 |
+
, start(start_)
|
| 37 |
+
, end(end_) {
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
std::string label;
|
| 41 |
+
int start;
|
| 42 |
+
int end;
|
| 43 |
+
AttributeMap attributes;
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2009 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
#include "SyntaxNodeCollection.h"
|
| 22 |
+
|
| 23 |
+
#include <cassert>
|
| 24 |
+
#include <iostream>
|
| 25 |
+
|
| 26 |
+
namespace MosesTraining
|
| 27 |
+
{
|
| 28 |
+
|
| 29 |
+
SyntaxNodeCollection::~SyntaxNodeCollection()
|
| 30 |
+
{
|
| 31 |
+
Clear();
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void SyntaxNodeCollection::Clear()
|
| 35 |
+
{
|
| 36 |
+
// loop through all m_nodes, delete them
|
| 37 |
+
for(size_t i=0; i<m_nodes.size(); i++) {
|
| 38 |
+
delete m_nodes[i];
|
| 39 |
+
}
|
| 40 |
+
m_nodes.clear();
|
| 41 |
+
m_index.clear();
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
|
| 45 |
+
const std::string &label)
|
| 46 |
+
{
|
| 47 |
+
SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
|
| 48 |
+
m_nodes.push_back( newNode );
|
| 49 |
+
m_index[ startPos ][ endPos ].push_back( newNode );
|
| 50 |
+
m_endPositionsIndex[ endPos ].push_back( newNode );
|
| 51 |
+
m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
|
| 52 |
+
m_numWords = std::max(endPos+1, m_numWords);
|
| 53 |
+
return newNode;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
|
| 57 |
+
{
|
| 58 |
+
return GetNodes( startPos, endPos).size() > 0;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
|
| 62 |
+
int startPos, int endPos ) const
|
| 63 |
+
{
|
| 64 |
+
NodeIndex::const_iterator startIndex = m_index.find( startPos );
|
| 65 |
+
if (startIndex == m_index.end() )
|
| 66 |
+
return m_emptyNode;
|
| 67 |
+
|
| 68 |
+
InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
|
| 69 |
+
if (endIndex == startIndex->second.end())
|
| 70 |
+
return m_emptyNode;
|
| 71 |
+
|
| 72 |
+
return endIndex->second;
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
|
| 76 |
+
{
|
| 77 |
+
return GetNodesByStartPosition(startPos).size() > 0;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
|
| 81 |
+
int startPos ) const
|
| 82 |
+
{
|
| 83 |
+
InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
|
| 84 |
+
if (startIndex == m_startPositionsIndex.end() )
|
| 85 |
+
return m_emptyNode;
|
| 86 |
+
|
| 87 |
+
return startIndex->second;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
|
| 91 |
+
{
|
| 92 |
+
return GetNodesByEndPosition(endPos).size() > 0;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
|
| 96 |
+
int endPos ) const
|
| 97 |
+
{
|
| 98 |
+
InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
|
| 99 |
+
if (endIndex == m_endPositionsIndex.end() )
|
| 100 |
+
return m_emptyNode;
|
| 101 |
+
|
| 102 |
+
return endIndex->second;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
|
| 106 |
+
{
|
| 107 |
+
std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
|
| 108 |
+
|
| 109 |
+
// Create a SyntaxTree object for each SyntaxNode.
|
| 110 |
+
for (std::vector<SyntaxNode*>::const_iterator p = m_nodes.begin();
|
| 111 |
+
p != m_nodes.end(); ++p) {
|
| 112 |
+
nodeToTree[*p] = new SyntaxTree(**p);
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
// Connect the SyntaxTrees.
|
| 116 |
+
typedef NodeIndex::const_iterator OuterIterator;
|
| 117 |
+
typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
|
| 118 |
+
|
| 119 |
+
SyntaxTree *root = 0;
|
| 120 |
+
SyntaxNode *prevNode = 0;
|
| 121 |
+
SyntaxTree *prevTree = 0;
|
| 122 |
+
// Iterate over all start indices from lowest to highest.
|
| 123 |
+
for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
|
| 124 |
+
const InnerNodeIndex &inner = p->second;
|
| 125 |
+
// Iterate over all end indices from highest to lowest.
|
| 126 |
+
for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
|
| 127 |
+
const std::vector<SyntaxNode*> &nodes = q->second;
|
| 128 |
+
// Iterate over all nodes that cover the same span in order of tree
|
| 129 |
+
// depth, top-most first.
|
| 130 |
+
for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
|
| 131 |
+
r != nodes.rend(); ++r) {
|
| 132 |
+
SyntaxNode *node = *r;
|
| 133 |
+
SyntaxTree *tree = nodeToTree[node];
|
| 134 |
+
if (!prevNode) {
|
| 135 |
+
// node is the root.
|
| 136 |
+
root = tree;
|
| 137 |
+
tree->parent() = 0;
|
| 138 |
+
} else if (prevNode->start == node->start) {
|
| 139 |
+
// prevNode is the parent of node.
|
| 140 |
+
assert(prevNode->end >= node->end);
|
| 141 |
+
tree->parent() = prevTree;
|
| 142 |
+
prevTree->children().push_back(tree);
|
| 143 |
+
} else {
|
| 144 |
+
// prevNode is a descendant of node's parent. The lowest common
|
| 145 |
+
// ancestor of prevNode and node will be node's parent.
|
| 146 |
+
SyntaxTree *ancestor = prevTree->parent();
|
| 147 |
+
while (ancestor->value().end < tree->value().end) {
|
| 148 |
+
ancestor = ancestor->parent();
|
| 149 |
+
}
|
| 150 |
+
assert(ancestor);
|
| 151 |
+
tree->parent() = ancestor;
|
| 152 |
+
ancestor->children().push_back(tree);
|
| 153 |
+
}
|
| 154 |
+
prevNode = node;
|
| 155 |
+
prevTree = tree;
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
return std::auto_ptr<SyntaxTree>(root);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/SyntaxNodeCollection.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2009 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <map>
|
| 23 |
+
#include <memory>
|
| 24 |
+
#include <sstream>
|
| 25 |
+
#include <string>
|
| 26 |
+
#include <vector>
|
| 27 |
+
|
| 28 |
+
#include "SyntaxNode.h"
|
| 29 |
+
#include "SyntaxTree.h"
|
| 30 |
+
|
| 31 |
+
namespace MosesTraining
|
| 32 |
+
{
|
| 33 |
+
|
| 34 |
+
/** A collection of SyntaxNodes organized by start and end position.
|
| 35 |
+
*
|
| 36 |
+
*/
|
| 37 |
+
class SyntaxNodeCollection
|
| 38 |
+
{
|
| 39 |
+
public:
|
| 40 |
+
SyntaxNodeCollection() : m_numWords(0) {}
|
| 41 |
+
|
| 42 |
+
~SyntaxNodeCollection();
|
| 43 |
+
|
| 44 |
+
//! Construct and insert a new SyntaxNode.
|
| 45 |
+
SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
|
| 46 |
+
|
| 47 |
+
//! Return true iff there are one or more SyntaxNodes with the given span.
|
| 48 |
+
bool HasNode( int startPos, int endPos ) const;
|
| 49 |
+
|
| 50 |
+
//! Lookup the SyntaxNodes for a given span.
|
| 51 |
+
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
|
| 52 |
+
|
| 53 |
+
bool HasNodeStartingAtPosition( int startPos ) const;
|
| 54 |
+
const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
|
| 55 |
+
bool HasNodeEndingAtPosition( int endPos ) const;
|
| 56 |
+
const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
|
| 57 |
+
|
| 58 |
+
//! Get a vector of pointers to all SyntaxNodes (unordered).
|
| 59 |
+
const std::vector< SyntaxNode* >& GetAllNodes() {
|
| 60 |
+
return m_nodes;
|
| 61 |
+
};
|
| 62 |
+
|
| 63 |
+
//! Get the number of words (defined as 1 + the max end pos of any node).
|
| 64 |
+
std::size_t GetNumWords() const {
|
| 65 |
+
return m_numWords;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
//! Clear the container (this deletes the SyntaxNodes).
|
| 69 |
+
void Clear();
|
| 70 |
+
|
| 71 |
+
//! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
|
| 72 |
+
std::auto_ptr<SyntaxTree> ExtractTree();
|
| 73 |
+
|
| 74 |
+
private:
|
| 75 |
+
typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
|
| 76 |
+
typedef std::map< int, InnerNodeIndex > NodeIndex;
|
| 77 |
+
|
| 78 |
+
// Not copyable.
|
| 79 |
+
SyntaxNodeCollection(const SyntaxNodeCollection &);
|
| 80 |
+
SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
|
| 81 |
+
|
| 82 |
+
std::vector< SyntaxNode* > m_nodes;
|
| 83 |
+
NodeIndex m_index;
|
| 84 |
+
int m_numWords;
|
| 85 |
+
std::vector< SyntaxNode* > m_emptyNode;
|
| 86 |
+
|
| 87 |
+
InnerNodeIndex m_endPositionsIndex;
|
| 88 |
+
InnerNodeIndex m_startPositionsIndex;
|
| 89 |
+
};
|
| 90 |
+
|
| 91 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/SyntaxTree.h
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "syntax-common/tree.h"
|
| 4 |
+
|
| 5 |
+
#include "SyntaxNode.h"
|
| 6 |
+
|
| 7 |
+
namespace MosesTraining
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
typedef Syntax::Tree<SyntaxNode> SyntaxTree;
|
| 11 |
+
|
| 12 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/XmlException.h
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
#pragma once
|
| 20 |
+
#ifndef XMLEXCEPTION_H_INCLUDED_
|
| 21 |
+
#define XMLEXCEPTION_H_INCLUDED_
|
| 22 |
+
|
| 23 |
+
#include <string>
|
| 24 |
+
|
| 25 |
+
namespace MosesTraining
|
| 26 |
+
{
|
| 27 |
+
|
| 28 |
+
class XmlException
|
| 29 |
+
{
|
| 30 |
+
public:
|
| 31 |
+
XmlException(const std::string & msg)
|
| 32 |
+
: m_msg(msg) {
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
const std::string &
|
| 36 |
+
getMsg() const {
|
| 37 |
+
return m_msg;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
private:
|
| 41 |
+
std::string m_msg;
|
| 42 |
+
};
|
| 43 |
+
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
#endif
|
mosesdecoder/phrase-extract/XmlTree.cpp
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2006 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include <cassert>
|
| 21 |
+
#include <vector>
|
| 22 |
+
#include <string>
|
| 23 |
+
#include <set>
|
| 24 |
+
#include <iostream>
|
| 25 |
+
#include <cstdlib>
|
| 26 |
+
#include <sstream>
|
| 27 |
+
|
| 28 |
+
#include "SyntaxNodeCollection.h"
|
| 29 |
+
#include "XmlException.h"
|
| 30 |
+
|
| 31 |
+
using namespace std;
|
| 32 |
+
|
| 33 |
+
namespace MosesTraining
|
| 34 |
+
{
|
| 35 |
+
|
| 36 |
+
inline std::vector<std::string> Tokenize(const std::string& str,
|
| 37 |
+
const std::string& delimiters = " \t")
|
| 38 |
+
{
|
| 39 |
+
std::vector<std::string> tokens;
|
| 40 |
+
// Skip delimiters at beginning.
|
| 41 |
+
std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
|
| 42 |
+
// Find first "non-delimiter".
|
| 43 |
+
std::string::size_type pos = str.find_first_of(delimiters, lastPos);
|
| 44 |
+
|
| 45 |
+
while (std::string::npos != pos || std::string::npos != lastPos) {
|
| 46 |
+
// Found a token, add it to the vector.
|
| 47 |
+
tokens.push_back(str.substr(lastPos, pos - lastPos));
|
| 48 |
+
// Skip delimiters. Note the "not_of"
|
| 49 |
+
lastPos = str.find_first_not_of(delimiters, pos);
|
| 50 |
+
// Find next "non-delimiter"
|
| 51 |
+
pos = str.find_first_of(delimiters, lastPos);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
return tokens;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
|
| 58 |
+
{
|
| 59 |
+
std::string res = str;
|
| 60 |
+
res.erase(str.find_last_not_of(dropChars)+1);
|
| 61 |
+
return res.erase(0, res.find_first_not_of(dropChars));
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
string ParseXmlTagAttribute(const string& tag,const string& attributeName)
|
| 65 |
+
{
|
| 66 |
+
/*TODO deal with unescaping \"*/
|
| 67 |
+
string tagOpen = attributeName + "=\"";
|
| 68 |
+
size_t contentsStart = tag.find(tagOpen);
|
| 69 |
+
if (contentsStart == string::npos) return "";
|
| 70 |
+
contentsStart += tagOpen.size();
|
| 71 |
+
size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
|
| 72 |
+
if (contentsEnd == string::npos) {
|
| 73 |
+
cerr << "Malformed XML attribute: "<< tag;
|
| 74 |
+
return "";
|
| 75 |
+
}
|
| 76 |
+
size_t possibleEnd;
|
| 77 |
+
while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
|
| 78 |
+
contentsEnd = possibleEnd;
|
| 79 |
+
}
|
| 80 |
+
return tag.substr(contentsStart,contentsEnd-contentsStart);
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
// s should be a sequence of name=attribute pairs separated by whitespace.
|
| 84 |
+
// e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
|
| 85 |
+
void ParseXmlTagAttributes(const std::string &s,
|
| 86 |
+
std::map<std::string, std::string> &attributes)
|
| 87 |
+
{
|
| 88 |
+
std::size_t begin = 0;
|
| 89 |
+
while (true) {
|
| 90 |
+
std::size_t pos = s.find('=', begin);
|
| 91 |
+
if (pos == std::string::npos) {
|
| 92 |
+
return;
|
| 93 |
+
}
|
| 94 |
+
std::string name = Trim(s.substr(begin, pos-begin));
|
| 95 |
+
begin = s.find('"', pos+1);
|
| 96 |
+
if (begin == std::string::npos) {
|
| 97 |
+
throw XmlException("invalid tag content");
|
| 98 |
+
}
|
| 99 |
+
pos = s.find('"', begin+1);
|
| 100 |
+
if (pos == std::string::npos) {
|
| 101 |
+
throw XmlException("invalid tag content");
|
| 102 |
+
}
|
| 103 |
+
while (s[pos-1] == '\\') {
|
| 104 |
+
pos = s.find('"', pos+1);
|
| 105 |
+
if (pos == std::string::npos) {
|
| 106 |
+
throw XmlException("invalid tag content");
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
if (name != "label" && name != "span") {
|
| 110 |
+
attributes[name] = s.substr(begin+1, pos-begin-1);
|
| 111 |
+
}
|
| 112 |
+
begin = pos+1;
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
/**
|
| 117 |
+
* Remove "<" and ">" from XML tag
|
| 118 |
+
*
|
| 119 |
+
* \param str xml token to be stripped
|
| 120 |
+
*/
|
| 121 |
+
string TrimXml(const string& str)
|
| 122 |
+
{
|
| 123 |
+
// too short to be xml token -> do nothing
|
| 124 |
+
if (str.size() < 2) return str;
|
| 125 |
+
|
| 126 |
+
// strip first and last character
|
| 127 |
+
if (str[0] == '<' && str[str.size() - 1] == '>') {
|
| 128 |
+
return str.substr(1, str.size() - 2);
|
| 129 |
+
}
|
| 130 |
+
// not an xml token -> do nothing
|
| 131 |
+
else {
|
| 132 |
+
return str;
|
| 133 |
+
}
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
/**
|
| 137 |
+
* Check if the token is an XML tag, i.e. starts with "<"
|
| 138 |
+
*
|
| 139 |
+
* \param tag token to be checked
|
| 140 |
+
*/
|
| 141 |
+
bool isXmlTag(const string& tag)
|
| 142 |
+
{
|
| 143 |
+
return tag[0] == '<';
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
/**
|
| 147 |
+
* Unescape XML special characters.
|
| 148 |
+
*/
|
| 149 |
+
string unescape(const string& str)
|
| 150 |
+
{
|
| 151 |
+
string s;
|
| 152 |
+
s.reserve(str.size());
|
| 153 |
+
string::size_type n;
|
| 154 |
+
string::size_type start = 0;
|
| 155 |
+
while ((n = str.find('&', start)) != string::npos) {
|
| 156 |
+
s += str.substr(start, n-start);
|
| 157 |
+
string::size_type end = str.find(';', n);
|
| 158 |
+
assert(n != string::npos);
|
| 159 |
+
string name = str.substr(n+1, end-n-1);
|
| 160 |
+
if (name == "lt") {
|
| 161 |
+
s += string("<");
|
| 162 |
+
} else if (name == "gt") {
|
| 163 |
+
s += string(">");
|
| 164 |
+
} else if (name == "#91") {
|
| 165 |
+
s += string("[");
|
| 166 |
+
} else if (name == "#93") {
|
| 167 |
+
s += string("]");
|
| 168 |
+
} else if (name == "bra") {
|
| 169 |
+
s += string("[");
|
| 170 |
+
} else if (name == "ket") {
|
| 171 |
+
s += string("]");
|
| 172 |
+
} else if (name == "bar" || name == "#124") {
|
| 173 |
+
s += string("|");
|
| 174 |
+
} else if (name == "amp") {
|
| 175 |
+
s += string("&");
|
| 176 |
+
} else if (name == "apos") {
|
| 177 |
+
s += string("'");
|
| 178 |
+
} else if (name == "quot") {
|
| 179 |
+
s += string("\"");
|
| 180 |
+
} else {
|
| 181 |
+
// Currently only handles the following five XML escape sequences:
|
| 182 |
+
// < <
|
| 183 |
+
// > >
|
| 184 |
+
// & &
|
| 185 |
+
// ' '
|
| 186 |
+
// " "
|
| 187 |
+
// Numeric character references (like ö) are not supported.
|
| 188 |
+
std::ostringstream msg;
|
| 189 |
+
msg << "unsupported XML escape sequence: &" << name << ";";
|
| 190 |
+
throw XmlException(msg.str());
|
| 191 |
+
}
|
| 192 |
+
if (end == str.size()-1) {
|
| 193 |
+
return s;
|
| 194 |
+
}
|
| 195 |
+
start = end + 1;
|
| 196 |
+
}
|
| 197 |
+
s += str.substr(start);
|
| 198 |
+
return s;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
/**
|
| 202 |
+
* Split up the input character string into tokens made up of
|
| 203 |
+
* either XML tags or text.
|
| 204 |
+
* example: this <b> is a </b> test .
|
| 205 |
+
* => (this ), (<b>), ( is a ), (</b>), ( test .)
|
| 206 |
+
*
|
| 207 |
+
* \param str input string
|
| 208 |
+
*/
|
| 209 |
+
vector<string> TokenizeXml(const string& str)
|
| 210 |
+
{
|
| 211 |
+
string lbrack = "<";
|
| 212 |
+
string rbrack = ">";
|
| 213 |
+
vector<string> tokens; // vector of tokens to be returned
|
| 214 |
+
string::size_type cpos = 0; // current position in string
|
| 215 |
+
string::size_type lpos = 0; // left start of xml tag
|
| 216 |
+
string::size_type rpos = 0; // right end of xml tag
|
| 217 |
+
|
| 218 |
+
// walk thorugh the string (loop vver cpos)
|
| 219 |
+
while (cpos != str.size()) {
|
| 220 |
+
// find the next opening "<" of an xml tag
|
| 221 |
+
lpos = str.find_first_of(lbrack, cpos);
|
| 222 |
+
if (lpos != string::npos) {
|
| 223 |
+
// find the end of the xml tag
|
| 224 |
+
rpos = str.find_first_of(rbrack, lpos);
|
| 225 |
+
// sanity check: there has to be closing ">"
|
| 226 |
+
if (rpos == string::npos) {
|
| 227 |
+
cerr << "ERROR: malformed XML: " << str << endl;
|
| 228 |
+
return tokens;
|
| 229 |
+
}
|
| 230 |
+
} else { // no more tags found
|
| 231 |
+
// add the rest as token
|
| 232 |
+
tokens.push_back(str.substr(cpos));
|
| 233 |
+
break;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
// add stuff before xml tag as token, if there is any
|
| 237 |
+
if (lpos - cpos > 0)
|
| 238 |
+
tokens.push_back(str.substr(cpos, lpos - cpos));
|
| 239 |
+
|
| 240 |
+
// add xml tag as token
|
| 241 |
+
tokens.push_back(str.substr(lpos, rpos-lpos+1));
|
| 242 |
+
cpos = rpos + 1;
|
| 243 |
+
}
|
| 244 |
+
return tokens;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
/**
|
| 248 |
+
* Process a sentence with XML-style annotation of syntactic nodes.
|
| 249 |
+
*
|
| 250 |
+
* \param line[in,out] in: sentence, out: sentence without the XML
|
| 251 |
+
* \param nodeCollection[out] the collection of SyntaxNode objects for this
|
| 252 |
+
* sentence
|
| 253 |
+
* \param labelCollection[out] label values are inserted into this set
|
| 254 |
+
* \param topLabelCollection[out] top labels (key) and their counts (value)
|
| 255 |
+
* are inserted into this map
|
| 256 |
+
* \param unescapeSpecialChars flag indicating whether XML special characters
|
| 257 |
+
* should be unescaped
|
| 258 |
+
*/
|
| 259 |
+
bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
|
| 260 |
+
set< string > &labelCollection,
|
| 261 |
+
map< string, int > &topLabelCollection,
|
| 262 |
+
bool unescapeSpecialChars )
|
| 263 |
+
{
|
| 264 |
+
//parse XML markup in translation line
|
| 265 |
+
|
| 266 |
+
// no xml tag? we're done.
|
| 267 |
+
if (line.find_first_of('<') == string::npos) {
|
| 268 |
+
return true;
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
// break up input into a vector of xml tags and text
|
| 272 |
+
// example: (this), (<b>), (is a), (</b>), (test .)
|
| 273 |
+
vector<string> xmlTokens = TokenizeXml(line);
|
| 274 |
+
|
| 275 |
+
// we need to store opened tags, until they are closed
|
| 276 |
+
// tags are stored as tripled (tagname, startpos, contents)
|
| 277 |
+
typedef pair< string, pair< size_t, string > > OpenedTag;
|
| 278 |
+
vector< OpenedTag > tagStack; // stack that contains active opened tags
|
| 279 |
+
|
| 280 |
+
string cleanLine; // return string (text without xml)
|
| 281 |
+
size_t wordPos = 0; // position in sentence (in terms of number of words)
|
| 282 |
+
|
| 283 |
+
// loop through the tokens
|
| 284 |
+
for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
|
| 285 |
+
// not a xml tag, but regular text (may contain many words)
|
| 286 |
+
if(!isXmlTag(xmlTokens[xmlTokenPos])) {
|
| 287 |
+
// add a space at boundary, if necessary
|
| 288 |
+
if (cleanLine.size()>0 &&
|
| 289 |
+
cleanLine[cleanLine.size() - 1] != ' ' &&
|
| 290 |
+
xmlTokens[xmlTokenPos][0] != ' ') {
|
| 291 |
+
cleanLine += " ";
|
| 292 |
+
}
|
| 293 |
+
// add words to output
|
| 294 |
+
if (unescapeSpecialChars) {
|
| 295 |
+
cleanLine += unescape(xmlTokens[xmlTokenPos]);
|
| 296 |
+
} else {
|
| 297 |
+
cleanLine += xmlTokens[xmlTokenPos];
|
| 298 |
+
}
|
| 299 |
+
wordPos = Tokenize(cleanLine).size(); // count all the words
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
// process xml tag
|
| 303 |
+
else {
|
| 304 |
+
// *** get essential information about tag ***
|
| 305 |
+
|
| 306 |
+
// strip extra boundary spaces and "<" and ">"
|
| 307 |
+
string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
|
| 308 |
+
// cerr << "XML TAG IS: " << tag << std::endl;
|
| 309 |
+
|
| 310 |
+
if (tag.size() == 0) {
|
| 311 |
+
cerr << "ERROR: empty tag name: " << line << endl;
|
| 312 |
+
return false;
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
// check if unary (e.g., "<wall/>")
|
| 316 |
+
bool isUnary = ( tag[tag.size() - 1] == '/' );
|
| 317 |
+
|
| 318 |
+
// check if opening tag (e.g. "<a>", not "</a>")g
|
| 319 |
+
bool isClosed = ( tag[0] == '/' );
|
| 320 |
+
bool isOpen = !isClosed;
|
| 321 |
+
|
| 322 |
+
if (isClosed && isUnary) {
|
| 323 |
+
cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
|
| 324 |
+
return false;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
if (isClosed)
|
| 328 |
+
tag = tag.substr(1); // remove "/" at the beginning
|
| 329 |
+
if (isUnary)
|
| 330 |
+
tag = tag.substr(0,tag.size()-1); // remove "/" at the end
|
| 331 |
+
|
| 332 |
+
// find the tag name and contents
|
| 333 |
+
string::size_type endOfName = tag.find_first_of(' ');
|
| 334 |
+
string tagName = tag;
|
| 335 |
+
string tagContent = "";
|
| 336 |
+
if (endOfName != string::npos) {
|
| 337 |
+
tagName = tag.substr(0,endOfName);
|
| 338 |
+
tagContent = tag.substr(endOfName+1);
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
// *** process new tag ***
|
| 342 |
+
|
| 343 |
+
if (isOpen || isUnary) {
|
| 344 |
+
// put the tag on the tag stack
|
| 345 |
+
OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
|
| 346 |
+
tagStack.push_back( openedTag );
|
| 347 |
+
// cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
// *** process completed tag ***
|
| 351 |
+
|
| 352 |
+
if (isClosed || isUnary) {
|
| 353 |
+
// pop last opened tag from stack;
|
| 354 |
+
if (tagStack.size() == 0) {
|
| 355 |
+
cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
|
| 356 |
+
return false;
|
| 357 |
+
}
|
| 358 |
+
OpenedTag openedTag = tagStack.back();
|
| 359 |
+
tagStack.pop_back();
|
| 360 |
+
|
| 361 |
+
// tag names have to match
|
| 362 |
+
if (openedTag.first != tagName) {
|
| 363 |
+
cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
|
| 364 |
+
return false;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
// assemble remaining information about tag
|
| 368 |
+
size_t startPos = openedTag.second.first;
|
| 369 |
+
string tagContent = openedTag.second.second;
|
| 370 |
+
size_t endPos = wordPos;
|
| 371 |
+
|
| 372 |
+
// span attribute overwrites position
|
| 373 |
+
string span = ParseXmlTagAttribute(tagContent,"span");
|
| 374 |
+
if (! span.empty()) {
|
| 375 |
+
vector<string> ij = Tokenize(span, "-");
|
| 376 |
+
if (ij.size() != 1 && ij.size() != 2) {
|
| 377 |
+
cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
|
| 378 |
+
return false;
|
| 379 |
+
}
|
| 380 |
+
startPos = atoi(ij[0].c_str());
|
| 381 |
+
if (ij.size() == 1) endPos = startPos + 1;
|
| 382 |
+
else endPos = atoi(ij[1].c_str()) + 1;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
// cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
|
| 386 |
+
|
| 387 |
+
if (startPos > endPos) {
|
| 388 |
+
cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
|
| 389 |
+
return false;
|
| 390 |
+
} else if (startPos == endPos) {
|
| 391 |
+
cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
|
| 392 |
+
continue;
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
string label = ParseXmlTagAttribute(tagContent,"label");
|
| 396 |
+
labelCollection.insert( label );
|
| 397 |
+
|
| 398 |
+
// report what we have processed so far
|
| 399 |
+
if (0) {
|
| 400 |
+
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
|
| 401 |
+
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
|
| 402 |
+
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
|
| 403 |
+
}
|
| 404 |
+
SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
|
| 405 |
+
ParseXmlTagAttributes(tagContent, node->attributes);
|
| 406 |
+
}
|
| 407 |
+
}
|
| 408 |
+
}
|
| 409 |
+
// we are done. check if there are tags that are still open
|
| 410 |
+
if (tagStack.size() > 0) {
|
| 411 |
+
cerr << "ERROR: some opened tags were never closed: " << line << endl;
|
| 412 |
+
return false;
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
// collect top labels
|
| 416 |
+
const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
|
| 417 |
+
for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
|
| 418 |
+
SyntaxNode *n = *node;
|
| 419 |
+
const string &label = n->label;
|
| 420 |
+
if (topLabelCollection.find( label ) == topLabelCollection.end())
|
| 421 |
+
topLabelCollection[ label ] = 0;
|
| 422 |
+
topLabelCollection[ label ]++;
|
| 423 |
+
}
|
| 424 |
+
|
| 425 |
+
// return de-xml'ed sentence in line
|
| 426 |
+
line = cleanLine;
|
| 427 |
+
return true;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
}
|
mosesdecoder/phrase-extract/XmlTree.h
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2006 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#pragma once
|
| 21 |
+
|
| 22 |
+
#include <string>
|
| 23 |
+
#include <vector>
|
| 24 |
+
#include <set>
|
| 25 |
+
#include <map>
|
| 26 |
+
|
| 27 |
+
#include "SyntaxNodeCollection.h"
|
| 28 |
+
|
| 29 |
+
namespace MosesTraining
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
|
| 33 |
+
std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
|
| 34 |
+
std::string TrimXml(const std::string& str);
|
| 35 |
+
bool isXmlTag(const std::string& tag);
|
| 36 |
+
std::vector<std::string> TokenizeXml(const std::string& str);
|
| 37 |
+
bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
|
| 38 |
+
std::string unescape(const std::string &str);
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
} // namespace MosesTraining
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o
ADDED
|
Binary file (116 kB). View file
|
|
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52c08921a91130c8d12538e7afc1d9f9d47f1c6e041cd15ad2243bbd64fc7a45
|
| 3 |
+
size 10954640
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o
ADDED
|
Binary file (149 kB). View file
|
|
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: ScoreFeatureTest --random -- phrase-extract/test.domain
|
| 4 |
+
Running 2 test cases...
|
| 5 |
+
1 7 4 aabc=7 ba=6 a=4
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
|
| 2 |
+
and should be placed after all Boost.Test arguments and the -- separator.
|
| 3 |
+
For example: ScoreFeatureTest --random -- phrase-extract/test.domain
|
| 4 |
+
Running 2 test cases...
|
| 5 |
+
1 7 4 aabc=7 ba=6 a=4
|
| 6 |
+
*** No errors detected
|
| 7 |
+
|
| 8 |
+
EXIT STATUS: 0
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
passed
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o
ADDED
|
Binary file (263 kB). View file
|
|
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o
ADDED
|
Binary file (118 kB). View file
|
|
|
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o
ADDED
|
Binary file (196 kB). View file
|
|
|