suricodes commited on
Commit
b7a24d4
·
verified ·
1 Parent(s): c4f5ca4

Upload 356 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +19 -0
  2. mosesdecoder/phrase-extract/Alignment.cpp +70 -0
  3. mosesdecoder/phrase-extract/Alignment.h +35 -0
  4. mosesdecoder/phrase-extract/AlignmentPhrase.cpp +49 -0
  5. mosesdecoder/phrase-extract/AlignmentPhrase.h +74 -0
  6. mosesdecoder/phrase-extract/DomainFeature.cpp +170 -0
  7. mosesdecoder/phrase-extract/DomainFeature.h +143 -0
  8. mosesdecoder/phrase-extract/ExtractedRule.h +83 -0
  9. mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp +584 -0
  10. mosesdecoder/phrase-extract/ExtractionPhrasePair.h +179 -0
  11. mosesdecoder/phrase-extract/Hole.h +116 -0
  12. mosesdecoder/phrase-extract/HoleCollection.cpp +77 -0
  13. mosesdecoder/phrase-extract/HoleCollection.h +95 -0
  14. mosesdecoder/phrase-extract/InputFileStream.cpp +61 -0
  15. mosesdecoder/phrase-extract/InputFileStream.h +48 -0
  16. mosesdecoder/phrase-extract/InternalStructFeature.cpp +57 -0
  17. mosesdecoder/phrase-extract/InternalStructFeature.h +64 -0
  18. mosesdecoder/phrase-extract/Jamfile +19 -0
  19. mosesdecoder/phrase-extract/OutputFileStream.cpp +90 -0
  20. mosesdecoder/phrase-extract/OutputFileStream.h +81 -0
  21. mosesdecoder/phrase-extract/PhraseExtractionOptions.h +193 -0
  22. mosesdecoder/phrase-extract/PhraseOrientation.cpp +481 -0
  23. mosesdecoder/phrase-extract/PhraseOrientation.h +127 -0
  24. mosesdecoder/phrase-extract/PropertiesConsolidator.cpp +350 -0
  25. mosesdecoder/phrase-extract/PropertiesConsolidator.h +67 -0
  26. mosesdecoder/phrase-extract/RuleExist.h +65 -0
  27. mosesdecoder/phrase-extract/RuleExtractionOptions.h +95 -0
  28. mosesdecoder/phrase-extract/ScoreFeature.cpp +114 -0
  29. mosesdecoder/phrase-extract/ScoreFeature.h +143 -0
  30. mosesdecoder/phrase-extract/ScoreFeatureTest.cpp +140 -0
  31. mosesdecoder/phrase-extract/SentenceAlignment.cpp +144 -0
  32. mosesdecoder/phrase-extract/SentenceAlignment.h +59 -0
  33. mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp +78 -0
  34. mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h +69 -0
  35. mosesdecoder/phrase-extract/SyntaxNode.h +46 -0
  36. mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp +163 -0
  37. mosesdecoder/phrase-extract/SyntaxNodeCollection.h +91 -0
  38. mosesdecoder/phrase-extract/SyntaxTree.h +12 -0
  39. mosesdecoder/phrase-extract/XmlException.h +46 -0
  40. mosesdecoder/phrase-extract/XmlTree.cpp +430 -0
  41. mosesdecoder/phrase-extract/XmlTree.h +41 -0
  42. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o +0 -0
  43. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest +3 -0
  44. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o +0 -0
  45. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output +8 -0
  46. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run +8 -0
  47. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test +1 -0
  48. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o +0 -0
  49. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o +0 -0
  50. mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o +0 -0
.gitattributes CHANGED
@@ -105,3 +105,22 @@ mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/lmbrgrid fi
105
  mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
106
  mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
107
  mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/moses filter=lfs diff=lfs merge=lfs -text
106
  mosesdecoder/moses-cmd/bin/gcc-9/release/link-static/threading-multi/vwtrainer filter=lfs diff=lfs merge=lfs -text
107
  mosesdecoder/OnDiskPt/bin/gcc-9/release/link-static/threading-multi/CreateOnDiskPt filter=lfs diff=lfs merge=lfs -text
108
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate filter=lfs diff=lfs merge=lfs -text
109
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-direct filter=lfs diff=lfs merge=lfs -text
110
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/consolidate-reverse filter=lfs diff=lfs merge=lfs -text
111
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract filter=lfs diff=lfs merge=lfs -text
112
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-lex filter=lfs diff=lfs merge=lfs -text
113
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/extract-rules filter=lfs diff=lfs merge=lfs -text
114
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/relax-parse filter=lfs diff=lfs merge=lfs -text
115
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/score filter=lfs diff=lfs merge=lfs -text
116
+ mosesdecoder/phrase-extract/bin/gcc-9/release/link-static/threading-multi/statistics filter=lfs diff=lfs merge=lfs -text
117
+ mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest filter=lfs diff=lfs merge=lfs -text
118
+ mosesdecoder/phrase-extract/extract-ghkm/bin/gcc-9/release/link-static/threading-multi/extract-ghkm filter=lfs diff=lfs merge=lfs -text
119
+ mosesdecoder/phrase-extract/extract-mixed-syntax/bin/gcc-9/release/link-static/threading-multi/extract-mixed-syntax filter=lfs diff=lfs merge=lfs -text
120
+ mosesdecoder/phrase-extract/filter-rule-table/bin/gcc-9/release/link-static/threading-multi/filter-rule-table filter=lfs diff=lfs merge=lfs -text
121
+ mosesdecoder/phrase-extract/lexical-reordering/bin/gcc-9/release/link-static/threading-multi/lexical-reordering-score filter=lfs diff=lfs merge=lfs -text
122
+ mosesdecoder/phrase-extract/postprocess-egret-forests/bin/gcc-9/release/link-static/threading-multi/postprocess-egret-forests filter=lfs diff=lfs merge=lfs -text
123
+ mosesdecoder/phrase-extract/score-stsg/bin/gcc-9/release/link-static/threading-multi/score-stsg filter=lfs diff=lfs merge=lfs -text
124
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/libsyntax_common.a filter=lfs diff=lfs merge=lfs -text
125
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_fragment_tokenizer_test filter=lfs diff=lfs merge=lfs -text
126
+ mosesdecoder/phrase-extract/syntax-common/bin/gcc-9/release/link-static/threading-multi/tree_test filter=lfs diff=lfs merge=lfs -text
mosesdecoder/phrase-extract/Alignment.cpp ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "Alignment.h"
21
+
22
+ #include "phrase-extract/syntax-common/exception.h"
23
+
24
+ #include <algorithm>
25
+ #include <cassert>
26
+ #include <cstdlib>
27
+
28
+ namespace MosesTraining
29
+ {
30
+
31
+ void ReadAlignment(const std::string &s, Alignment &a)
32
+ {
33
+ const std::string digits = "0123456789";
34
+
35
+ a.clear();
36
+
37
+ std::string::size_type begin = 0;
38
+ while (true) {
39
+ std::string::size_type end = s.find("-", begin);
40
+ if (end == std::string::npos) {
41
+ return;
42
+ }
43
+ int src = std::atoi(s.substr(begin, end-begin).c_str());
44
+ if (end+1 == s.size()) {
45
+ throw Syntax::Exception("Target index missing");
46
+ }
47
+
48
+ begin = end+1;
49
+ end = s.find_first_not_of(digits, begin+1);
50
+ int tgt;
51
+ if (end == std::string::npos) {
52
+ tgt = std::atoi(s.substr(begin).c_str());
53
+ a.push_back(std::make_pair(src, tgt));
54
+ return;
55
+ } else {
56
+ tgt = std::atoi(s.substr(begin, end-begin).c_str());
57
+ a.push_back(std::make_pair(src, tgt));
58
+ }
59
+ begin = end+1;
60
+ }
61
+ }
62
+
63
+ void FlipAlignment(Alignment &a)
64
+ {
65
+ for (Alignment::iterator p = a.begin(); p != a.end(); ++p) {
66
+ std::swap(p->first, p->second);
67
+ }
68
+ }
69
+
70
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/Alignment.h ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <string>
23
+ #include <utility>
24
+ #include <vector>
25
+
26
+ namespace MosesTraining
27
+ {
28
+
29
+ typedef std::vector<std::pair<int, int> > Alignment;
30
+
31
+ void ReadAlignment(const std::string &, Alignment &);
32
+
33
+ void FlipAlignment(Alignment &);
34
+
35
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/AlignmentPhrase.cpp ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2006 University of Edinburgh
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include <algorithm>
22
+ #include <iostream>
23
+ #include "AlignmentPhrase.h"
24
+
25
+ using namespace std;
26
+
27
+ namespace MosesTraining
28
+ {
29
+
30
+ void AlignmentElement::Merge(size_t align)
31
+ {
32
+ m_elements.insert(align);
33
+ }
34
+
35
+ void AlignmentPhrase::Merge(const std::vector< std::vector<size_t> > &source)
36
+ {
37
+ for (size_t idx = 0 ; idx < source.size() ; ++idx) {
38
+ AlignmentElement &currElement = m_elements[idx];
39
+ const vector<size_t> &newElement = source[idx];
40
+
41
+ for (size_t pos = 0 ; pos < newElement.size() ; ++pos) {
42
+ currElement.Merge(newElement[pos]);
43
+ }
44
+ }
45
+ }
46
+
47
+ } // namespace
48
+
49
+
mosesdecoder/phrase-extract/AlignmentPhrase.h ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2006 University of Edinburgh
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #pragma once
22
+
23
+ #include <vector>
24
+ #include <set>
25
+
26
+ namespace MosesTraining
27
+ {
28
+
29
+ class WordsRange;
30
+
31
+ class AlignmentElement
32
+ {
33
+ protected:
34
+ std::set<size_t> m_elements;
35
+ public:
36
+ typedef std::set<size_t>::iterator iterator;
37
+ typedef std::set<size_t>::const_iterator const_iterator;
38
+ const_iterator begin() const {
39
+ return m_elements.begin();
40
+ }
41
+ const_iterator end() const {
42
+ return m_elements.end();
43
+ }
44
+
45
+ AlignmentElement() {
46
+ }
47
+
48
+ size_t GetSize() const {
49
+ return m_elements.size();
50
+ }
51
+
52
+ void Merge(size_t align);
53
+ };
54
+
55
+ class AlignmentPhrase
56
+ {
57
+ protected:
58
+ std::vector<AlignmentElement> m_elements;
59
+ public:
60
+ AlignmentPhrase(size_t size)
61
+ :m_elements(size) {
62
+ }
63
+ void Merge(const AlignmentPhrase &newAlignment, const WordsRange &newAlignmentRange);
64
+ void Merge(const std::vector< std::vector<size_t> > &source);
65
+ size_t GetSize() const {
66
+ return m_elements.size();
67
+ }
68
+ const AlignmentElement &GetElement(size_t pos) const {
69
+ return m_elements[pos];
70
+ }
71
+ };
72
+
73
+ } // namespace
74
+
mosesdecoder/phrase-extract/DomainFeature.cpp ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "DomainFeature.h"
2
+ #include "ExtractionPhrasePair.h"
3
+ #include "tables-core.h"
4
+ #include "InputFileStream.h"
5
+ #include "util/tokenize.hh"
6
+
7
+ using namespace std;
8
+
9
+ namespace MosesTraining
10
+ {
11
+
12
+ // handling of domain names: load database with sentence-id / domain name info
13
+ void Domain::load( const std::string &domainFileName )
14
+ {
15
+ Moses::InputFileStream fileS( domainFileName );
16
+ istream *fileP = &fileS;
17
+
18
+ string line;
19
+ while(getline(*fileP, line)) {
20
+ // read
21
+ const vector< string > domainSpecLine = util::tokenize( line );
22
+ int lineNumber;
23
+ if (domainSpecLine.size() != 2 ||
24
+ ! sscanf(domainSpecLine[0].c_str(), "%d", &lineNumber)) {
25
+ std::cerr << "ERROR: in domain specification line: '" << line << "'" << endl;
26
+ exit(1);
27
+ }
28
+ // store
29
+ const string &name = domainSpecLine[1];
30
+ spec.push_back( make_pair( lineNumber, name ));
31
+ if (name2id.find( name ) == name2id.end()) {
32
+ name2id[ name ] = list.size();
33
+ list.push_back( name );
34
+ }
35
+ }
36
+ }
37
+
38
+ // get domain name based on sentence number
39
+ string Domain::getDomainOfSentence( int sentenceId ) const
40
+ {
41
+ for(size_t i=0; i<spec.size(); i++) {
42
+ if (sentenceId <= spec[i].first) {
43
+ return spec[i].second;
44
+ }
45
+ }
46
+ return "undefined";
47
+ }
48
+
49
+ DomainFeature::DomainFeature(const string& domainFile) : m_propertyKey("domain")
50
+ {
51
+ //process domain file
52
+ m_domain.load(domainFile);
53
+ }
54
+
55
+ void DomainFeature::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
56
+ float count,
57
+ int sentenceId) const
58
+ {
59
+ std::string value = m_domain.getDomainOfSentence(sentenceId);
60
+ phrasePair.AddProperty(m_propertyKey, value, count);
61
+ }
62
+
63
+ void DomainFeature::add(const ScoreFeatureContext& context,
64
+ std::vector<float>& denseValues,
65
+ std::map<std::string,float>& sparseValues) const
66
+ {
67
+ const map<string,float> *domainCount = context.phrasePair.GetProperty(m_propertyKey);
68
+ assert( domainCount != NULL );
69
+ add(*domainCount,
70
+ context.phrasePair.GetCount(),
71
+ context.maybeLog,
72
+ denseValues, sparseValues);
73
+ }
74
+
75
+ void SubsetDomainFeature::add(const map<string,float>& domainCount,
76
+ float count,
77
+ const MaybeLog& maybeLog,
78
+ std::vector<float>& denseValues,
79
+ std::map<std::string,float>& sparseValues) const
80
+ {
81
+ if (m_domain.list.size() > 6) {
82
+ UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException,
83
+ "too many domains for core domain subset features");
84
+ }
85
+ size_t bitmap = 0;
86
+ for(size_t bit = 0; bit < m_domain.list.size(); bit++) {
87
+ if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) {
88
+ bitmap += 1 << bit;
89
+ }
90
+ }
91
+ for(size_t i = 1; i < (1 << m_domain.list.size()); i++) {
92
+ denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 ));
93
+ }
94
+ }
95
+
96
+ void SparseSubsetDomainFeature::add(const map<string,float>& domainCount,float count,
97
+ const MaybeLog& maybeLog,
98
+ std::vector<float>& denseValues,
99
+ std::map<std::string,float>& sparseValues) const
100
+ {
101
+ typedef vector<string>::const_iterator I;
102
+ ostringstream key;
103
+ key << "doms";
104
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); ++i) {
105
+ if (domainCount.find(*i) != domainCount.end()) {
106
+ key << "_" << *i;
107
+ }
108
+ }
109
+ sparseValues[key.str()] = 1;
110
+ }
111
+
112
+
113
+ void RatioDomainFeature::add(const map<string,float>& domainCount,float count,
114
+ const MaybeLog& maybeLog,
115
+ std::vector<float>& denseValues,
116
+ std::map<std::string,float>& sparseValues) const
117
+ {
118
+ typedef vector< string >::const_iterator I;
119
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
120
+ map<string,float>::const_iterator dci = domainCount.find(*i);
121
+ if (dci == domainCount.end() ) {
122
+ denseValues.push_back(maybeLog( 1 ));
123
+ } else {
124
+ denseValues.push_back(maybeLog(exp( dci->second / count ) ));
125
+ }
126
+ }
127
+ }
128
+
129
+
130
+ void SparseRatioDomainFeature::add(const map<string,float>& domainCount,float count,
131
+ const MaybeLog& maybeLog,
132
+ std::vector<float>& denseValues,
133
+ std::map<std::string,float>& sparseValues) const
134
+ {
135
+ typedef map< string, float >::const_iterator I;
136
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
137
+ sparseValues["domr_" + i->first] = (i->second / count);
138
+ }
139
+ }
140
+
141
+
142
+ void IndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
143
+ const MaybeLog& maybeLog,
144
+ std::vector<float>& denseValues,
145
+ std::map<std::string,float>& sparseValues) const
146
+ {
147
+ typedef vector< string >::const_iterator I;
148
+ for (I i = m_domain.list.begin(); i != m_domain.list.end(); i++ ) {
149
+ map<string,float>::const_iterator dci = domainCount.find(*i);
150
+ if (dci == domainCount.end() ) {
151
+ denseValues.push_back(maybeLog( 1 ));
152
+ } else {
153
+ denseValues.push_back(maybeLog(2.718));
154
+ }
155
+ }
156
+ }
157
+
158
+ void SparseIndicatorDomainFeature::add(const map<string,float>& domainCount,float count,
159
+ const MaybeLog& maybeLog,
160
+ std::vector<float>& denseValues,
161
+ std::map<std::string,float>& sparseValues) const
162
+ {
163
+ typedef map< string, float >::const_iterator I;
164
+ for (I i=domainCount.begin(); i != domainCount.end(); i++) {
165
+ sparseValues["dom_" + i->first] = 1;
166
+ }
167
+ }
168
+
169
+ }
170
+
mosesdecoder/phrase-extract/DomainFeature.h ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ #ifndef _DOMAIN_H
4
+ #define _DOMAIN_H
5
+
6
+ #include <iostream>
7
+ #include <fstream>
8
+ #include <cassert>
9
+ #include <cstdlib>
10
+ #include <string>
11
+ #include <queue>
12
+ #include <map>
13
+ #include <cmath>
14
+
15
+ #include "ScoreFeature.h"
16
+
17
+ namespace MosesTraining
18
+ {
19
+
20
+ class Domain
21
+ {
22
+ public:
23
+ std::vector< std::pair< int, std::string > > spec;
24
+ std::vector< std::string > list;
25
+ std::map< std::string, int > name2id;
26
+ void load( const std::string &fileName );
27
+ std::string getDomainOfSentence( int sentenceId ) const;
28
+ };
29
+
30
+ class DomainFeature : public ScoreFeature
31
+ {
32
+ public:
33
+
34
+ DomainFeature(const std::string& domainFile);
35
+
36
+ void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
37
+ float count,
38
+ int sentenceId) const;
39
+
40
+ void add(const ScoreFeatureContext& context,
41
+ std::vector<float>& denseValues,
42
+ std::map<std::string,float>& sparseValues) const;
43
+
44
+ protected:
45
+ /** Overridden in subclass */
46
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
47
+ const MaybeLog& maybeLog,
48
+ std::vector<float>& denseValues,
49
+ std::map<std::string,float>& sparseValues) const = 0;
50
+
51
+
52
+ Domain m_domain;
53
+
54
+ const std::string m_propertyKey;
55
+
56
+ };
57
+
58
+ class SubsetDomainFeature : public DomainFeature
59
+ {
60
+ public:
61
+ SubsetDomainFeature(const std::string& domainFile) :
62
+ DomainFeature(domainFile) {}
63
+
64
+ protected:
65
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
66
+ const MaybeLog& maybeLog,
67
+ std::vector<float>& denseValues,
68
+ std::map<std::string,float>& sparseValues) const;
69
+ };
70
+
71
+ class SparseSubsetDomainFeature : public DomainFeature
72
+ {
73
+ public:
74
+ SparseSubsetDomainFeature(const std::string& domainFile) :
75
+ DomainFeature(domainFile) {}
76
+
77
+ protected:
78
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
79
+ const MaybeLog& maybeLog,
80
+ std::vector<float>& denseValues,
81
+ std::map<std::string,float>& sparseValues) const;
82
+
83
+ };
84
+
85
+ class IndicatorDomainFeature : public DomainFeature
86
+ {
87
+ public:
88
+ IndicatorDomainFeature(const std::string& domainFile) :
89
+ DomainFeature(domainFile) {}
90
+
91
+ protected:
92
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
93
+ const MaybeLog& maybeLog,
94
+ std::vector<float>& denseValues,
95
+ std::map<std::string,float>& sparseValues) const;
96
+ };
97
+
98
+
99
+ class SparseIndicatorDomainFeature : public DomainFeature
100
+ {
101
+ public:
102
+ SparseIndicatorDomainFeature(const std::string& domainFile) :
103
+ DomainFeature(domainFile) {}
104
+
105
+ protected:
106
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
107
+ const MaybeLog& maybeLog,
108
+ std::vector<float>& denseValues,
109
+ std::map<std::string,float>& sparseValues) const;
110
+ };
111
+
112
+
113
+ class RatioDomainFeature : public DomainFeature
114
+ {
115
+ public:
116
+ RatioDomainFeature(const std::string& domainFile) :
117
+ DomainFeature(domainFile) {}
118
+
119
+ protected:
120
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
121
+ const MaybeLog& maybeLog,
122
+ std::vector<float>& denseValues,
123
+ std::map<std::string,float>& sparseValues) const;
124
+ };
125
+
126
+
127
+ class SparseRatioDomainFeature : public DomainFeature
128
+ {
129
+ public:
130
+ SparseRatioDomainFeature(const std::string& domainFile) :
131
+ DomainFeature(domainFile) {}
132
+
133
+ protected:
134
+ virtual void add(const std::map<std::string,float>& domainCounts, float count,
135
+ const MaybeLog& maybeLog,
136
+ std::vector<float>& denseValues,
137
+ std::map<std::string,float>& sparseValues) const;
138
+ };
139
+
140
+
141
+ }
142
+
143
+ #endif
mosesdecoder/phrase-extract/ExtractedRule.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef EXTRACTEDRULE_H_INCLUDED_
22
+ #define EXTRACTEDRULE_H_INCLUDED_
23
+
24
+ #include <string>
25
+ #include <iostream>
26
+ #include <sstream>
27
+ #include <map>
28
+
29
+ #include "PhraseOrientation.h"
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+ // sentence-level collection of rules
35
+ class ExtractedRule
36
+ {
37
+ public:
38
+ std::string source;
39
+ std::string target;
40
+ std::string alignment;
41
+ std::string alignmentInv;
42
+ std::string sourceContextLeft;
43
+ std::string sourceContextRight;
44
+ std::string targetContextLeft;
45
+ std::string targetContextRight;
46
+ std::string sourceHoleString;
47
+ std::string targetHoleString;
48
+ std::string targetSyntacticPreference;
49
+ int startT;
50
+ int endT;
51
+ int startS;
52
+ int endS;
53
+ float count;
54
+ double pcfgScore;
55
+ PhraseOrientation::REO_CLASS l2rOrientation;
56
+ PhraseOrientation::REO_CLASS r2lOrientation;
57
+
58
+ ExtractedRule(int sT, int eT, int sS, int eS)
59
+ : source()
60
+ , target()
61
+ , alignment()
62
+ , alignmentInv()
63
+ , sourceContextLeft()
64
+ , sourceContextRight()
65
+ , targetContextLeft()
66
+ , targetContextRight()
67
+ , sourceHoleString()
68
+ , targetHoleString()
69
+ , targetSyntacticPreference()
70
+ , startT(sT)
71
+ , endT(eT)
72
+ , startS(sS)
73
+ , endS(eS)
74
+ , count(0)
75
+ , pcfgScore(0.0)
76
+ , l2rOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
77
+ , r2lOrientation(PhraseOrientation::REO_CLASS_UNKNOWN)
78
+ { }
79
+ };
80
+
81
+ }
82
+
83
+ #endif
mosesdecoder/phrase-extract/ExtractionPhrasePair.cpp ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2009 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <sstream>
21
+ #include "ExtractionPhrasePair.h"
22
+ #include "tables-core.h"
23
+ #include "score.h"
24
+ #include "moses/Util.h"
25
+
26
+ #include <cstdlib>
27
+
28
+ using namespace std;
29
+
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+
35
+ extern Vocabulary vcbT;
36
+ extern Vocabulary vcbS;
37
+
38
+ extern bool hierarchicalFlag;
39
+
40
+
41
+ ExtractionPhrasePair::ExtractionPhrasePair( const PHRASE *phraseSource,
42
+ const PHRASE *phraseTarget,
43
+ ALIGNMENT *targetToSourceAlignment,
44
+ float count, float pcfgSum ) :
45
+ m_phraseSource(phraseSource),
46
+ m_phraseTarget(phraseTarget),
47
+ m_count(count),
48
+ m_pcfgSum(pcfgSum)
49
+ {
50
+ assert(!phraseSource->empty());
51
+
52
+ m_count = count;
53
+ m_pcfgSum = pcfgSum;
54
+
55
+ std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
56
+ m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
57
+
58
+ m_lastTargetToSourceAlignment = insertedAlignment.first;
59
+ m_lastCount = m_count;
60
+ m_lastPcfgSum = m_pcfgSum;
61
+
62
+ m_isValid = true;
63
+ }
64
+
65
+
66
+ ExtractionPhrasePair::~ExtractionPhrasePair( )
67
+ {
68
+ Clear();
69
+ }
70
+
71
+
72
+ // return value: true if the given alignment was seen for the first time and thus will be stored,
73
+ // false if it was present already (the pointer may thus be deleted(
74
+ bool ExtractionPhrasePair::Add( ALIGNMENT *targetToSourceAlignment,
75
+ float count, float pcfgSum )
76
+ {
77
+ m_count += count;
78
+ m_pcfgSum += pcfgSum;
79
+
80
+ m_lastCount = count;
81
+ m_lastPcfgSum = pcfgSum;
82
+
83
+ std::map<ALIGNMENT*,float>::iterator iter = m_lastTargetToSourceAlignment;
84
+ if ( *(iter->first) == *targetToSourceAlignment ) {
85
+ iter->second += count;
86
+ return false;
87
+ } else {
88
+ std::pair< std::map<ALIGNMENT*,float>::iterator, bool > insertedAlignment =
89
+ m_targetToSourceAlignments.insert( std::pair<ALIGNMENT*,float>(targetToSourceAlignment,count) );
90
+ if ( !insertedAlignment.second ) {
91
+ // the alignment already exists: increment count
92
+ insertedAlignment.first->second += count;
93
+ return false;
94
+ }
95
+ m_lastTargetToSourceAlignment = insertedAlignment.first;
96
+ }
97
+
98
+ return true;
99
+ }
100
+
101
+
102
+ void ExtractionPhrasePair::IncrementPrevious( float count, float pcfgSum )
103
+ {
104
+ m_count += count;
105
+ m_pcfgSum += pcfgSum;
106
+ m_lastTargetToSourceAlignment->second += count;
107
+ // properties
108
+ for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
109
+ iter !=m_properties.end(); ++iter ) {
110
+ LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
111
+ (*lastPropertyValue)->second += count;
112
+ }
113
+
114
+ m_lastCount = count;
115
+ m_lastPcfgSum = pcfgSum;
116
+ }
117
+
118
+
119
+ // Check for lexical match
120
+ // and in case of SCFG rules for equal non-terminal alignment.
121
+ bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
122
+ const PHRASE *otherPhraseTarget,
123
+ ALIGNMENT *otherTargetToSourceAlignment ) const
124
+ {
125
+ if (*otherPhraseTarget != *m_phraseTarget) {
126
+ return false;
127
+ }
128
+ if (*otherPhraseSource != *m_phraseSource) {
129
+ return false;
130
+ }
131
+
132
+ return MatchesAlignment( otherTargetToSourceAlignment );
133
+ }
134
+
135
+ // Check for lexical match
136
+ // and in case of SCFG rules for equal non-terminal alignment.
137
+ // Set boolean indicators.
138
+ // (Note that we check in the order: target - source - alignment
139
+ // and do not touch the subsequent boolean indicators once a previous one has been set to false.)
140
+ bool ExtractionPhrasePair::Matches( const PHRASE *otherPhraseSource,
141
+ const PHRASE *otherPhraseTarget,
142
+ ALIGNMENT *otherTargetToSourceAlignment,
143
+ bool &sourceMatch,
144
+ bool &targetMatch,
145
+ bool &alignmentMatch ) const
146
+ {
147
+ if (*otherPhraseSource != *m_phraseSource) {
148
+ sourceMatch = false;
149
+ return false;
150
+ } else {
151
+ sourceMatch = true;
152
+ }
153
+ if (*otherPhraseTarget != *m_phraseTarget) {
154
+ targetMatch = false;
155
+ return false;
156
+ } else {
157
+ targetMatch = true;
158
+ }
159
+ if ( !MatchesAlignment(otherTargetToSourceAlignment) ) {
160
+ alignmentMatch = false;
161
+ return false;
162
+ } else {
163
+ alignmentMatch = true;
164
+ }
165
+ return true;
166
+ }
167
+
168
+ // Check for equal non-terminal alignment in case of SCFG rules.
169
+ // Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
170
+ bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
171
+ {
172
+ if (!hierarchicalFlag) return true;
173
+
174
+ // all or none of the phrasePair's word alignment matrices match, so just pick one
175
+ const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;
176
+
177
+ assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
178
+ assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());
179
+
180
+ // loop over all symbols but the left hand side of the rule
181
+ for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
182
+ if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
183
+ size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin());
184
+ size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());
185
+
186
+ if (thisTargetToSourceAlignment->at(i).size() != 1 ||
187
+ otherTargetToSourceAlignment->at(i).size() != 1 ||
188
+ thisAlign != otherAlign) {
189
+ return false;
190
+ }
191
+ }
192
+ }
193
+
194
+ return true;
195
+ }
196
+
197
+ void ExtractionPhrasePair::Clear()
198
+ {
199
+ delete m_phraseSource;
200
+ delete m_phraseTarget;
201
+
202
+ m_count = 0.0f;
203
+ m_pcfgSum = 0.0f;
204
+
205
+ for ( std::map<ALIGNMENT*,float>::iterator iter=m_targetToSourceAlignments.begin();
206
+ iter!=m_targetToSourceAlignments.end(); ++iter) {
207
+ delete iter->first;
208
+ }
209
+ m_targetToSourceAlignments.clear();
210
+
211
+ for ( std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter=m_properties.begin();
212
+ iter!=m_properties.end(); ++iter) {
213
+ delete (iter->second).second;
214
+ delete (iter->second).first;
215
+ }
216
+ m_properties.clear();
217
+
218
+ m_lastCount = 0.0f;
219
+ m_lastPcfgSum = 0.0f;
220
+ m_lastTargetToSourceAlignment = m_targetToSourceAlignments.begin();
221
+
222
+ m_isValid = false;
223
+ }
224
+
225
+
226
+ void ExtractionPhrasePair::AddProperties( const std::string &propertiesString, float count )
227
+ {
228
+ if (propertiesString.empty()) {
229
+ return;
230
+ }
231
+
232
+ vector<std::string> toks;
233
+ Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
234
+ for (size_t i = 1; i < toks.size(); ++i) {
235
+ std::string &tok = toks[i];
236
+ if (tok.empty()) {
237
+ continue;
238
+ }
239
+ size_t endPos = tok.rfind("}");
240
+ tok = tok.substr(0, endPos - 1);
241
+
242
+ vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
243
+ if (keyValue.size() == 2) {
244
+ AddProperty(keyValue[0], keyValue[1], count);
245
+ }
246
+ }
247
+ }
248
+
249
+
250
+ const ALIGNMENT *ExtractionPhrasePair::FindBestAlignmentTargetToSource() const
251
+ {
252
+ float bestAlignmentCount = -1;
253
+
254
+ std::map<ALIGNMENT*,float>::const_iterator bestAlignment = m_targetToSourceAlignments.end();
255
+
256
+ for (std::map<ALIGNMENT*,float>::const_iterator iter=m_targetToSourceAlignments.begin();
257
+ iter!=m_targetToSourceAlignments.end(); ++iter) {
258
+ if ( (iter->second > bestAlignmentCount) ||
259
+ ( (iter->second == bestAlignmentCount) &&
260
+ (*(iter->first) > *(bestAlignment->first)) ) ) {
261
+ bestAlignmentCount = iter->second;
262
+ bestAlignment = iter;
263
+ }
264
+ }
265
+
266
+ if ( bestAlignment == m_targetToSourceAlignments.end()) {
267
+ return NULL;
268
+ }
269
+
270
+ return bestAlignment->first;
271
+ }
272
+
273
+
274
+ const std::string *ExtractionPhrasePair::FindBestPropertyValue(const std::string &key) const
275
+ {
276
+ float bestPropertyCount = -1;
277
+
278
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
279
+ if ( allPropertyValues == NULL ) {
280
+ return NULL;
281
+ }
282
+
283
+ PROPERTY_VALUES::const_iterator bestPropertyValue = allPropertyValues->end();
284
+
285
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
286
+ iter!=allPropertyValues->end(); ++iter) {
287
+ if ( (iter->second > bestPropertyCount) ||
288
+ ( (iter->second == bestPropertyCount) &&
289
+ (iter->first > bestPropertyValue->first) ) ) {
290
+ bestPropertyCount = iter->second;
291
+ bestPropertyValue = iter;
292
+ }
293
+ }
294
+
295
+ if ( bestPropertyValue == allPropertyValues->end()) {
296
+ return NULL;
297
+ }
298
+
299
+ return &(bestPropertyValue->first);
300
+ }
301
+
302
+
303
+ std::string ExtractionPhrasePair::CollectAllPropertyValues(const std::string &key) const
304
+ {
305
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
306
+
307
+ if ( allPropertyValues == NULL ) {
308
+ return "";
309
+ }
310
+
311
+ std::ostringstream oss;
312
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
313
+ iter!=allPropertyValues->end(); ++iter) {
314
+ if (!(iter->first).empty()) {
315
+ if (iter!=allPropertyValues->begin()) {
316
+ oss << " ";
317
+ }
318
+ oss << iter->first;
319
+ oss << " ";
320
+ oss << iter->second;
321
+ }
322
+ }
323
+
324
+ std::string allPropertyValuesString(oss.str());
325
+ return allPropertyValuesString;
326
+ }
327
+
328
+
329
+ std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
330
+ std::set<std::string>& labelSet,
331
+ boost::unordered_map<std::string,float>& countsLabelsLHS,
332
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& jointCountsRulesTargetLHSAndLabelsLHS,
333
+ Vocabulary &vcbT) const
334
+ {
335
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
336
+
337
+ if ( allPropertyValues == NULL ) {
338
+ return "";
339
+ }
340
+
341
+ std::string lhs="", rhs="", currentRhs="";
342
+ float currentRhsCount = 0.0;
343
+ std::list< std::pair<std::string,float> > lhsGivenCurrentRhsCounts;
344
+
345
+ std::ostringstream oss;
346
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
347
+ iter!=allPropertyValues->end(); ++iter) {
348
+
349
+ size_t space = (iter->first).find_last_of(' ');
350
+ if ( space == string::npos ) {
351
+ lhs = iter->first;
352
+ rhs.clear();
353
+ } else {
354
+ lhs = (iter->first).substr(space+1);
355
+ rhs = (iter->first).substr(0,space);
356
+ }
357
+
358
+ labelSet.insert(lhs);
359
+
360
+ if ( rhs.compare(currentRhs) ) {
361
+
362
+ if ( iter!=allPropertyValues->begin() ) {
363
+ if ( !currentRhs.empty() ) {
364
+ istringstream tokenizer(currentRhs);
365
+ std::string rhsLabel;
366
+ while ( tokenizer.peek() != EOF ) {
367
+ tokenizer >> rhsLabel;
368
+ labelSet.insert(rhsLabel);
369
+ }
370
+ oss << " " << currentRhs << " " << currentRhsCount;
371
+ }
372
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
373
+ if ( !currentRhs.empty() ) {
374
+ oss << " " << lhsGivenCurrentRhsCounts.size();
375
+ }
376
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
377
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
378
+ oss << " " << iter2->first << " " << iter2->second;
379
+
380
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
381
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
382
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
383
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
384
+
385
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
386
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
387
+ if (!insertedCountsLabelsLHS.second) {
388
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
389
+ }
390
+
391
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
392
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
393
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
394
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
395
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
396
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
397
+ } else {
398
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
399
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
400
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
401
+ if (!insertedJointCounts.second) {
402
+ (insertedJointCounts.first)->second += iter2->second;
403
+ }
404
+ }
405
+
406
+ }
407
+ }
408
+
409
+ lhsGivenCurrentRhsCounts.clear();
410
+ }
411
+
412
+ currentRhsCount = 0.0;
413
+ currentRhs = rhs;
414
+ }
415
+
416
+ currentRhsCount += iter->second;
417
+ lhsGivenCurrentRhsCounts.push_back( std::pair<std::string,float>(lhs,iter->second) );
418
+ }
419
+
420
+ if ( !currentRhs.empty() ) {
421
+ istringstream tokenizer(currentRhs);
422
+ std::string rhsLabel;
423
+ while ( tokenizer.peek() != EOF ) {
424
+ tokenizer >> rhsLabel;
425
+ labelSet.insert(rhsLabel);
426
+ }
427
+ oss << " " << currentRhs << " " << currentRhsCount;
428
+ }
429
+ if ( lhsGivenCurrentRhsCounts.size() > 0 ) {
430
+ if ( !currentRhs.empty() ) {
431
+ oss << " " << lhsGivenCurrentRhsCounts.size();
432
+ }
433
+ for ( std::list< std::pair<std::string,float> >::const_iterator iter2=lhsGivenCurrentRhsCounts.begin();
434
+ iter2!=lhsGivenCurrentRhsCounts.end(); ++iter2 ) {
435
+ oss << " " << iter2->first << " " << iter2->second;
436
+
437
+ // update countsLabelsLHS and jointCountsRulesTargetLHSAndLabelsLHS
438
+ std::string ruleTargetLhs = vcbT.getWord(m_phraseTarget->back());
439
+ ruleTargetLhs.erase(ruleTargetLhs.begin()); // strip square brackets
440
+ ruleTargetLhs.erase(ruleTargetLhs.size()-1);
441
+
442
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedCountsLabelsLHS =
443
+ countsLabelsLHS.insert(std::pair<std::string,float>(iter2->first,iter2->second));
444
+ if (!insertedCountsLabelsLHS.second) {
445
+ (insertedCountsLabelsLHS.first)->second += iter2->second;
446
+ }
447
+
448
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >::iterator jointCountsRulesTargetLHSAndLabelsLHSIter =
449
+ jointCountsRulesTargetLHSAndLabelsLHS.find(ruleTargetLhs);
450
+ if ( jointCountsRulesTargetLHSAndLabelsLHSIter == jointCountsRulesTargetLHSAndLabelsLHS.end() ) {
451
+ boost::unordered_map<std::string,float>* jointCounts = new boost::unordered_map<std::string,float>;
452
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
453
+ jointCountsRulesTargetLHSAndLabelsLHS.insert(std::pair<std::string,boost::unordered_map<std::string,float>* >(ruleTargetLhs,jointCounts));
454
+ } else {
455
+ boost::unordered_map<std::string,float>* jointCounts = jointCountsRulesTargetLHSAndLabelsLHSIter->second;
456
+ std::pair< boost::unordered_map<std::string,float>::iterator, bool > insertedJointCounts =
457
+ jointCounts->insert(std::pair<std::string,float>(iter2->first,iter2->second));
458
+ if (!insertedJointCounts.second) {
459
+ (insertedJointCounts.first)->second += iter2->second;
460
+ }
461
+ }
462
+
463
+ }
464
+ }
465
+
466
+ std::string allPropertyValuesString(oss.str());
467
+ return allPropertyValuesString;
468
+ }
469
+
470
+
471
+ void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
472
+ const std::vector<float> &orientationClassPriorsL2R,
473
+ const std::vector<float> &orientationClassPriorsR2L,
474
+ double smoothingFactor,
475
+ std::ostream &out) const
476
+ {
477
+ assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dleft dright
478
+
479
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
480
+
481
+ if ( allPropertyValues == NULL ) {
482
+ return;
483
+ }
484
+
485
+ // bidirectional MSLR phrase orientation with 2x4 orientation classes:
486
+ // mono swap dright dleft
487
+ std::vector<float> orientationClassCountSumL2R(4,0);
488
+ std::vector<float> orientationClassCountSumR2L(4,0);
489
+
490
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
491
+ iter!=allPropertyValues->end(); ++iter) {
492
+ std::string l2rOrientationClass, r2lOrientationClass;
493
+ try {
494
+ istringstream tokenizer(iter->first);
495
+ tokenizer >> l2rOrientationClass;
496
+ tokenizer >> r2lOrientationClass;
497
+ if ( tokenizer.peek() != EOF ) {
498
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
499
+ << ": Collecting phrase orientations failed. "
500
+ << "Too many tokens?");
501
+ }
502
+ } catch (const std::exception &e) {
503
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
504
+ << ": Collecting phrase orientations failed. "
505
+ << "Flawed property value in extract file?");
506
+ }
507
+
508
+ int l2rOrientationClassId = -1;
509
+ if (!l2rOrientationClass.compare("mono")) {
510
+ l2rOrientationClassId = 0;
511
+ }
512
+ if (!l2rOrientationClass.compare("swap")) {
513
+ l2rOrientationClassId = 1;
514
+ }
515
+ if (!l2rOrientationClass.compare("dleft")) {
516
+ l2rOrientationClassId = 2;
517
+ }
518
+ if (!l2rOrientationClass.compare("dright")) {
519
+ l2rOrientationClassId = 3;
520
+ }
521
+ if (l2rOrientationClassId == -1) {
522
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
523
+ << ": Collecting phrase orientations failed. "
524
+ << "Unknown orientation class \"" << l2rOrientationClass << "\"." );
525
+ }
526
+ int r2lOrientationClassId = -1;
527
+ if (!r2lOrientationClass.compare("mono")) {
528
+ r2lOrientationClassId = 0;
529
+ }
530
+ if (!r2lOrientationClass.compare("swap")) {
531
+ r2lOrientationClassId = 1;
532
+ }
533
+ if (!r2lOrientationClass.compare("dleft")) {
534
+ r2lOrientationClassId = 2;
535
+ }
536
+ if (!r2lOrientationClass.compare("dright")) {
537
+ r2lOrientationClassId = 3;
538
+ }
539
+ if (r2lOrientationClassId == -1) {
540
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
541
+ << ": Collecting phrase orientations failed. "
542
+ << "Unknown orientation class \"" << r2lOrientationClass << "\"." );
543
+ }
544
+
545
+ orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
546
+ orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
547
+ }
548
+
549
+ for (size_t i=0; i<4; ++i) {
550
+ if (i>0) {
551
+ out << " ";
552
+ }
553
+ out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
554
+ }
555
+ for (size_t i=0; i<4; ++i) {
556
+ out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
557
+ }
558
+ }
559
+
560
+
561
+ void ExtractionPhrasePair::UpdateVocabularyFromValueTokens(const std::string& propertyKey,
562
+ std::set<std::string>& vocabulary) const
563
+ {
564
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( propertyKey );
565
+
566
+ if ( allPropertyValues == NULL ) {
567
+ return;
568
+ }
569
+
570
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
571
+ iter!=allPropertyValues->end(); ++iter) {
572
+
573
+ std::vector<std::string> tokens = Moses::Tokenize(iter->first);
574
+ for (std::vector<std::string>::const_iterator tokenIt=tokens.begin();
575
+ tokenIt!=tokens.end(); ++tokenIt) {
576
+ vocabulary.insert(*tokenIt);
577
+ }
578
+ }
579
+ }
580
+
581
+
582
+
583
+ }
584
+
mosesdecoder/phrase-extract/ExtractionPhrasePair.h ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2009 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #include "tables-core.h"
22
+
23
+ #include <vector>
24
+ #include <set>
25
+ #include <map>
26
+ #include <boost/unordered_map.hpp>
27
+
28
+ namespace MosesTraining
29
+ {
30
+
31
+
32
+ typedef std::vector< std::set<size_t> > ALIGNMENT;
33
+
34
+
35
+ class ExtractionPhrasePair
36
+ {
37
+
38
+ protected:
39
+
40
+ typedef std::map<std::string,float> PROPERTY_VALUES;
41
+ typedef std::map<std::string,float>::iterator LAST_PROPERTY_VALUE;
42
+
43
+
44
+ bool m_isValid;
45
+
46
+ const PHRASE *m_phraseSource;
47
+ const PHRASE *m_phraseTarget;
48
+
49
+ float m_count;
50
+ float m_pcfgSum;
51
+
52
+ std::map<ALIGNMENT*,float> m_targetToSourceAlignments;
53
+ std::map<std::string,
54
+ std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > > m_properties;
55
+
56
+ float m_lastCount;
57
+ float m_lastPcfgSum;
58
+ std::map<ALIGNMENT*,float>::iterator m_lastTargetToSourceAlignment;
59
+
60
+ public:
61
+
62
+ ExtractionPhrasePair( const PHRASE *phraseSource,
63
+ const PHRASE *phraseTarget,
64
+ ALIGNMENT *targetToSourceAlignment,
65
+ float count, float pcfgSum );
66
+
67
+ ~ExtractionPhrasePair();
68
+
69
+ bool Add( ALIGNMENT *targetToSourceAlignment,
70
+ float count, float pcfgSum );
71
+
72
+ void IncrementPrevious( float count, float pcfgSum );
73
+
74
+ bool Matches( const PHRASE *otherPhraseSource,
75
+ const PHRASE *otherPhraseTarget,
76
+ ALIGNMENT *otherTargetToSourceAlignment ) const;
77
+
78
+ bool Matches( const PHRASE *otherPhraseSource,
79
+ const PHRASE *otherPhraseTarget,
80
+ ALIGNMENT *otherTargetToSourceAlignment,
81
+ bool &sourceMatch,
82
+ bool &targetMatch,
83
+ bool &alignmentMatch ) const;
84
+
85
+ bool MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const;
86
+
87
+ void Clear();
88
+
89
+ bool IsValid() const {
90
+ return m_isValid;
91
+ }
92
+
93
+
94
+ const PHRASE *GetSource() const {
95
+ return m_phraseSource;
96
+ }
97
+
98
+ const PHRASE *GetTarget() const {
99
+ return m_phraseTarget;
100
+ }
101
+
102
+ float GetCount() const {
103
+ return m_count;
104
+ }
105
+
106
+ float GetPcfgScore() const {
107
+ return m_pcfgSum;
108
+ }
109
+
110
+ const size_t GetNumberOfProperties() const {
111
+ return m_properties.size();
112
+ }
113
+
114
+ const std::map<std::string,float> *GetProperty( const std::string &key ) const {
115
+ std::map<std::string, std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::const_iterator iter;
116
+ iter = m_properties.find(key);
117
+ if (iter == m_properties.end()) {
118
+ return NULL;
119
+ } else {
120
+ return iter->second.first;
121
+ }
122
+ }
123
+
124
+ const ALIGNMENT *FindBestAlignmentTargetToSource() const;
125
+
126
+ const std::string *FindBestPropertyValue(const std::string &key) const;
127
+
128
+ std::string CollectAllPropertyValues(const std::string &key) const;
129
+
130
+ std::string CollectAllLabelsSeparateLHSAndRHS(const std::string& propertyKey,
131
+ std::set<std::string>& sourceLabelSet,
132
+ boost::unordered_map<std::string,float>& sourceLHSCounts,
133
+ boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
134
+ Vocabulary &vcbT) const;
135
+
136
+ void CollectAllPhraseOrientations(const std::string &key,
137
+ const std::vector<float> &orientationClassPriorsL2R,
138
+ const std::vector<float> &orientationClassPriorsR2L,
139
+ double smoothingFactor,
140
+ std::ostream &out) const;
141
+
142
+ void UpdateVocabularyFromValueTokens(const std::string& propertyKey,
143
+ std::set<std::string>& vocabulary) const;
144
+
145
+ void AddProperties(const std::string &str, float count);
146
+
147
+ void AddProperty(const std::string &key, const std::string &value, float count) {
148
+ std::map<std::string,
149
+ std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* > >::iterator iter = m_properties.find(key);
150
+ if ( iter == m_properties.end() ) {
151
+ // key not found: insert property key and value
152
+ PROPERTY_VALUES *propertyValues = new PROPERTY_VALUES();
153
+ std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
154
+ LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
155
+ m_properties[key] = std::pair< PROPERTY_VALUES*, LAST_PROPERTY_VALUE* >(propertyValues, lastPropertyValue);
156
+ } else {
157
+ LAST_PROPERTY_VALUE *lastPropertyValue = (iter->second).second;
158
+ if ( (*lastPropertyValue)->first == value ) { // same property key-value pair has been seen right before
159
+ // property key-value pair exists already: add count
160
+ (*lastPropertyValue)->second += count;
161
+ } else { // need to check whether the property key-value pair has appeared before (insert if not)
162
+ // property key exists, but not in combination with this value:
163
+ // add new value with count
164
+ PROPERTY_VALUES *propertyValues = (iter->second).first;
165
+ std::pair<LAST_PROPERTY_VALUE,bool> insertedProperty = propertyValues->insert( std::pair<std::string,float>(value,count) );
166
+ if ( !insertedProperty.second ) { // property value for this key appeared before: add count
167
+ insertedProperty.first->second += count;
168
+ }
169
+ LAST_PROPERTY_VALUE *lastPropertyValue = new LAST_PROPERTY_VALUE(insertedProperty.first);
170
+ delete (iter->second).second;
171
+ (iter->second).second = lastPropertyValue;
172
+ }
173
+ }
174
+ }
175
+
176
+ };
177
+
178
+ }
179
+
mosesdecoder/phrase-extract/Hole.h ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef HOLE_H_INCLUDED_
22
+ #define HOLE_H_INCLUDED_
23
+
24
+ #include <cassert>
25
+ #include <list>
26
+ #include <string>
27
+ #include <vector>
28
+
29
+ namespace MosesTraining
30
+ {
31
+
32
+ class Hole
33
+ {
34
+ protected:
35
+ std::vector<int> m_start, m_end, m_pos;
36
+ std::vector<std::string> m_label;
37
+
38
+ public:
39
+ Hole()
40
+ : m_start(2)
41
+ , m_end(2)
42
+ , m_pos(2)
43
+ , m_label(2) {
44
+ }
45
+
46
+ Hole(const Hole &copy)
47
+ : m_start(copy.m_start)
48
+ , m_end(copy.m_end)
49
+ , m_pos(copy.m_pos)
50
+ , m_label(copy.m_label) {
51
+ }
52
+
53
+ Hole(int startS, int endS, int startT, int endT)
54
+ : m_start(2)
55
+ , m_end(2)
56
+ , m_pos(2)
57
+ , m_label(2) {
58
+ m_start[0] = startS;
59
+ m_end[0] = endS;
60
+ m_start[1] = startT;
61
+ m_end[1] = endT;
62
+ }
63
+
64
+ int GetStart(size_t direction) const {
65
+ return m_start[direction];
66
+ }
67
+
68
+ int GetEnd(size_t direction) const {
69
+ return m_end[direction];
70
+ }
71
+
72
+ int GetSize(size_t direction) const {
73
+ return m_end[direction] - m_start[direction] + 1;
74
+ }
75
+
76
+ void SetPos(int pos, size_t direction) {
77
+ m_pos[direction] = pos;
78
+ }
79
+
80
+ int GetPos(size_t direction) const {
81
+ return m_pos[direction];
82
+ }
83
+
84
+ void SetLabel(const std::string &label, size_t direction) {
85
+ m_label[direction] = label;
86
+ }
87
+
88
+ const std::string &GetLabel(size_t direction) const {
89
+ return m_label[direction];
90
+ }
91
+
92
+ bool Overlap(const Hole &otherHole, size_t direction) const {
93
+ return ! ( otherHole.GetEnd(direction) < GetStart(direction) ||
94
+ otherHole.GetStart(direction) > GetEnd(direction) );
95
+ }
96
+
97
+ bool Neighbor(const Hole &otherHole, size_t direction) const {
98
+ return ( otherHole.GetEnd(direction)+1 == GetStart(direction) ||
99
+ otherHole.GetStart(direction) == GetEnd(direction)+1 );
100
+ }
101
+ };
102
+
103
+ typedef std::list<Hole> HoleList;
104
+
105
+ class HoleSourceOrderer
106
+ {
107
+ public:
108
+ bool operator()(const Hole* holeA, const Hole* holeB) const {
109
+ assert(holeA->GetStart(0) != holeB->GetStart(0));
110
+ return holeA->GetStart(0) < holeB->GetStart(0);
111
+ }
112
+ };
113
+
114
+ }
115
+
116
+ #endif
mosesdecoder/phrase-extract/HoleCollection.cpp ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "HoleCollection.h"
21
+
22
+ #include <algorithm>
23
+
24
+ namespace MosesTraining
25
+ {
26
+
27
+ void HoleCollection::SortSourceHoles()
28
+ {
29
+ assert(m_sortedSourceHoles.size() == 0);
30
+
31
+ // add
32
+ HoleList::iterator iter;
33
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
34
+ Hole &currHole = *iter;
35
+ m_sortedSourceHoles.push_back(&currHole);
36
+ }
37
+
38
+ // sort
39
+ std::sort(m_sortedSourceHoles.begin(), m_sortedSourceHoles.end(), HoleSourceOrderer());
40
+ }
41
+
42
+ void HoleCollection::Add(int startT, int endT, int startS, int endS)
43
+ {
44
+ Hole hole(startS, endS, startT, endT);
45
+ m_scope.push_back(Scope(hole));
46
+ m_sourceHoleStartPoints.push_back(startS);
47
+ m_sourceHoleEndPoints.push_back(endS);
48
+ m_holes.push_back(hole);
49
+ m_sortedSourceHoles.clear();
50
+ }
51
+
52
+ void HoleCollection::RemoveLast()
53
+ {
54
+ m_scope.pop_back();
55
+ m_sourceHoleStartPoints.pop_back();
56
+ m_sourceHoleEndPoints.pop_back();
57
+ m_holes.pop_back();
58
+ m_sortedSourceHoles.clear();
59
+ }
60
+
61
+ int HoleCollection::Scope(const Hole &proposedHole) const
62
+ {
63
+ const int holeStart = proposedHole.GetStart(0);
64
+ const int holeEnd = proposedHole.GetEnd(0);
65
+ int scope = m_scope.back();
66
+ if (holeStart == m_sourcePhraseStart.back() ||
67
+ find(m_sourceHoleEndPoints.begin(), m_sourceHoleEndPoints.end(), holeStart-1) != m_sourceHoleEndPoints.end()) {
68
+ ++scope; // Adding hole would introduce choice point at start of hole.
69
+ }
70
+ if (holeEnd == m_sourcePhraseEnd.back() ||
71
+ find(m_sourceHoleStartPoints.begin(), m_sourceHoleStartPoints.end(), holeEnd-1) != m_sourceHoleStartPoints.end()) {
72
+ ++scope; // Adding hole would introduce choice point at end of hole.
73
+ }
74
+ return scope;
75
+ }
76
+
77
+ }
mosesdecoder/phrase-extract/HoleCollection.h ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef HOLECOLLECTION_H_INCLUDED_
22
+ #define HOLECOLLECTION_H_INCLUDED_
23
+
24
+ #include <set>
25
+ #include <vector>
26
+
27
+ #include "Hole.h"
28
+
29
+ namespace MosesTraining
30
+ {
31
+
32
+ class HoleCollection
33
+ {
34
+ protected:
35
+ HoleList m_holes;
36
+ std::vector<Hole*> m_sortedSourceHoles;
37
+ std::vector<int> m_sourceHoleStartPoints;
38
+ std::vector<int> m_sourceHoleEndPoints;
39
+ std::vector<int> m_scope;
40
+ std::vector<int> m_sourcePhraseStart;
41
+ std::vector<int> m_sourcePhraseEnd;
42
+
43
+ public:
44
+ HoleCollection(int sourcePhraseStart, int sourcePhraseEnd)
45
+ : m_scope(1, 0)
46
+ , m_sourcePhraseStart(1, sourcePhraseStart)
47
+ , m_sourcePhraseEnd(1, sourcePhraseEnd) {
48
+ }
49
+
50
+ const HoleList &GetHoles() const {
51
+ return m_holes;
52
+ }
53
+
54
+ HoleList &GetHoles() {
55
+ return m_holes;
56
+ }
57
+
58
+ std::vector<Hole*> &GetSortedSourceHoles() {
59
+ return m_sortedSourceHoles;
60
+ }
61
+
62
+ void Add(int startT, int endT, int startS, int endS);
63
+
64
+ void RemoveLast();
65
+
66
+ bool OverlapSource(const Hole &sourceHole) const {
67
+ HoleList::const_iterator iter;
68
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
69
+ const Hole &currHole = *iter;
70
+ if (currHole.Overlap(sourceHole, 0))
71
+ return true;
72
+ }
73
+ return false;
74
+ }
75
+
76
+ bool ConsecSource(const Hole &sourceHole) const {
77
+ HoleList::const_iterator iter;
78
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
79
+ const Hole &currHole = *iter;
80
+ if (currHole.Neighbor(sourceHole, 0))
81
+ return true;
82
+ }
83
+ return false;
84
+ }
85
+
86
+ // Determine the scope that would result from adding the given hole.
87
+ int Scope(const Hole &proposedHole) const;
88
+
89
+ void SortSourceHoles();
90
+
91
+ };
92
+
93
+ }
94
+
95
+ #endif
mosesdecoder/phrase-extract/InputFileStream.cpp ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "InputFileStream.h"
23
+ #include "gzfilebuf.h"
24
+ #include <iostream>
25
+
26
+ using namespace std;
27
+
28
+ namespace Moses
29
+ {
30
+ InputFileStream::InputFileStream(const std::string &filePath)
31
+ : std::istream(NULL)
32
+ , m_streambuf(NULL)
33
+ {
34
+ if (filePath.size() > 3 &&
35
+ filePath.substr(filePath.size() - 3, 3) == ".gz") {
36
+ m_streambuf = new gzfilebuf(filePath.c_str());
37
+ } else {
38
+ std::filebuf* fb = new std::filebuf();
39
+ fb = fb->open(filePath.c_str(), std::ios::in);
40
+ if (! fb) {
41
+ cerr << "Can't read " << filePath.c_str() << endl;
42
+ exit(1);
43
+ }
44
+ m_streambuf = fb;
45
+ }
46
+ this->init(m_streambuf);
47
+ }
48
+
49
+ InputFileStream::~InputFileStream()
50
+ {
51
+ delete m_streambuf;
52
+ m_streambuf = NULL;
53
+ }
54
+
55
+ void InputFileStream::Close()
56
+ {
57
+ }
58
+
59
+
60
+ }
61
+
mosesdecoder/phrase-extract/InputFileStream.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_InputFileStream_h
23
+ #define moses_InputFileStream_h
24
+
25
+ #include <cstdlib>
26
+ #include <fstream>
27
+ #include <string>
28
+
29
+ namespace Moses
30
+ {
31
+
32
+ /** Used in place of std::istream, can read zipped files if it ends in .gz
33
+ */
34
+ class InputFileStream : public std::istream
35
+ {
36
+ protected:
37
+ std::streambuf *m_streambuf;
38
+ public:
39
+
40
+ explicit InputFileStream(const std::string &filePath);
41
+ ~InputFileStream();
42
+
43
+ void Close();
44
+ };
45
+
46
+ }
47
+
48
+ #endif
mosesdecoder/phrase-extract/InternalStructFeature.cpp ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "InternalStructFeature.h"
2
+ #include <map>
3
+
4
+ using namespace std;
5
+
6
+ namespace MosesTraining
7
+ {
8
+
9
+ void InternalStructFeature::add(const ScoreFeatureContext& context,
10
+ std::vector<float>& denseValues,
11
+ std::map<std::string,float>& sparseValues) const
12
+ {
13
+ const std::map<std::string,float> *allTrees = context.phrasePair.GetProperty("Tree"); // our would we rather want to take the most frequent one only?
14
+ for ( std::map<std::string,float>::const_iterator iter=allTrees->begin();
15
+ iter!=allTrees->end(); ++iter ) {
16
+ add(&(iter->first), iter->second, denseValues, sparseValues);
17
+ }
18
+ }
19
+
20
+ void InternalStructFeatureDense::add(const std::string *treeFragment,
21
+ float count,
22
+ std::vector<float>& denseValues,
23
+ std::map<std::string,float>& sparseValues) const
24
+ {
25
+ //cout<<"Dense: "<<*internalStruct<<endl;
26
+ size_t start=0;
27
+ int countNP=0;
28
+ while((start = treeFragment->find("NP", start)) != string::npos) {
29
+ countNP += count;
30
+ start+=2; //length of "NP"
31
+ }
32
+ //should add e^countNP so in the decoder I get log(e^countNP)=countNP -> but is log or ln?
33
+ //should use this but don't know what it does? -> maybeLog( (bitmap == i) ? 2.718 : 1 )
34
+ denseValues.push_back(exp(countNP));
35
+
36
+ }
37
+
38
+ void InternalStructFeatureSparse::add(const std::string *treeFragment,
39
+ float count,
40
+ std::vector<float>& denseValues,
41
+ std::map<std::string,float>& sparseValues) const
42
+ {
43
+ //cout<<"Sparse: "<<*internalStruct<<endl;
44
+ if(treeFragment->find("VBZ")!=std::string::npos)
45
+ sparseValues["NTVBZ"] += count;
46
+ if(treeFragment->find("VBD")!=std::string::npos)
47
+ sparseValues["NTVBD"] += count;
48
+ if(treeFragment->find("VBP")!=std::string::npos)
49
+ sparseValues["NTVBP"] += count;
50
+ if(treeFragment->find("PP")!=std::string::npos)
51
+ sparseValues["NTPP"] += count;
52
+ if(treeFragment->find("SBAR")!=std::string::npos)
53
+ sparseValues["NTSBAR"] += count;
54
+ }
55
+
56
+
57
+ }
mosesdecoder/phrase-extract/InternalStructFeature.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <fstream>
3
+ #include <cassert>
4
+ #include <cstdlib>
5
+ #include <string>
6
+ #include <queue>
7
+ #include <map>
8
+ #include <cmath>
9
+
10
+ #include "ScoreFeature.h"
11
+ #include "extract-ghkm/Node.h"
12
+
13
+ namespace MosesTraining
14
+ {
15
+
16
+
17
+ class InternalStructFeature : public ScoreFeature
18
+ {
19
+ public:
20
+ InternalStructFeature() : m_type(0) {};
21
+ /** Add the values for this feature function. */
22
+ void add(const ScoreFeatureContext& context,
23
+ std::vector<float>& denseValues,
24
+ std::map<std::string,float>& sparseValues) const;
25
+
26
+
27
+ protected:
28
+ /** Overridden in subclass */
29
+ virtual void add(const std::string *treeFragment,
30
+ float count,
31
+ std::vector<float>& denseValues,
32
+ std::map<std::string,float>& sparseValues) const = 0;
33
+ int m_type;
34
+ };
35
+
36
+ class InternalStructFeatureDense : public InternalStructFeature
37
+ {
38
+ public:
39
+ InternalStructFeatureDense()
40
+ :InternalStructFeature() {
41
+ m_type=1;
42
+ } //std::cout<<"InternalStructFeatureDense: Construct "<<m_type<<"\n";}
43
+ protected:
44
+ virtual void add(const std::string *treeFragment,
45
+ float count,
46
+ std::vector<float>& denseValues,
47
+ std::map<std::string,float>& sparseValues) const;
48
+ };
49
+
50
+ class InternalStructFeatureSparse : public InternalStructFeature
51
+ {
52
+ public:
53
+ InternalStructFeatureSparse()
54
+ :InternalStructFeature() {
55
+ m_type=2;
56
+ }// std::cout<<"InternalStructFeatureSparse: Construct "<<m_type<<"\n";}
57
+ protected:
58
+ virtual void add(const std::string *treeFragment,
59
+ float count,
60
+ std::vector<float>& denseValues,
61
+ std::map<std::string,float>& sparseValues) const;
62
+ };
63
+
64
+ }
mosesdecoder/phrase-extract/Jamfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local most-deps = [ glob *.cpp : ExtractionPhrasePair.cpp *Test.cpp *-main.cpp ] ;
2
+ #Build .o files with include path setting, reused.
3
+ for local d in $(most-deps) {
4
+ obj $(d:B).o : $(d) ;
5
+ }
6
+ #and stuff them into an alias.
7
+ alias deps : $(most-deps:B).o ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ../moses//ThreadPool ../moses//Util ../util//kenutil ;
8
+
9
+ #ExtractionPhrasePair.cpp requires that main define some global variables.
10
+ #Build the mains that do not need these global variables.
11
+ for local m in [ glob *-main.cpp : score-main.cpp ] {
12
+ exe [ MATCH "(.*)-main.cpp" : $(m) ] : $(m) deps ;
13
+ }
14
+
15
+ #The side dishes that use ExtractionPhrasePair.cpp
16
+ exe score : ExtractionPhrasePair.cpp score-main.cpp deps ;
17
+
18
+ import testing ;
19
+ run ScoreFeatureTest.cpp ExtractionPhrasePair.cpp deps ..//boost_unit_test_framework ..//boost_iostreams : : test.domain ;
mosesdecoder/phrase-extract/OutputFileStream.cpp ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <iostream>
23
+ #include <boost/algorithm/string/predicate.hpp>
24
+ #include <boost/iostreams/filter/gzip.hpp>
25
+ #include "OutputFileStream.h"
26
+ #include "gzfilebuf.h"
27
+
28
+ using namespace std;
29
+ using namespace boost::algorithm;
30
+
31
+ namespace Moses
32
+ {
33
+ OutputFileStream::OutputFileStream()
34
+ :boost::iostreams::filtering_ostream()
35
+ ,m_outFile(NULL)
36
+ ,m_open(false)
37
+ {
38
+ }
39
+
40
+ OutputFileStream::OutputFileStream(const std::string &filePath)
41
+ :m_outFile(NULL)
42
+ ,m_open(false)
43
+ {
44
+ Open(filePath);
45
+ }
46
+
47
+ OutputFileStream::~OutputFileStream()
48
+ {
49
+ Close();
50
+ }
51
+
52
+ bool OutputFileStream::Open(const std::string &filePath)
53
+ {
54
+ assert(!m_open);
55
+ if (filePath == std::string("-")) {
56
+ // Write to standard output. Leave m_outFile null.
57
+ this->push(std::cout);
58
+ } else {
59
+ m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
60
+ if (m_outFile->fail()) {
61
+ return false;
62
+ }
63
+
64
+ if (ends_with(filePath, ".gz")) {
65
+ this->push(boost::iostreams::gzip_compressor());
66
+ }
67
+ this->push(*m_outFile);
68
+ }
69
+
70
+ m_open = true;
71
+ return true;
72
+ }
73
+
74
+ void OutputFileStream::Close()
75
+ {
76
+ if (!m_open) return;
77
+ this->flush();
78
+ if (m_outFile) {
79
+ this->pop(); // file
80
+
81
+ m_outFile->close();
82
+ delete m_outFile;
83
+ m_outFile = NULL;
84
+ }
85
+ m_open = false;
86
+ }
87
+
88
+
89
+ }
90
+
mosesdecoder/phrase-extract/OutputFileStream.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <cstdlib>
25
+ #include <fstream>
26
+ #include <string>
27
+ #include <iostream>
28
+ #include <boost/iostreams/filtering_stream.hpp>
29
+
30
+ namespace Moses
31
+ {
32
+
33
+ /** Version of std::ostream with transparent compression.
34
+ *
35
+ * Transparently compresses output when writing to a file whose name ends in
36
+ * ".gz". Or, writes to stdout instead of a file when given a filename
37
+ * consisting of just a dash ("-").
38
+ */
39
+ class OutputFileStream : public boost::iostreams::filtering_ostream
40
+ {
41
+ private:
42
+ /** File that needs flushing & closing when we close this stream.
43
+ *
44
+ * Is NULL when no file is opened, e.g. when writing to standard output.
45
+ */
46
+ std::ofstream *m_outFile;
47
+
48
+ /// Is this stream open?
49
+ bool m_open;
50
+
51
+ public:
52
+ /** Create an unopened OutputFileStream.
53
+ *
54
+ * Until it's been opened, nothing can be done with this stream.
55
+ */
56
+ OutputFileStream();
57
+
58
+ /// Create an OutputFileStream, and open it by calling Open().
59
+ OutputFileStream(const std::string &filePath);
60
+ virtual ~OutputFileStream();
61
+
62
+ // TODO: Can we please just always throw an exception when this fails?
63
+ /** Open stream.
64
+ *
65
+ * If filePath is "-" (just a dash), this opens the stream for writing to
66
+ * standard output. Otherwise, it opens the given file. If the filename
67
+ * has the ".gz" suffix, output will be transparently compressed.
68
+ *
69
+ * Call Close() to close the file.
70
+ *
71
+ * Returns whether opening the file was successful. It may also throw an
72
+ * exception on failure.
73
+ */
74
+ bool Open(const std::string &filePath);
75
+
76
+ /// Flush and close stream. After this, the stream can be opened again.
77
+ void Close();
78
+ };
79
+
80
+ }
81
+
mosesdecoder/phrase-extract/PhraseExtractionOptions.h ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2010 University of Edinburgh
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ namespace MosesTraining
26
+ {
27
+ enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
28
+ enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
29
+
30
+
31
+ class PhraseExtractionOptions
32
+ {
33
+
34
+ public:
35
+ int maxPhraseLength;
36
+ int minPhraseLength;
37
+ std::string separator;
38
+
39
+ private:
40
+ bool allModelsOutputFlag;
41
+ bool wordModel;
42
+ REO_MODEL_TYPE wordType;
43
+ bool phraseModel;
44
+ REO_MODEL_TYPE phraseType;
45
+ bool hierModel;
46
+ REO_MODEL_TYPE hierType;
47
+ bool orientationFlag;
48
+ bool translationFlag;
49
+ bool includeSentenceIdFlag; //include sentence id in extract file
50
+ bool onlyOutputSpanInfo;
51
+ bool gzOutput;
52
+ std::string instanceWeightsFile; //weights for each sentence
53
+ bool targetConstituentConstrainedFlag;
54
+ bool targetConstituentBoundariesFlag;
55
+ bool flexScoreFlag;
56
+ bool singleWordHeuristicFlag;
57
+
58
+ public:
59
+ std::vector<std::string> placeholders;
60
+ bool debug;
61
+
62
+ PhraseExtractionOptions(const int initmaxPhraseLength):
63
+ maxPhraseLength(initmaxPhraseLength),
64
+ minPhraseLength(3),
65
+ separator("|||"),
66
+ allModelsOutputFlag(false),
67
+ wordModel(false),
68
+ wordType(REO_MSD),
69
+ phraseModel(false),
70
+ phraseType(REO_MSD),
71
+ hierModel(false),
72
+ hierType(REO_MSD),
73
+ orientationFlag(false),
74
+ translationFlag(true),
75
+ includeSentenceIdFlag(false),
76
+ onlyOutputSpanInfo(false),
77
+ gzOutput(false),
78
+ targetConstituentConstrainedFlag(false),
79
+ targetConstituentBoundariesFlag(false),
80
+ flexScoreFlag(false),
81
+ singleWordHeuristicFlag(false),
82
+ debug(false) {
83
+ }
84
+
85
+ //functions for initialization of options
86
+ void initAllModelsOutputFlag(const bool initallModelsOutputFlag) {
87
+ allModelsOutputFlag=initallModelsOutputFlag;
88
+ }
89
+ void initWordModel(const bool initwordModel) {
90
+ wordModel=initwordModel;
91
+ }
92
+ void initWordType(REO_MODEL_TYPE initwordType ) {
93
+ wordType=initwordType;
94
+ }
95
+ void initPhraseModel(const bool initphraseModel ) {
96
+ phraseModel=initphraseModel;
97
+ }
98
+ void initPhraseType(REO_MODEL_TYPE initphraseType) {
99
+ phraseType=initphraseType;
100
+ }
101
+ void initHierModel(const bool inithierModel) {
102
+ hierModel=inithierModel;
103
+ }
104
+ void initHierType(REO_MODEL_TYPE inithierType) {
105
+ hierType=inithierType;
106
+ }
107
+ void initOrientationFlag(const bool initorientationFlag) {
108
+ orientationFlag=initorientationFlag;
109
+ }
110
+ void initTranslationFlag(const bool inittranslationFlag) {
111
+ translationFlag=inittranslationFlag;
112
+ }
113
+ void initIncludeSentenceIdFlag(const bool initincludeSentenceIdFlag) {
114
+ includeSentenceIdFlag=initincludeSentenceIdFlag;
115
+ }
116
+ void initOnlyOutputSpanInfo(const bool initonlyOutputSpanInfo) {
117
+ onlyOutputSpanInfo= initonlyOutputSpanInfo;
118
+ }
119
+ void initGzOutput (const bool initgzOutput) {
120
+ gzOutput= initgzOutput;
121
+ }
122
+ void initInstanceWeightsFile(const char* initInstanceWeightsFile) {
123
+ instanceWeightsFile = std::string(initInstanceWeightsFile);
124
+ }
125
+ void initTargetConstituentConstrainedFlag(const bool initTargetConstituentConstrainedFlag) {
126
+ targetConstituentConstrainedFlag = initTargetConstituentConstrainedFlag;
127
+ }
128
+ void initTargetConstituentBoundariesFlag(const bool initTargetConstituentBoundariesFlag) {
129
+ targetConstituentBoundariesFlag = initTargetConstituentBoundariesFlag;
130
+ }
131
+ void initFlexScoreFlag(const bool initflexScoreFlag) {
132
+ flexScoreFlag=initflexScoreFlag;
133
+ }
134
+ void initSingleWordHeuristicFlag(const bool initSingleWordHeuristicFlag) {
135
+ singleWordHeuristicFlag = initSingleWordHeuristicFlag;
136
+ }
137
+
138
+ // functions for getting values
139
+ bool isAllModelsOutputFlag() const {
140
+ return allModelsOutputFlag;
141
+ }
142
+ bool isWordModel() const {
143
+ return wordModel;
144
+ }
145
+ REO_MODEL_TYPE isWordType() const {
146
+ return wordType;
147
+ }
148
+ bool isPhraseModel() const {
149
+ return phraseModel;
150
+ }
151
+ REO_MODEL_TYPE isPhraseType() const {
152
+ return phraseType;
153
+ }
154
+ bool isHierModel() const {
155
+ return hierModel;
156
+ }
157
+ REO_MODEL_TYPE isHierType() const {
158
+ return hierType;
159
+ }
160
+ bool isOrientationFlag() const {
161
+ return orientationFlag;
162
+ }
163
+ bool isTranslationFlag() const {
164
+ return translationFlag;
165
+ }
166
+ bool isIncludeSentenceIdFlag() const {
167
+ return includeSentenceIdFlag;
168
+ }
169
+ bool isOnlyOutputSpanInfo() const {
170
+ return onlyOutputSpanInfo;
171
+ }
172
+ bool isGzOutput () const {
173
+ return gzOutput;
174
+ }
175
+ std::string getInstanceWeightsFile() const {
176
+ return instanceWeightsFile;
177
+ }
178
+ bool isTargetConstituentConstrainedFlag() const {
179
+ return targetConstituentConstrainedFlag;
180
+ }
181
+ bool isTargetConstituentBoundariesFlag() const {
182
+ return targetConstituentBoundariesFlag;
183
+ }
184
+ bool isFlexScoreFlag() const {
185
+ return flexScoreFlag;
186
+ }
187
+ bool isSingleWordHeuristicFlag() const {
188
+ return singleWordHeuristicFlag;
189
+ }
190
+ };
191
+
192
+ }
193
+
mosesdecoder/phrase-extract/PhraseOrientation.cpp ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "PhraseOrientation.h"
21
+
22
+ #include <iostream>
23
+ #include <sstream>
24
+ #include <limits>
25
+ #include <cassert>
26
+
27
+ #include <boost/assign/list_of.hpp>
28
+
29
+ namespace MosesTraining
30
+ {
31
+
32
+ std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
33
+ std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
34
+
35
+ PhraseOrientation::PhraseOrientation(int sourceSize,
36
+ int targetSize,
37
+ const Alignment &alignment)
38
+ : m_countF(sourceSize)
39
+ , m_countE(targetSize)
40
+ {
41
+ // prepare data structures for alignments
42
+ std::vector<std::vector<int> > alignedToS;
43
+ for(int i=0; i<m_countF; ++i) {
44
+ std::vector< int > dummy;
45
+ alignedToS.push_back(dummy);
46
+ }
47
+ for(int i=0; i<m_countE; ++i) {
48
+ std::vector< int > dummy;
49
+ m_alignedToT.push_back(dummy);
50
+ }
51
+ std::vector<int> alignedCountS(m_countF,0);
52
+
53
+ for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) {
54
+ alignedToS[a->first].push_back(a->second);
55
+ alignedCountS[a->first]++;
56
+ m_alignedToT[a->second].push_back(a->first);
57
+ }
58
+
59
+ Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
60
+ }
61
+
62
+
63
+ PhraseOrientation::PhraseOrientation(int sourceSize,
64
+ int targetSize,
65
+ const Moses::AlignmentInfo &alignTerm,
66
+ const Moses::AlignmentInfo &alignNonTerm)
67
+ : m_countF(sourceSize)
68
+ , m_countE(targetSize)
69
+ {
70
+ // prepare data structures for alignments
71
+ std::vector<std::vector<int> > alignedToS;
72
+ for(int i=0; i<m_countF; ++i) {
73
+ std::vector< int > dummy;
74
+ alignedToS.push_back(dummy);
75
+ }
76
+ for(int i=0; i<m_countE; ++i) {
77
+ std::vector< int > dummy;
78
+ m_alignedToT.push_back(dummy);
79
+ }
80
+ std::vector<int> alignedCountS(m_countF,0);
81
+
82
+ for (Moses::AlignmentInfo::const_iterator it=alignTerm.begin();
83
+ it!=alignTerm.end(); ++it) {
84
+ alignedToS[it->first].push_back(it->second);
85
+ alignedCountS[it->first]++;
86
+ m_alignedToT[it->second].push_back(it->first);
87
+ }
88
+
89
+ for (Moses::AlignmentInfo::const_iterator it=alignNonTerm.begin();
90
+ it!=alignNonTerm.end(); ++it) {
91
+ alignedToS[it->first].push_back(it->second);
92
+ alignedCountS[it->first]++;
93
+ m_alignedToT[it->second].push_back(it->first);
94
+ }
95
+
96
+ Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
97
+ }
98
+
99
+ PhraseOrientation::PhraseOrientation(int sourceSize,
100
+ int targetSize,
101
+ const std::vector<std::vector<int> > &alignedToT,
102
+ const std::vector<std::vector<int> > &alignedToS,
103
+ const std::vector<int> &alignedCountS)
104
+ : m_countF(sourceSize)
105
+ , m_countE(targetSize)
106
+ , m_alignedToT(alignedToT)
107
+ {
108
+ Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
109
+ }
110
+
111
+
112
+ void PhraseOrientation::Init(int sourceSize,
113
+ int targetSize,
114
+ const std::vector<std::vector<int> > &alignedToT,
115
+ const std::vector<std::vector<int> > &alignedToS,
116
+ const std::vector<int> &alignedCountS)
117
+ {
118
+ for (int startF=0; startF<m_countF; ++startF) {
119
+ for (int endF=startF; endF<m_countF; ++endF) {
120
+
121
+ int minE = std::numeric_limits<int>::max();
122
+ int maxE = -1;
123
+ for (int fi=startF; fi<=endF; ++fi) {
124
+ for (size_t i=0; i<alignedToS[fi].size(); ++i) {
125
+ int ei = alignedToS[fi][i];
126
+ if (ei<minE) {
127
+ minE = ei;
128
+ }
129
+ if (ei>maxE) {
130
+ maxE = ei;
131
+ }
132
+ }
133
+ }
134
+
135
+ m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE);
136
+ }
137
+ }
138
+
139
+ // check alignments for target phrase startE...endE
140
+ // loop over continuous phrases which are compatible with the word alignments
141
+ for (int startE=0; startE<m_countE; ++startE) {
142
+ for (int endE=startE; endE<m_countE; ++endE) {
143
+
144
+ int minF = std::numeric_limits<int>::max();
145
+ int maxF = -1;
146
+ std::vector< int > usedF = alignedCountS;
147
+ for (int ei=startE; ei<=endE; ++ei) {
148
+ for (size_t i=0; i<alignedToT[ei].size(); ++i) {
149
+ int fi = alignedToT[ei][i];
150
+ if (fi<minF) {
151
+ minF = fi;
152
+ }
153
+ if (fi>maxF) {
154
+ maxF = fi;
155
+ }
156
+ usedF[fi]--;
157
+ }
158
+ }
159
+
160
+ m_minAndMaxAlignedToTargetSpan[ std::pair<int,int>(startE,endE) ] = std::pair<int,int>(minF,maxF);
161
+
162
+ if (maxF >= 0) { // aligned to any source words at all
163
+
164
+ // check if source words are aligned to out of bounds target words
165
+ bool out_of_bounds = false;
166
+ for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi)
167
+ if (usedF[fi]>0) {
168
+ // cout << "out of bounds: " << fi << "\n";
169
+ out_of_bounds = true;
170
+ }
171
+
172
+ // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
173
+ if (!out_of_bounds) {
174
+ // start point of source phrase may retreat over unaligned
175
+ for (int startF=minF;
176
+ (startF>=0 &&
177
+ (startF==minF || alignedCountS[startF]==0)); // unaligned
178
+ startF--) {
179
+ // end point of source phrase may advance over unaligned
180
+ for (int endF=maxF;
181
+ (endF<m_countF &&
182
+ (endF==maxF || alignedCountS[endF]==0)); // unaligned
183
+ endF++) { // at this point we have extracted a phrase
184
+
185
+ InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight,
186
+ startF, startE, endF, endE);
187
+ }
188
+ }
189
+ }
190
+ }
191
+ }
192
+ }
193
+ }
194
+
195
+
196
+ void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y )
197
+ {
198
+ std::set<int> tmp;
199
+ tmp.insert(x);
200
+ std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
201
+ if (ret.second == false) {
202
+ ret.first->second.insert(x);
203
+ }
204
+ }
205
+
206
+
207
+ void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft,
208
+ HSentenceVertices & topRight,
209
+ HSentenceVertices & bottomLeft,
210
+ HSentenceVertices & bottomRight,
211
+ int startF, int startE, int endF, int endE)
212
+ {
213
+
214
+ InsertVertex(topLeft, startF, startE);
215
+ InsertVertex(topRight, endF, startE);
216
+ InsertVertex(bottomLeft, startF, endE);
217
+ InsertVertex(bottomRight, endF, endE);
218
+ }
219
+
220
+
221
+ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const
222
+ {
223
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
224
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
225
+
226
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
227
+ int startE = (foundMinMax->second).first;
228
+ int endE = (foundMinMax->second).second;
229
+ // std::cerr << "Phrase orientation for"
230
+ // << " startF=" << startF
231
+ // << " endF=" << endF
232
+ // << " startE=" << startE
233
+ // << " endE=" << endE
234
+ // << std::endl;
235
+ return GetOrientationInfoString(startF, startE, endF, endE, direction);
236
+ } else {
237
+ std::cerr << "PhraseOrientation::GetOrientationInfoString(): Error: not able to determine phrase orientation" << std::endl;
238
+ std::exit(1);
239
+ }
240
+ }
241
+
242
+
243
+ const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
244
+ {
245
+ REO_CLASS hierPrevOrient=REO_CLASS_UNKNOWN, hierNextOrient=REO_CLASS_UNKNOWN;
246
+
247
+ if ( direction == REO_DIR_L2R || direction == REO_DIR_BIDIR )
248
+ hierPrevOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_L2R);
249
+
250
+ if ( direction == REO_DIR_R2L || direction == REO_DIR_BIDIR )
251
+ hierNextOrient = GetOrientationInfo(startF, startE, endF, endE, REO_DIR_R2L);
252
+
253
+ switch (direction) {
254
+ case REO_DIR_L2R:
255
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR);
256
+ break;
257
+ case REO_DIR_R2L:
258
+ return GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
259
+ break;
260
+ case REO_DIR_BIDIR:
261
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
262
+ break;
263
+ default:
264
+ return GetOrientationString(hierPrevOrient, REO_MODEL_TYPE_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MODEL_TYPE_MSLR);
265
+ break;
266
+ }
267
+ return "PhraseOrientationERROR";
268
+ }
269
+
270
+
271
+ PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const
272
+ {
273
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
274
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
275
+
276
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
277
+ int startE = (foundMinMax->second).first;
278
+ int endE = (foundMinMax->second).second;
279
+ // std::cerr << "Phrase orientation for"
280
+ // << " startF=" << startF
281
+ // << " endF=" << endF
282
+ // << " startE=" << startE
283
+ // << " endE=" << endE
284
+ // << std::endl;
285
+ return GetOrientationInfo(startF, startE, endF, endE, direction);
286
+ } else {
287
+ std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: not able to determine phrase orientation" << std::endl;
288
+ std::exit(1);
289
+ }
290
+ }
291
+
292
+
293
+ PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const
294
+ {
295
+ if ( direction != REO_DIR_L2R && direction != REO_DIR_R2L ) {
296
+ std::cerr << "PhraseOrientation::GetOrientationInfo(): Error: direction should be either L2R or R2L" << std::endl;
297
+ std::exit(1);
298
+ }
299
+
300
+ if ( direction == REO_DIR_L2R )
301
+ return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
302
+ startF, endF, startE, endE, m_countF-1, 0, 0, 1,
303
+ &ge, &le,
304
+ m_bottomRight, m_bottomLeft);
305
+
306
+ if ( direction == REO_DIR_R2L )
307
+ return GetOrientHierModel(REO_MODEL_TYPE_MSLR,
308
+ endF, startF, endE, startE, 0, m_countF-1, m_countE-1, -1,
309
+ &le, &ge,
310
+ m_topLeft, m_topRight);
311
+
312
+ return REO_CLASS_UNKNOWN;
313
+ }
314
+
315
+
316
+ // to be called with countF-1 instead of countF
317
+ PhraseOrientation::REO_CLASS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType,
318
+ int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
319
+ bool (*ge)(int, int), bool (*le)(int, int),
320
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const
321
+ {
322
+ bool leftSourceSpanIsAligned = ( (startF != zeroF) && SourceSpanIsAligned(zeroF,startF-unit) );
323
+ bool topTargetSpanIsAligned = ( (startE != zeroE) && TargetSpanIsAligned(zeroE,startE-unit) );
324
+
325
+ if (!topTargetSpanIsAligned && !leftSourceSpanIsAligned)
326
+ return REO_CLASS_LEFT;
327
+
328
+ HSentenceVertices::const_iterator it;
329
+
330
+ if (//(connectedLeftTop && !connectedRightTop) ||
331
+ ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
332
+ it->second.find(startF-unit) != it->second.end()))
333
+ return REO_CLASS_LEFT;
334
+
335
+ if (modelType == REO_MODEL_TYPE_MONO)
336
+ return REO_CLASS_UNKNOWN;
337
+
338
+ if (//(!connectedLeftTop && connectedRightTop) ||
339
+ ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
340
+ it->second.find(endF + unit) != it->second.end()))
341
+ return REO_CLASS_RIGHT;
342
+
343
+ if (modelType == REO_MODEL_TYPE_MSD)
344
+ return REO_CLASS_UNKNOWN;
345
+
346
+ for (int indexF=startF-2*unit; (*ge)(indexF, zeroF); indexF=indexF-unit) {
347
+ if ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
348
+ it->second.find(indexF) != it->second.end())
349
+ return REO_CLASS_DLEFT;
350
+ }
351
+
352
+ for (int indexF=endF+2*unit; (*le)(indexF, countF); indexF=indexF+unit) {
353
+ if ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
354
+ it->second.find(indexF) != it->second.end())
355
+ return REO_CLASS_DRIGHT;
356
+ }
357
+
358
+ return REO_CLASS_UNKNOWN;
359
+ }
360
+
361
+ bool PhraseOrientation::SourceSpanIsAligned(int index1, int index2) const
362
+ {
363
+ return SpanIsAligned(index1, index2, m_minAndMaxAlignedToSourceSpan);
364
+ }
365
+
366
+ bool PhraseOrientation::TargetSpanIsAligned(int index1, int index2) const
367
+ {
368
+ return SpanIsAligned(index1, index2, m_minAndMaxAlignedToTargetSpan);
369
+ }
370
+
371
+ bool PhraseOrientation::SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const
372
+ {
373
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator itMinAndMaxAligned =
374
+ minAndMaxAligned.find(std::pair<int,int>(std::min(index1,index2),std::max(index1,index2)));
375
+
376
+ if (itMinAndMaxAligned == minAndMaxAligned.end()) {
377
+ std::cerr << "PhraseOrientation::SourceSpanIsAligned(): Error" << std::endl;
378
+ std::exit(1);
379
+ } else {
380
+ if (itMinAndMaxAligned->second.first == std::numeric_limits<int>::max()) {
381
+ return false;
382
+ }
383
+ }
384
+ return true;
385
+ }
386
+
387
+
388
+ const std::string PhraseOrientation::GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType)
389
+ {
390
+ std::ostringstream oss;
391
+ WriteOrientation(oss, orient, modelType);
392
+ return oss.str();
393
+ }
394
+
395
+
396
+ void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType)
397
+ {
398
+ switch(orient) {
399
+ case REO_CLASS_LEFT:
400
+ out << "mono";
401
+ break;
402
+ case REO_CLASS_RIGHT:
403
+ out << "swap";
404
+ break;
405
+ case REO_CLASS_DLEFT:
406
+ out << "dleft";
407
+ break;
408
+ case REO_CLASS_DRIGHT:
409
+ out << "dright";
410
+ break;
411
+ case REO_CLASS_UNKNOWN:
412
+ switch(modelType) {
413
+ case REO_MODEL_TYPE_MONO:
414
+ out << "nomono";
415
+ break;
416
+ case REO_MODEL_TYPE_MSD:
417
+ out << "other";
418
+ break;
419
+ case REO_MODEL_TYPE_MSLR:
420
+ out << "dleft";
421
+ break;
422
+ }
423
+ break;
424
+ }
425
+ }
426
+
427
+
428
+ bool PhraseOrientation::IsAligned(int fi, int ei) const
429
+ {
430
+ if (ei == -1 && fi == -1)
431
+ return true;
432
+
433
+ if (ei <= -1 || fi <= -1)
434
+ return false;
435
+
436
+ if (ei == m_countE && fi == m_countF)
437
+ return true;
438
+
439
+ if (ei >= m_countE || fi >= m_countF)
440
+ return false;
441
+
442
+ for (size_t i=0; i<m_alignedToT[ei].size(); ++i)
443
+ if (m_alignedToT[ei][i] == fi)
444
+ return true;
445
+
446
+ return false;
447
+ }
448
+
449
+
450
+ void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment)
451
+ {
452
+ assert(direction==REO_DIR_L2R || direction==REO_DIR_R2L);
453
+ if (direction == REO_DIR_L2R) {
454
+ m_l2rOrientationPriorCounts[orient] += increment;
455
+ } else if (direction == REO_DIR_R2L) {
456
+ m_r2lOrientationPriorCounts[orient] += increment;
457
+ }
458
+ }
459
+
460
+
461
+ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType)
462
+ {
463
+ std::map<std::string,float> l2rOrientationPriorCountsMap;
464
+ std::map<std::string,float> r2lOrientationPriorCountsMap;
465
+ for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
466
+ l2rOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_l2rOrientationPriorCounts[orient];
467
+ }
468
+ for (int orient=0; orient<=REO_CLASS_UNKNOWN; ++orient) {
469
+ r2lOrientationPriorCountsMap[GetOrientationString((REO_CLASS)orient, modelType)] += m_r2lOrientationPriorCounts[orient];
470
+ }
471
+ for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin();
472
+ l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) {
473
+ out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl;
474
+ }
475
+ for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin();
476
+ r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) {
477
+ out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl;
478
+ }
479
+ }
480
+
481
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/PhraseOrientation.h ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <map>
23
+ #include <set>
24
+ #include <string>
25
+ #include <vector>
26
+
27
+ #include <boost/unordered_map.hpp>
28
+
29
+ #include "moses/AlignmentInfo.h"
30
+
31
+ #include "Alignment.h"
32
+
33
+ namespace MosesTraining
34
+ {
35
+
36
+ // The key of the map is the English index and the value is a set of the source ones
37
+ typedef std::map <int, std::set<int> > HSentenceVertices;
38
+
39
+
40
+ class PhraseOrientation
41
+ {
42
+ public:
43
+
44
+ enum REO_MODEL_TYPE {REO_MODEL_TYPE_MSD, REO_MODEL_TYPE_MSLR, REO_MODEL_TYPE_MONO};
45
+ enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN};
46
+ enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR};
47
+
48
+ PhraseOrientation() {};
49
+
50
+ PhraseOrientation(int sourceSize,
51
+ int targetSize,
52
+ const Alignment &alignment);
53
+
54
+ PhraseOrientation(int sourceSize,
55
+ int targetSize,
56
+ const Moses::AlignmentInfo &alignTerm,
57
+ const Moses::AlignmentInfo &alignNonTerm);
58
+
59
+ PhraseOrientation(int sourceSize,
60
+ int targetSize,
61
+ const std::vector<std::vector<int> > &alignedToT,
62
+ const std::vector<std::vector<int> > &alignedToS,
63
+ const std::vector<int> &alignedCountS);
64
+
65
+ REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
66
+ REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
67
+ const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const;
68
+ const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=REO_DIR_BIDIR) const;
69
+ static const std::string GetOrientationString(const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
70
+ static void WriteOrientation(std::ostream& out, const REO_CLASS orient, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
71
+ void IncrementPriorCount(REO_DIR direction, REO_CLASS orient, float increment);
72
+ static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MODEL_TYPE_MSLR);
73
+ bool SourceSpanIsAligned(int index1, int index2) const;
74
+ bool TargetSpanIsAligned(int index1, int index2) const;
75
+
76
+ private:
77
+
78
+ void Init(int sourceSize, int targetSize,
79
+ const std::vector<std::vector<int> > &alignedToT,
80
+ const std::vector<std::vector<int> > &alignedToS,
81
+ const std::vector<int> &alignedCountS);
82
+
83
+ void InsertVertex( HSentenceVertices & corners, int x, int y );
84
+
85
+ void InsertPhraseVertices(HSentenceVertices & topLeft,
86
+ HSentenceVertices & topRight,
87
+ HSentenceVertices & bottomLeft,
88
+ HSentenceVertices & bottomRight,
89
+ int startF, int startE, int endF, int endE);
90
+
91
+ REO_CLASS GetOrientHierModel(REO_MODEL_TYPE modelType,
92
+ int startF, int endF, int startE, int endE, int countF, int zeroF, int zeroE, int unit,
93
+ bool (*ge)(int, int), bool (*lt)(int, int),
94
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const;
95
+
96
+ bool SpanIsAligned(int index1, int index2, const boost::unordered_map< std::pair<int,int> , std::pair<int,int> > &minAndMaxAligned) const;
97
+
98
+ bool IsAligned(int fi, int ei) const;
99
+
100
+ static bool ge(int first, int second) {
101
+ return first >= second;
102
+ };
103
+ static bool le(int first, int second) {
104
+ return first <= second;
105
+ };
106
+ static bool lt(int first, int second) {
107
+ return first < second;
108
+ };
109
+
110
+ int m_countF;
111
+ int m_countE;
112
+
113
+ std::vector<std::vector<int> > m_alignedToT;
114
+
115
+ HSentenceVertices m_topLeft;
116
+ HSentenceVertices m_topRight;
117
+ HSentenceVertices m_bottomLeft;
118
+ HSentenceVertices m_bottomRight;
119
+
120
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan;
121
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToTargetSpan;
122
+
123
+ static std::vector<float> m_l2rOrientationPriorCounts;
124
+ static std::vector<float> m_r2lOrientationPriorCounts;
125
+ };
126
+
127
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/PropertiesConsolidator.cpp ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "PropertiesConsolidator.h"
21
+
22
+ #include <sstream>
23
+ #include <limits>
24
+ #include <vector>
25
+
26
+ #include "moses/Util.h"
27
+ #include "phrase-extract/InputFileStream.h"
28
+ #include "phrase-extract/OutputFileStream.h"
29
+
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+ void PropertiesConsolidator::ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile)
35
+ {
36
+ Moses::InputFileStream inFile(sourceLabelSetFile);
37
+
38
+ // read source label set
39
+ m_sourceLabels.clear();
40
+ std::string line;
41
+ while (getline(inFile, line)) {
42
+ std::istringstream tokenizer(line);
43
+ std::string label;
44
+ size_t index;
45
+ try {
46
+ tokenizer >> label >> index;
47
+ } catch (const std::exception &e) {
48
+ UTIL_THROW2("Error reading source label set file " << sourceLabelSetFile << " .");
49
+ }
50
+ std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_sourceLabels.insert( std::pair<std::string,size_t>(label,index) );
51
+ UTIL_THROW_IF2(!inserted.second,"Source label set file " << sourceLabelSetFile << " should contain each syntactic label only once.");
52
+ }
53
+
54
+ inFile.Close();
55
+
56
+ m_sourceLabelsFlag = true;
57
+ }
58
+
59
+
60
+ void PropertiesConsolidator::ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile)
61
+ {
62
+ Moses::InputFileStream inFile(partsOfSpeechFile);
63
+
64
+ // read parts-of-speech vocabulary
65
+ m_partsOfSpeechVocabulary.clear();
66
+ std::string line;
67
+ while (getline(inFile, line)) {
68
+ std::istringstream tokenizer(line);
69
+ std::string label;
70
+ size_t index;
71
+ try {
72
+ tokenizer >> label >> index;
73
+ } catch (const std::exception &e) {
74
+ UTIL_THROW2("Error reading part-of-speech vocabulary file " << partsOfSpeechFile << " .");
75
+ }
76
+ std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_partsOfSpeechVocabulary.insert( std::pair<std::string,size_t>(label,index) );
77
+ UTIL_THROW_IF2(!inserted.second,"Part-of-speech vocabulary file " << partsOfSpeechFile << " should contain each POS tag only once.");
78
+ }
79
+
80
+ inFile.Close();
81
+
82
+ m_partsOfSpeechFlag = true;
83
+ }
84
+
85
+
86
+ void PropertiesConsolidator::ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile)
87
+ {
88
+ Moses::InputFileStream inFile(targetSyntacticPreferencesLabelSetFile);
89
+
90
+ // read target syntactic preferences label set
91
+ m_targetSyntacticPreferencesLabels.clear();
92
+ std::string line;
93
+ while (getline(inFile, line)) {
94
+ std::istringstream tokenizer(line);
95
+ std::string label;
96
+ size_t index;
97
+ try {
98
+ tokenizer >> label >> index;
99
+ } catch (const std::exception &e) {
100
+ UTIL_THROW2("Error reading target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " .");
101
+ }
102
+ std::pair< std::map<std::string,size_t>::iterator, bool > inserted = m_targetSyntacticPreferencesLabels.insert( std::pair<std::string,size_t>(label,index) );
103
+ UTIL_THROW_IF2(!inserted.second,"Target syntactic preferences label set file " << targetSyntacticPreferencesLabelSetFile << " should contain each syntactic label only once.");
104
+ }
105
+
106
+ inFile.Close();
107
+
108
+ m_targetSyntacticPreferencesFlag = true;
109
+ }
110
+
111
+
112
+ void PropertiesConsolidator::ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const
113
+ {
114
+ if ( propertiesString.empty() ) {
115
+ return;
116
+ }
117
+
118
+ std::vector<std::string> toks;
119
+ Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
120
+ for (size_t i = 1; i < toks.size(); ++i) {
121
+ std::string &tok = toks[i];
122
+ if (tok.empty()) {
123
+ continue;
124
+ }
125
+ size_t endPos = tok.rfind("}");
126
+ tok = tok.substr(0, endPos - 1);
127
+ std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
128
+ assert(keyValue.size() == 2);
129
+
130
+ if ( !keyValue[0].compare("SourceLabels") ) {
131
+
132
+ if ( m_sourceLabelsFlag ) {
133
+
134
+ // SourceLabels property: replace strings with vocabulary indices
135
+ out << " {{" << keyValue[0];
136
+ ProcessSourceLabelsPropertyValue(keyValue[1], out);
137
+ out << "}}";
138
+
139
+ } else { // don't process SourceLabels property
140
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
141
+ }
142
+
143
+ } else if ( !keyValue[0].compare("POS") ) {
144
+
145
+ /* DO NOTHING (property is not registered in the decoder at the moment)
146
+ if ( m_partsOfSpeechFlag ) {
147
+
148
+ // POS property: replace strings with vocabulary indices
149
+ out << " {{" << keyValue[0];
150
+ ProcessPOSPropertyValue(keyValue[1], out);
151
+ out << "}}";
152
+
153
+ } else { // don't process POS property
154
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
155
+ }
156
+ */
157
+
158
+ } else if ( !keyValue[0].compare("TargetPreferences") ) {
159
+
160
+ if ( m_targetSyntacticPreferencesFlag ) {
161
+
162
+ // TargetPreferences property: replace strings with vocabulary indices
163
+ out << " {{" << keyValue[0];
164
+ ProcessTargetSyntacticPreferencesPropertyValue(keyValue[1], out);
165
+ out << "}}";
166
+
167
+ } else { // don't process TargetPreferences property
168
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
169
+ }
170
+
171
+ } else {
172
+
173
+ // output other property
174
+ out << " {{" << keyValue[0] << " " << keyValue[1] << "}}";
175
+ }
176
+ }
177
+ }
178
+
179
+
180
+ void PropertiesConsolidator::ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
181
+ {
182
+ // SourceLabels property: replace strings with vocabulary indices
183
+ std::istringstream tokenizer(value);
184
+
185
+ size_t nNTs;
186
+ double totalCount;
187
+
188
+ if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
189
+ UTIL_THROW2("Not able to read number of non-terminals from SourceLabels property. "
190
+ << "Flawed SourceLabels property?");
191
+ }
192
+ assert( nNTs > 0 );
193
+ out << " " << nNTs;
194
+
195
+ if (! (tokenizer >> totalCount)) { // second token: overall rule count
196
+ UTIL_THROW2("Not able to read overall rule count from SourceLabels property. "
197
+ << "Flawed SourceLabels property?");
198
+ }
199
+ assert( totalCount > 0.0 );
200
+ out << " " << totalCount;
201
+
202
+ while (tokenizer.peek() != EOF) {
203
+ try {
204
+
205
+ size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
206
+
207
+ std::string token;
208
+
209
+ if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
210
+ for (size_t i=0; i<nNTs-1; ++i) { // RHS source non-terminal labels
211
+ tokenizer >> token; // RHS source non-terminal label
212
+ std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
213
+ UTIL_THROW_IF2(found == m_sourceLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
214
+ out << " " << found->second;
215
+ }
216
+
217
+ tokenizer >> token; // sourceLabelsRHSCount
218
+ out << " " << token;
219
+
220
+ tokenizer >> numberOfLHSsGivenRHS;
221
+ out << " " << numberOfLHSsGivenRHS;
222
+ }
223
+
224
+ for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS source non-terminal labels seen with this RHS
225
+ tokenizer >> token; // LHS source non-terminal label
226
+ std::map<std::string,size_t>::const_iterator found = m_sourceLabels.find(token);
227
+ UTIL_THROW_IF2(found == m_sourceLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
228
+ out << " " << found->second;
229
+
230
+ tokenizer >> token; // ruleSourceLabelledCount
231
+ out << " " << token;
232
+ }
233
+
234
+ } catch (const std::exception &e) {
235
+ UTIL_THROW2("Flawed item in SourceLabels property?");
236
+ }
237
+ }
238
+ }
239
+
240
+
241
+ void PropertiesConsolidator::ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
242
+ {
243
+ std::istringstream tokenizer(value);
244
+ while (tokenizer.peek() != EOF) {
245
+ std::string token;
246
+ tokenizer >> token;
247
+ std::map<std::string,size_t>::const_iterator found = m_partsOfSpeechVocabulary.find(token);
248
+ UTIL_THROW_IF2(found == m_partsOfSpeechVocabulary.end() ,"Part-of-speech \"" << token << "\" from the phrase table not found in given part-of-speech vocabulary.");
249
+ out << " " << found->second;
250
+ }
251
+ }
252
+
253
+
254
+ bool PropertiesConsolidator::GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const
255
+ {
256
+ out.clear();
257
+ if ( propertiesString.empty() ) {
258
+ return false;
259
+ }
260
+
261
+ std::vector<std::string> toks;
262
+ Moses::TokenizeMultiCharSeparator(toks, propertiesString, "{{");
263
+ for (size_t i = 1; i < toks.size(); ++i) {
264
+ std::string &tok = toks[i];
265
+ if (tok.empty()) {
266
+ continue;
267
+ }
268
+ size_t endPos = tok.rfind("}");
269
+ tok = tok.substr(0, endPos - 1);
270
+ std::vector<std::string> keyValue = Moses::TokenizeFirstOnly(tok, " ");
271
+ assert(keyValue.size() == 2);
272
+
273
+ if ( !keyValue[0].compare("POS") ) {
274
+ std::istringstream tokenizer(keyValue[1]);
275
+ while (tokenizer.peek() != EOF) {
276
+ std::string token;
277
+ tokenizer >> token;
278
+ out.push_back(token);
279
+ }
280
+ return true;
281
+ }
282
+ }
283
+
284
+ return false;
285
+ }
286
+
287
+
288
+ void PropertiesConsolidator::ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const
289
+ {
290
+ // TargetPreferences property: replace strings with vocabulary indices
291
+ std::istringstream tokenizer(value);
292
+
293
+ size_t nNTs;
294
+ double totalCount;
295
+
296
+ if (! (tokenizer >> nNTs)) { // first token: number of non-terminals (incl. left-hand side)
297
+ UTIL_THROW2("Not able to read number of non-terminals from TargetPreferences property. "
298
+ << "Flawed TargetPreferences property?");
299
+ }
300
+ assert( nNTs > 0 );
301
+ out << " " << nNTs;
302
+
303
+ if (! (tokenizer >> totalCount)) { // second token: overall rule count
304
+ UTIL_THROW2("Not able to read overall rule count from TargetPreferences property. "
305
+ << "Flawed TargetPreferences property?");
306
+ }
307
+ assert( totalCount > 0.0 );
308
+ out << " " << totalCount;
309
+
310
+ while (tokenizer.peek() != EOF) {
311
+ try {
312
+
313
+ size_t numberOfLHSsGivenRHS = std::numeric_limits<std::size_t>::max();
314
+
315
+ std::string token;
316
+
317
+ if (nNTs > 1) { // rule has right-hand side non-terminals, i.e. it's a hierarchical rule
318
+ for (size_t i=0; i<nNTs-1; ++i) { // RHS target preference non-terminal labels
319
+ tokenizer >> token; // RHS target preference non-terminal label
320
+ std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
321
+ UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end(), "Label \"" << token << "\" from the phrase table not found in given label set.");
322
+ out << " " << found->second;
323
+ }
324
+
325
+ tokenizer >> token; // targetPreferenceRHSCount
326
+ out << " " << token;
327
+
328
+ tokenizer >> numberOfLHSsGivenRHS;
329
+ out << " " << numberOfLHSsGivenRHS;
330
+ }
331
+
332
+ for (size_t i=0; i<numberOfLHSsGivenRHS && tokenizer.peek()!=EOF; ++i) { // LHS target preference non-terminal labels seen with this RHS
333
+ tokenizer >> token; // LHS target preference non-terminal label
334
+ std::map<std::string,size_t>::const_iterator found = m_targetSyntacticPreferencesLabels.find(token);
335
+ UTIL_THROW_IF2(found == m_targetSyntacticPreferencesLabels.end() ,"Label \"" << token << "\" from the phrase table not found in given label set.");
336
+ out << " " << found->second;
337
+
338
+ tokenizer >> token; // ruleTargetPreferenceLabelledCount
339
+ out << " " << token;
340
+ }
341
+
342
+ } catch (const std::exception &e) {
343
+ UTIL_THROW2("Flawed item in TargetPreferences property?");
344
+ }
345
+ }
346
+ }
347
+
348
+
349
+ } // namespace MosesTraining
350
+
mosesdecoder/phrase-extract/PropertiesConsolidator.h ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+
21
+ #pragma once
22
+
23
+ #include <string>
24
+ #include <map>
25
+ #include <vector>
26
+
27
+ #include "OutputFileStream.h"
28
+
29
+
30
+ namespace MosesTraining
31
+ {
32
+
33
+ class PropertiesConsolidator
34
+ {
35
+ public:
36
+
37
+ PropertiesConsolidator()
38
+ : m_sourceLabelsFlag(false)
39
+ , m_partsOfSpeechFlag(false)
40
+ , m_targetSyntacticPreferencesFlag(false)
41
+ {};
42
+
43
+ void ActivateSourceLabelsProcessing(const std::string &sourceLabelSetFile);
44
+ void ActivatePartsOfSpeechProcessing(const std::string &partsOfSpeechFile);
45
+ void ActivateTargetSyntacticPreferencesProcessing(const std::string &targetSyntacticPreferencesLabelSetFile);
46
+
47
+ bool GetPOSPropertyValueFromPropertiesString(const std::string &propertiesString, std::vector<std::string>& out) const;
48
+
49
+ void ProcessPropertiesString(const std::string &propertiesString, Moses::OutputFileStream& out) const;
50
+
51
+ protected:
52
+
53
+ void ProcessSourceLabelsPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
54
+ void ProcessPOSPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
55
+ void ProcessTargetSyntacticPreferencesPropertyValue(const std::string &value, Moses::OutputFileStream& out) const;
56
+
57
+ bool m_sourceLabelsFlag;
58
+ std::map<std::string,size_t> m_sourceLabels;
59
+ bool m_partsOfSpeechFlag;
60
+ std::map<std::string,size_t> m_partsOfSpeechVocabulary;
61
+ bool m_targetSyntacticPreferencesFlag;
62
+ std::map<std::string,size_t> m_targetSyntacticPreferencesLabels;
63
+
64
+ };
65
+
66
+ } // namespace MosesTraining
67
+
mosesdecoder/phrase-extract/RuleExist.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef RULEEXIST_H_INCLUDED_
22
+ #define RULEEXIST_H_INCLUDED_
23
+
24
+ #include <vector>
25
+
26
+ #include "Hole.h"
27
+
28
+ namespace MosesTraining
29
+ {
30
+
31
+ // reposity of extracted phrase pairs
32
+ // which are potential holes in larger phrase pairs
33
+ class RuleExist
34
+ {
35
+ protected:
36
+ std::vector< std::vector<HoleList> > m_phraseExist;
37
+ // indexed by source pos. and source length
38
+ // maps to list of holes where <int, int> are target pos
39
+
40
+ public:
41
+ RuleExist(size_t size)
42
+ :m_phraseExist(size) {
43
+ // size is the length of the source sentence
44
+ for (size_t pos = 0; pos < size; ++pos) {
45
+ // create empty hole lists
46
+ std::vector<HoleList> &endVec = m_phraseExist[pos];
47
+ endVec.resize(size - pos);
48
+ }
49
+ }
50
+
51
+ void Add(int startT, int endT, int startS, int endS) {
52
+ m_phraseExist[startT][endT - startT].push_back(Hole(startS, endS, startT, endT));
53
+ }
54
+
55
+ const HoleList &GetSourceHoles(int startT, int endT) const {
56
+ const HoleList &sourceHoles = m_phraseExist[startT][endT - startT];
57
+ return sourceHoles;
58
+ }
59
+
60
+ };
61
+
62
+ }
63
+
64
+
65
+ #endif
mosesdecoder/phrase-extract/RuleExtractionOptions.h ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ namespace MosesTraining
23
+ {
24
+
25
+ struct RuleExtractionOptions {
26
+ public:
27
+ int maxSpan;
28
+ int minHoleSource;
29
+ int minHoleTarget;
30
+ int minWords;
31
+ int maxSymbolsTarget;
32
+ int maxSymbolsSource;
33
+ int maxNonTerm;
34
+ int maxScope;
35
+ bool onlyDirectFlag;
36
+ bool glueGrammarFlag;
37
+ bool unknownWordLabelFlag;
38
+ bool onlyOutputSpanInfo;
39
+ bool noFileLimit;
40
+ bool properConditioning;
41
+ bool nonTermFirstWord;
42
+ bool nonTermConsecTarget;
43
+ bool nonTermConsecSource;
44
+ bool requireAlignedWord;
45
+ bool sourceSyntax;
46
+ bool targetSyntax;
47
+ bool targetSyntacticPreferences;
48
+ bool duplicateRules;
49
+ bool fractionalCounting;
50
+ bool pcfgScore;
51
+ bool gzOutput;
52
+ bool unpairedExtractFormat;
53
+ bool conditionOnTargetLhs;
54
+ bool boundaryRules;
55
+ bool flexScoreFlag;
56
+ bool phraseOrientation;
57
+
58
+ RuleExtractionOptions()
59
+ : maxSpan(10)
60
+ , minHoleSource(2)
61
+ , minHoleTarget(1)
62
+ , minWords(1)
63
+ , maxSymbolsTarget(999)
64
+ , maxSymbolsSource(5)
65
+ , maxNonTerm(2)
66
+ , maxScope(999)
67
+ // int minHoleSize(1)
68
+ // int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
69
+ , onlyDirectFlag(false)
70
+ , glueGrammarFlag(false)
71
+ , unknownWordLabelFlag(false)
72
+ , onlyOutputSpanInfo(false)
73
+ , noFileLimit(false)
74
+ //bool zipFiles(false)
75
+ , properConditioning(false)
76
+ , nonTermFirstWord(true)
77
+ , nonTermConsecTarget(true)
78
+ , nonTermConsecSource(false)
79
+ , requireAlignedWord(true)
80
+ , sourceSyntax(false)
81
+ , targetSyntax(false)
82
+ , targetSyntacticPreferences(false)
83
+ , duplicateRules(true)
84
+ , fractionalCounting(true)
85
+ , pcfgScore(false)
86
+ , gzOutput(false)
87
+ , unpairedExtractFormat(false)
88
+ , conditionOnTargetLhs(false)
89
+ , boundaryRules(false)
90
+ , flexScoreFlag(false)
91
+ , phraseOrientation(false) {}
92
+ };
93
+
94
+ }
95
+
mosesdecoder/phrase-extract/ScoreFeature.cpp ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2012- University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <boost/algorithm/string/predicate.hpp>
21
+ #include "ScoreFeature.h"
22
+ #include "DomainFeature.h"
23
+ #include "InternalStructFeature.h"
24
+
25
+ using namespace std;
26
+ using namespace boost::algorithm;
27
+
28
+ namespace MosesTraining
29
+ {
30
+
31
+
32
+ const string& ScoreFeatureManager::usage() const
33
+ {
34
+ const static string& usage = "[--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]" ;
35
+ return usage;
36
+ }
37
+
38
+ void ScoreFeatureManager::configure(const std::vector<std::string> args)
39
+ {
40
+ bool domainAdded = false;
41
+ bool sparseDomainAdded = false;
42
+
43
+ for (size_t i = 0; i < args.size(); ++i) {
44
+ if (args[i] == "--IgnoreSentenceId") {
45
+ m_includeSentenceId = true;
46
+ } else if (starts_with(args[i], "--Domain")) {
47
+ string type = args[i].substr(8);
48
+ ++i;
49
+ UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
50
+ string domainFile = args[i];
51
+ UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException,
52
+ "Only allowed one domain feature");
53
+ if (type == "Subset") {
54
+ m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile)));
55
+ } else if (type == "Ratio") {
56
+ m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile)));
57
+ } else if (type == "Indicator") {
58
+ m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile)));
59
+ } else {
60
+ UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
61
+ }
62
+ domainAdded = true;
63
+ m_includeSentenceId = true;
64
+ } else if (starts_with(args[i], "--SparseDomain")) {
65
+ string type = args[i].substr(14);
66
+ ++i;
67
+ UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file");
68
+ string domainFile = args[i];
69
+ UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException,
70
+ "Only allowed one sparse domain feature");
71
+ if (type == "Subset") {
72
+ m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile)));
73
+ } else if (type == "Ratio") {
74
+ m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile)));
75
+ } else if (type == "Indicator") {
76
+ m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile)));
77
+ } else {
78
+ UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type);
79
+ }
80
+ sparseDomainAdded = true;
81
+ m_includeSentenceId = true;
82
+ } else if(args[i] == "--TreeFeatureSparse") {
83
+ //MARIA
84
+ m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse()));
85
+ } else if(args[i] == "--TreeFeatureDense") {
86
+ //MARIA
87
+ m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense()));
88
+ } else {
89
+ UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]);
90
+ }
91
+
92
+ }
93
+
94
+ }
95
+
96
+ void ScoreFeatureManager::addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
97
+ float count,
98
+ int sentenceId) const
99
+ {
100
+ for (size_t i = 0; i < m_features.size(); ++i) {
101
+ m_features[i]->addPropertiesToPhrasePair(phrasePair, count, sentenceId);
102
+ }
103
+ }
104
+
105
+ void ScoreFeatureManager::addFeatures(const ScoreFeatureContext& context,
106
+ std::vector<float>& denseValues,
107
+ std::map<std::string,float>& sparseValues) const
108
+ {
109
+ for (size_t i = 0; i < m_features.size(); ++i) {
110
+ m_features[i]->add(context, denseValues, sparseValues);
111
+ }
112
+ }
113
+ }
114
+
mosesdecoder/phrase-extract/ScoreFeature.h ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2012- University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ /**
21
+ * This contains extra features that can be added to the scorer. To add a new feature:
22
+ * 1. Implement a subclass of ScoreFeature
23
+ * 2. Updated ScoreFeatureManager.configure() to configure your feature, and usage() to
24
+ * display usage info.
25
+ * 3. Write unit tests (see ScoreFeatureTest.cpp) and regression tests
26
+ **/
27
+
28
+ #pragma once
29
+
30
+ #include <string>
31
+ #include <map>
32
+ #include <vector>
33
+
34
+ #include <boost/shared_ptr.hpp>
35
+
36
+ #include "util/exception.hh"
37
+
38
+ #include "ExtractionPhrasePair.h"
39
+
40
+ namespace MosesTraining
41
+ {
42
+
43
+ struct MaybeLog {
44
+ MaybeLog(bool useLog, float negativeLog):
45
+ m_useLog(useLog), m_negativeLog(negativeLog) {}
46
+
47
+ inline float operator() (float a) const {
48
+ return m_useLog ? m_negativeLog*log(a) : a;
49
+ }
50
+
51
+ float m_useLog;
52
+ float m_negativeLog;
53
+ };
54
+
55
+ class ScoreFeatureArgumentException : public util::Exception
56
+ {
57
+ public:
58
+ ScoreFeatureArgumentException() throw() {
59
+ *this << "Unable to configure features: ";
60
+ }
61
+ ~ScoreFeatureArgumentException() throw() {}
62
+ };
63
+
64
+ /** Passed to each feature to be used to calculate its values */
65
+ struct ScoreFeatureContext {
66
+ ScoreFeatureContext(
67
+ const ExtractionPhrasePair &thePhrasePair,
68
+ const MaybeLog& theMaybeLog
69
+ ) :
70
+ phrasePair(thePhrasePair),
71
+ maybeLog(theMaybeLog) {
72
+ }
73
+
74
+ const ExtractionPhrasePair &phrasePair;
75
+ MaybeLog maybeLog;
76
+ };
77
+
78
+ /**
79
+ * Abstract base class for extra features that can be added to the phrase table
80
+ * during scoring.
81
+ **/
82
+ class ScoreFeature
83
+ {
84
+ public:
85
+
86
+ /** Some features might need to store properties in ExtractionPhrasePair,
87
+ * e.g. to pass along external information loaded by a feature
88
+ * which may distinguish several phrase occurrences based on sentence ID */
89
+ virtual void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
90
+ float count,
91
+ int sentenceId) const {};
92
+
93
+ /** Add the values for this score feature. */
94
+ virtual void add(const ScoreFeatureContext& context,
95
+ std::vector<float>& denseValues,
96
+ std::map<std::string,float>& sparseValues) const = 0;
97
+
98
+ virtual ~ScoreFeature() {}
99
+
100
+ };
101
+
102
+ typedef boost::shared_ptr<ScoreFeature> ScoreFeaturePtr;
103
+ class ScoreFeatureManager
104
+ {
105
+ public:
106
+ ScoreFeatureManager():
107
+ m_includeSentenceId(false) {}
108
+
109
+ /** To be appended to the score usage message */
110
+ const std::string& usage() const;
111
+
112
+ /** Pass the unused command-line arguments to configure the extra features */
113
+ void configure(const std::vector<std::string> args);
114
+
115
+ /** Some features might need to store properties in ExtractionPhrasePair,
116
+ * e.g. to pass along external information loaded by a feature
117
+ * which may distinguish several phrase occurrences based on sentence ID */
118
+ void addPropertiesToPhrasePair(ExtractionPhrasePair &phrasePair,
119
+ float count,
120
+ int sentenceId) const;
121
+
122
+ /** Add all the features */
123
+ void addFeatures(const ScoreFeatureContext& context,
124
+ std::vector<float>& denseValues,
125
+ std::map<std::string,float>& sparseValues) const;
126
+
127
+ const std::vector<ScoreFeaturePtr>& getFeatures() const {
128
+ return m_features;
129
+ }
130
+
131
+ /** Do we need to include sentence ids in phrase pairs? */
132
+ bool includeSentenceId() const {
133
+ return m_includeSentenceId;
134
+ }
135
+
136
+ private:
137
+ std::vector<ScoreFeaturePtr> m_features;
138
+ bool m_includeSentenceId;
139
+ };
140
+
141
+ }
142
+
143
+
mosesdecoder/phrase-extract/ScoreFeatureTest.cpp ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2012- University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "DomainFeature.h"
21
+ #include "ScoreFeature.h"
22
+ #include "tables-core.h"
23
+
24
+ #define BOOST_TEST_MODULE MosesTrainingScoreFeature
25
+ #include <boost/test/test_tools.hpp>
26
+ #include <boost/test/unit_test.hpp>
27
+ #include <boost/assign/list_of.hpp>
28
+
29
+ #include <unordered_set>
30
+ #include <unordered_map>
31
+
32
+ using namespace MosesTraining;
33
+ using namespace std;
34
+
35
+ //pesky global variables
36
+ namespace MosesTraining
37
+ {
38
+ bool hierarchicalFlag = false;
39
+ Vocabulary vcbT;
40
+ Vocabulary vcbS;
41
+ }
42
+
43
+
44
+ const char *DomainFileLocation()
45
+ {
46
+ if (boost::unit_test::framework::master_test_suite().argc < 2) {
47
+ return "test.domain";
48
+ }
49
+ return boost::unit_test::framework::master_test_suite().argv[1];
50
+ }
51
+
52
+
53
+ BOOST_AUTO_TEST_CASE(manager_configure_domain_except)
54
+ {
55
+ //Check that configure rejects illegal domain arg combinations
56
+ ScoreFeatureManager manager;
57
+ BOOST_CHECK_THROW(
58
+ manager.configure(boost::assign::list_of("--DomainRatio")("/dev/null")("--DomainIndicator")("/dev/null")),
59
+ ScoreFeatureArgumentException);
60
+ BOOST_CHECK_THROW(
61
+ manager.configure(boost::assign::list_of("--SparseDomainSubset")("/dev/null")("--SparseDomainRatio")("/dev/null")),
62
+ ScoreFeatureArgumentException);
63
+ BOOST_CHECK_THROW(
64
+ manager.configure(boost::assign::list_of("--SparseDomainBlah")("/dev/null")),
65
+ ScoreFeatureArgumentException);
66
+ BOOST_CHECK_THROW(
67
+ manager.configure(boost::assign::list_of("--DomainSubset")),
68
+ ScoreFeatureArgumentException);
69
+ }
70
+
71
+ template <class Expected>
72
+ static void checkDomainConfigured(
73
+ const vector<string>& args)
74
+ {
75
+ ScoreFeatureManager manager;
76
+ manager.configure(args);
77
+ const std::vector<ScoreFeaturePtr>& features = manager.getFeatures();
78
+ //BOOST_REQUIRE_EQUAL(features.size(), 2);
79
+ //if I add to features this check will fail?
80
+ BOOST_REQUIRE_EQUAL(features.size(), 1); //MARIA -> what is this check and why does it fail when I add my feature?
81
+ Expected* feature = dynamic_cast<Expected*>(features[0].get());
82
+ BOOST_REQUIRE(feature);
83
+ BOOST_CHECK(manager.includeSentenceId());
84
+ }
85
+
86
+ template<typename T>
87
+ T adder(T v)
88
+ {
89
+ return v;
90
+ }
91
+
92
+ template<typename T, typename... Args>
93
+ T adder(T first, Args... args)
94
+ {
95
+ return first + adder(args...);
96
+ }
97
+
98
+
99
+ BOOST_AUTO_TEST_CASE(manager_config_domain)
100
+ {
101
+ checkDomainConfigured<RatioDomainFeature>
102
+ (boost::assign::list_of("--DomainRatio")("/dev/null"));
103
+ checkDomainConfigured<IndicatorDomainFeature>
104
+ (boost::assign::list_of("--DomainIndicator")("/dev/null"));
105
+ checkDomainConfigured<SubsetDomainFeature>
106
+ (boost::assign::list_of("--DomainSubset")("/dev/null"));
107
+ checkDomainConfigured<SparseRatioDomainFeature>
108
+ (boost::assign::list_of("--SparseDomainRatio")("/dev/null"));
109
+ checkDomainConfigured<SparseIndicatorDomainFeature>
110
+ (boost::assign::list_of("--SparseDomainIndicator")("/dev/null"));
111
+ checkDomainConfigured<SparseSubsetDomainFeature>
112
+ (boost::assign::list_of("--SparseDomainSubset")("/dev/null"));
113
+
114
+ // C++11 testing
115
+ unordered_set<int> s;
116
+ s.insert(4);
117
+ s.insert(7);
118
+ s.insert(4);
119
+ s.insert(1);
120
+
121
+ for (auto i: s) {
122
+ cerr << i << " ";
123
+ }
124
+
125
+ unordered_map<std::string, int> m;
126
+ m["a"] = 4;
127
+ m["ba"] = 6;
128
+ m["aabc"] = 7;
129
+
130
+ for (auto i: m) {
131
+ cerr << i.first << "=" << i.second << " ";
132
+ }
133
+
134
+ long sum = adder(1, 2, 3, 8, 7);
135
+
136
+ std::string s1 = "x", s2 = "aa", s3 = "bb", s4 = "yy";
137
+ std::string ssum = adder(s1, s2, s3, s4);
138
+
139
+ }
140
+
mosesdecoder/phrase-extract/SentenceAlignment.cpp ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "SentenceAlignment.h"
21
+
22
+ #include <map>
23
+ #include <set>
24
+ #include <string>
25
+
26
+ #include "tables-core.h"
27
+ #include "util/tokenize.hh"
28
+
29
+ using namespace std;
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+ SentenceAlignment::~SentenceAlignment() {}
35
+
36
+ void addBoundaryWords(vector<string> &phrase)
37
+ {
38
+ phrase.insert(phrase.begin(), "<s>");
39
+ phrase.push_back("</s>");
40
+ }
41
+
42
+ bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
43
+ {
44
+ target = util::tokenize(targetString);
45
+ if (boundaryRules)
46
+ addBoundaryWords(target);
47
+ return true;
48
+ }
49
+
50
+ bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
51
+ {
52
+ source = util::tokenize(sourceString);
53
+ if (boundaryRules)
54
+ addBoundaryWords(source);
55
+ return true;
56
+ }
57
+
58
+ bool SentenceAlignment::create(const char targetString[],
59
+ const char sourceString[],
60
+ const char alignmentString[],
61
+ const char weightString[],
62
+ int sentenceID, bool boundaryRules)
63
+ {
64
+ using namespace std;
65
+ this->sentenceID = sentenceID;
66
+ this->weightString = std::string(weightString);
67
+
68
+ // process sentence strings and store in target and source members.
69
+ if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
70
+ return false;
71
+ }
72
+ if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
73
+ return false;
74
+ }
75
+
76
+ // check if sentences are empty
77
+ if (target.size() == 0 || source.size() == 0) {
78
+ cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
79
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
80
+ return false;
81
+ }
82
+
83
+ // prepare data structures for alignments
84
+ for(size_t i=0; i<source.size(); i++) {
85
+ alignedCountS.push_back( 0 );
86
+ }
87
+ for(size_t i=0; i<target.size(); i++) {
88
+ vector< int > dummy;
89
+ alignedToT.push_back( dummy );
90
+ }
91
+
92
+ // reading in alignments
93
+ vector<string> alignmentSequence = util::tokenize( alignmentString );
94
+ for(size_t i=0; i<alignmentSequence.size(); i++) {
95
+ int s,t;
96
+ // cout << "scaning " << alignmentSequence[i].c_str() << endl;
97
+ if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
98
+ cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
99
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
100
+ return false;
101
+ }
102
+
103
+ if (boundaryRules) {
104
+ ++s;
105
+ ++t;
106
+ }
107
+
108
+ // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
109
+ if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
110
+ cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
111
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
112
+ return false;
113
+ }
114
+ alignedToT[t].push_back( s );
115
+ alignedCountS[s]++;
116
+ }
117
+
118
+ if (boundaryRules) {
119
+ alignedToT[0].push_back(0);
120
+ alignedCountS[0]++;
121
+
122
+ alignedToT.back().push_back(alignedCountS.size() - 1);
123
+ alignedCountS.back()++;
124
+
125
+ }
126
+
127
+ return true;
128
+ }
129
+
130
+ void SentenceAlignment::invertAlignment()
131
+ {
132
+ alignedToS.resize(source.size());
133
+ for (size_t targetPos = 0; targetPos < alignedToT.size(); ++targetPos) {
134
+ const std::vector<int> &vec = alignedToT[targetPos];
135
+ for (size_t i = 0; i < vec.size(); ++i) {
136
+ int sourcePos = vec[i];
137
+ alignedToS[sourcePos].push_back(targetPos);
138
+ }
139
+
140
+ }
141
+ }
142
+
143
+ }
144
+
mosesdecoder/phrase-extract/SentenceAlignment.h ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef SENTENCE_ALIGNMENT_H_INCLUDED_
22
+ #define SENTENCE_ALIGNMENT_H_INCLUDED_
23
+
24
+ #include <string>
25
+ #include <vector>
26
+
27
+ namespace MosesTraining
28
+ {
29
+
30
+ class SentenceAlignment
31
+ {
32
+ public:
33
+ std::vector<std::string> target;
34
+ std::vector<std::string> source;
35
+ std::vector<int> alignedCountS;
36
+ std::vector<std::vector<int> > alignedToT, alignedToS;
37
+ int sentenceID;
38
+ std::string weightString;
39
+
40
+ virtual ~SentenceAlignment();
41
+
42
+ virtual bool processTargetSentence(const char *, int, bool boundaryRules);
43
+
44
+ virtual bool processSourceSentence(const char *, int, bool boundaryRules);
45
+
46
+ bool create(const char targetString[],
47
+ const char sourceString[],
48
+ const char alignmentString[],
49
+ const char weightString[],
50
+ int sentenceID, bool boundaryRules);
51
+
52
+ void invertAlignment();
53
+
54
+ };
55
+
56
+ }
57
+
58
+
59
+ #endif
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.cpp ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "SentenceAlignmentWithSyntax.h"
21
+
22
+ #include <map>
23
+ #include <set>
24
+ #include <string>
25
+
26
+ #include "tables-core.h"
27
+ #include "XmlException.h"
28
+ #include "XmlTree.h"
29
+ #include "util/tokenize.hh"
30
+
31
+ using namespace std;
32
+
33
+ namespace MosesTraining
34
+ {
35
+
36
+ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
37
+ {
38
+ if (!m_targetSyntax) {
39
+ return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
40
+ }
41
+
42
+ string targetStringCPP(targetString);
43
+ try {
44
+ ProcessAndStripXMLTags(targetStringCPP, targetTree,
45
+ m_targetLabelCollection,
46
+ m_targetTopLabelCollection,
47
+ false);
48
+ } catch (const XmlException & e) {
49
+ std::cerr << "WARNING: failed to process target sentence at line "
50
+ << sentenceID << ": " << e.getMsg() << std::endl;
51
+ return false;
52
+ }
53
+ target = util::tokenize(targetStringCPP);
54
+ return true;
55
+ }
56
+
57
+ bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
58
+ {
59
+ if (!m_sourceSyntax) {
60
+ return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
61
+ }
62
+
63
+ string sourceStringCPP(sourceString);
64
+ try {
65
+ ProcessAndStripXMLTags(sourceStringCPP, sourceTree,
66
+ m_sourceLabelCollection ,
67
+ m_sourceTopLabelCollection,
68
+ false);
69
+ } catch (const XmlException & e) {
70
+ std::cerr << "WARNING: failed to process source sentence at line "
71
+ << sentenceID << ": " << e.getMsg() << std::endl;
72
+ return false;
73
+ }
74
+ source = util::tokenize(sourceStringCPP);
75
+ return true;
76
+ }
77
+
78
+ } // namespace
mosesdecoder/phrase-extract/SentenceAlignmentWithSyntax.h ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <map>
23
+ #include <set>
24
+ #include <string>
25
+ #include <vector>
26
+
27
+ #include "RuleExtractionOptions.h"
28
+ #include "SentenceAlignment.h"
29
+ #include "SyntaxNodeCollection.h"
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+ class SentenceAlignmentWithSyntax : public SentenceAlignment
35
+ {
36
+ public:
37
+ SyntaxNodeCollection targetTree;
38
+ SyntaxNodeCollection sourceTree;
39
+ std::set<std::string> & m_targetLabelCollection;
40
+ std::set<std::string> & m_sourceLabelCollection;
41
+ std::map<std::string, int> & m_targetTopLabelCollection;
42
+ std::map<std::string, int> & m_sourceTopLabelCollection;
43
+ const bool m_targetSyntax, m_sourceSyntax;
44
+
45
+ SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
46
+ std::set<std::string> & srcLabelColl,
47
+ std::map<std::string,int> & tgtTopLabelColl,
48
+ std::map<std::string,int> & srcTopLabelColl,
49
+ bool targetSyntax,
50
+ bool sourceSyntax)
51
+ : m_targetLabelCollection(tgtLabelColl)
52
+ , m_sourceLabelCollection(srcLabelColl)
53
+ , m_targetTopLabelCollection(tgtTopLabelColl)
54
+ , m_sourceTopLabelCollection(srcTopLabelColl)
55
+ , m_targetSyntax(targetSyntax)
56
+ , m_sourceSyntax(sourceSyntax) {
57
+ }
58
+
59
+ virtual ~SentenceAlignmentWithSyntax() {}
60
+
61
+ bool
62
+ processTargetSentence(const char *, int, bool boundaryRules);
63
+
64
+ bool
65
+ processSourceSentence(const char *, int, bool boundaryRules);
66
+ };
67
+
68
+ }
69
+
mosesdecoder/phrase-extract/SyntaxNode.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2009 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <map>
23
+ #include <string>
24
+
25
+ namespace MosesTraining
26
+ {
27
+
28
+ /*! A node in a syntactic structure (tree, lattice, etc.). SyntaxNodes have a
29
+ * label and a span plus an arbitrary set of name/value attributes.
30
+ */
31
+ struct SyntaxNode {
32
+ typedef std::map<std::string, std::string> AttributeMap;
33
+
34
+ SyntaxNode(const std::string &label_, int start_, int end_)
35
+ : label(label_)
36
+ , start(start_)
37
+ , end(end_) {
38
+ }
39
+
40
+ std::string label;
41
+ int start;
42
+ int end;
43
+ AttributeMap attributes;
44
+ };
45
+
46
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/SyntaxNodeCollection.cpp ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2009 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+
21
+ #include "SyntaxNodeCollection.h"
22
+
23
+ #include <cassert>
24
+ #include <iostream>
25
+
26
+ namespace MosesTraining
27
+ {
28
+
29
+ SyntaxNodeCollection::~SyntaxNodeCollection()
30
+ {
31
+ Clear();
32
+ }
33
+
34
+ void SyntaxNodeCollection::Clear()
35
+ {
36
+ // loop through all m_nodes, delete them
37
+ for(size_t i=0; i<m_nodes.size(); i++) {
38
+ delete m_nodes[i];
39
+ }
40
+ m_nodes.clear();
41
+ m_index.clear();
42
+ }
43
+
44
+ SyntaxNode *SyntaxNodeCollection::AddNode(int startPos, int endPos,
45
+ const std::string &label)
46
+ {
47
+ SyntaxNode* newNode = new SyntaxNode(label, startPos, endPos);
48
+ m_nodes.push_back( newNode );
49
+ m_index[ startPos ][ endPos ].push_back( newNode );
50
+ m_endPositionsIndex[ endPos ].push_back( newNode );
51
+ m_startPositionsIndex[ startPos ].push_back( newNode ); // TODO: may not need this: access m_index by startPos and iterate over its InnerNodeIndex (= end positions)?
52
+ m_numWords = std::max(endPos+1, m_numWords);
53
+ return newNode;
54
+ }
55
+
56
+ bool SyntaxNodeCollection::HasNode( int startPos, int endPos ) const
57
+ {
58
+ return GetNodes( startPos, endPos).size() > 0;
59
+ }
60
+
61
+ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodes(
62
+ int startPos, int endPos ) const
63
+ {
64
+ NodeIndex::const_iterator startIndex = m_index.find( startPos );
65
+ if (startIndex == m_index.end() )
66
+ return m_emptyNode;
67
+
68
+ InnerNodeIndex::const_iterator endIndex = startIndex->second.find( endPos );
69
+ if (endIndex == startIndex->second.end())
70
+ return m_emptyNode;
71
+
72
+ return endIndex->second;
73
+ }
74
+
75
+ bool SyntaxNodeCollection::HasNodeStartingAtPosition( int startPos ) const
76
+ {
77
+ return GetNodesByStartPosition(startPos).size() > 0;
78
+ }
79
+
80
+ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByStartPosition(
81
+ int startPos ) const
82
+ {
83
+ InnerNodeIndex::const_iterator startIndex = m_startPositionsIndex.find( startPos );
84
+ if (startIndex == m_startPositionsIndex.end() )
85
+ return m_emptyNode;
86
+
87
+ return startIndex->second;
88
+ }
89
+
90
+ bool SyntaxNodeCollection::HasNodeEndingAtPosition( int endPos ) const
91
+ {
92
+ return GetNodesByEndPosition(endPos).size() > 0;
93
+ }
94
+
95
+ const std::vector< SyntaxNode* >& SyntaxNodeCollection::GetNodesByEndPosition(
96
+ int endPos ) const
97
+ {
98
+ InnerNodeIndex::const_iterator endIndex = m_endPositionsIndex.find( endPos );
99
+ if (endIndex == m_endPositionsIndex.end() )
100
+ return m_emptyNode;
101
+
102
+ return endIndex->second;
103
+ }
104
+
105
+ std::auto_ptr<SyntaxTree> SyntaxNodeCollection::ExtractTree()
106
+ {
107
+ std::map<SyntaxNode *, SyntaxTree *> nodeToTree;
108
+
109
+ // Create a SyntaxTree object for each SyntaxNode.
110
+ for (std::vector<SyntaxNode*>::const_iterator p = m_nodes.begin();
111
+ p != m_nodes.end(); ++p) {
112
+ nodeToTree[*p] = new SyntaxTree(**p);
113
+ }
114
+
115
+ // Connect the SyntaxTrees.
116
+ typedef NodeIndex::const_iterator OuterIterator;
117
+ typedef InnerNodeIndex::const_reverse_iterator InnerIterator;
118
+
119
+ SyntaxTree *root = 0;
120
+ SyntaxNode *prevNode = 0;
121
+ SyntaxTree *prevTree = 0;
122
+ // Iterate over all start indices from lowest to highest.
123
+ for (OuterIterator p = m_index.begin(); p != m_index.end(); ++p) {
124
+ const InnerNodeIndex &inner = p->second;
125
+ // Iterate over all end indices from highest to lowest.
126
+ for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
127
+ const std::vector<SyntaxNode*> &nodes = q->second;
128
+ // Iterate over all nodes that cover the same span in order of tree
129
+ // depth, top-most first.
130
+ for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
131
+ r != nodes.rend(); ++r) {
132
+ SyntaxNode *node = *r;
133
+ SyntaxTree *tree = nodeToTree[node];
134
+ if (!prevNode) {
135
+ // node is the root.
136
+ root = tree;
137
+ tree->parent() = 0;
138
+ } else if (prevNode->start == node->start) {
139
+ // prevNode is the parent of node.
140
+ assert(prevNode->end >= node->end);
141
+ tree->parent() = prevTree;
142
+ prevTree->children().push_back(tree);
143
+ } else {
144
+ // prevNode is a descendant of node's parent. The lowest common
145
+ // ancestor of prevNode and node will be node's parent.
146
+ SyntaxTree *ancestor = prevTree->parent();
147
+ while (ancestor->value().end < tree->value().end) {
148
+ ancestor = ancestor->parent();
149
+ }
150
+ assert(ancestor);
151
+ tree->parent() = ancestor;
152
+ ancestor->children().push_back(tree);
153
+ }
154
+ prevNode = node;
155
+ prevTree = tree;
156
+ }
157
+ }
158
+ }
159
+
160
+ return std::auto_ptr<SyntaxTree>(root);
161
+ }
162
+
163
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/SyntaxNodeCollection.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2009 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <map>
23
+ #include <memory>
24
+ #include <sstream>
25
+ #include <string>
26
+ #include <vector>
27
+
28
+ #include "SyntaxNode.h"
29
+ #include "SyntaxTree.h"
30
+
31
+ namespace MosesTraining
32
+ {
33
+
34
+ /** A collection of SyntaxNodes organized by start and end position.
35
+ *
36
+ */
37
+ class SyntaxNodeCollection
38
+ {
39
+ public:
40
+ SyntaxNodeCollection() : m_numWords(0) {}
41
+
42
+ ~SyntaxNodeCollection();
43
+
44
+ //! Construct and insert a new SyntaxNode.
45
+ SyntaxNode *AddNode( int startPos, int endPos, const std::string &label );
46
+
47
+ //! Return true iff there are one or more SyntaxNodes with the given span.
48
+ bool HasNode( int startPos, int endPos ) const;
49
+
50
+ //! Lookup the SyntaxNodes for a given span.
51
+ const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
52
+
53
+ bool HasNodeStartingAtPosition( int startPos ) const;
54
+ const std::vector< SyntaxNode* >& GetNodesByStartPosition( int startPos ) const;
55
+ bool HasNodeEndingAtPosition( int endPos ) const;
56
+ const std::vector< SyntaxNode* >& GetNodesByEndPosition( int endPos ) const;
57
+
58
+ //! Get a vector of pointers to all SyntaxNodes (unordered).
59
+ const std::vector< SyntaxNode* >& GetAllNodes() {
60
+ return m_nodes;
61
+ };
62
+
63
+ //! Get the number of words (defined as 1 + the max end pos of any node).
64
+ std::size_t GetNumWords() const {
65
+ return m_numWords;
66
+ }
67
+
68
+ //! Clear the container (this deletes the SyntaxNodes).
69
+ void Clear();
70
+
71
+ //! Extract a SyntaxTree (assuming the collection's nodes constitute a tree).
72
+ std::auto_ptr<SyntaxTree> ExtractTree();
73
+
74
+ private:
75
+ typedef std::map< int, std::vector< SyntaxNode* > > InnerNodeIndex;
76
+ typedef std::map< int, InnerNodeIndex > NodeIndex;
77
+
78
+ // Not copyable.
79
+ SyntaxNodeCollection(const SyntaxNodeCollection &);
80
+ SyntaxNodeCollection &operator=(const SyntaxNodeCollection &);
81
+
82
+ std::vector< SyntaxNode* > m_nodes;
83
+ NodeIndex m_index;
84
+ int m_numWords;
85
+ std::vector< SyntaxNode* > m_emptyNode;
86
+
87
+ InnerNodeIndex m_endPositionsIndex;
88
+ InnerNodeIndex m_startPositionsIndex;
89
+ };
90
+
91
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/SyntaxTree.h ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "syntax-common/tree.h"
4
+
5
+ #include "SyntaxNode.h"
6
+
7
+ namespace MosesTraining
8
+ {
9
+
10
+ typedef Syntax::Tree<SyntaxNode> SyntaxTree;
11
+
12
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/XmlException.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+ #pragma once
20
+ #ifndef XMLEXCEPTION_H_INCLUDED_
21
+ #define XMLEXCEPTION_H_INCLUDED_
22
+
23
+ #include <string>
24
+
25
+ namespace MosesTraining
26
+ {
27
+
28
+ class XmlException
29
+ {
30
+ public:
31
+ XmlException(const std::string & msg)
32
+ : m_msg(msg) {
33
+ }
34
+
35
+ const std::string &
36
+ getMsg() const {
37
+ return m_msg;
38
+ }
39
+
40
+ private:
41
+ std::string m_msg;
42
+ };
43
+
44
+ }
45
+
46
+ #endif
mosesdecoder/phrase-extract/XmlTree.cpp ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2006 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <cassert>
21
+ #include <vector>
22
+ #include <string>
23
+ #include <set>
24
+ #include <iostream>
25
+ #include <cstdlib>
26
+ #include <sstream>
27
+
28
+ #include "SyntaxNodeCollection.h"
29
+ #include "XmlException.h"
30
+
31
+ using namespace std;
32
+
33
+ namespace MosesTraining
34
+ {
35
+
36
+ inline std::vector<std::string> Tokenize(const std::string& str,
37
+ const std::string& delimiters = " \t")
38
+ {
39
+ std::vector<std::string> tokens;
40
+ // Skip delimiters at beginning.
41
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
42
+ // Find first "non-delimiter".
43
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
44
+
45
+ while (std::string::npos != pos || std::string::npos != lastPos) {
46
+ // Found a token, add it to the vector.
47
+ tokens.push_back(str.substr(lastPos, pos - lastPos));
48
+ // Skip delimiters. Note the "not_of"
49
+ lastPos = str.find_first_not_of(delimiters, pos);
50
+ // Find next "non-delimiter"
51
+ pos = str.find_first_of(delimiters, lastPos);
52
+ }
53
+
54
+ return tokens;
55
+ }
56
+
57
+ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
58
+ {
59
+ std::string res = str;
60
+ res.erase(str.find_last_not_of(dropChars)+1);
61
+ return res.erase(0, res.find_first_not_of(dropChars));
62
+ }
63
+
64
+ string ParseXmlTagAttribute(const string& tag,const string& attributeName)
65
+ {
66
+ /*TODO deal with unescaping \"*/
67
+ string tagOpen = attributeName + "=\"";
68
+ size_t contentsStart = tag.find(tagOpen);
69
+ if (contentsStart == string::npos) return "";
70
+ contentsStart += tagOpen.size();
71
+ size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
72
+ if (contentsEnd == string::npos) {
73
+ cerr << "Malformed XML attribute: "<< tag;
74
+ return "";
75
+ }
76
+ size_t possibleEnd;
77
+ while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
78
+ contentsEnd = possibleEnd;
79
+ }
80
+ return tag.substr(contentsStart,contentsEnd-contentsStart);
81
+ }
82
+
83
+ // s should be a sequence of name=attribute pairs separated by whitespace.
84
+ // e.g. "label=\"S\" pcfg=\"-1.452\" foo=\"blah\\\"blah\""
85
+ void ParseXmlTagAttributes(const std::string &s,
86
+ std::map<std::string, std::string> &attributes)
87
+ {
88
+ std::size_t begin = 0;
89
+ while (true) {
90
+ std::size_t pos = s.find('=', begin);
91
+ if (pos == std::string::npos) {
92
+ return;
93
+ }
94
+ std::string name = Trim(s.substr(begin, pos-begin));
95
+ begin = s.find('"', pos+1);
96
+ if (begin == std::string::npos) {
97
+ throw XmlException("invalid tag content");
98
+ }
99
+ pos = s.find('"', begin+1);
100
+ if (pos == std::string::npos) {
101
+ throw XmlException("invalid tag content");
102
+ }
103
+ while (s[pos-1] == '\\') {
104
+ pos = s.find('"', pos+1);
105
+ if (pos == std::string::npos) {
106
+ throw XmlException("invalid tag content");
107
+ }
108
+ }
109
+ if (name != "label" && name != "span") {
110
+ attributes[name] = s.substr(begin+1, pos-begin-1);
111
+ }
112
+ begin = pos+1;
113
+ }
114
+ }
115
+
116
+ /**
117
+ * Remove "<" and ">" from XML tag
118
+ *
119
+ * \param str xml token to be stripped
120
+ */
121
+ string TrimXml(const string& str)
122
+ {
123
+ // too short to be xml token -> do nothing
124
+ if (str.size() < 2) return str;
125
+
126
+ // strip first and last character
127
+ if (str[0] == '<' && str[str.size() - 1] == '>') {
128
+ return str.substr(1, str.size() - 2);
129
+ }
130
+ // not an xml token -> do nothing
131
+ else {
132
+ return str;
133
+ }
134
+ }
135
+
136
+ /**
137
+ * Check if the token is an XML tag, i.e. starts with "<"
138
+ *
139
+ * \param tag token to be checked
140
+ */
141
+ bool isXmlTag(const string& tag)
142
+ {
143
+ return tag[0] == '<';
144
+ }
145
+
146
+ /**
147
+ * Unescape XML special characters.
148
+ */
149
+ string unescape(const string& str)
150
+ {
151
+ string s;
152
+ s.reserve(str.size());
153
+ string::size_type n;
154
+ string::size_type start = 0;
155
+ while ((n = str.find('&', start)) != string::npos) {
156
+ s += str.substr(start, n-start);
157
+ string::size_type end = str.find(';', n);
158
+ assert(n != string::npos);
159
+ string name = str.substr(n+1, end-n-1);
160
+ if (name == "lt") {
161
+ s += string("<");
162
+ } else if (name == "gt") {
163
+ s += string(">");
164
+ } else if (name == "#91") {
165
+ s += string("[");
166
+ } else if (name == "#93") {
167
+ s += string("]");
168
+ } else if (name == "bra") {
169
+ s += string("[");
170
+ } else if (name == "ket") {
171
+ s += string("]");
172
+ } else if (name == "bar" || name == "#124") {
173
+ s += string("|");
174
+ } else if (name == "amp") {
175
+ s += string("&");
176
+ } else if (name == "apos") {
177
+ s += string("'");
178
+ } else if (name == "quot") {
179
+ s += string("\"");
180
+ } else {
181
+ // Currently only handles the following five XML escape sequences:
182
+ // &lt; <
183
+ // &gt; >
184
+ // &amp; &
185
+ // &apos; '
186
+ // &quot; "
187
+ // Numeric character references (like &#xf6;) are not supported.
188
+ std::ostringstream msg;
189
+ msg << "unsupported XML escape sequence: &" << name << ";";
190
+ throw XmlException(msg.str());
191
+ }
192
+ if (end == str.size()-1) {
193
+ return s;
194
+ }
195
+ start = end + 1;
196
+ }
197
+ s += str.substr(start);
198
+ return s;
199
+ }
200
+
201
+ /**
202
+ * Split up the input character string into tokens made up of
203
+ * either XML tags or text.
204
+ * example: this <b> is a </b> test .
205
+ * => (this ), (<b>), ( is a ), (</b>), ( test .)
206
+ *
207
+ * \param str input string
208
+ */
209
+ vector<string> TokenizeXml(const string& str)
210
+ {
211
+ string lbrack = "<";
212
+ string rbrack = ">";
213
+ vector<string> tokens; // vector of tokens to be returned
214
+ string::size_type cpos = 0; // current position in string
215
+ string::size_type lpos = 0; // left start of xml tag
216
+ string::size_type rpos = 0; // right end of xml tag
217
+
218
+ // walk thorugh the string (loop vver cpos)
219
+ while (cpos != str.size()) {
220
+ // find the next opening "<" of an xml tag
221
+ lpos = str.find_first_of(lbrack, cpos);
222
+ if (lpos != string::npos) {
223
+ // find the end of the xml tag
224
+ rpos = str.find_first_of(rbrack, lpos);
225
+ // sanity check: there has to be closing ">"
226
+ if (rpos == string::npos) {
227
+ cerr << "ERROR: malformed XML: " << str << endl;
228
+ return tokens;
229
+ }
230
+ } else { // no more tags found
231
+ // add the rest as token
232
+ tokens.push_back(str.substr(cpos));
233
+ break;
234
+ }
235
+
236
+ // add stuff before xml tag as token, if there is any
237
+ if (lpos - cpos > 0)
238
+ tokens.push_back(str.substr(cpos, lpos - cpos));
239
+
240
+ // add xml tag as token
241
+ tokens.push_back(str.substr(lpos, rpos-lpos+1));
242
+ cpos = rpos + 1;
243
+ }
244
+ return tokens;
245
+ }
246
+
247
+ /**
248
+ * Process a sentence with XML-style annotation of syntactic nodes.
249
+ *
250
+ * \param line[in,out] in: sentence, out: sentence without the XML
251
+ * \param nodeCollection[out] the collection of SyntaxNode objects for this
252
+ * sentence
253
+ * \param labelCollection[out] label values are inserted into this set
254
+ * \param topLabelCollection[out] top labels (key) and their counts (value)
255
+ * are inserted into this map
256
+ * \param unescapeSpecialChars flag indicating whether XML special characters
257
+ * should be unescaped
258
+ */
259
+ bool ProcessAndStripXMLTags(string &line, SyntaxNodeCollection &nodeCollection,
260
+ set< string > &labelCollection,
261
+ map< string, int > &topLabelCollection,
262
+ bool unescapeSpecialChars )
263
+ {
264
+ //parse XML markup in translation line
265
+
266
+ // no xml tag? we're done.
267
+ if (line.find_first_of('<') == string::npos) {
268
+ return true;
269
+ }
270
+
271
+ // break up input into a vector of xml tags and text
272
+ // example: (this), (<b>), (is a), (</b>), (test .)
273
+ vector<string> xmlTokens = TokenizeXml(line);
274
+
275
+ // we need to store opened tags, until they are closed
276
+ // tags are stored as tripled (tagname, startpos, contents)
277
+ typedef pair< string, pair< size_t, string > > OpenedTag;
278
+ vector< OpenedTag > tagStack; // stack that contains active opened tags
279
+
280
+ string cleanLine; // return string (text without xml)
281
+ size_t wordPos = 0; // position in sentence (in terms of number of words)
282
+
283
+ // loop through the tokens
284
+ for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
285
+ // not a xml tag, but regular text (may contain many words)
286
+ if(!isXmlTag(xmlTokens[xmlTokenPos])) {
287
+ // add a space at boundary, if necessary
288
+ if (cleanLine.size()>0 &&
289
+ cleanLine[cleanLine.size() - 1] != ' ' &&
290
+ xmlTokens[xmlTokenPos][0] != ' ') {
291
+ cleanLine += " ";
292
+ }
293
+ // add words to output
294
+ if (unescapeSpecialChars) {
295
+ cleanLine += unescape(xmlTokens[xmlTokenPos]);
296
+ } else {
297
+ cleanLine += xmlTokens[xmlTokenPos];
298
+ }
299
+ wordPos = Tokenize(cleanLine).size(); // count all the words
300
+ }
301
+
302
+ // process xml tag
303
+ else {
304
+ // *** get essential information about tag ***
305
+
306
+ // strip extra boundary spaces and "<" and ">"
307
+ string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
308
+ // cerr << "XML TAG IS: " << tag << std::endl;
309
+
310
+ if (tag.size() == 0) {
311
+ cerr << "ERROR: empty tag name: " << line << endl;
312
+ return false;
313
+ }
314
+
315
+ // check if unary (e.g., "<wall/>")
316
+ bool isUnary = ( tag[tag.size() - 1] == '/' );
317
+
318
+ // check if opening tag (e.g. "<a>", not "</a>")g
319
+ bool isClosed = ( tag[0] == '/' );
320
+ bool isOpen = !isClosed;
321
+
322
+ if (isClosed && isUnary) {
323
+ cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
324
+ return false;
325
+ }
326
+
327
+ if (isClosed)
328
+ tag = tag.substr(1); // remove "/" at the beginning
329
+ if (isUnary)
330
+ tag = tag.substr(0,tag.size()-1); // remove "/" at the end
331
+
332
+ // find the tag name and contents
333
+ string::size_type endOfName = tag.find_first_of(' ');
334
+ string tagName = tag;
335
+ string tagContent = "";
336
+ if (endOfName != string::npos) {
337
+ tagName = tag.substr(0,endOfName);
338
+ tagContent = tag.substr(endOfName+1);
339
+ }
340
+
341
+ // *** process new tag ***
342
+
343
+ if (isOpen || isUnary) {
344
+ // put the tag on the tag stack
345
+ OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
346
+ tagStack.push_back( openedTag );
347
+ // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
348
+ }
349
+
350
+ // *** process completed tag ***
351
+
352
+ if (isClosed || isUnary) {
353
+ // pop last opened tag from stack;
354
+ if (tagStack.size() == 0) {
355
+ cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
356
+ return false;
357
+ }
358
+ OpenedTag openedTag = tagStack.back();
359
+ tagStack.pop_back();
360
+
361
+ // tag names have to match
362
+ if (openedTag.first != tagName) {
363
+ cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
364
+ return false;
365
+ }
366
+
367
+ // assemble remaining information about tag
368
+ size_t startPos = openedTag.second.first;
369
+ string tagContent = openedTag.second.second;
370
+ size_t endPos = wordPos;
371
+
372
+ // span attribute overwrites position
373
+ string span = ParseXmlTagAttribute(tagContent,"span");
374
+ if (! span.empty()) {
375
+ vector<string> ij = Tokenize(span, "-");
376
+ if (ij.size() != 1 && ij.size() != 2) {
377
+ cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
378
+ return false;
379
+ }
380
+ startPos = atoi(ij[0].c_str());
381
+ if (ij.size() == 1) endPos = startPos + 1;
382
+ else endPos = atoi(ij[1].c_str()) + 1;
383
+ }
384
+
385
+ // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
386
+
387
+ if (startPos > endPos) {
388
+ cerr << "ERROR: tag " << tagName << " startPos is bigger than endPos (" << startPos << "-" << endPos << "): " << line << endl;
389
+ return false;
390
+ } else if (startPos == endPos) {
391
+ cerr << "WARNING: tag " << tagName << ". Ignoring 0 span (" << startPos << "-" << endPos << "): " << line << endl;
392
+ continue;
393
+ }
394
+
395
+ string label = ParseXmlTagAttribute(tagContent,"label");
396
+ labelCollection.insert( label );
397
+
398
+ // report what we have processed so far
399
+ if (0) {
400
+ cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
401
+ cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
402
+ cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
403
+ }
404
+ SyntaxNode *node = nodeCollection.AddNode( startPos, endPos-1, label );
405
+ ParseXmlTagAttributes(tagContent, node->attributes);
406
+ }
407
+ }
408
+ }
409
+ // we are done. check if there are tags that are still open
410
+ if (tagStack.size() > 0) {
411
+ cerr << "ERROR: some opened tags were never closed: " << line << endl;
412
+ return false;
413
+ }
414
+
415
+ // collect top labels
416
+ const vector< SyntaxNode* >& topNodes = nodeCollection.GetNodes( 0, wordPos-1 );
417
+ for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
418
+ SyntaxNode *n = *node;
419
+ const string &label = n->label;
420
+ if (topLabelCollection.find( label ) == topLabelCollection.end())
421
+ topLabelCollection[ label ] = 0;
422
+ topLabelCollection[ label ]++;
423
+ }
424
+
425
+ // return de-xml'ed sentence in line
426
+ line = cleanLine;
427
+ return true;
428
+ }
429
+
430
+ }
mosesdecoder/phrase-extract/XmlTree.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2006 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <string>
23
+ #include <vector>
24
+ #include <set>
25
+ #include <map>
26
+
27
+ #include "SyntaxNodeCollection.h"
28
+
29
+ namespace MosesTraining
30
+ {
31
+
32
+ std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
33
+ std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
34
+ std::string TrimXml(const std::string& str);
35
+ bool isXmlTag(const std::string& tag);
36
+ std::vector<std::string> TokenizeXml(const std::string& str);
37
+ bool ProcessAndStripXMLTags(std::string &line, SyntaxNodeCollection &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection, bool unescape = true);
38
+ std::string unescape(const std::string &str);
39
+
40
+
41
+ } // namespace MosesTraining
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ExtractionPhrasePair.o ADDED
Binary file (116 kB). View file
 
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52c08921a91130c8d12538e7afc1d9f9d47f1c6e041cd15ad2243bbd64fc7a45
3
+ size 10954640
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.o ADDED
Binary file (149 kB). View file
 
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.output ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: ScoreFeatureTest --random -- phrase-extract/test.domain
4
+ Running 2 test cases...
5
+ 1 7 4 aabc=7 ba=6 a=4
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.run ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Boost.Test WARNING: token "phrase-extract/test.domain" does not correspond to the Boost.Test argument
2
+ and should be placed after all Boost.Test arguments and the -- separator.
3
+ For example: ScoreFeatureTest --random -- phrase-extract/test.domain
4
+ Running 2 test cases...
5
+ 1 7 4 aabc=7 ba=6 a=4
6
+ *** No errors detected
7
+
8
+ EXIT STATUS: 0
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ScoreFeatureTest.test ADDED
@@ -0,0 +1 @@
 
 
1
+ passed
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/ThreadPool.o ADDED
Binary file (263 kB). View file
 
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Timer.o ADDED
Binary file (118 kB). View file
 
mosesdecoder/phrase-extract/bin/ScoreFeatureTest.test/gcc-9/release/link-static/threading-multi/Util.o ADDED
Binary file (196 kB). View file