sleepyhead111 commited on
Commit
b8cda01
·
verified ·
1 Parent(s): 001fd82

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/moses/AlignmentInfoCollection.cpp +60 -0
  2. mosesdecoder/moses/AlignmentInfoCollection.h +81 -0
  3. mosesdecoder/moses/AlignmentInfoTest.cpp +71 -0
  4. mosesdecoder/moses/BaseManager.cpp +160 -0
  5. mosesdecoder/moses/Bitmaps.cpp +58 -0
  6. mosesdecoder/moses/ChartCell.h +128 -0
  7. mosesdecoder/moses/ChartCellLabel.h +89 -0
  8. mosesdecoder/moses/ChartHypothesis.h +204 -0
  9. mosesdecoder/moses/ChartKBestExtractor.cpp +332 -0
  10. mosesdecoder/moses/ChartKBestExtractor.h +132 -0
  11. mosesdecoder/moses/ChartParser.cpp +313 -0
  12. mosesdecoder/moses/ChartRuleLookupManager.cpp +9 -0
  13. mosesdecoder/moses/ChartRuleLookupManager.h +84 -0
  14. mosesdecoder/moses/ChartTranslationOptionList.cpp +219 -0
  15. mosesdecoder/moses/ChartTranslationOptions.h +104 -0
  16. mosesdecoder/moses/ConfusionNet.cpp +294 -0
  17. mosesdecoder/moses/ContextScope.h +124 -0
  18. mosesdecoder/moses/DecodeGraph.cpp +43 -0
  19. mosesdecoder/moses/DecodeStep.cpp +90 -0
  20. mosesdecoder/moses/DecodeStepGeneration.h +54 -0
  21. mosesdecoder/moses/DecodeStepTranslation.cpp +280 -0
  22. mosesdecoder/moses/DecodeStepTranslation.h +89 -0
  23. mosesdecoder/moses/Factor.cpp +48 -0
  24. mosesdecoder/moses/ForestInput.h +88 -0
  25. mosesdecoder/moses/GenerationDictionary.h +83 -0
  26. mosesdecoder/moses/HypothesisStackCubePruning.cpp +313 -0
  27. mosesdecoder/moses/HypothesisStackCubePruning.h +153 -0
  28. mosesdecoder/moses/Incremental.h +124 -0
  29. mosesdecoder/moses/Jamfile +143 -0
  30. mosesdecoder/moses/LVoc.cpp +7 -0
  31. mosesdecoder/moses/LVoc.h +93 -0
  32. mosesdecoder/moses/Manager.cpp +2016 -0
  33. mosesdecoder/moses/MockHypothesis.h +97 -0
  34. mosesdecoder/moses/OutputFileStream.h +81 -0
  35. mosesdecoder/moses/PCNTools.h +67 -0
  36. mosesdecoder/moses/PDTAimp.cpp +476 -0
  37. mosesdecoder/moses/Parameter.cpp +1690 -0
  38. mosesdecoder/moses/Parameter.h +173 -0
  39. mosesdecoder/moses/Phrase.h +244 -0
  40. mosesdecoder/moses/PrefixTree.h +339 -0
  41. mosesdecoder/moses/Range.h +107 -0
  42. mosesdecoder/moses/ReorderingConstraint.cpp +260 -0
  43. mosesdecoder/moses/ReorderingConstraint.h +113 -0
  44. mosesdecoder/moses/ScoreComponentCollectionTest.cpp +184 -0
  45. mosesdecoder/moses/Search.cpp +50 -0
  46. mosesdecoder/moses/Search.h +57 -0
  47. mosesdecoder/moses/SearchCubePruning.h +48 -0
  48. mosesdecoder/moses/SearchNormal.cpp +423 -0
  49. mosesdecoder/moses/SquareMatrix.cpp +127 -0
  50. mosesdecoder/moses/StaticData.cpp +966 -0
mosesdecoder/moses/AlignmentInfoCollection.cpp ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "AlignmentInfoCollection.h"
21
+
22
+ namespace Moses
23
+ {
24
+
25
+ AlignmentInfoCollection AlignmentInfoCollection::s_instance;
26
+
27
+ AlignmentInfoCollection::AlignmentInfoCollection()
28
+ {
29
+ std::set<std::pair<size_t,size_t> > pairs;
30
+ m_emptyAlignmentInfo = Add(pairs);
31
+ }
32
+
33
+ AlignmentInfoCollection::~AlignmentInfoCollection()
34
+ {}
35
+
36
+ const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
37
+ {
38
+ return *m_emptyAlignmentInfo;
39
+ }
40
+
41
+ AlignmentInfo const *
42
+ AlignmentInfoCollection::
43
+ Add(AlignmentInfo const& ainfo)
44
+ {
45
+ #ifdef WITH_THREADS
46
+ {
47
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
48
+ AlignmentInfoSet::const_iterator i = m_collection.find(ainfo);
49
+ if (i != m_collection.end())
50
+ return &*i;
51
+ }
52
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
53
+ #endif
54
+ std::pair<AlignmentInfoSet::iterator, bool> ret = m_collection.insert(ainfo);
55
+ return &(*ret.first);
56
+ }
57
+
58
+
59
+
60
+ }
mosesdecoder/moses/AlignmentInfoCollection.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "AlignmentInfo.h"
23
+
24
+ #include <set>
25
+
26
+ #ifdef WITH_THREADS
27
+ #include <boost/thread/shared_mutex.hpp>
28
+ #include <boost/thread/locks.hpp>
29
+ #endif
30
+
31
+ namespace Moses
32
+ {
33
+
34
+ /** Singleton collection of all AlignmentInfo objects.
35
+ * Used as a cache of all alignment info to save space.
36
+ */
37
+ class AlignmentInfoCollection
38
+ {
39
+ public:
40
+ static AlignmentInfoCollection &Instance() {
41
+ return s_instance;
42
+ }
43
+
44
+ /** Returns a pointer to an AlignmentInfo object with the same source-target
45
+ * alignment pairs as given in the argument. If the collection already
46
+ * contains such an object then returns a pointer to it; otherwise a new
47
+ * one is inserted.
48
+ */
49
+ private:
50
+ const AlignmentInfo* Add(AlignmentInfo const& ainfo);
51
+
52
+ public:
53
+ template<typename ALNREP>
54
+ AlignmentInfo const *
55
+ Add(ALNREP const & aln) {
56
+ return this->Add(AlignmentInfo(aln));
57
+ }
58
+
59
+ //! Returns a pointer to an empty AlignmentInfo object.
60
+ const AlignmentInfo &GetEmptyAlignmentInfo() const;
61
+
62
+ private:
63
+ typedef std::set<AlignmentInfo, AlignmentInfoOrderer> AlignmentInfoSet;
64
+
65
+
66
+ //! Only a single static variable should be created.
67
+ AlignmentInfoCollection();
68
+ ~AlignmentInfoCollection();
69
+
70
+ static AlignmentInfoCollection s_instance;
71
+
72
+ #ifdef WITH_THREADS
73
+ //reader-writer lock
74
+ mutable boost::shared_mutex m_accessLock;
75
+ #endif
76
+
77
+ AlignmentInfoSet m_collection;
78
+ const AlignmentInfo *m_emptyAlignmentInfo;
79
+ };
80
+
81
+ }
mosesdecoder/moses/AlignmentInfoTest.cpp ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010- University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <boost/test/unit_test.hpp>
21
+
22
+ #include "AlignmentInfo.h"
23
+ #include "AlignmentInfoCollection.h"
24
+
25
+ using namespace Moses;
26
+ using namespace std;
27
+
28
+ BOOST_AUTO_TEST_SUITE(alignment_info)
29
+
30
+ typedef pair<size_t,size_t> IndexPair;
31
+ typedef set<pair<size_t,size_t> > IndexSet;
32
+
33
+ struct AlignmentInfoFixture {
34
+ const AlignmentInfo* ai1;
35
+ const AlignmentInfo* ai2;
36
+ const AlignmentInfo* ai3;
37
+
38
+ AlignmentInfoFixture() {
39
+ AlignmentInfoCollection& collection = AlignmentInfoCollection::Instance();
40
+ IndexSet aligns1,aligns2,aligns3;
41
+ aligns1.insert(IndexPair(1,1));
42
+ aligns1.insert(IndexPair(2,1));
43
+ aligns2.insert(IndexPair(1,1));
44
+ aligns2.insert(IndexPair(2,1));
45
+ aligns3.insert(IndexPair(1,2));
46
+ aligns3.insert(IndexPair(2,1));
47
+ ai1 = collection.Add(aligns1);
48
+ ai2 = collection.Add(aligns2);
49
+ ai3 = collection.Add(aligns3);
50
+ }
51
+
52
+ };
53
+
54
+ BOOST_FIXTURE_TEST_CASE(comparator, AlignmentInfoFixture)
55
+ {
56
+ BOOST_CHECK(*ai1 == *ai2);
57
+ BOOST_CHECK(*ai1 == *ai1);
58
+ BOOST_CHECK(*ai2 == *ai2);
59
+ BOOST_CHECK(*ai3 == *ai3);
60
+ BOOST_CHECK(!(*ai2 == *ai3));
61
+ BOOST_CHECK(!(*ai1 == *ai3));
62
+ }
63
+
64
+ BOOST_FIXTURE_TEST_CASE(hasher, AlignmentInfoFixture)
65
+ {
66
+ //simple test that same objects give same hash
67
+ AlignmentInfoHasher hash;
68
+ BOOST_CHECK_EQUAL(hash(*ai1), hash(*ai2));
69
+ }
70
+
71
+ BOOST_AUTO_TEST_SUITE_END()
mosesdecoder/moses/BaseManager.cpp ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "BaseManager.h"
2
+ #include "StaticData.h"
3
+ #include "moses/FF/StatelessFeatureFunction.h"
4
+ #include "moses/FF/StatefulFeatureFunction.h"
5
+ #include "moses/TranslationTask.h"
6
+
7
+ #include <vector>
8
+ #include <boost/algorithm/string/predicate.hpp>
9
+ #include <boost/iostreams/device/file.hpp>
10
+ #include <boost/iostreams/filter/bzip2.hpp>
11
+ #include <boost/iostreams/filter/gzip.hpp>
12
+ #include <boost/iostreams/filtering_stream.hpp>
13
+ #include <boost/filesystem.hpp>
14
+
15
+ using namespace std;
16
+
17
+ namespace Moses
18
+ {
19
+
20
+ BaseManager::BaseManager(ttasksptr const& ttask)
21
+ : m_ttask(ttask), m_source(*(ttask->GetSource().get()))
22
+ { }
23
+
24
+ const InputType&
25
+ BaseManager::GetSource() const
26
+ {
27
+ return m_source;
28
+ }
29
+
30
+ const ttasksptr
31
+ BaseManager::GetTtask() const
32
+ {
33
+ return m_ttask.lock();
34
+ }
35
+
36
+ void
37
+ BaseManager::
38
+ OutputSearchGraphAsHypergraph(std::ostream& out) const
39
+ {
40
+ // This virtual function that may not be implemented everywhere, but it should for
41
+ // derived classes that use it
42
+ UTIL_THROW2("Not implemented.");
43
+ }
44
+
45
+ void
46
+ BaseManager::
47
+ OutputSearchGraphAsHypergraph(std::string const& fname, size_t const precision) const
48
+ {
49
+ std::string odir = boost::filesystem::path(fname).parent_path().string();
50
+ if (! boost::filesystem::exists(odir))
51
+ boost::filesystem::create_directory(odir);
52
+ UTIL_THROW_IF2(!boost::filesystem::is_directory(odir),
53
+ "Cannot output hypergraphs to " << odir
54
+ << " because that path exists but is not a directory.");
55
+
56
+ // not clear why we need to output the weights every time we dump a search
57
+ // graph into a file again, but that's what the old code did.
58
+
59
+ string weightsFile = odir + "/weights";
60
+ TRACE_ERR("The weights file is " << weightsFile << "\n");
61
+ ofstream weightsOut;
62
+ weightsOut.open(weightsFile.c_str());
63
+ weightsOut.setf(std::ios::fixed);
64
+ weightsOut.precision(6);
65
+ // just temporarily, till we've implemented weight scoring in the manager
66
+ // (or the translation task)
67
+ StaticData::Instance().GetAllWeights().Save(weightsOut);
68
+ weightsOut.close();
69
+
70
+ boost::iostreams::filtering_ostream file;
71
+ if (boost::ends_with(fname, ".gz"))
72
+ file.push(boost::iostreams::gzip_compressor());
73
+ else if (boost::ends_with(fname, ".bz2"))
74
+ file.push( boost::iostreams::bzip2_compressor() );
75
+ file.push( boost::iostreams::file_sink(fname, ios_base::out) );
76
+ if (file.is_complete() && file.good()) {
77
+ file.setf(std::ios::fixed);
78
+ file.precision(precision);
79
+ this->OutputSearchGraphAsHypergraph(file);
80
+ file.flush();
81
+ } else {
82
+ TRACE_ERR("Cannot output hypergraph for line "
83
+ << this->GetSource().GetTranslationId()
84
+ << " because the output file " << fname
85
+ << " is not open or not ready for writing"
86
+ << std::endl);
87
+ }
88
+ file.pop();
89
+ }
90
+
91
+
92
+
93
+
94
+ /***
95
+ * print surface factor only for the given phrase
96
+ */
97
+ void
98
+ BaseManager::
99
+ OutputSurface(std::ostream &out, Phrase const& phrase) const
100
+ {
101
+ std::vector<FactorType> const& factor_order = options()->output.factor_order;
102
+
103
+ bool markUnknown = options()->unk.mark;
104
+ std::string const& fd = options()->output.factor_delimiter;
105
+
106
+ size_t size = phrase.GetSize();
107
+ for (size_t pos = 0 ; pos < size ; pos++) {
108
+ const Factor *factor = phrase.GetFactor(pos, factor_order[0]);
109
+ UTIL_THROW_IF2(factor == NULL, "Empty factor 0 at position " << pos);
110
+
111
+ const Word &word = phrase.GetWord(pos);
112
+ if(markUnknown && word.IsOOV()) {
113
+ out << options()->unk.prefix;
114
+ }
115
+
116
+ out << *factor;
117
+
118
+ for (size_t i = 1 ; i < factor_order.size() ; i++) {
119
+ const Factor *factor = phrase.GetFactor(pos, factor_order[i]);
120
+ UTIL_THROW_IF2(!factor, "Empty factor " << i << " at position " << pos);
121
+ out << fd << *factor;
122
+ }
123
+
124
+ if(markUnknown && word.IsOOV()) {
125
+ out << options()->unk.suffix;
126
+ }
127
+
128
+ out << " ";
129
+ }
130
+ }
131
+
132
+ // Emulates the old operator<<(ostream &, const DottedRule &) function. The
133
+ // output format is a bit odd (reverse order and double spacing between symbols)
134
+ // but there are scripts and tools that expect the output of -T to look like
135
+ // that.
136
+ void BaseManager::WriteApplicationContext(std::ostream &out,
137
+ const ApplicationContext &context) const
138
+ {
139
+ assert(!context.empty());
140
+ ApplicationContext::const_reverse_iterator p = context.rbegin();
141
+ while (true) {
142
+ out << p->second << "=" << p->first << " ";
143
+ if (++p == context.rend()) {
144
+ break;
145
+ }
146
+ out << " ";
147
+ }
148
+ }
149
+
150
+ AllOptions::ptr const&
151
+ BaseManager::
152
+ options() const
153
+ {
154
+ return GetTtask()->options();
155
+ }
156
+
157
+
158
+ } // namespace
159
+
160
+
mosesdecoder/moses/Bitmaps.cpp ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <boost/foreach.hpp>
2
+ #include "Bitmaps.h"
3
+ #include "Util.h"
4
+
5
+ using namespace std;
6
+
7
+ namespace Moses
8
+ {
9
+ Bitmaps::Bitmaps(size_t inputSize, const std::vector<bool> &initSourceCompleted)
10
+ {
11
+ m_initBitmap = new Bitmap(inputSize, initSourceCompleted);
12
+ m_coll[m_initBitmap];
13
+ }
14
+
15
+ Bitmaps::~Bitmaps()
16
+ {
17
+ BOOST_FOREACH (const Coll::value_type& myPair, m_coll) {
18
+ const Bitmap *bm = myPair.first;
19
+ delete bm;
20
+ }
21
+ }
22
+
23
+ const Bitmap &Bitmaps::GetNextBitmap(const Bitmap &bm, const Range &range)
24
+ {
25
+ Bitmap *newBM = new Bitmap(bm, range);
26
+
27
+ Coll::const_iterator iter = m_coll.find(newBM);
28
+ if (iter == m_coll.end()) {
29
+ m_coll[newBM] = NextBitmaps();
30
+ return *newBM;
31
+ } else {
32
+ delete newBM;
33
+ return *iter->first;
34
+ }
35
+ }
36
+
37
+ const Bitmap &Bitmaps::GetBitmap(const Bitmap &bm, const Range &range)
38
+ {
39
+ Coll::iterator iter = m_coll.find(&bm);
40
+ assert(iter != m_coll.end());
41
+
42
+ const Bitmap *newBM;
43
+ NextBitmaps &next = iter->second;
44
+ NextBitmaps::const_iterator iterNext = next.find(range);
45
+ if (iterNext == next.end()) {
46
+ // not seen the link yet.
47
+ newBM = &GetNextBitmap(bm, range);
48
+ next[range] = newBM;
49
+ } else {
50
+ // link exist
51
+ //std::cerr << "link exists" << endl;
52
+ newBM = iterNext->second;
53
+ }
54
+ return *newBM;
55
+ }
56
+
57
+ }
58
+
mosesdecoder/moses/ChartCell.h ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2010 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <iostream>
25
+ #include <queue>
26
+ #include <map>
27
+ #include <vector>
28
+ #include "Word.h"
29
+ #include "Range.h"
30
+ #include "NonTerminal.h"
31
+ #include "ChartHypothesis.h"
32
+ #include "ChartHypothesisCollection.h"
33
+ #include "RuleCube.h"
34
+ #include "ChartCellLabelSet.h"
35
+
36
+ #include <boost/scoped_ptr.hpp>
37
+ #include <boost/functional/hash.hpp>
38
+ #include <boost/unordered_map.hpp>
39
+ #include <boost/version.hpp>
40
+
41
+ namespace Moses
42
+ {
43
+ class ChartSearchGraphWriter;
44
+ class ChartTranslationOptionList;
45
+ class ChartCellCollection;
46
+ class ChartManager;
47
+
48
+ class ChartCellBase
49
+ {
50
+ public:
51
+ ChartCellBase(size_t startPos, size_t endPos);
52
+
53
+ virtual ~ChartCellBase();
54
+
55
+ const ChartCellLabelSet &GetTargetLabelSet() const {
56
+ return m_targetLabelSet;
57
+ }
58
+
59
+ ChartCellLabelSet &MutableTargetLabelSet() {
60
+ return m_targetLabelSet;
61
+ }
62
+
63
+ const Range &GetCoverage() const {
64
+ return m_coverage;
65
+ }
66
+
67
+ protected:
68
+ const Range m_coverage;
69
+ ChartCellLabelSet m_targetLabelSet;
70
+ };
71
+
72
+ /** 1 cell in chart decoder.
73
+ * Doesn't directly hold hypotheses. Each cell contain a map of ChartHypothesisCollection that have different constituent labels
74
+ */
75
+ class ChartCell : public ChartCellBase
76
+ {
77
+ friend std::ostream& operator<<(std::ostream&, const ChartCell&);
78
+ public:
79
+ #if defined(BOOST_VERSION) && (BOOST_VERSION >= 104200)
80
+ typedef boost::unordered_map<Word,
81
+ ChartHypothesisCollection,
82
+ NonTerminalHasher,
83
+ NonTerminalEqualityPred
84
+ > MapType;
85
+ #else
86
+ typedef std::map<Word, ChartHypothesisCollection> MapType;
87
+ #endif
88
+
89
+ protected:
90
+ MapType m_hypoColl;
91
+
92
+ bool m_nBestIsEnabled; /**< flag to determine whether to keep track of old arcs */
93
+ ChartManager &m_manager;
94
+
95
+ public:
96
+ ChartCell(size_t startPos, size_t endPos, ChartManager &manager);
97
+ ~ChartCell();
98
+
99
+ void Decode(const ChartTranslationOptionList &transOptList
100
+ ,const ChartCellCollection &allChartCells);
101
+
102
+ //! Get all hypotheses in the cell that have the specified constituent label
103
+ const HypoList *GetSortedHypotheses(const Word &constituentLabel) const {
104
+ MapType::const_iterator p = m_hypoColl.find(constituentLabel);
105
+ return (p == m_hypoColl.end()) ? NULL : &(p->second.GetSortedHypotheses());
106
+ }
107
+
108
+ //! for n-best list
109
+ const HypoList *GetAllSortedHypotheses() const;
110
+
111
+ bool AddHypothesis(ChartHypothesis *hypo);
112
+
113
+ void SortHypotheses();
114
+ void PruneToSize();
115
+
116
+ const ChartHypothesis *GetBestHypothesis() const;
117
+
118
+ void CleanupArcList();
119
+
120
+ void OutputSizes(std::ostream &out) const;
121
+ size_t GetSize() const;
122
+
123
+ void WriteSearchGraph(const ChartSearchGraphWriter& writer, const std::map<unsigned,bool> &reachable) const;
124
+
125
+ };
126
+
127
+ }
128
+
mosesdecoder/moses/ChartCellLabel.h ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "HypoList.h"
23
+ #include "Word.h"
24
+ #include "Range.h"
25
+ #include "ChartParserCallback.h"
26
+
27
+ namespace search
28
+ {
29
+ class Vertex;
30
+ }
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ class Word;
36
+
37
+ /** Contains a range, word (non-terms?) and a vector of hypotheses.
38
+ * @todo This is probably incompatible with lattice decoding when the word that spans
39
+ * a position (or positions) can vary.
40
+ * @todo is this to hold sorted hypotheses that are in the queue for creating the next hypos?
41
+ */
42
+ class ChartCellLabel
43
+ {
44
+ public:
45
+ union Stack {
46
+ const HypoList *cube; // cube pruning
47
+ search::Vertex *incr; // incremental search after filling.
48
+ void *incr_generator; // incremental search during filling.
49
+ };
50
+
51
+
52
+ ChartCellLabel(const Range &coverage, const Word &label,
53
+ Stack stack=Stack())
54
+ : m_coverage(coverage)
55
+ , m_label(label)
56
+ , m_stack(stack)
57
+ , m_bestScore(0) {
58
+ }
59
+
60
+ const Range &GetCoverage() const {
61
+ return m_coverage;
62
+ }
63
+ const Word &GetLabel() const {
64
+ return m_label;
65
+ }
66
+ Stack GetStack() const {
67
+ return m_stack;
68
+ }
69
+ Stack &MutableStack() {
70
+ return m_stack;
71
+ }
72
+
73
+ //caching of best score on stack
74
+ float GetBestScore(const ChartParserCallback *outColl) const {
75
+ if (m_bestScore == 0) {
76
+ m_bestScore = outColl->GetBestScore(this);
77
+ }
78
+ return m_bestScore;
79
+ }
80
+
81
+ private:
82
+ const Range &m_coverage;
83
+ const Word &m_label;
84
+ //const InputPath &m_inputPath;
85
+ Stack m_stack;
86
+ mutable float m_bestScore;
87
+ };
88
+
89
+ }
mosesdecoder/moses/ChartHypothesis.h ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // vim:tabstop=2
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2010 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #pragma once
22
+
23
+ #include <vector>
24
+ #include <boost/scoped_ptr.hpp>
25
+ #include "Util.h"
26
+ #include "Range.h"
27
+ #include "ScoreComponentCollection.h"
28
+ #include "Phrase.h"
29
+ #include "ChartTranslationOptions.h"
30
+ #include "ObjectPool.h"
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ class ChartKBestExtractor;
36
+ class ChartHypothesis;
37
+ class ChartManager;
38
+ class RuleCubeItem;
39
+ class FFState;
40
+
41
+ typedef std::vector<ChartHypothesis*> ChartArcList;
42
+
43
+ /** a hypothesis in the hierarchical/syntax decoder.
44
+ * Contain a pointer to the current target phrase, a vector of previous hypos, and some scores
45
+ */
46
+ class ChartHypothesis
47
+ {
48
+ friend std::ostream& operator<<(std::ostream&, const ChartHypothesis&);
49
+ // friend class ChartKBestExtractor;
50
+
51
+ protected:
52
+
53
+ boost::shared_ptr<ChartTranslationOption> m_transOpt;
54
+
55
+ Range m_currSourceWordsRange;
56
+ std::vector<const FFState*> m_ffStates; /*! stateful feature function states */
57
+ /*! sum of scores of this hypothesis, and previous hypotheses. Lazily initialised. */
58
+ mutable boost::scoped_ptr<ScoreComponentCollection> m_scoreBreakdown;
59
+ mutable boost::scoped_ptr<ScoreComponentCollection> m_deltaScoreBreakdown;
60
+ ScoreComponentCollection m_currScoreBreakdown /*! scores for this hypothesis only */
61
+ ,m_lmNGram
62
+ ,m_lmPrefix;
63
+ float m_totalScore;
64
+
65
+ ChartArcList *m_arcList; /*! all arcs that end at the same trellis point as this hypothesis */
66
+ const ChartHypothesis *m_winningHypo;
67
+
68
+ std::vector<const ChartHypothesis*> m_prevHypos; // always sorted by source position?
69
+
70
+ ChartManager& m_manager;
71
+
72
+ unsigned m_id; /* pkoehn wants to log the order in which hypotheses were generated */
73
+
74
+ //! not implemented
75
+ ChartHypothesis();
76
+
77
+ //! not implemented
78
+ ChartHypothesis(const ChartHypothesis &copy);
79
+
80
+ public:
81
+ ChartHypothesis(const ChartTranslationOptions &, const RuleCubeItem &item,
82
+ ChartManager &manager);
83
+
84
+ //! only used by ChartKBestExtractor
85
+ ChartHypothesis(const ChartHypothesis &, const ChartKBestExtractor &);
86
+
87
+ ~ChartHypothesis();
88
+
89
+ unsigned GetId() const {
90
+ return m_id;
91
+ }
92
+
93
+ const ChartTranslationOption &GetTranslationOption() const {
94
+ return *m_transOpt;
95
+ }
96
+
97
+ //! Get the rule that created this hypothesis
98
+ const TargetPhrase &GetCurrTargetPhrase() const {
99
+ return m_transOpt->GetPhrase();
100
+ }
101
+
102
+ //! the source range that this hypothesis spans
103
+ const Range &GetCurrSourceRange() const {
104
+ return m_currSourceWordsRange;
105
+ }
106
+
107
+ //! the arc list when creating n-best lists
108
+ inline const ChartArcList* GetArcList() const {
109
+ return m_arcList;
110
+ }
111
+
112
+ //! the feature function states for a particular feature \param featureID
113
+ inline const FFState* GetFFState( size_t featureID ) const {
114
+ return m_ffStates[ featureID ];
115
+ }
116
+
117
+ //! reference back to the manager
118
+ inline const ChartManager& GetManager() const {
119
+ return m_manager;
120
+ }
121
+
122
+ void GetOutputPhrase(Phrase &outPhrase) const;
123
+ Phrase GetOutputPhrase() const;
124
+
125
+ // get leftmost/rightmost words only
126
+ // leftRightMost: 1=left, 2=right
127
+ void GetOutputPhrase(size_t leftRightMost, size_t numWords, Phrase &outPhrase) const;
128
+
129
+ void EvaluateWhenApplied();
130
+
131
+ void AddArc(ChartHypothesis *loserHypo);
132
+ void CleanupArcList();
133
+ void SetWinningHypo(const ChartHypothesis *hypo);
134
+
135
+ //! get the unweighted score for each feature function
136
+ const ScoreComponentCollection &GetScoreBreakdown() const {
137
+ // Note: never call this method before m_currScoreBreakdown is fully computed
138
+ if (!m_scoreBreakdown.get()) {
139
+ m_scoreBreakdown.reset(new ScoreComponentCollection());
140
+ // score breakdown from current translation rule
141
+ if (m_transOpt) {
142
+ m_scoreBreakdown->PlusEquals(GetTranslationOption().GetScores());
143
+ }
144
+ m_scoreBreakdown->PlusEquals(m_currScoreBreakdown);
145
+ // score breakdowns from prev hypos
146
+ for (std::vector<const ChartHypothesis*>::const_iterator iter = m_prevHypos.begin(); iter != m_prevHypos.end(); ++iter) {
147
+ const ChartHypothesis &prevHypo = **iter;
148
+ m_scoreBreakdown->PlusEquals(prevHypo.GetScoreBreakdown());
149
+ }
150
+ }
151
+ return *(m_scoreBreakdown.get());
152
+ }
153
+
154
+ //! get the unweighted score delta for each feature function
155
+ const ScoreComponentCollection &GetDeltaScoreBreakdown() const {
156
+ // Note: never call this method before m_currScoreBreakdown is fully computed
157
+ if (!m_deltaScoreBreakdown.get()) {
158
+ m_deltaScoreBreakdown.reset(new ScoreComponentCollection());
159
+ // score breakdown from current translation rule
160
+ if (m_transOpt) {
161
+ m_deltaScoreBreakdown->PlusEquals(GetTranslationOption().GetScores());
162
+ }
163
+ m_deltaScoreBreakdown->PlusEquals(m_currScoreBreakdown);
164
+ // delta: score breakdowns from prev hypos _not_ added
165
+ }
166
+ return *(m_deltaScoreBreakdown.get());
167
+ }
168
+
169
+ //! Get the weighted total score
170
+ float GetFutureScore() const {
171
+ // scores from current translation rule. eg. translation models & word penalty
172
+ return m_totalScore;
173
+ }
174
+
175
+ //! vector of previous hypotheses this hypo is built on
176
+ const std::vector<const ChartHypothesis*> &GetPrevHypos() const {
177
+ return m_prevHypos;
178
+ }
179
+
180
+ //! get a particular previous hypos
181
+ const ChartHypothesis* GetPrevHypo(size_t pos) const {
182
+ return m_prevHypos[pos];
183
+ }
184
+
185
+ //! get the constituency label that covers this hypo
186
+ const Word &GetTargetLHS() const {
187
+ return GetCurrTargetPhrase().GetTargetLHS();
188
+ }
189
+
190
+ //! get the best hypo in the arc list when doing n-best list creation. It's either this hypothesis, or the best hypo is this hypo is in the arc list
191
+ const ChartHypothesis* GetWinningHypothesis() const {
192
+ return m_winningHypo;
193
+ }
194
+
195
+ // for unordered_set in stack
196
+ size_t hash() const;
197
+ bool operator==(const ChartHypothesis& other) const;
198
+
199
+ TO_STRING();
200
+
201
+ }; // class ChartHypothesis
202
+
203
+ }
204
+
mosesdecoder/moses/ChartKBestExtractor.cpp ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2014 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "ChartKBestExtractor.h"
21
+
22
+ #include "ChartHypothesis.h"
23
+ #include "ScoreComponentCollection.h"
24
+ #include "StaticData.h"
25
+
26
+ #include <boost/scoped_ptr.hpp>
27
+
28
+ #include <vector>
29
+
30
+ using namespace std;
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ // Extract the k-best list from the search graph.
36
+ void ChartKBestExtractor::Extract(
37
+ const std::vector<const ChartHypothesis*> &topLevelHypos, std::size_t k,
38
+ KBestVec &kBestList)
39
+ {
40
+ kBestList.clear();
41
+ if (topLevelHypos.empty()) {
42
+ return;
43
+ }
44
+
45
+ // Create a new ChartHypothesis object, supremeHypo, that has the best
46
+ // top-level hypothesis as its predecessor and has the same score.
47
+ std::vector<const ChartHypothesis*>::const_iterator p = topLevelHypos.begin();
48
+ const ChartHypothesis &bestTopLevelHypo = **p;
49
+ boost::scoped_ptr<ChartHypothesis> supremeHypo(
50
+ new ChartHypothesis(bestTopLevelHypo, *this));
51
+
52
+ // Do the same for each alternative top-level hypothesis, but add the new
53
+ // ChartHypothesis objects as arcs from supremeHypo, as if they had been
54
+ // recombined.
55
+ for (++p; p != topLevelHypos.end(); ++p) {
56
+ // Check that the first item in topLevelHypos really was the best.
57
+ UTIL_THROW_IF2((*p)->GetFutureScore() > bestTopLevelHypo.GetFutureScore(),
58
+ "top-level hypotheses are not correctly sorted");
59
+ // Note: there's no need for a smart pointer here: supremeHypo will take
60
+ // ownership of altHypo.
61
+ ChartHypothesis *altHypo = new ChartHypothesis(**p, *this);
62
+ supremeHypo->AddArc(altHypo);
63
+ }
64
+
65
+ // Create the target vertex then lazily fill its k-best list.
66
+ boost::shared_ptr<Vertex> targetVertex = FindOrCreateVertex(*supremeHypo);
67
+ LazyKthBest(*targetVertex, k, k);
68
+
69
+ // Copy the k-best list from the target vertex, but drop the top edge from
70
+ // each derivation.
71
+ kBestList.reserve(targetVertex->kBestList.size());
72
+ for (std::vector<boost::weak_ptr<Derivation> >::const_iterator
73
+ q = targetVertex->kBestList.begin();
74
+ q != targetVertex->kBestList.end(); ++q) {
75
+ const boost::shared_ptr<Derivation> d(*q);
76
+ assert(d);
77
+ assert(d->subderivations.size() == 1);
78
+ kBestList.push_back(d->subderivations[0]);
79
+ }
80
+ }
81
+
82
+ // Generate the target-side yield of the derivation d.
83
+ Phrase ChartKBestExtractor::GetOutputPhrase(const Derivation &d)
84
+ {
85
+ FactorType placeholderFactor = StaticData::Instance().options()->input.placeholder_factor;
86
+
87
+ Phrase ret(ARRAY_SIZE_INCR);
88
+
89
+ const ChartHypothesis &hypo = d.edge.head->hypothesis;
90
+ const TargetPhrase &phrase = hypo.GetCurrTargetPhrase();
91
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
92
+ phrase.GetAlignNonTerm().GetNonTermIndexMap();
93
+ for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) {
94
+ const Word &word = phrase.GetWord(pos);
95
+ if (word.IsNonTerminal()) {
96
+ std::size_t nonTermInd = nonTermIndexMap[pos];
97
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
98
+ Phrase subPhrase = GetOutputPhrase(subderivation);
99
+ ret.Append(subPhrase);
100
+ } else {
101
+ ret.AddWord(word);
102
+ if (placeholderFactor == NOT_FOUND) {
103
+ continue;
104
+ }
105
+ std::set<std::size_t> sourcePosSet =
106
+ phrase.GetAlignTerm().GetAlignmentsForTarget(pos);
107
+ if (sourcePosSet.size() == 1) {
108
+ const std::vector<const Word*> *ruleSourceFromInputPath =
109
+ hypo.GetTranslationOption().GetSourceRuleFromInputPath();
110
+ UTIL_THROW_IF2(ruleSourceFromInputPath == NULL,
111
+ "Source Words in of the rules hasn't been filled out");
112
+ std::size_t sourcePos = *sourcePosSet.begin();
113
+ const Word *sourceWord = ruleSourceFromInputPath->at(sourcePos);
114
+ UTIL_THROW_IF2(sourceWord == NULL,
115
+ "Null source word at position " << sourcePos);
116
+ const Factor *factor = sourceWord->GetFactor(placeholderFactor);
117
+ if (factor) {
118
+ ret.Back()[0] = factor;
119
+ }
120
+ }
121
+ }
122
+ }
123
+
124
+ return ret;
125
+ }
126
+
127
+ // Generate the score breakdown of the derivation d.
128
+ boost::shared_ptr<ScoreComponentCollection>
129
+ ChartKBestExtractor::GetOutputScoreBreakdown(const Derivation &d)
130
+ {
131
+ const ChartHypothesis &hypo = d.edge.head->hypothesis;
132
+ boost::shared_ptr<ScoreComponentCollection> scoreBreakdown(new ScoreComponentCollection());
133
+ scoreBreakdown->PlusEquals(hypo.GetDeltaScoreBreakdown());
134
+ const TargetPhrase &phrase = hypo.GetCurrTargetPhrase();
135
+ const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
136
+ phrase.GetAlignNonTerm().GetNonTermIndexMap();
137
+ for (std::size_t pos = 0; pos < phrase.GetSize(); ++pos) {
138
+ const Word &word = phrase.GetWord(pos);
139
+ if (word.IsNonTerminal()) {
140
+ std::size_t nonTermInd = nonTermIndexMap[pos];
141
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
142
+ scoreBreakdown->PlusEquals(*GetOutputScoreBreakdown(subderivation));
143
+ }
144
+ }
145
+
146
+ return scoreBreakdown;
147
+ }
148
+
149
+ // Generate the target tree of the derivation d.
150
+ TreePointer ChartKBestExtractor::GetOutputTree(const Derivation &d)
151
+ {
152
+ const ChartHypothesis &hypo = d.edge.head->hypothesis;
153
+ const TargetPhrase &phrase = hypo.GetCurrTargetPhrase();
154
+ if (const PhraseProperty *property = phrase.GetProperty("Tree")) {
155
+ const std::string *tree = property->GetValueString();
156
+ TreePointer mytree (boost::make_shared<InternalTree>(*tree));
157
+
158
+ //get subtrees (in target order)
159
+ std::vector<TreePointer> previous_trees;
160
+ for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
161
+ const Word &word = phrase.GetWord(pos);
162
+ if (word.IsNonTerminal()) {
163
+ size_t nonTermInd = phrase.GetAlignNonTerm().GetNonTermIndexMap()[pos];
164
+ const Derivation &subderivation = *d.subderivations[nonTermInd];
165
+ const TreePointer prev_tree = GetOutputTree(subderivation);
166
+ previous_trees.push_back(prev_tree);
167
+ }
168
+ }
169
+
170
+ mytree->Combine(previous_trees);
171
+ mytree->Unbinarize();
172
+ return mytree;
173
+ } else {
174
+ UTIL_THROW2("Error: k-best tree output active, but no internal tree structure found");
175
+ }
176
+ }
177
+
178
+ // Create an unweighted hyperarc corresponding to the given ChartHypothesis.
179
+ ChartKBestExtractor::UnweightedHyperarc ChartKBestExtractor::CreateEdge(
180
+ const ChartHypothesis &h)
181
+ {
182
+ UnweightedHyperarc edge;
183
+ edge.head = FindOrCreateVertex(h);
184
+ const std::vector<const ChartHypothesis*> &prevHypos = h.GetPrevHypos();
185
+ edge.tail.resize(prevHypos.size());
186
+ for (std::size_t i = 0; i < prevHypos.size(); ++i) {
187
+ const ChartHypothesis *prevHypo = prevHypos[i];
188
+ edge.tail[i] = FindOrCreateVertex(*prevHypo);
189
+ }
190
+ return edge;
191
+ }
192
+
193
+ // Look for the vertex corresponding to a given ChartHypothesis, creating
194
+ // a new one if necessary.
195
+ boost::shared_ptr<ChartKBestExtractor::Vertex>
196
+ ChartKBestExtractor::FindOrCreateVertex(const ChartHypothesis &h)
197
+ {
198
+ VertexMap::value_type element(&h, boost::shared_ptr<Vertex>());
199
+ std::pair<VertexMap::iterator, bool> p = m_vertexMap.insert(element);
200
+ boost::shared_ptr<Vertex> &sp = p.first->second;
201
+ if (!p.second) {
202
+ return sp; // Vertex was already in m_vertexMap.
203
+ }
204
+ sp.reset(new Vertex(h));
205
+ // Create the 1-best derivation and add it to the vertex's kBestList.
206
+ UnweightedHyperarc bestEdge;
207
+ bestEdge.head = sp;
208
+ const std::vector<const ChartHypothesis*> &prevHypos = h.GetPrevHypos();
209
+ bestEdge.tail.resize(prevHypos.size());
210
+ for (std::size_t i = 0; i < prevHypos.size(); ++i) {
211
+ const ChartHypothesis *prevHypo = prevHypos[i];
212
+ bestEdge.tail[i] = FindOrCreateVertex(*prevHypo);
213
+ }
214
+ boost::shared_ptr<Derivation> bestDerivation(new Derivation(bestEdge));
215
+ #ifndef NDEBUG
216
+ std::pair<DerivationSet::iterator, bool> q =
217
+ #endif
218
+ m_derivations.insert(bestDerivation);
219
+ assert(q.second);
220
+ sp->kBestList.push_back(bestDerivation);
221
+ return sp;
222
+ }
223
+
224
+ // Create the 1-best derivation for each edge in BS(v) (except the best one)
225
+ // and add it to v's candidate queue.
226
+ void ChartKBestExtractor::GetCandidates(Vertex &v, std::size_t k)
227
+ {
228
+ // Create derivations for all of v's incoming edges except the best. This
229
+ // means everything in v.hypothesis.GetArcList() and not the edge defined
230
+ // by v.hypothesis itself. The 1-best derivation for that edge will already
231
+ // have been created.
232
+ const ChartArcList *arcList = v.hypothesis.GetArcList();
233
+ if (arcList) {
234
+ for (std::size_t i = 0; i < arcList->size(); ++i) {
235
+ const ChartHypothesis &recombinedHypo = *(*arcList)[i];
236
+ boost::shared_ptr<Vertex> w = FindOrCreateVertex(recombinedHypo);
237
+ assert(w->kBestList.size() == 1);
238
+ v.candidates.push(w->kBestList[0]);
239
+ }
240
+ }
241
+ }
242
+
243
+ // Lazily fill v's k-best list.
244
+ void ChartKBestExtractor::LazyKthBest(Vertex &v, std::size_t k,
245
+ std::size_t globalK)
246
+ {
247
+ // If this is the first visit to vertex v then initialize the priority queue.
248
+ if (v.visited == false) {
249
+ // The 1-best derivation should already be in v's k-best list.
250
+ assert(v.kBestList.size() == 1);
251
+ // Initialize v's priority queue.
252
+ GetCandidates(v, globalK);
253
+ v.visited = true;
254
+ }
255
+ // Add derivations to the k-best list until it contains k or there are none
256
+ // left to add.
257
+ while (v.kBestList.size() < k) {
258
+ assert(!v.kBestList.empty());
259
+ // Update the priority queue by adding the successors of the last
260
+ // derivation (unless they've been seen before).
261
+ boost::shared_ptr<Derivation> d(v.kBestList.back());
262
+ LazyNext(v, *d, globalK);
263
+ // Check if there are any derivations left in the queue.
264
+ if (v.candidates.empty()) {
265
+ break;
266
+ }
267
+ // Get the next best derivation and delete it from the queue.
268
+ boost::weak_ptr<Derivation> next = v.candidates.top();
269
+ v.candidates.pop();
270
+ // Add it to the k-best list.
271
+ v.kBestList.push_back(next);
272
+ }
273
+ }
274
+
275
+ // Create the neighbours of Derivation d and add them to v's candidate queue.
276
+ void ChartKBestExtractor::LazyNext(Vertex &v, const Derivation &d,
277
+ std::size_t globalK)
278
+ {
279
+ for (std::size_t i = 0; i < d.edge.tail.size(); ++i) {
280
+ Vertex &pred = *d.edge.tail[i];
281
+ // Ensure that pred's k-best list contains enough derivations.
282
+ std::size_t k = d.backPointers[i] + 2;
283
+ LazyKthBest(pred, k, globalK);
284
+ if (pred.kBestList.size() < k) {
285
+ // pred's derivations have been exhausted.
286
+ continue;
287
+ }
288
+ // Create the neighbour.
289
+ boost::shared_ptr<Derivation> next(new Derivation(d, i));
290
+ // Check if it has been created before.
291
+ std::pair<DerivationSet::iterator, bool> p = m_derivations.insert(next);
292
+ if (p.second) {
293
+ v.candidates.push(next); // Haven't previously seen it.
294
+ }
295
+ }
296
+ }
297
+
298
+ // Construct the 1-best Derivation that ends at edge e.
299
+ ChartKBestExtractor::Derivation::Derivation(const UnweightedHyperarc &e)
300
+ {
301
+ edge = e;
302
+ std::size_t arity = edge.tail.size();
303
+ backPointers.resize(arity, 0);
304
+ subderivations.reserve(arity);
305
+ for (std::size_t i = 0; i < arity; ++i) {
306
+ const Vertex &pred = *edge.tail[i];
307
+ assert(pred.kBestList.size() >= 1);
308
+ boost::shared_ptr<Derivation> sub(pred.kBestList[0]);
309
+ subderivations.push_back(sub);
310
+ }
311
+ score = edge.head->hypothesis.GetFutureScore();
312
+ }
313
+
314
+ // Construct a Derivation that neighbours an existing Derivation.
315
+ ChartKBestExtractor::Derivation::Derivation(const Derivation &d, std::size_t i)
316
+ {
317
+ edge.head = d.edge.head;
318
+ edge.tail = d.edge.tail;
319
+ backPointers = d.backPointers;
320
+ subderivations = d.subderivations;
321
+ std::size_t j = ++backPointers[i];
322
+ score = d.score;
323
+ // Deduct the score of the old subderivation.
324
+ score -= subderivations[i]->score;
325
+ // Update the subderivation pointer.
326
+ boost::shared_ptr<Derivation> newSub(edge.tail[i]->kBestList[j]);
327
+ subderivations[i] = newSub;
328
+ // Add the score of the new subderivation.
329
+ score += subderivations[i]->score;
330
+ }
331
+
332
+ } // namespace Moses
mosesdecoder/moses/ChartKBestExtractor.h ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2014 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include <cassert>
23
+ #include "ChartHypothesis.h"
24
+ #include "ScoreComponentCollection.h"
25
+ #include "FF/InternalTree.h"
26
+
27
+ #include <boost/unordered_set.hpp>
28
+ #include <boost/weak_ptr.hpp>
29
+ #include <boost/shared_ptr.hpp>
30
+
31
+ #include <queue>
32
+ #include <vector>
33
+
34
+ namespace Moses
35
+ {
36
+
37
+ // k-best list extractor that implements algorithm 3 from this paper:
38
+ //
39
+ // Liang Huang and David Chiang
40
+ // "Better k-best parsing"
41
+ // In Proceedings of IWPT 2005
42
+ //
43
+ class ChartKBestExtractor
44
+ {
45
+ public:
46
+ struct Vertex;
47
+
48
+ struct UnweightedHyperarc {
49
+ boost::shared_ptr<Vertex> head;
50
+ std::vector<boost::shared_ptr<Vertex> > tail;
51
+ };
52
+
53
+ struct Derivation {
54
+ Derivation(const UnweightedHyperarc &);
55
+ Derivation(const Derivation &, std::size_t);
56
+
57
+ UnweightedHyperarc edge;
58
+ std::vector<std::size_t> backPointers;
59
+ std::vector<boost::shared_ptr<Derivation> > subderivations;
60
+ float score;
61
+ };
62
+
63
+ struct DerivationOrderer {
64
+ bool operator()(const boost::weak_ptr<Derivation> &d1,
65
+ const boost::weak_ptr<Derivation> &d2) const {
66
+ boost::shared_ptr<Derivation> s1(d1);
67
+ boost::shared_ptr<Derivation> s2(d2);
68
+ return s1->score < s2->score;
69
+ }
70
+ };
71
+
72
+ struct Vertex {
73
+ typedef std::priority_queue<boost::weak_ptr<Derivation>,
74
+ std::vector<boost::weak_ptr<Derivation> >,
75
+ DerivationOrderer> DerivationQueue;
76
+
77
+ Vertex(const ChartHypothesis &h) : hypothesis(h), visited(false) {}
78
+
79
+ const ChartHypothesis &hypothesis;
80
+ std::vector<boost::weak_ptr<Derivation> > kBestList;
81
+ DerivationQueue candidates;
82
+ bool visited;
83
+ };
84
+
85
+ typedef std::vector<boost::shared_ptr<Derivation> > KBestVec;
86
+
87
+ // Extract the k-best list from the search hypergraph given the full, sorted
88
+ // list of top-level vertices.
89
+ void Extract(const std::vector<const ChartHypothesis*> &topHypos,
90
+ std::size_t k, KBestVec &);
91
+
92
+ static Phrase GetOutputPhrase(const Derivation &);
93
+ static boost::shared_ptr<ScoreComponentCollection> GetOutputScoreBreakdown(const Derivation &);
94
+ static TreePointer GetOutputTree(const Derivation &);
95
+
96
+ private:
97
+ typedef boost::unordered_map<const ChartHypothesis *,
98
+ boost::shared_ptr<Vertex> > VertexMap;
99
+
100
+ struct DerivationHasher {
101
+ std::size_t operator()(const boost::shared_ptr<Derivation> &d) const {
102
+ std::size_t seed = 0;
103
+ boost::hash_combine(seed, d->edge.head);
104
+ boost::hash_combine(seed, d->edge.tail);
105
+ boost::hash_combine(seed, d->backPointers);
106
+ return seed;
107
+ }
108
+ };
109
+
110
+ struct DerivationEqualityPred {
111
+ bool operator()(const boost::shared_ptr<Derivation> &d1,
112
+ const boost::shared_ptr<Derivation> &d2) const {
113
+ return d1->edge.head == d2->edge.head &&
114
+ d1->edge.tail == d2->edge.tail &&
115
+ d1->backPointers == d2->backPointers;
116
+ }
117
+ };
118
+
119
+ typedef boost::unordered_set<boost::shared_ptr<Derivation>, DerivationHasher,
120
+ DerivationEqualityPred> DerivationSet;
121
+
122
+ UnweightedHyperarc CreateEdge(const ChartHypothesis &);
123
+ boost::shared_ptr<Vertex> FindOrCreateVertex(const ChartHypothesis &);
124
+ void GetCandidates(Vertex &, std::size_t);
125
+ void LazyKthBest(Vertex &, std::size_t, std::size_t);
126
+ void LazyNext(Vertex &, const Derivation &, std::size_t);
127
+
128
+ VertexMap m_vertexMap;
129
+ DerivationSet m_derivations;
130
+ };
131
+
132
+ } // namespace Moses
mosesdecoder/moses/ChartParser.cpp ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2010 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "ChartParser.h"
23
+ #include "ChartParserCallback.h"
24
+ #include "ChartRuleLookupManager.h"
25
+ #include "StaticData.h"
26
+ #include "TreeInput.h"
27
+ #include "Sentence.h"
28
+ #include "DecodeGraph.h"
29
+ #include "moses/FF/UnknownWordPenaltyProducer.h"
30
+ #include "moses/TranslationModel/PhraseDictionary.h"
31
+ #include "moses/TranslationTask.h"
32
+
33
+ using namespace std;
34
+ using namespace Moses;
35
+
36
+ namespace Moses
37
+ {
38
+
39
+ ChartParserUnknown
40
+ ::ChartParserUnknown(ttasksptr const& ttask)
41
+ : m_ttask(ttask)
42
+ { }
43
+
44
+ ChartParserUnknown::~ChartParserUnknown()
45
+ {
46
+ RemoveAllInColl(m_unksrcs);
47
+ }
48
+
49
+ AllOptions::ptr const&
50
+ ChartParserUnknown::
51
+ options() const
52
+ {
53
+ return m_ttask.lock()->options();
54
+ }
55
+
56
+ void
57
+ ChartParserUnknown::
58
+ Process(const Word &sourceWord, const Range &range, ChartParserCallback &to)
59
+ {
60
+ // unknown word, add as trans opt
61
+ const StaticData &staticData = StaticData::Instance();
62
+ const UnknownWordPenaltyProducer &unknownWordPenaltyProducer
63
+ = UnknownWordPenaltyProducer::Instance();
64
+
65
+ size_t isDigit = 0;
66
+ if (options()->unk.drop) {
67
+ const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
68
+ const StringPiece s = f->GetString();
69
+ isDigit = s.find_first_of("0123456789");
70
+ if (isDigit == string::npos)
71
+ isDigit = 0;
72
+ else
73
+ isDigit = 1;
74
+ // modify the starting bitmap
75
+ }
76
+
77
+ Phrase* unksrc = new Phrase(1);
78
+ unksrc->AddWord() = sourceWord;
79
+ Word &newWord = unksrc->GetWord(0);
80
+ newWord.SetIsOOV(true);
81
+
82
+ m_unksrcs.push_back(unksrc);
83
+
84
+ // hack. Once the OOV FF is a phrase table, get rid of this
85
+ PhraseDictionary *firstPt = NULL;
86
+ if (PhraseDictionary::GetColl().size() == 0) {
87
+ firstPt = PhraseDictionary::GetColl()[0];
88
+ }
89
+
90
+ //TranslationOption *transOpt;
91
+ if (! options()->unk.drop || isDigit) {
92
+ // loop
93
+ const UnknownLHSList &lhsList = options()->syntax.unknown_lhs; // staticData.GetUnknownLHS();
94
+ UnknownLHSList::const_iterator iterLHS;
95
+ for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
96
+ const string &targetLHSStr = iterLHS->first;
97
+ float prob = iterLHS->second;
98
+
99
+ // lhs
100
+ //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
101
+ Word *targetLHS = new Word(true);
102
+
103
+ targetLHS->CreateFromString(Output, options()->output.factor_order,
104
+ targetLHSStr, true);
105
+ UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
106
+
107
+ // add to dictionary
108
+ TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
109
+ Word &targetWord = targetPhrase->AddWord();
110
+ targetWord.CreateUnknownWord(sourceWord);
111
+
112
+ // scores
113
+ float unknownScore = FloorScore(TransformScore(prob));
114
+
115
+ targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
116
+ targetPhrase->SetTargetLHS(targetLHS);
117
+ targetPhrase->SetAlignmentInfo("0-0");
118
+ targetPhrase->EvaluateInIsolation(*unksrc);
119
+
120
+ if (!options()->output.detailed_tree_transrep_filepath.empty() ||
121
+ options()->nbest.print_trees || staticData.GetTreeStructure() != NULL) {
122
+ std::string prop = "[ ";
123
+ prop += (*targetLHS)[0]->GetString().as_string() + " ";
124
+ prop += sourceWord[0]->GetString().as_string() + " ]";
125
+ targetPhrase->SetProperty("Tree", prop);
126
+ }
127
+
128
+ // chart rule
129
+ to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
130
+ } // for (iterLHS
131
+ } else {
132
+ // drop source word. create blank trans opt
133
+ float unknownScore = FloorScore(-numeric_limits<float>::infinity());
134
+
135
+ TargetPhrase *targetPhrase = new TargetPhrase(firstPt);
136
+ // loop
137
+ const UnknownLHSList &lhsList = options()->syntax.unknown_lhs;//staticData.GetUnknownLHS();
138
+ UnknownLHSList::const_iterator iterLHS;
139
+ for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
140
+ const string &targetLHSStr = iterLHS->first;
141
+ //float prob = iterLHS->second;
142
+
143
+ Word *targetLHS = new Word(true);
144
+ targetLHS->CreateFromString(Output, staticData.options()->output.factor_order,
145
+ targetLHSStr, true);
146
+ UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");
147
+
148
+ targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
149
+ targetPhrase->EvaluateInIsolation(*unksrc);
150
+
151
+ targetPhrase->SetTargetLHS(targetLHS);
152
+
153
+ // chart rule
154
+ to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
155
+ }
156
+ }
157
+ }
158
+
159
+ ChartParser
160
+ ::ChartParser(ttasksptr const& ttask, ChartCellCollectionBase &cells)
161
+ : m_ttask(ttask)
162
+ , m_unknown(ttask)
163
+ , m_decodeGraphList(StaticData::Instance().GetDecodeGraphs())
164
+ , m_source(*(ttask->GetSource().get()))
165
+ {
166
+ const StaticData &staticData = StaticData::Instance();
167
+
168
+ staticData.InitializeForInput(ttask);
169
+ CreateInputPaths(m_source);
170
+
171
+ const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl();
172
+ assert(dictionaries.size() == m_decodeGraphList.size());
173
+ m_ruleLookupManagers.reserve(dictionaries.size());
174
+ for (std::size_t i = 0; i < dictionaries.size(); ++i) {
175
+ const PhraseDictionary *dict = dictionaries[i];
176
+ PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict);
177
+ std::size_t maxChartSpan = m_decodeGraphList[i]->GetMaxChartSpan();
178
+ ChartRuleLookupManager *lookupMgr = nonConstDict->CreateRuleLookupManager(*this, cells, maxChartSpan);
179
+ m_ruleLookupManagers.push_back(lookupMgr);
180
+ }
181
+
182
+ }
183
+
184
+ ChartParser::~ChartParser()
185
+ {
186
+ RemoveAllInColl(m_ruleLookupManagers);
187
+ StaticData::Instance().CleanUpAfterSentenceProcessing(m_ttask.lock());
188
+
189
+ InputPathMatrix::const_iterator iterOuter;
190
+ for (iterOuter = m_inputPathMatrix.begin(); iterOuter != m_inputPathMatrix.end(); ++iterOuter) {
191
+ const std::vector<InputPath*> &outer = *iterOuter;
192
+
193
+ std::vector<InputPath*>::const_iterator iterInner;
194
+ for (iterInner = outer.begin(); iterInner != outer.end(); ++iterInner) {
195
+ InputPath *path = *iterInner;
196
+ delete path;
197
+ }
198
+ }
199
+ }
200
+
201
+ void ChartParser::Create(const Range &range, ChartParserCallback &to)
202
+ {
203
+ assert(m_decodeGraphList.size() == m_ruleLookupManagers.size());
204
+
205
+ std::vector <DecodeGraph*>::const_iterator iterDecodeGraph;
206
+ std::vector <ChartRuleLookupManager*>::const_iterator iterRuleLookupManagers = m_ruleLookupManagers.begin();
207
+ for (iterDecodeGraph = m_decodeGraphList.begin(); iterDecodeGraph != m_decodeGraphList.end(); ++iterDecodeGraph, ++iterRuleLookupManagers) {
208
+ const DecodeGraph &decodeGraph = **iterDecodeGraph;
209
+ assert(decodeGraph.GetSize() == 1);
210
+ ChartRuleLookupManager &ruleLookupManager = **iterRuleLookupManagers;
211
+ size_t maxSpan = decodeGraph.GetMaxChartSpan();
212
+ size_t last = m_source.GetSize()-1;
213
+ if (maxSpan != 0) {
214
+ last = min(last, range.GetStartPos()+maxSpan);
215
+ }
216
+ if (maxSpan == 0 || range.GetNumWordsCovered() <= maxSpan) {
217
+ const InputPath &inputPath = GetInputPath(range);
218
+ ruleLookupManager.GetChartRuleCollection(inputPath, last, to);
219
+ }
220
+ }
221
+
222
+ if (range.GetNumWordsCovered() == 1
223
+ && range.GetStartPos() != 0
224
+ && range.GetStartPos() != m_source.GetSize()-1) {
225
+ bool always = options()->unk.always_create_direct_transopt;
226
+ if (to.Empty() || always) {
227
+ // create unknown words for 1 word coverage where we don't have any trans options
228
+ const Word &sourceWord = m_source.GetWord(range.GetStartPos());
229
+ m_unknown.Process(sourceWord, range, to);
230
+ }
231
+ }
232
+ }
233
+
234
+ void ChartParser::CreateInputPaths(const InputType &input)
235
+ {
236
+ size_t size = input.GetSize();
237
+ m_inputPathMatrix.resize(size);
238
+
239
+ UTIL_THROW_IF2(input.GetType() != SentenceInput && input.GetType() != TreeInputType,
240
+ "Input must be a sentence or a tree, " <<
241
+ "not lattice or confusion networks");
242
+
243
+ TranslationTask const* ttask = m_ttask.lock().get();
244
+ for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) {
245
+ for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
246
+ size_t endPos = startPos + phaseSize -1;
247
+ vector<InputPath*> &vec = m_inputPathMatrix[startPos];
248
+
249
+ Range range(startPos, endPos);
250
+ Phrase subphrase(input.GetSubString(Range(startPos, endPos)));
251
+ const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);
252
+
253
+ InputPath *node;
254
+ if (range.GetNumWordsCovered() == 1) {
255
+ node = new InputPath(ttask, subphrase, labels, range, NULL, NULL);
256
+ vec.push_back(node);
257
+ } else {
258
+ const InputPath &prevNode = GetInputPath(startPos, endPos - 1);
259
+ node = new InputPath(ttask, subphrase, labels, range, &prevNode, NULL);
260
+ vec.push_back(node);
261
+ }
262
+
263
+ //m_inputPathQueue.push_back(node);
264
+ }
265
+ }
266
+ }
267
+
268
+ const InputPath &ChartParser::GetInputPath(const Range &range) const
269
+ {
270
+ return GetInputPath(range.GetStartPos(), range.GetEndPos());
271
+ }
272
+
273
+ const InputPath &ChartParser::GetInputPath(size_t startPos, size_t endPos) const
274
+ {
275
+ size_t offset = endPos - startPos;
276
+ UTIL_THROW_IF2(offset >= m_inputPathMatrix[startPos].size(),
277
+ "Out of bound: " << offset);
278
+ return *m_inputPathMatrix[startPos][offset];
279
+ }
280
+
281
+ InputPath &ChartParser::GetInputPath(size_t startPos, size_t endPos)
282
+ {
283
+ size_t offset = endPos - startPos;
284
+ UTIL_THROW_IF2(offset >= m_inputPathMatrix[startPos].size(),
285
+ "Out of bound: " << offset);
286
+ return *m_inputPathMatrix[startPos][offset];
287
+ }
288
+ /*
289
+ const Sentence &ChartParser::GetSentence() const {
290
+ const Sentence &sentence = static_cast<const Sentence&>(m_source);
291
+ return sentence;
292
+ }
293
+ */
294
+ size_t ChartParser::GetSize() const
295
+ {
296
+ return m_source.GetSize();
297
+ }
298
+
299
+ long ChartParser::GetTranslationId() const
300
+ {
301
+ return m_source.GetTranslationId();
302
+ }
303
+
304
+
305
+ AllOptions::ptr const&
306
+ ChartParser::
307
+ options() const
308
+ {
309
+ return m_ttask.lock()->options();
310
+ }
311
+
312
+
313
+ } // namespace Moses
mosesdecoder/moses/ChartRuleLookupManager.cpp ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #include "ChartRuleLookupManager.h"
2
+ #include "ChartParser.h"
3
+
4
+ namespace Moses
5
+ {
6
+ ChartRuleLookupManager::~ChartRuleLookupManager()
7
+ {}
8
+ } // namespace Moses
9
+
mosesdecoder/moses/ChartRuleLookupManager.h ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+ #ifndef moses_ChartRuleLookupManager_h
22
+ #define moses_ChartRuleLookupManager_h
23
+
24
+ #include "ChartCellCollection.h"
25
+ #include "InputType.h"
26
+
27
+ namespace Moses
28
+ {
29
+ class ChartParser;
30
+ class ChartParserCallback;
31
+ class Range;
32
+ class Sentence;
33
+
34
+ /** Defines an interface for looking up rules in a rule table. Concrete
35
+ * implementation classes should correspond to specific PhraseDictionary
36
+ * subclasses (memory or on-disk). Since a ChartRuleLookupManager object
37
+ * maintains sentence-specific state, exactly one should be created for
38
+ * each sentence that is to be decoded.
39
+ */
40
+ class ChartRuleLookupManager
41
+ {
42
+ public:
43
+ ChartRuleLookupManager(const ChartParser &parser,
44
+ const ChartCellCollectionBase &cellColl)
45
+ : m_parser(parser)
46
+ , m_cellCollection(cellColl) {}
47
+
48
+ virtual ~ChartRuleLookupManager();
49
+
50
+ const ChartCellLabelSet &GetTargetLabelSet(size_t begin, size_t end) const {
51
+ return m_cellCollection.GetBase(Range(begin, end)).GetTargetLabelSet();
52
+ }
53
+
54
+ const ChartParser &GetParser() const {
55
+ return m_parser;
56
+ }
57
+ //const Sentence &GetSentence() const;
58
+
59
+ const ChartCellLabel &GetSourceAt(size_t at) const {
60
+ return m_cellCollection.GetSourceWordLabel(at);
61
+ }
62
+
63
+ /** abstract function. Return a vector of translation options for given a range in the input sentence
64
+ * \param range source range for which you want the translation options
65
+ * \param outColl return argument
66
+ */
67
+ virtual void GetChartRuleCollection(
68
+ const InputPath &inputPath,
69
+ size_t lastPos, // last position to consider if using lookahead
70
+ ChartParserCallback &outColl) = 0;
71
+
72
+ private:
73
+ //! Non-copyable: copy constructor and assignment operator not implemented.
74
+ ChartRuleLookupManager(const ChartRuleLookupManager &);
75
+ //! Non-copyable: copy constructor and assignment operator not implemented.
76
+ ChartRuleLookupManager &operator=(const ChartRuleLookupManager &);
77
+
78
+ const ChartParser &m_parser;
79
+ const ChartCellCollectionBase &m_cellCollection;
80
+ };
81
+
82
+ } // namespace Moses
83
+
84
+ #endif
mosesdecoder/moses/ChartTranslationOptionList.cpp ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 Hieu Hoang
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <algorithm>
21
+ #include <iostream>
22
+ #include <vector>
23
+ #include "StaticData.h"
24
+ #include "ChartTranslationOptionList.h"
25
+ #include "ChartTranslationOptions.h"
26
+ #include "ChartCellCollection.h"
27
+ #include "Range.h"
28
+ #include "InputType.h"
29
+ #include "InputPath.h"
30
+
31
+ using namespace std;
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ ChartTranslationOptionList::
37
+ ChartTranslationOptionList(size_t ruleLimit, const InputType &input)
38
+ : m_size(0)
39
+ , m_ruleLimit(ruleLimit)
40
+ {
41
+ m_scoreThreshold = std::numeric_limits<float>::infinity();
42
+ }
43
+
44
+ ChartTranslationOptionList::~ChartTranslationOptionList()
45
+ {
46
+ RemoveAllInColl(m_collection);
47
+ }
48
+
49
+ void ChartTranslationOptionList::Clear()
50
+ {
51
+ m_size = 0;
52
+ m_scoreThreshold = std::numeric_limits<float>::infinity();
53
+ }
54
+
55
+ class ChartTranslationOptionOrderer
56
+ {
57
+ public:
58
+ bool operator()(const ChartTranslationOptions* itemA, const ChartTranslationOptions* itemB) const {
59
+ return itemA->GetEstimateOfBestScore() > itemB->GetEstimateOfBestScore();
60
+ }
61
+ };
62
+
63
+ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
64
+ const StackVec &stackVec,
65
+ const Range &range)
66
+ {
67
+ if (tpc.IsEmpty()) {
68
+ return;
69
+ }
70
+
71
+ for (size_t i = 0; i < stackVec.size(); ++i) {
72
+ const ChartCellLabel &chartCellLabel = *stackVec[i];
73
+ size_t numHypos = chartCellLabel.GetStack().cube->size();
74
+ if (numHypos == 0) {
75
+ return; // empty stack. These rules can't be used
76
+ }
77
+ }
78
+
79
+ const TargetPhrase &targetPhrase = **(tpc.begin());
80
+ float score = targetPhrase.GetFutureScore();
81
+ for (StackVec::const_iterator p = stackVec.begin(); p != stackVec.end(); ++p) {
82
+ score += (*p)->GetBestScore(this);
83
+ }
84
+
85
+ // If the rule limit has already been reached then don't add the option
86
+ // unless it is better than at least one existing option.
87
+ if (m_ruleLimit && m_size > m_ruleLimit && score < m_scoreThreshold) {
88
+ return;
89
+ }
90
+
91
+ // Add the option to the list.
92
+ if (m_size == m_collection.size()) {
93
+ // m_collection has reached capacity: create a new object.
94
+ m_collection.push_back(new ChartTranslationOptions(tpc, stackVec,
95
+ range, score));
96
+ } else {
97
+ // Overwrite an unused object.
98
+ *(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec,
99
+ range, score);
100
+ }
101
+ ++m_size;
102
+
103
+ // If the rule limit hasn't been exceeded then update the threshold.
104
+ if (!m_ruleLimit || m_size <= m_ruleLimit) {
105
+ m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
106
+ }
107
+
108
+ // Prune if bursting
109
+ if (m_ruleLimit && m_size == m_ruleLimit * 2) {
110
+ NTH_ELEMENT4(m_collection.begin(),
111
+ m_collection.begin() + m_ruleLimit - 1,
112
+ m_collection.begin() + m_size,
113
+ ChartTranslationOptionOrderer());
114
+ m_scoreThreshold = m_collection[m_ruleLimit-1]->GetEstimateOfBestScore();
115
+ m_size = m_ruleLimit;
116
+ }
117
+ }
118
+
119
+ void
120
+ ChartTranslationOptionList::
121
+ AddPhraseOOV(TargetPhrase &phrase,
122
+ std::list<TargetPhraseCollection::shared_ptr > &waste_memory,
123
+ const Range &range)
124
+ {
125
+ TargetPhraseCollection::shared_ptr tpc(new TargetPhraseCollection);
126
+ tpc->Add(&phrase);
127
+ waste_memory.push_back(tpc);
128
+ StackVec empty;
129
+ Add(*tpc, empty, range);
130
+ }
131
+
132
+ void ChartTranslationOptionList::ApplyThreshold(float const threshold)
133
+ {
134
+ if (m_ruleLimit && m_size > m_ruleLimit) {
135
+ // Something's gone wrong if the list has grown to m_ruleLimit * 2
136
+ // without being pruned.
137
+ assert(m_size < m_ruleLimit * 2);
138
+ // Reduce the list to the best m_ruleLimit options. The remaining
139
+ // options can be overwritten on subsequent calls to Add().
140
+ NTH_ELEMENT4(m_collection.begin(),
141
+ m_collection.begin()+m_ruleLimit,
142
+ m_collection.begin()+m_size,
143
+ ChartTranslationOptionOrderer());
144
+ m_size = m_ruleLimit;
145
+ }
146
+
147
+ // keep only those over best + threshold
148
+
149
+ float scoreThreshold = -std::numeric_limits<float>::infinity();
150
+
151
+ CollType::const_iterator iter;
152
+ for (iter = m_collection.begin(); iter != m_collection.begin()+m_size; ++iter) {
153
+ const ChartTranslationOptions *transOpt = *iter;
154
+ float score = transOpt->GetEstimateOfBestScore();
155
+ scoreThreshold = (score > scoreThreshold) ? score : scoreThreshold;
156
+ }
157
+
158
+ scoreThreshold += threshold; // StaticData::Instance().GetTranslationOptionThreshold();
159
+
160
+ CollType::iterator bound = std::partition(m_collection.begin(),
161
+ m_collection.begin()+m_size,
162
+ ScoreThresholdPred(scoreThreshold));
163
+
164
+ m_size = std::distance(m_collection.begin(), bound);
165
+ }
166
+
167
+ float ChartTranslationOptionList::GetBestScore(const ChartCellLabel *chartCell) const
168
+ {
169
+ const HypoList *stack = chartCell->GetStack().cube;
170
+ assert(stack);
171
+ assert(!stack->empty());
172
+ const ChartHypothesis &bestHypo = **(stack->begin());
173
+ return bestHypo.GetFutureScore();
174
+ }
175
+
176
+ void ChartTranslationOptionList::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
177
+ {
178
+ // NEVER iterate over ALL of the collection. Just over the first m_size
179
+ CollType::iterator iter;
180
+ for (iter = m_collection.begin(); iter != m_collection.begin() + m_size; ++iter) {
181
+ ChartTranslationOptions &transOpts = **iter;
182
+ transOpts.EvaluateWithSourceContext(input, inputPath);
183
+ }
184
+
185
+ // get rid of empty trans opts
186
+ size_t numDiscard = 0;
187
+ for (size_t i = 0; i < m_size; ++i) {
188
+ ChartTranslationOptions *transOpts = m_collection[i];
189
+ if (transOpts->GetSize() == 0) {
190
+ //delete transOpts;
191
+ ++numDiscard;
192
+ } else if (numDiscard) {
193
+ SwapTranslationOptions(i - numDiscard, i);
194
+ //m_collection[] = transOpts;
195
+ }
196
+ }
197
+
198
+ size_t newSize = m_size - numDiscard;
199
+ m_size = newSize;
200
+ }
201
+
202
+ void ChartTranslationOptionList::SwapTranslationOptions(size_t a, size_t b)
203
+ {
204
+ ChartTranslationOptions *transOptsA = m_collection[a];
205
+ ChartTranslationOptions *transOptsB = m_collection[b];
206
+ m_collection[a] = transOptsB;
207
+ m_collection[b] = transOptsA;
208
+ }
209
+
210
+ std::ostream& operator<<(std::ostream &out, const ChartTranslationOptionList &obj)
211
+ {
212
+ for (size_t i = 0; i < obj.m_collection.size(); ++i) {
213
+ const ChartTranslationOptions &transOpts = *obj.m_collection[i];
214
+ out << transOpts << endl;
215
+ }
216
+ return out;
217
+ }
218
+
219
+ }
mosesdecoder/moses/ChartTranslationOptions.h ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 Hieu Hoang
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "StackVec.h"
23
+ #include "TargetPhrase.h"
24
+ #include "TargetPhraseCollection.h"
25
+ #include "Range.h"
26
+
27
+ #include <vector>
28
+ #include <boost/shared_ptr.hpp>
29
+ #include "ChartTranslationOption.h"
30
+
31
+ namespace Moses
32
+ {
33
+ class ChartTranslationOption;
34
+ class InputPath;
35
+ class InputType;
36
+
37
+ /** Similar to a DottedRule, but contains a direct reference to a list
38
+ * of translations and provdes an estimate of the best score. For a specific range in the input sentence
39
+ */
40
+ class ChartTranslationOptions
41
+ {
42
+ friend std::ostream& operator<<(std::ostream&, const ChartTranslationOptions&);
43
+
44
+ public:
45
+ typedef std::vector<boost::shared_ptr<ChartTranslationOption> > CollType;
46
+
47
+ /** Constructor
48
+ \param targetPhraseColl @todo dunno
49
+ \param stackVec @todo dunno
50
+ \param range the range in the source sentence this translation option covers
51
+ \param score @todo dunno
52
+ */
53
+ ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
54
+ const StackVec &stackVec,
55
+ const Range &range,
56
+ float score);
57
+ ~ChartTranslationOptions();
58
+
59
+ static float CalcEstimateOfBestScore(const TargetPhraseCollection &,
60
+ const StackVec &);
61
+
62
+ size_t GetSize() const {
63
+ return m_collection.size();
64
+ }
65
+
66
+ //! @todo dunno
67
+ const StackVec &GetStackVec() const {
68
+ return m_stackVec;
69
+ }
70
+
71
+ //! @todo isn't the translation suppose to just contain 1 target phrase, not a whole collection of them?
72
+ const CollType &GetTargetPhrases() const {
73
+ return m_collection;
74
+ }
75
+
76
+ //! the range in the source sentence this translation option covers
77
+ const Range &GetSourceWordsRange() const {
78
+ return *m_wordsRange;
79
+ }
80
+
81
+ /** return an estimate of the best score possible with this translation option.
82
+ * the estimate is the sum of the top target phrase's estimated score plus the
83
+ * scores of the best child hypotheses.
84
+ */
85
+ inline float GetEstimateOfBestScore() const {
86
+ return m_estimateOfBestScore;
87
+ }
88
+
89
+ void EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath);
90
+
91
+ void SetInputPath(const InputPath *inputPath);
92
+
93
+ void CreateSourceRuleFromInputPath();
94
+
95
+ private:
96
+
97
+ StackVec m_stackVec; //! vector of hypothesis list!
98
+ CollType m_collection;
99
+
100
+ const Range *m_wordsRange;
101
+ float m_estimateOfBestScore;
102
+ };
103
+
104
+ }
mosesdecoder/moses/ConfusionNet.cpp ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ // $Id$
3
+
4
+ #include "ConfusionNet.h"
5
+ #include <sstream>
6
+
7
+ #include "FactorCollection.h"
8
+ #include "Util.h"
9
+ #include "TranslationOptionCollectionConfusionNet.h"
10
+ #include "StaticData.h"
11
+ #include "Sentence.h"
12
+ #include "moses/FF/InputFeature.h"
13
+ #include "util/exception.hh"
14
+ #include "moses/TranslationTask.h"
15
+ namespace Moses
16
+ {
17
+ struct CNStats {
18
+ size_t created,destr,read,colls,words;
19
+
20
+ CNStats() : created(0),destr(0),read(0),colls(0),words(0) {}
21
+ ~CNStats() {
22
+ print(std::cerr);
23
+ }
24
+
25
+ void createOne() {
26
+ ++created;
27
+ }
28
+ void destroyOne() {
29
+ ++destr;
30
+ }
31
+
32
+ void collect(const ConfusionNet& cn) {
33
+ ++read;
34
+ colls+=cn.GetSize();
35
+ for(size_t i=0; i<cn.GetSize(); ++i)
36
+ words+=cn[i].size();
37
+ }
38
+ void print(std::ostream& out) const {
39
+ if(created>0) {
40
+ out<<"confusion net statistics:\n"
41
+ " created:\t"<<created<<"\n"
42
+ " destroyed:\t"<<destr<<"\n"
43
+ " succ. read:\t"<<read<<"\n"
44
+ " columns:\t"<<colls<<"\n"
45
+ " words:\t"<<words<<"\n"
46
+ " avg. word/column:\t"<<words/(1.0*colls)<<"\n"
47
+ " avg. cols/sent:\t"<<colls/(1.0*read)<<"\n"
48
+ "\n\n";
49
+ }
50
+ }
51
+ };
52
+
53
+ CNStats stats;
54
+
55
+ size_t
56
+ ConfusionNet::
57
+ GetColumnIncrement(size_t i, size_t j) const
58
+ {
59
+ (void) i;
60
+ (void) j;
61
+ return 1;
62
+ }
63
+
64
+ ConfusionNet::
65
+ ConfusionNet(AllOptions::ptr const& opts) : InputType(opts)
66
+ {
67
+ stats.createOne();
68
+
69
+ if (is_syntax(opts->search.algo)) {
70
+ m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
71
+ }
72
+ UTIL_THROW_IF2(InputFeature::InstancePtr() == NULL, "Input feature must be specified");
73
+ }
74
+
75
+ ConfusionNet::
76
+ ~ConfusionNet()
77
+ {
78
+ stats.destroyOne();
79
+ }
80
+
81
+ ConfusionNet::
82
+ ConfusionNet(Sentence const& s) : InputType(s.options())
83
+ {
84
+ data.resize(s.GetSize());
85
+ for(size_t i=0; i<s.GetSize(); ++i) {
86
+ ScorePair scorePair;
87
+ std::pair<Word, ScorePair > temp = std::make_pair(s.GetWord(i), scorePair);
88
+ data[i].push_back(temp);
89
+ }
90
+ }
91
+
92
+ bool
93
+ ConfusionNet::
94
+ ReadF(std::istream& in, int format)
95
+ {
96
+ VERBOSE(2, "read confusion net with format "<<format<<"\n");
97
+ switch(format) {
98
+ case 0:
99
+ return ReadFormat0(in);
100
+ case 1:
101
+ return ReadFormat1(in);
102
+ default:
103
+ std::cerr << "ERROR: unknown format '"<<format
104
+ <<"' in ConfusionNet::Read";
105
+ }
106
+ return false;
107
+ }
108
+
109
+ int
110
+ ConfusionNet::
111
+ Read(std::istream& in)
112
+ {
113
+ int rv=ReadF(in,0);
114
+ if(rv) stats.collect(*this);
115
+ return rv;
116
+ }
117
+
118
+ bool
119
+ ConfusionNet::
120
+ ReadFormat0(std::istream& in)
121
+ {
122
+ Clear();
123
+ const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
124
+
125
+ const InputFeature *inputFeature = InputFeature::InstancePtr();
126
+ size_t numInputScores = inputFeature->GetNumInputScores();
127
+ size_t numRealWordCount = inputFeature->GetNumRealWordsInInput();
128
+
129
+ size_t totalCount = numInputScores + numRealWordCount;
130
+ bool addRealWordCount = (numRealWordCount > 0);
131
+
132
+ std::string line;
133
+ while(getline(in,line)) {
134
+ std::istringstream is(line);
135
+ std::string word;
136
+
137
+ Column col;
138
+ while(is>>word) {
139
+ Word w;
140
+ w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
141
+ std::vector<float> probs(totalCount, 0.0);
142
+ for(size_t i=0; i < numInputScores; i++) {
143
+ double prob;
144
+ if (!(is>>prob)) {
145
+ TRACE_ERR("ERROR: unable to parse CN input - bad link probability, "
146
+ << "or wrong number of scores\n");
147
+ return false;
148
+ }
149
+ if(prob<0.0) {
150
+ VERBOSE(1, "WARN: negative prob: "<<prob<<" ->set to 0.0\n");
151
+ prob=0.0;
152
+ } else if (prob>1.0) {
153
+ VERBOSE(1, "WARN: prob > 1.0 : "<<prob<<" -> set to 1.0\n");
154
+ prob=1.0;
155
+ }
156
+ probs[i] = (std::max(static_cast<float>(log(prob)),LOWEST_SCORE));
157
+
158
+ }
159
+ // store 'real' word count in last feature if we have one more
160
+ // weight than we do arc scores and not epsilon
161
+ if (addRealWordCount && word!=EPSILON && word!="")
162
+ probs.back() = -1.0;
163
+
164
+ ScorePair scorePair(probs);
165
+
166
+ col.push_back(std::make_pair(w,scorePair));
167
+ }
168
+ if(col.size()) {
169
+ data.push_back(col);
170
+ ShrinkToFit(data.back());
171
+ } else break;
172
+ }
173
+ return !data.empty();
174
+ }
175
+
176
+ bool
177
+ ConfusionNet::
178
+ ReadFormat1(std::istream& in)
179
+ {
180
+ Clear();
181
+ const std::vector<FactorType>& factorOrder = m_options->input.factor_order;
182
+ std::string line;
183
+ if(!getline(in,line)) return 0;
184
+ size_t s;
185
+ if(getline(in,line)) s=atoi(line.c_str());
186
+ else return 0;
187
+ data.resize(s);
188
+ for(size_t i=0; i<data.size(); ++i) {
189
+ if(!getline(in,line)) return 0;
190
+ std::istringstream is(line);
191
+ if(!(is>>s)) return 0;
192
+ std::string word;
193
+ double prob;
194
+ data[i].resize(s);
195
+ for(size_t j=0; j<s; ++j)
196
+ if(is>>word>>prob) {
197
+ //TODO: we are only reading one prob from this input format, should read many... but this function is unused anyway. -JS
198
+ data[i][j].second.denseScores = std::vector<float> (1);
199
+ data[i][j].second.denseScores.push_back((float) log(prob));
200
+ if(data[i][j].second.denseScores[0]<0) {
201
+ VERBOSE(1, "WARN: neg costs: "<<data[i][j].second.denseScores[0]<<" -> set to 0\n");
202
+ data[i][j].second.denseScores[0]=0.0;
203
+ }
204
+ // String2Word(word,data[i][j].first,factorOrder);
205
+ Word& w = data[i][j].first;
206
+ w.CreateFromString(Input,factorOrder,StringPiece(word),false,false);
207
+ } else return 0;
208
+ }
209
+ return !data.empty();
210
+ }
211
+
212
+ void ConfusionNet::Print(std::ostream& out) const
213
+ {
214
+ out<<"conf net: "<<data.size()<<"\n";
215
+ for(size_t i=0; i<data.size(); ++i) {
216
+ out<<i<<" -- ";
217
+ for(size_t j=0; j<data[i].size(); ++j) {
218
+ out<<"("<<data[i][j].first.ToString()<<", ";
219
+
220
+ // dense
221
+ std::vector<float>::const_iterator iterDense;
222
+ for(iterDense = data[i][j].second.denseScores.begin();
223
+ iterDense < data[i][j].second.denseScores.end();
224
+ ++iterDense) {
225
+ out<<", "<<*iterDense;
226
+ }
227
+
228
+ // sparse
229
+ std::map<StringPiece, float>::const_iterator iterSparse;
230
+ for(iterSparse = data[i][j].second.sparseScores.begin();
231
+ iterSparse != data[i][j].second.sparseScores.end();
232
+ ++iterSparse) {
233
+ out << ", " << iterSparse->first << "=" << iterSparse->second;
234
+ }
235
+
236
+ out<<") ";
237
+ }
238
+ out<<"\n";
239
+ }
240
+ out<<"\n\n";
241
+ }
242
+
243
+ #ifdef _WIN32
244
+ #pragma warning(disable:4716)
245
+ #endif
246
+ Phrase
247
+ ConfusionNet::
248
+ GetSubString(const Range&) const
249
+ {
250
+ UTIL_THROW2("ERROR: call to ConfusionNet::GetSubString\n");
251
+ //return Phrase(Input);
252
+ }
253
+
254
+ std::string
255
+ ConfusionNet::
256
+ GetStringRep(const std::vector<FactorType> /* factorsToPrint */) const //not well defined yet
257
+ {
258
+ TRACE_ERR("ERROR: call to ConfusionNet::GeStringRep\n");
259
+ return "";
260
+ }
261
+ #ifdef _WIN32
262
+ #pragma warning(disable:4716)
263
+ #endif
264
+ const Word& ConfusionNet::GetWord(size_t) const
265
+ {
266
+ UTIL_THROW2("ERROR: call to ConfusionNet::GetFactorArray\n");
267
+ }
268
+ #ifdef _WIN32
269
+ #pragma warning(default:4716)
270
+ #endif
271
+ std::ostream& operator<<(std::ostream& out,const ConfusionNet& cn)
272
+ {
273
+ cn.Print(out);
274
+ return out;
275
+ }
276
+
277
+ TranslationOptionCollection*
278
+ ConfusionNet::
279
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const
280
+ {
281
+ // size_t maxNoTransOptPerCoverage
282
+ // = ttask->options()->search.max_trans_opt_per_cov;
283
+ // float translationOptionThreshold
284
+ // = ttask->options()->search.trans_opt_threshold;
285
+ TranslationOptionCollection *rv
286
+ = new TranslationOptionCollectionConfusionNet(ttask, *this);
287
+ //, maxNoTransOptPerCoverage, translationOptionThreshold);
288
+ assert(rv);
289
+ return rv;
290
+ }
291
+
292
+ }
293
+
294
+
mosesdecoder/moses/ContextScope.h ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ // A class to store "local" information (such as task-specific caches).
3
+ // The idea is for each translation task to have a scope, which stores
4
+ // shared pointers to task-specific objects such as caches and priors.
5
+ // Since these objects are referenced via shared pointers, sopes can
6
+ // share information.
7
+ #pragma once
8
+
9
+ #ifdef WITH_THREADS
10
+ #include <boost/thread/shared_mutex.hpp>
11
+ #include <boost/thread/locks.hpp>
12
+ #include <boost/foreach.hpp>
13
+ #endif
14
+
15
+ // for some reason, the xmlrpc_c headers must be included AFTER the
16
+ // boost thread-related ones ...
17
+ #include "xmlrpc-c.h"
18
+
19
+ #include <map>
20
+ #include <boost/shared_ptr.hpp>
21
+ #include "TypeDef.h"
22
+ #include "Util.h"
23
+
24
+ namespace Moses
25
+ {
26
+ class ContextScope
27
+ {
28
+ protected:
29
+ typedef std::map<void const*, boost::shared_ptr<void> > scratchpad_t;
30
+ typedef scratchpad_t::iterator iter_t;
31
+ typedef scratchpad_t::value_type entry_t;
32
+ typedef scratchpad_t::const_iterator const_iter_t;
33
+ scratchpad_t m_scratchpad;
34
+ #ifdef WITH_THREADS
35
+ mutable boost::shared_mutex m_lock;
36
+ #endif
37
+ SPTR<std::map<std::string,float> const> m_context_weights;
38
+ public:
39
+ typedef boost::shared_ptr<ContextScope> ptr;
40
+ template<typename T>
41
+ boost::shared_ptr<void> const&
42
+ set(void const* const key, boost::shared_ptr<T> const& val) {
43
+ #ifdef WITH_THREADS
44
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
45
+ #endif
46
+ return (m_scratchpad[key] = val);
47
+ }
48
+
49
+ template<typename T>
50
+ boost::shared_ptr<T> const
51
+ get(void const* key, bool CreateNewIfNecessary=false) {
52
+ #ifdef WITH_THREADS
53
+ using boost::shared_mutex;
54
+ using boost::upgrade_lock;
55
+ // T const* key = reinterpret_cast<T const*>(xkey);
56
+ upgrade_lock<shared_mutex> lock(m_lock);
57
+ #endif
58
+ iter_t m = m_scratchpad.find(key);
59
+ boost::shared_ptr< T > ret;
60
+ if (m != m_scratchpad.end()) {
61
+ if (m->second == NULL && CreateNewIfNecessary) {
62
+ #ifdef WITH_THREADS
63
+ boost::upgrade_to_unique_lock<shared_mutex> xlock(lock);
64
+ #endif
65
+ m->second.reset(new T);
66
+ }
67
+ ret = boost::static_pointer_cast< T >(m->second);
68
+ return ret;
69
+ }
70
+ if (!CreateNewIfNecessary) return ret;
71
+ #ifdef WITH_THREADS
72
+ boost::upgrade_to_unique_lock<shared_mutex> xlock(lock);
73
+ #endif
74
+ ret.reset(new T);
75
+ m_scratchpad[key] = ret;
76
+ return ret;
77
+ }
78
+
79
+ ContextScope() { }
80
+
81
+ ContextScope(ContextScope const& other) {
82
+ #ifdef WITH_THREADS
83
+ boost::unique_lock<boost::shared_mutex> lock1(this->m_lock);
84
+ boost::unique_lock<boost::shared_mutex> lock2(other.m_lock);
85
+ #endif
86
+ m_scratchpad = other.m_scratchpad;
87
+ }
88
+
89
+ SPTR<std::map<std::string,float> const>
90
+ GetContextWeights() {
91
+ return m_context_weights;
92
+ }
93
+
94
+ bool
95
+ SetContextWeights(std::string const& spec) {
96
+ if (m_context_weights) return false;
97
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
98
+ SPTR<std::map<std::string,float> > M(new std::map<std::string, float>);
99
+
100
+ // TO DO; This needs to be done with StringPiece.find, not Tokenize
101
+ // PRIORITY: low
102
+ std::vector<std::string> tokens = Tokenize(spec,":");
103
+ for (std::vector<std::string>::iterator it = tokens.begin();
104
+ it != tokens.end(); it++) {
105
+ std::vector<std::string> key_and_value = Tokenize(*it, ",");
106
+ (*M)[key_and_value[0]] = atof(key_and_value[1].c_str());
107
+ }
108
+ m_context_weights = M;
109
+ return true;
110
+ }
111
+
112
+ bool
113
+ SetContextWeights(SPTR<std::map<std::string,float> const> const& w) {
114
+ if (m_context_weights) return false;
115
+ #ifdef WITH_THREADS
116
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
117
+ #endif
118
+ m_context_weights = w;
119
+ return true;
120
+ }
121
+
122
+ };
123
+
124
+ };
mosesdecoder/moses/DecodeGraph.cpp ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include "DecodeGraph.h"
24
+ #include "DecodeStep.h"
25
+ #include "TypeDef.h"
26
+ #include "Util.h"
27
+
28
+ namespace Moses
29
+ {
30
+ DecodeGraph::~DecodeGraph()
31
+ {
32
+ RemoveAllInColl(m_steps);
33
+ }
34
+
35
+ //! Add another decode step to the graph
36
+ void DecodeGraph::Add(DecodeStep *decodeStep)
37
+ {
38
+ m_steps.push_back(decodeStep);
39
+ decodeStep->SetContainer(this);
40
+ }
41
+
42
+ }
43
+
mosesdecoder/moses/DecodeStep.cpp ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "DecodeStep.h"
23
+ #include "GenerationDictionary.h"
24
+ #include "StaticData.h"
25
+ #include "moses/TranslationModel/PhraseDictionary.h"
26
+
27
+ namespace Moses
28
+ {
29
+ DecodeStep::DecodeStep(DecodeFeature *decodeFeature,
30
+ const DecodeStep* prev,
31
+ const std::vector<FeatureFunction*> &features)
32
+ : m_decodeFeature(decodeFeature)
33
+ {
34
+ FactorMask prevOutputFactors;
35
+ if (prev) prevOutputFactors = prev->m_outputFactors;
36
+ m_outputFactors = prevOutputFactors;
37
+ FactorMask conflictMask = (m_outputFactors & decodeFeature->GetOutputFactorMask());
38
+ m_outputFactors |= decodeFeature->GetOutputFactorMask();
39
+ FactorMask newOutputFactorMask = m_outputFactors ^ prevOutputFactors; //xor
40
+ m_newOutputFactors.resize(newOutputFactorMask.count());
41
+ m_conflictFactors.resize(conflictMask.count());
42
+ size_t j=0, k=0;
43
+ for (size_t i = 0; i < MAX_NUM_FACTORS; i++) {
44
+ if (newOutputFactorMask[i]) m_newOutputFactors[j++] = i;
45
+ if (conflictMask[i]) m_conflictFactors[k++] = i;
46
+ }
47
+ VERBOSE(2,"DecodeStep():\n\toutputFactors=" << m_outputFactors
48
+ << "\n\tconflictFactors=" << conflictMask
49
+ << "\n\tnewOutputFactors=" << newOutputFactorMask << std::endl);
50
+
51
+ // find out which feature function can be applied in this decode step
52
+ for (size_t i = 0; i < features.size(); ++i) {
53
+ FeatureFunction *feature = features[i];
54
+ if (feature->IsUseable(m_outputFactors)) {
55
+ m_featuresToApply.push_back(feature);
56
+ } else {
57
+ m_featuresRemaining.push_back(feature);
58
+ }
59
+ }
60
+
61
+ decodeFeature->SetContainer(this);
62
+ }
63
+
64
+ DecodeStep::~DecodeStep() {}
65
+
66
+ /** returns phrase feature (dictionary) for translation step */
67
+ const PhraseDictionary* DecodeStep::GetPhraseDictionaryFeature() const
68
+ {
69
+ return dynamic_cast<const PhraseDictionary*>(m_decodeFeature);
70
+ }
71
+
72
+ /** returns generation feature (dictionary) for generation step */
73
+ const GenerationDictionary* DecodeStep::GetGenerationDictionaryFeature() const
74
+ {
75
+ return dynamic_cast<const GenerationDictionary*>(m_decodeFeature);
76
+ }
77
+
78
+ void DecodeStep::RemoveFeature(const FeatureFunction *ff)
79
+ {
80
+ for (size_t i = 0; i < m_featuresToApply.size(); ++i) {
81
+ if (ff == m_featuresToApply[i]) {
82
+ m_featuresToApply.erase(m_featuresToApply.begin() + i);
83
+ return;
84
+ }
85
+ }
86
+ }
87
+
88
+ }
89
+
90
+
mosesdecoder/moses/DecodeStepGeneration.h ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_DecodeStepGeneration_h
23
+ #define moses_DecodeStepGeneration_h
24
+
25
+ #include "DecodeStep.h"
26
+
27
+ namespace Moses
28
+ {
29
+
30
+ class GenerationDictionary;
31
+ class Phrase;
32
+ class ScoreComponentCollection;
33
+
34
+ //! subclass of DecodeStep for generation step
35
+ class DecodeStepGeneration : public DecodeStep
36
+ {
37
+ public:
38
+ DecodeStepGeneration(GenerationDictionary* dict,
39
+ const DecodeStep* prev,
40
+ const std::vector<FeatureFunction*> &features);
41
+
42
+
43
+ void Process(const TranslationOption &inputPartialTranslOpt
44
+ , const DecodeStep &decodeStep
45
+ , PartialTranslOptColl &outputPartialTranslOptColl
46
+ , TranslationOptionCollection *toc
47
+ , bool adhereTableLimit) const;
48
+
49
+ private:
50
+ };
51
+
52
+
53
+ }
54
+ #endif
mosesdecoder/moses/DecodeStepTranslation.cpp ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "DecodeStepTranslation.h"
23
+ #include "TranslationOption.h"
24
+ #include "TranslationOptionCollection.h"
25
+ #include "PartialTranslOptColl.h"
26
+ #include "FactorCollection.h"
27
+ #include "util/exception.hh"
28
+
29
+ using namespace std;
30
+
31
+ namespace Moses
32
+ {
33
+ DecodeStepTranslation::DecodeStepTranslation(PhraseDictionary* pdf,
34
+ const DecodeStep* prev,
35
+ const std::vector<FeatureFunction*> &features)
36
+ : DecodeStep(pdf, prev, features)
37
+ {
38
+ // don't apply feature functions that are from current phrase table.It should already have been
39
+ // dont by the phrase table.
40
+ const std::vector<FeatureFunction*> &pdfFeatures = pdf->GetFeaturesToApply();
41
+ for (size_t i = 0; i < pdfFeatures.size(); ++i) {
42
+ FeatureFunction *ff = pdfFeatures[i];
43
+ RemoveFeature(ff);
44
+ }
45
+ }
46
+
47
+ void DecodeStepTranslation::Process(const TranslationOption &inputPartialTranslOpt
48
+ , const DecodeStep &decodeStep
49
+ , PartialTranslOptColl &outputPartialTranslOptColl
50
+ , TranslationOptionCollection *toc
51
+ , bool adhereTableLimit
52
+ , TargetPhraseCollection::shared_ptr phraseColl) const
53
+ {
54
+ if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) {
55
+ // word deletion
56
+ outputPartialTranslOptColl.Add(new TranslationOption(inputPartialTranslOpt));
57
+ return;
58
+ }
59
+
60
+ // normal trans step
61
+ const Range &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
62
+ const InputPath &inputPath = inputPartialTranslOpt.GetInputPath();
63
+ const PhraseDictionary* phraseDictionary =
64
+ decodeStep.GetPhraseDictionaryFeature();
65
+ const TargetPhrase &inPhrase = inputPartialTranslOpt.GetTargetPhrase();
66
+ const size_t currSize = inPhrase.GetSize();
67
+ const size_t tableLimit = phraseDictionary->GetTableLimit();
68
+
69
+ if (phraseColl != NULL) {
70
+ TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
71
+ iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit;
72
+
73
+ for (iterTargetPhrase = phraseColl->begin(); iterTargetPhrase != iterEnd; ++iterTargetPhrase) {
74
+ const TargetPhrase& targetPhrase = **iterTargetPhrase;
75
+ // const ScoreComponentCollection &transScores = targetPhrase.GetScoreBreakdown();
76
+ // skip if the
77
+ if (targetPhrase.GetSize() != currSize) continue;
78
+
79
+ TargetPhrase outPhrase(inPhrase);
80
+
81
+ if (IsFilteringStep()) {
82
+ if (!inputPartialTranslOpt.IsCompatible(targetPhrase, m_conflictFactors))
83
+ continue;
84
+ }
85
+
86
+ outPhrase.Merge(targetPhrase, m_newOutputFactors);
87
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
88
+
89
+ TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
90
+ assert(newTransOpt != NULL);
91
+
92
+ newTransOpt->SetInputPath(inputPath);
93
+
94
+ outputPartialTranslOptColl.Add(newTransOpt );
95
+
96
+ }
97
+ } else if (sourceWordsRange.GetNumWordsCovered() == 1) {
98
+ // unknown handler
99
+ //toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
100
+ }
101
+ }
102
+
103
+ void
104
+ DecodeStepTranslation::
105
+ ProcessInitialTranslation(InputType const& source,
106
+ PartialTranslOptColl &outputPartialTranslOptColl,
107
+ size_t startPos, size_t endPos,
108
+ bool adhereTableLimit,
109
+ InputPath const& inputPath,
110
+ TargetPhraseCollection::shared_ptr phraseColl) const
111
+ {
112
+ const PhraseDictionary* phraseDictionary = GetPhraseDictionaryFeature();
113
+ const size_t tableLimit = phraseDictionary->GetTableLimit();
114
+
115
+ const Range range(startPos, endPos);
116
+
117
+ if (phraseColl != NULL) {
118
+ IFVERBOSE(3) {
119
+ if(source.GetType() == SentenceInput)
120
+ TRACE_ERR("[" << source.GetSubString(range) << "; "
121
+ << startPos << "-" << endPos << "]\n");
122
+ else
123
+ TRACE_ERR("[" << startPos << "-" << endPos << "]" << std::endl);
124
+ }
125
+
126
+ TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
127
+ iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit;
128
+
129
+ for (iterTargetPhrase = phraseColl->begin() ; iterTargetPhrase != iterEnd ; ++iterTargetPhrase) {
130
+ const TargetPhrase &targetPhrase = **iterTargetPhrase;
131
+ TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
132
+
133
+ transOpt->SetInputPath(inputPath);
134
+
135
+ outputPartialTranslOptColl.Add (transOpt);
136
+
137
+ VERBOSE(3,"\t" << targetPhrase << "\n");
138
+ }
139
+ VERBOSE(3,std::endl);
140
+ }
141
+ }
142
+
143
+ void
144
+ DecodeStepTranslation::
145
+ ProcessInitialTransLEGACY(InputType const& source,
146
+ PartialTranslOptColl &outputPartialTranslOptColl,
147
+ size_t startPos, size_t endPos,
148
+ bool adhereTableLimit,
149
+ InputPathList const& inputPathList) const
150
+ {
151
+ const PhraseDictionary* phraseDictionary = GetPhraseDictionaryFeature();
152
+ const size_t tableLimit = phraseDictionary->GetTableLimit();
153
+
154
+ const Range range(startPos, endPos);
155
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr phraseColl
156
+ = phraseDictionary->GetTargetPhraseCollectionLEGACY(source,range);
157
+
158
+ if (phraseColl != NULL) {
159
+ IFVERBOSE(3) {
160
+ if(source.GetType() == SentenceInput)
161
+ TRACE_ERR("[" << source.GetSubString(range) << "; "
162
+ << startPos << "-" << endPos << "]\n");
163
+ else
164
+ TRACE_ERR("[" << startPos << "-" << endPos << "]" << std::endl);
165
+ }
166
+
167
+ const std::vector<Phrase> &sourcePhrases = phraseColl->GetSourcePhrases();
168
+
169
+ TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
170
+ std::vector<Phrase>::const_iterator iterSourcePhrase;
171
+ iterEnd = (!adhereTableLimit || tableLimit == 0 || phraseColl->GetSize() < tableLimit) ? phraseColl->end() : phraseColl->begin() + tableLimit;
172
+
173
+ for (iterTargetPhrase = phraseColl->begin(), iterSourcePhrase = sourcePhrases.begin()
174
+ ; iterTargetPhrase != iterEnd
175
+ ; ++iterTargetPhrase, ++iterSourcePhrase) {
176
+ assert(iterSourcePhrase != sourcePhrases.end());
177
+
178
+ const TargetPhrase &targetPhrase = **iterTargetPhrase;
179
+ const Phrase &sourcePhrase = *iterSourcePhrase;
180
+
181
+ const InputPath &inputPath = GetInputPathLEGACY(targetPhrase, sourcePhrase, inputPathList);
182
+
183
+ TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
184
+ transOpt->SetInputPath(inputPath);
185
+
186
+ outputPartialTranslOptColl.Add (transOpt);
187
+
188
+ VERBOSE(3,"\t" << targetPhrase << "\n");
189
+ }
190
+ VERBOSE(3,std::endl);
191
+ }
192
+ }
193
+
194
+ const InputPath &DecodeStepTranslation::GetInputPathLEGACY(
195
+ const TargetPhrase targetPhrase,
196
+ const Phrase sourcePhrase,
197
+ const InputPathList &inputPathList) const
198
+ {
199
+ const Word &wordFromPt = sourcePhrase.GetWord(0);
200
+
201
+ InputPathList::const_iterator iter;
202
+ for (iter = inputPathList.begin(); iter != inputPathList.end(); ++iter) {
203
+ const InputPath &inputPath = **iter;
204
+ const Phrase &phraseFromIP = inputPath.GetPhrase();
205
+
206
+ const Word *wordIP = NULL;
207
+ for (size_t i = 0; i < phraseFromIP.GetSize(); ++i) {
208
+ const Word &tempWord = phraseFromIP.GetWord(i);
209
+ if (!tempWord.IsEpsilon()) {
210
+ wordIP = &tempWord;
211
+ break;
212
+ }
213
+ }
214
+
215
+ // const Range &range = inputPath.GetWordsRange();
216
+
217
+ if (wordIP && *wordIP == wordFromPt) {
218
+ return inputPath;
219
+ }
220
+ }
221
+
222
+ UTIL_THROW(util::Exception, "Input path not found");
223
+ }
224
+
225
+ void
226
+ DecodeStepTranslation::
227
+ ProcessLEGACY(TranslationOption const& in,
228
+ DecodeStep const& decodeStep,
229
+ PartialTranslOptColl &out,
230
+ TranslationOptionCollection *toc,
231
+ bool adhereTableLimit) const
232
+ {
233
+ if (in.GetTargetPhrase().GetSize() == 0) {
234
+ // word deletion
235
+ out.Add(new TranslationOption(in));
236
+ return;
237
+ }
238
+
239
+ // normal trans step
240
+ Range const& srcRange = in.GetSourceWordsRange();
241
+ InputPath const& inputPath = in.GetInputPath();
242
+ PhraseDictionary const* pdict = decodeStep.GetPhraseDictionaryFeature();
243
+ TargetPhrase const& inPhrase = in.GetTargetPhrase();
244
+ size_t const currSize = inPhrase.GetSize();
245
+ size_t const tableLimit = pdict->GetTableLimit();
246
+
247
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr phraseColl
248
+ = pdict->GetTargetPhraseCollectionLEGACY(toc->GetSource(),srcRange);
249
+
250
+ if (phraseColl != NULL) {
251
+ TargetPhraseCollection::const_iterator iterTargetPhrase, iterEnd;
252
+ iterEnd = ((adhereTableLimit && tableLimit && phraseColl->GetSize() >= tableLimit)
253
+ ? phraseColl->begin() + tableLimit : phraseColl->end());
254
+
255
+ for (iterTargetPhrase = phraseColl->begin();
256
+ iterTargetPhrase != iterEnd;
257
+ ++iterTargetPhrase) {
258
+ TargetPhrase const& targetPhrase = **iterTargetPhrase;
259
+ if (targetPhrase.GetSize() != currSize ||
260
+ (IsFilteringStep() && !in.IsCompatible(targetPhrase, m_conflictFactors)))
261
+ continue;
262
+
263
+ TargetPhrase outPhrase(inPhrase);
264
+ outPhrase.Merge(targetPhrase, m_newOutputFactors);
265
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply); // need to do this as all non-transcores would be screwed up
266
+
267
+ TranslationOption *newTransOpt = new TranslationOption(srcRange, outPhrase);
268
+ assert(newTransOpt != NULL);
269
+
270
+ newTransOpt->SetInputPath(inputPath);
271
+
272
+ out.Add(newTransOpt);
273
+
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+
280
+
mosesdecoder/moses/DecodeStepTranslation.h ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_DecodeStepTranslation_h
23
+ #define moses_DecodeStepTranslation_h
24
+
25
+ #include "DecodeStep.h"
26
+ #include "moses/TranslationModel/PhraseDictionary.h"
27
+ #include "InputPath.h"
28
+
29
+ namespace Moses
30
+ {
31
+
32
+ class PhraseDictionary;
33
+ class TargetPhrase;
34
+ class InputPath;
35
+
36
+ //! subclass of DecodeStep for translation step
37
+ class DecodeStepTranslation : public DecodeStep
38
+ {
39
+ public:
40
+ DecodeStepTranslation(); //! not implemented
41
+ DecodeStepTranslation(PhraseDictionary* phraseFeature,
42
+ const DecodeStep* prev,
43
+ const std::vector<FeatureFunction*> &features);
44
+
45
+
46
+ virtual void Process(const TranslationOption &inputPartialTranslOpt
47
+ , const DecodeStep &decodeStep
48
+ , PartialTranslOptColl &outputPartialTranslOptColl
49
+ , TranslationOptionCollection *toc
50
+ , bool adhereTableLimit
51
+ , TargetPhraseCollection::shared_ptr phraseColl) const;
52
+
53
+
54
+ /*! initialize list of partial translation options by applying the first translation step
55
+ * Ideally, this function should be in DecodeStepTranslation class
56
+ */
57
+ void ProcessInitialTranslation(const InputType &source
58
+ , PartialTranslOptColl &outputPartialTranslOptColl
59
+ , size_t startPos, size_t endPos, bool adhereTableLimit
60
+ , const InputPath &inputPath
61
+ , TargetPhraseCollection::shared_ptr phraseColl) const;
62
+
63
+ // legacy
64
+ void
65
+ ProcessInitialTransLEGACY(InputType const& source,
66
+ PartialTranslOptColl &outputPartialTranslOptColl,
67
+ size_t startPos, size_t endPos,
68
+ bool adhereTableLimit,
69
+ InputPathList const& inputPathList) const;
70
+
71
+ void ProcessLEGACY(const TranslationOption &inputPartialTranslOpt
72
+ , const DecodeStep &decodeStep
73
+ , PartialTranslOptColl &outputPartialTranslOptColl
74
+ , TranslationOptionCollection *toc
75
+ , bool adhereTableLimit) const;
76
+
77
+ private:
78
+ // I'm not sure whether this actually works or not for binary phrase table.
79
+ // The source phrase only appears to contain the 1st word, therefore, this function
80
+ // only compares the 1st word
81
+ const InputPath &GetInputPathLEGACY(const TargetPhrase targetPhrase,
82
+ const Phrase sourcePhrase,
83
+ const InputPathList &inputPathList) const;
84
+
85
+ };
86
+
87
+
88
+ }
89
+ #endif
mosesdecoder/moses/Factor.cpp ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "Factor.h"
23
+
24
+ #include <boost/functional/hash.hpp>
25
+
26
+ using namespace std;
27
+
28
+ namespace Moses
29
+ {
30
+
31
+ TO_STRING_BODY(Factor)
32
+
33
+ // friend
34
+ ostream& operator<<(ostream& out, const Factor& factor)
35
+ {
36
+ out << factor.GetString();
37
+ return out;
38
+ }
39
+
40
+ size_t hash_value(const Factor& f)
41
+ {
42
+ boost::hash<size_t> hasher;
43
+ return hasher(f.GetId());
44
+ }
45
+
46
+ }
47
+
48
+
mosesdecoder/moses/ForestInput.h ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ #ifndef moses_ForestInput_h
3
+ #define moses_ForestInput_h
4
+
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ #include <boost/shared_ptr.hpp>
9
+
10
+ #include <util/string_piece.hh>
11
+
12
+ #include "moses/Syntax/F2S/Forest.h"
13
+
14
+ #include "Sentence.h"
15
+
16
+ namespace Moses
17
+ {
18
+ class TranslationTask;
19
+ class ForestInput : public Sentence
20
+ {
21
+ public:
22
+ friend std::ostream &operator<<(std::ostream&, const ForestInput &);
23
+
24
+ ForestInput(AllOptions::ptr const& opts) : Sentence(opts), m_rootVertex(NULL) {}
25
+
26
+ InputTypeEnum GetType() const {
27
+ return ForestInputType;
28
+ }
29
+
30
+ //! populate this InputType with data from in stream
31
+ virtual int
32
+ Read(std::istream& in);
33
+
34
+ //! Output debugging info to stream out
35
+ virtual void Print(std::ostream&) const;
36
+
37
+ //! create trans options specific to this InputType
38
+ virtual TranslationOptionCollection*
39
+ CreateTranslationOptionCollection() const;
40
+
41
+ boost::shared_ptr<const Syntax::F2S::Forest> GetForest() const {
42
+ return m_forest;
43
+ }
44
+
45
+ const Syntax::F2S::Forest::Vertex *GetRootVertex() const {
46
+ return m_rootVertex;
47
+ }
48
+
49
+ private:
50
+ typedef Syntax::F2S::Forest Forest;
51
+
52
+ struct VertexSetHash {
53
+ std::size_t operator()(const Forest::Vertex *v) const {
54
+ std::size_t seed = 0;
55
+ boost::hash_combine(seed, v->pvertex.symbol);
56
+ boost::hash_combine(seed, v->pvertex.span.GetStartPos());
57
+ boost::hash_combine(seed, v->pvertex.span.GetEndPos());
58
+ return seed;
59
+ }
60
+ };
61
+
62
+ struct VertexSetPred {
63
+ bool operator()(const Forest::Vertex *v, const Forest::Vertex *w) const {
64
+ return v->pvertex == w->pvertex;
65
+ }
66
+ };
67
+
68
+ typedef boost::unordered_set<Forest::Vertex *, VertexSetHash,
69
+ VertexSetPred> VertexSet;
70
+
71
+ Forest::Vertex *AddOrDeleteVertex(Forest::Vertex *);
72
+
73
+ std::size_t FindMaxEnd(const Forest &);
74
+
75
+ void FindTopVertices(Forest &, std::vector<Forest::Vertex *> &);
76
+
77
+ void ParseHyperedgeLine(const std::string &);
78
+
79
+ Forest::Vertex *ParseVertex(const StringPiece &);
80
+
81
+ boost::shared_ptr<Forest> m_forest;
82
+ Forest::Vertex *m_rootVertex;
83
+ VertexSet m_vertexSet;
84
+ };
85
+
86
+ } // namespace Moses
87
+
88
+ #endif
mosesdecoder/moses/GenerationDictionary.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_GenerationDictionary_h
23
+ #define moses_GenerationDictionary_h
24
+
25
+ #include <list>
26
+ #include <stdexcept>
27
+ #include <vector>
28
+ #include <boost/unordered_map.hpp>
29
+ #include "ScoreComponentCollection.h"
30
+ #include "Phrase.h"
31
+ #include "TypeDef.h"
32
+ #include "moses/FF/DecodeFeature.h"
33
+
34
+ namespace Moses
35
+ {
36
+
37
+ class FactorCollection;
38
+
39
+ typedef boost::unordered_map < Word , ScoreComponentCollection > OutputWordCollection;
40
+ // 1st = output phrase
41
+ // 2nd = log probability (score)
42
+
43
+ /** Implementation of a generation table in a trie.
44
+ */
45
+ class GenerationDictionary : public DecodeFeature
46
+ {
47
+ typedef boost::unordered_map<const Word* , OutputWordCollection, UnorderedComparer<Word>, UnorderedComparer<Word> > Collection;
48
+ protected:
49
+ static std::vector<GenerationDictionary*> s_staticColl;
50
+
51
+ Collection m_collection;
52
+ // 1st = source
53
+ // 2nd = target
54
+ std::string m_filePath;
55
+
56
+ public:
57
+ static const std::vector<GenerationDictionary*>& GetColl() {
58
+ return s_staticColl;
59
+ }
60
+
61
+ GenerationDictionary(const std::string &line);
62
+ virtual ~GenerationDictionary();
63
+
64
+ //! load data file
65
+ void Load(AllOptions::ptr const& opts);
66
+
67
+ /** number of unique input entries in the generation table.
68
+ * NOT the number of lines in the generation table
69
+ */
70
+ size_t GetSize() const {
71
+ return m_collection.size();
72
+ }
73
+ /** returns a bag of output words, OutputWordCollection, for a particular input word.
74
+ * Or NULL if the input word isn't found. The search function used is the WordComparer functor
75
+ */
76
+ const OutputWordCollection *FindWord(const Word &word) const;
77
+ void SetParameter(const std::string& key, const std::string& value);
78
+
79
+ };
80
+
81
+
82
+ }
83
+ #endif
mosesdecoder/moses/HypothesisStackCubePruning.cpp ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <algorithm>
23
+ #include <set>
24
+ #include <queue>
25
+ #include "HypothesisStackCubePruning.h"
26
+ #include "TypeDef.h"
27
+ #include "Util.h"
28
+ #include "StaticData.h"
29
+ #include "Manager.h"
30
+ #include "util/exception.hh"
31
+
32
+ using namespace std;
33
+
34
+ namespace Moses
35
+ {
36
+ HypothesisStackCubePruning::HypothesisStackCubePruning(Manager& manager) :
37
+ HypothesisStack(manager)
38
+ {
39
+ m_nBestIsEnabled = manager.options()->nbest.enabled;
40
+ m_bestScore = -std::numeric_limits<float>::infinity();
41
+ m_worstScore = -std::numeric_limits<float>::infinity();
42
+ m_deterministic = manager.options()->cube.deterministic_search;
43
+ }
44
+
45
+ /** remove all hypotheses from the collection */
46
+ void HypothesisStackCubePruning::RemoveAll()
47
+ {
48
+ // delete all bitmap accessors;
49
+ _BMType::iterator iter;
50
+ for (iter = m_bitmapAccessor.begin(); iter != m_bitmapAccessor.end(); ++iter) {
51
+ delete iter->second;
52
+ }
53
+ }
54
+
55
+ pair<HypothesisStackCubePruning::iterator, bool> HypothesisStackCubePruning::Add(Hypothesis *hypo)
56
+ {
57
+ std::pair<iterator, bool> ret = m_hypos.insert(hypo);
58
+
59
+ if (ret.second) {
60
+ // equiv hypo doesn't exists
61
+ VERBOSE(3,"added hyp to stack");
62
+
63
+ // Update best score, if this hypothesis is new best
64
+ if (hypo->GetFutureScore() > m_bestScore) {
65
+ VERBOSE(3,", best on stack");
66
+ m_bestScore = hypo->GetFutureScore();
67
+ // this may also affect the worst score
68
+ if ( m_bestScore + m_beamWidth > m_worstScore )
69
+ m_worstScore = m_bestScore + m_beamWidth;
70
+ }
71
+
72
+ // Prune only if stack is twice as big as needed (lazy pruning)
73
+ VERBOSE(3,", now size " << m_hypos.size());
74
+ if (m_hypos.size() > 2*m_maxHypoStackSize-1) {
75
+ PruneToSize(m_maxHypoStackSize);
76
+ } else {
77
+ VERBOSE(3,std::endl);
78
+ }
79
+ }
80
+
81
+ return ret;
82
+ }
83
+
84
+ bool HypothesisStackCubePruning::AddPrune(Hypothesis *hypo)
85
+ {
86
+ if (hypo->GetFutureScore() == - std::numeric_limits<float>::infinity()) {
87
+ m_manager.GetSentenceStats().AddDiscarded();
88
+ VERBOSE(3,"discarded, constraint" << std::endl);
89
+ delete hypo;
90
+ return false;
91
+ }
92
+
93
+ if (hypo->GetFutureScore() < m_worstScore) {
94
+ // too bad for stack. don't bother adding hypo into collection
95
+ m_manager.GetSentenceStats().AddDiscarded();
96
+ VERBOSE(3,"discarded, too bad for stack" << std::endl);
97
+ delete hypo;
98
+ return false;
99
+ }
100
+
101
+ // over threshold, try to add to collection
102
+ std::pair<iterator, bool> addRet = Add(hypo);
103
+ if (addRet.second) {
104
+ // nothing found. add to collection
105
+ return true;
106
+ }
107
+
108
+ // equiv hypo exists, recombine with other hypo
109
+ iterator &iterExisting = addRet.first;
110
+ assert(iterExisting != m_hypos.end());
111
+ Hypothesis *hypoExisting = *iterExisting;
112
+
113
+ m_manager.GetSentenceStats().AddRecombination(*hypo, **iterExisting);
114
+
115
+ // found existing hypo with same target ending.
116
+ // keep the best 1
117
+ if (hypo->GetFutureScore() > hypoExisting->GetFutureScore()) {
118
+ // incoming hypo is better than the one we have
119
+ VERBOSE(3,"better than matching hyp " << hypoExisting->GetId() << ", recombining, ");
120
+ if (m_nBestIsEnabled) {
121
+ hypo->AddArc(hypoExisting);
122
+ Detach(iterExisting);
123
+ } else {
124
+ Remove(iterExisting);
125
+ }
126
+
127
+ bool added = Add(hypo).second;
128
+ if (!added) {
129
+ iterExisting = m_hypos.find(hypo);
130
+ UTIL_THROW(util::Exception, "Should have added hypothesis " << **iterExisting);
131
+ }
132
+ return false;
133
+ } else {
134
+ // already storing the best hypo. discard current hypo
135
+ VERBOSE(3,"worse than matching hyp " << hypoExisting->GetId() << ", recombining" << std::endl)
136
+ if (m_nBestIsEnabled) {
137
+ hypoExisting->AddArc(hypo);
138
+ } else {
139
+ delete hypo;
140
+ }
141
+ return false;
142
+ }
143
+ }
144
+
145
+ void HypothesisStackCubePruning::AddInitial(Hypothesis *hypo)
146
+ {
147
+ std::pair<iterator, bool> addRet = Add(hypo);
148
+ UTIL_THROW_IF2(!addRet.second,
149
+ "Should have added hypothesis " << *hypo);
150
+
151
+ const Bitmap &bitmap = hypo->GetWordsBitmap();
152
+ AddBitmapContainer(bitmap, *this);
153
+ }
154
+
155
+ void HypothesisStackCubePruning::PruneToSize(size_t newSize)
156
+ {
157
+ if ( newSize == 0) return; // no limit
158
+
159
+ if (m_hypos.size() > newSize) { // ok, if not over the limit
160
+ priority_queue<float> bestScores;
161
+
162
+ // push all scores to a heap
163
+ // (but never push scores below m_bestScore+m_beamWidth)
164
+ iterator iter = m_hypos.begin();
165
+ float score = 0;
166
+ while (iter != m_hypos.end()) {
167
+ Hypothesis *hypo = *iter;
168
+ score = hypo->GetFutureScore();
169
+ if (score > m_bestScore+m_beamWidth) {
170
+ bestScores.push(score);
171
+ }
172
+ ++iter;
173
+ }
174
+
175
+ // pop the top newSize scores (and ignore them, these are the scores of hyps that will remain)
176
+ // ensure to never pop beyond heap size
177
+ size_t minNewSizeHeapSize = newSize > bestScores.size() ? bestScores.size() : newSize;
178
+ for (size_t i = 1 ; i < minNewSizeHeapSize ; i++)
179
+ bestScores.pop();
180
+
181
+ // and remember the threshold
182
+ float scoreThreshold = bestScores.top();
183
+
184
+ // delete all hypos under score threshold
185
+ iter = m_hypos.begin();
186
+ while (iter != m_hypos.end()) {
187
+ Hypothesis *hypo = *iter;
188
+ float score = hypo->GetFutureScore();
189
+ if (score < scoreThreshold) {
190
+ iterator iterRemove = iter++;
191
+ Remove(iterRemove);
192
+ m_manager.GetSentenceStats().AddPruning();
193
+ } else {
194
+ ++iter;
195
+ }
196
+ }
197
+ VERBOSE(3,", pruned to size " << size() << endl);
198
+
199
+ IFVERBOSE(3) {
200
+ TRACE_ERR("stack now contains: ");
201
+ for(iter = m_hypos.begin(); iter != m_hypos.end(); iter++) {
202
+ Hypothesis *hypo = *iter;
203
+ TRACE_ERR( hypo->GetId() << " (" << hypo->GetFutureScore() << ") ");
204
+ }
205
+ TRACE_ERR( endl);
206
+ }
207
+
208
+ // set the worstScore, so that newly generated hypotheses will not be added if worse than the worst in the stack
209
+ m_worstScore = scoreThreshold;
210
+ }
211
+ }
212
+
213
+ const Hypothesis *HypothesisStackCubePruning::GetBestHypothesis() const
214
+ {
215
+ if (!m_hypos.empty()) {
216
+ const_iterator iter = m_hypos.begin();
217
+ Hypothesis *bestHypo = *iter;
218
+ while (++iter != m_hypos.end()) {
219
+ Hypothesis *hypo = *iter;
220
+ if (hypo->GetFutureScore() > bestHypo->GetFutureScore())
221
+ bestHypo = hypo;
222
+ }
223
+ return bestHypo;
224
+ }
225
+ return NULL;
226
+ }
227
+
228
+ vector<const Hypothesis*> HypothesisStackCubePruning::GetSortedList() const
229
+ {
230
+ vector<const Hypothesis*> ret;
231
+ ret.reserve(m_hypos.size());
232
+ std::copy(m_hypos.begin(), m_hypos.end(), std::inserter(ret, ret.end()));
233
+ sort(ret.begin(), ret.end(), CompareHypothesisTotalScore());
234
+
235
+ return ret;
236
+ }
237
+
238
+
239
+ void HypothesisStackCubePruning::CleanupArcList()
240
+ {
241
+ // only necessary if n-best calculations are enabled
242
+ if (!m_nBestIsEnabled) return;
243
+
244
+ iterator iter;
245
+ for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
246
+ Hypothesis *mainHypo = *iter;
247
+ mainHypo->CleanupArcList(this->m_manager.options()->nbest.nbest_size, this->m_manager.options()->NBestDistinct());
248
+ }
249
+ }
250
+
251
+ void HypothesisStackCubePruning::SetBitmapAccessor(const Bitmap &newBitmap
252
+ , HypothesisStackCubePruning &stack
253
+ , const Range &/*range*/
254
+ , BitmapContainer &bitmapContainer
255
+ , const SquareMatrix &estimatedScores
256
+ , const TranslationOptionList &transOptList)
257
+ {
258
+ BitmapContainer *bmContainer = AddBitmapContainer(newBitmap, stack);
259
+ BackwardsEdge *edge = new BackwardsEdge(bitmapContainer
260
+ , *bmContainer
261
+ , transOptList
262
+ , estimatedScores
263
+ , m_manager.GetSource()
264
+ , m_deterministic);
265
+ bmContainer->AddBackwardsEdge(edge);
266
+ }
267
+
268
+
269
+ TO_STRING_BODY(HypothesisStackCubePruning);
270
+
271
+
272
+ // friend
273
+ std::ostream& operator<<(std::ostream& out, const HypothesisStackCubePruning& hypoColl)
274
+ {
275
+ HypothesisStackCubePruning::const_iterator iter;
276
+
277
+ for (iter = hypoColl.begin() ; iter != hypoColl.end() ; ++iter) {
278
+ const Hypothesis &hypo = **iter;
279
+ out << hypo << endl;
280
+
281
+ }
282
+ return out;
283
+ }
284
+
285
+ void
286
+ HypothesisStackCubePruning::AddHypothesesToBitmapContainers()
287
+ {
288
+ HypothesisStackCubePruning::const_iterator iter;
289
+ for (iter = m_hypos.begin() ; iter != m_hypos.end() ; ++iter) {
290
+ Hypothesis *h = *iter;
291
+ const Bitmap &bitmap = h->GetWordsBitmap();
292
+ BitmapContainer *container = m_bitmapAccessor[&bitmap];
293
+ container->AddHypothesis(h);
294
+ }
295
+ }
296
+
297
+ BitmapContainer *HypothesisStackCubePruning::AddBitmapContainer(const Bitmap &bitmap, HypothesisStackCubePruning &stack)
298
+ {
299
+ _BMType::iterator iter = m_bitmapAccessor.find(&bitmap);
300
+
301
+ BitmapContainer *bmContainer;
302
+ if (iter == m_bitmapAccessor.end()) {
303
+ bmContainer = new BitmapContainer(bitmap, stack, m_deterministic);
304
+ m_bitmapAccessor[&bitmap] = bmContainer;
305
+ } else {
306
+ bmContainer = iter->second;
307
+ }
308
+
309
+ return bmContainer;
310
+ }
311
+
312
+ }
313
+
mosesdecoder/moses/HypothesisStackCubePruning.h ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_HypothesisStackCubePruning_h
23
+ #define moses_HypothesisStackCubePruning_h
24
+
25
+ #include <limits>
26
+ #include <set>
27
+ #include <boost/unordered_map.hpp>
28
+ #include "Hypothesis.h"
29
+ #include "BitmapContainer.h"
30
+ #include "HypothesisStack.h"
31
+ #include "Util.h"
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ class BitmapContainer;
37
+ class TranslationOptionList;
38
+ class Manager;
39
+
40
+ //typedef boost::unordered_map<Bitmap, BitmapContainer*, UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > _BMType;
41
+ typedef boost::unordered_map<const Bitmap*, BitmapContainer*> _BMType;
42
+ // can compare Bitmap* 'cos all bitmaps are created from bitmaps factory class. MUST ensure this is the case
43
+
44
+ /** A stack for phrase-based decoding with cube-pruning. */
45
+ class HypothesisStackCubePruning : public HypothesisStack
46
+ {
47
+ public:
48
+ friend std::ostream& operator<<(std::ostream&, const HypothesisStackCubePruning&);
49
+
50
+ protected:
51
+ _BMType m_bitmapAccessor;
52
+
53
+ float m_bestScore; /**< score of the best hypothesis in collection */
54
+ float m_worstScore; /**< score of the worse hypthesis in collection */
55
+ float m_beamWidth; /**< minimum score due to threashold pruning */
56
+ size_t m_maxHypoStackSize; /**< maximum number of hypothesis allowed in this stack */
57
+ bool m_nBestIsEnabled; /**< flag to determine whether to keep track of old arcs */
58
+ bool m_deterministic; /**< flag to determine whether to sort hypotheses deterministically */
59
+
60
+ /** add hypothesis to stack. Prune if necessary.
61
+ * Returns false if equiv hypo exists in collection, otherwise returns true
62
+ */
63
+ std::pair<HypothesisStackCubePruning::iterator, bool> Add(Hypothesis *hypothesis);
64
+
65
+ /** destroy all instances of Hypothesis in this collection */
66
+ void RemoveAll();
67
+
68
+ BitmapContainer *AddBitmapContainer(const Bitmap &bitmap, HypothesisStackCubePruning &stack);
69
+
70
+ public:
71
+ HypothesisStackCubePruning(Manager& manager);
72
+ ~HypothesisStackCubePruning() {
73
+ RemoveAll();
74
+ m_bitmapAccessor.clear();
75
+ }
76
+
77
+ /** adds the hypo, but only if within thresholds (beamThr, stackSize).
78
+ * This function will recombine hypotheses silently! There is no record
79
+ * (could affect n-best list generation...TODO)
80
+ * Call stack for adding hypothesis is
81
+ AddPrune()
82
+ Add()
83
+ AddNoPrune()
84
+ */
85
+ bool AddPrune(Hypothesis *hypothesis);
86
+
87
+ void AddInitial(Hypothesis *hypo);
88
+
89
+ /** set maximum number of hypotheses in the collection
90
+ * \param maxHypoStackSize maximum number (typical number: 100)
91
+ */
92
+ inline void SetMaxHypoStackSize(size_t maxHypoStackSize) {
93
+ m_maxHypoStackSize = maxHypoStackSize;
94
+ }
95
+
96
+ inline size_t GetMaxHypoStackSize() const {
97
+ return m_maxHypoStackSize;
98
+ }
99
+
100
+ /** set beam threshold, hypotheses in the stack must not be worse than
101
+ * this factor times the best score to be allowed in the stack
102
+ * \param beamThreshold minimum factor (typical number: 0.03)
103
+ */
104
+ inline void SetBeamWidth(float beamWidth) {
105
+ m_beamWidth = beamWidth;
106
+ }
107
+
108
+ /** return score of the best hypothesis in the stack */
109
+ inline float GetBestScore() const {
110
+ return m_bestScore;
111
+ }
112
+
113
+ /** return worst score allowed for the stack */
114
+ inline float GetWorstScore() const {
115
+ return m_worstScore;
116
+ }
117
+
118
+ void AddHypothesesToBitmapContainers();
119
+
120
+ const _BMType& GetBitmapAccessor() const {
121
+ return m_bitmapAccessor;
122
+ }
123
+
124
+ void SetBitmapAccessor(const Bitmap &newBitmap
125
+ , HypothesisStackCubePruning &stack
126
+ , const Range &range
127
+ , BitmapContainer &bitmapContainer
128
+ , const SquareMatrix &estimatedScores
129
+ , const TranslationOptionList &transOptList);
130
+
131
+ /** pruning, if too large.
132
+ * Pruning algorithm: find a threshold and delete all hypothesis below it.
133
+ * The threshold is chosen so that exactly newSize top items remain on the
134
+ * stack in fact, in situations where some of the hypothesis fell below
135
+ * m_beamWidth, the stack will contain less items.
136
+ * \param newSize maximum size */
137
+ void PruneToSize(size_t newSize);
138
+
139
+ //! return the hypothesis with best score. Used to get the translated at end of decoding
140
+ const Hypothesis *GetBestHypothesis() const;
141
+ //! return all hypothesis, sorted by descending score. Used in creation of N best list
142
+ std::vector<const Hypothesis*> GetSortedList() const;
143
+
144
+ /** make all arcs in point to the equiv hypothesis that contains them.
145
+ * Ie update doubly linked list be hypo & arcs
146
+ */
147
+ void CleanupArcList();
148
+
149
+ TO_STRING();
150
+ };
151
+
152
+ }
153
+ #endif
mosesdecoder/moses/Incremental.h ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ #pragma once
3
+
4
+ #include "lm/word_index.hh"
5
+ #include "search/applied.hh"
6
+ #include "search/nbest.hh"
7
+
8
+ #include "moses/ChartCellCollection.h"
9
+ #include "moses/ChartParser.h"
10
+
11
+ #include "BaseManager.h"
12
+
13
+ #include <vector>
14
+ #include <string>
15
+
16
+ namespace Moses
17
+ {
18
+ class ScoreComponentCollection;
19
+ class InputType;
20
+ class LanguageModel;
21
+
22
+ namespace Incremental
23
+ {
24
+
25
+ class Manager : public BaseManager
26
+ {
27
+ public:
28
+ Manager(ttasksptr const& ttask);
29
+
30
+ ~Manager();
31
+
32
+ template <class Model> void LMCallback(const Model &model, const std::vector<lm::WordIndex> &words);
33
+
34
+ void Decode();
35
+
36
+ const std::vector<search::Applied> &GetNBest() const;
37
+
38
+ // Call to get the same value as ProcessSentence returned.
39
+ const std::vector<search::Applied> &Completed() const {
40
+ return *completed_nbest_;
41
+ }
42
+
43
+ // output
44
+ void OutputBest(OutputCollector *collector) const;
45
+ void OutputNBest(OutputCollector *collector) const;
46
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
47
+ void OutputNBestList(OutputCollector *collector, const std::vector<search::Applied> &nbest, long translationId) const;
48
+ void OutputLatticeSamples(OutputCollector *collector) const {
49
+ }
50
+ void OutputAlignment(OutputCollector *collector) const {
51
+ }
52
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
53
+ void OutputWordGraph(OutputCollector *collector) const {
54
+ }
55
+ void OutputSearchGraph(OutputCollector *collector) const {
56
+ }
57
+ void OutputSearchGraphSLF() const {
58
+ }
59
+
60
+ void
61
+ OutputSearchGraphAsHypergraph
62
+ ( std::string const& fname, size_t const precision ) const
63
+ { }
64
+
65
+
66
+ private:
67
+ template <class Model, class Best> search::History PopulateBest(const Model &model, const std::vector<lm::WordIndex> &words, Best &out);
68
+
69
+ ChartCellCollectionBase cells_;
70
+ ChartParser parser_;
71
+
72
+ // Only one of single_best_ or n_best_ will be used, but it was easier to do this than a template.
73
+ search::SingleBest single_best_;
74
+ // ProcessSentence returns a reference to a vector. ProcessSentence
75
+ // doesn't have one, so this is populated and returned.
76
+ std::vector<search::Applied> backing_for_single_;
77
+
78
+ search::NBest n_best_;
79
+
80
+ const std::vector<search::Applied> *completed_nbest_;
81
+
82
+ // outputs
83
+ void OutputDetailedTranslationReport(
84
+ OutputCollector *collector,
85
+ const search::Applied *applied,
86
+ const Sentence &sentence,
87
+ long translationId) const;
88
+ void OutputTranslationOptions(std::ostream &out,
89
+ ApplicationContext &applicationContext,
90
+ const search::Applied *applied,
91
+ const Sentence &sentence,
92
+ long translationId) const;
93
+ void OutputTranslationOption(std::ostream &out,
94
+ ApplicationContext &applicationContext,
95
+ const search::Applied *applied,
96
+ const Sentence &sentence,
97
+ long translationId) const;
98
+ void ReconstructApplicationContext(const search::Applied *applied,
99
+ const Sentence &sentence,
100
+ ApplicationContext &context) const;
101
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
102
+ ApplicationContext &applicationContext,
103
+ const search::Applied *applied,
104
+ const Sentence &sentence,
105
+ long translationId) const;
106
+ void OutputBestHypo(OutputCollector *collector, search::Applied applied, long translationId) const;
107
+ void OutputBestNone(OutputCollector *collector, long translationId) const;
108
+
109
+ void OutputUnknowns(OutputCollector *collector) const {
110
+ }
111
+ void CalcDecoderStatistics() const {
112
+ }
113
+
114
+ };
115
+
116
+ // Just get the phrase.
117
+ void ToPhrase(const search::Applied final, Phrase &out);
118
+ // Get the phrase and the features.
119
+ void PhraseAndFeatures(const search::Applied final, Phrase &phrase, ScoreComponentCollection &features);
120
+
121
+
122
+ } // namespace Incremental
123
+ } // namespace Moses
124
+
mosesdecoder/moses/Jamfile ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- jam -*-
2
+ max-factors = [ option.get "max-factors" : 4 : 4 ] ;
3
+ path-constant FACTOR-LOG : bin/factor.log ;
4
+ update-if-changed $(FACTOR-LOG) $(max-factors) ;
5
+ max-factors = <define>MAX_NUM_FACTORS=$(max-factors) <dependency>$(FACTOR-LOG) ;
6
+
7
+ with-dlib = [ option.get "with-dlib" ] ;
8
+ if $(with-dlib) {
9
+ dlib = <define>WITH_DLIB <include>$(with-dlib) ;
10
+ } else {
11
+ dlib = ;
12
+ }
13
+
14
+ with-oxlm = [ option.get "with-oxlm" ] ;
15
+ if $(with-oxlm) {
16
+ oxlm = <cxxflags>-std=c++0x <define>LM_OXLM <include>$(with-oxlm)/src <include>$(with-oxlm)/third_party/eigen ;
17
+ } else {
18
+ oxlm = ;
19
+ }
20
+
21
+ local classifier = ;
22
+ if [ option.get "with-vw" ] {
23
+ classifier += ..//vw//classifier ;
24
+ }
25
+
26
+ alias headers : ../util//kenutil $(classifier) : : : $(max-factors) $(dlib) $(oxlm) ;
27
+ alias ThreadPool : ThreadPool.cpp ;
28
+ alias Util : Util.cpp Timer.cpp ;
29
+
30
+ if [ option.get "with-synlm" : no : yes ] = yes
31
+ {
32
+ lib m ;
33
+ obj SyntacticLanguageModel.o : SyntacticLanguageModel.cpp headers : <include>$(TOP)/synlm/hhmm/rvtl/include <include>$(TOP)/synlm/hhmm/wsjparse/include ;
34
+ alias synlm : SyntacticLanguageModel.o m : : : <define>HAVE_SYNLM ;
35
+ } else {
36
+ alias synlm ;
37
+ }
38
+
39
+ #This is a kludge to force rebuilding if different --with options are passed.
40
+ #Could have used features like <srilm>on but getting these to apply only to
41
+ #linking was ugly and it still didn't trigger an install (since the install
42
+ #path doesn't encode features). It stores a file lm.log with the previous
43
+ #options and forces a rebuild if the current options differ.
44
+ local current = ;
45
+ for local i in srilm irstlm randlm {
46
+ local optval = [ option.get "with-$(i)" ] ;
47
+ if $(optval) {
48
+ current += "--with-$(i)=$(optval)" ;
49
+ }
50
+ }
51
+ current = $(current:J=" ") ;
52
+ current ?= "" ;
53
+ path-constant LM-LOG : bin/lm.log ;
54
+ update-if-changed $(LM-LOG) $(current) ;
55
+
56
+ obj FF_Factory.o : FF/Factory.cpp LM//macros headers ../lm//kenlm mmlib : <dependency>$(LM-LOG) ;
57
+
58
+ # check if we have xmlrpc-c's abyss server available
59
+ # if yes, include server capabilities in the moses executable
60
+ # include $(TOP)/jam-files/server.jam ;
61
+
62
+ if [ xmlrpc ]
63
+ {
64
+ echo "BUILDING MOSES SERVER!" ;
65
+ alias mserver : [ glob server/*.cpp ] ;
66
+ }
67
+ else
68
+ {
69
+ echo "NOT BUILDING MOSES SERVER!" ;
70
+ alias mserver ;
71
+ }
72
+
73
+ if [ option.get "with-mm" : no : yes ] = yes
74
+ {
75
+ alias mmlib :
76
+ $(TOP)/moses/TranslationModel/UG//mmsapt
77
+ $(TOP)/moses/TranslationModel/UG/generic//generic
78
+ $(TOP)/moses/TranslationModel/UG/mm//mm
79
+ ;
80
+ } else {
81
+ alias mmlib ;
82
+ }
83
+
84
+ local with-vw = [ option.get "with-vw" ] ;
85
+ if $(with-vw) {
86
+ alias vwfiles : [ glob FF/VW/*.cpp ] ;
87
+ } else {
88
+ alias vwfiles ;
89
+ }
90
+
91
+ lib moses :
92
+ [ glob
93
+ *.cpp
94
+ parameters/*.cpp
95
+ Syntax/*.cpp
96
+ Syntax/F2S/*.cpp
97
+ Syntax/S2T/*.cpp
98
+ Syntax/S2T/Parsers/*.cpp
99
+ Syntax/S2T/Parsers/RecursiveCYKPlusParser/*.cpp
100
+ Syntax/S2T/Parsers/Scope3Parser/*.cpp
101
+ Syntax/T2S/*.cpp
102
+ TranslationModel/*.cpp
103
+ TranslationModel/fuzzy-match/*.cpp
104
+ TranslationModel/DynSAInclude/*.cpp
105
+ TranslationModel/RuleTable/*.cpp
106
+ TranslationModel/Scope3Parser/*.cpp
107
+ TranslationModel/CYKPlusParser/*.cpp
108
+ ../phrase-extract/PhraseOrientation.cpp
109
+ FF/*.cpp
110
+ FF/bilingual-lm/*.cpp
111
+ FF/OSM-Feature/*.cpp
112
+ FF/Dsg-Feature/*.cpp
113
+ FF/LexicalReordering/*.cpp
114
+ PP/*.cpp
115
+ : #exceptions
116
+ ThreadPool.cpp
117
+ SyntacticLanguageModel.cpp
118
+ *Test.cpp Mock*.cpp FF/*Test.cpp
119
+ FF/Factory.cpp
120
+ ]
121
+ vwfiles synlm mmlib mserver headers
122
+ FF_Factory.o
123
+ LM//LM
124
+ TranslationModel/CompactPT//CompactPT
125
+ ThreadPool
126
+ ..//search
127
+ ../util/double-conversion//double-conversion
128
+ ../probingpt//probingpt
129
+ ..//z
130
+ ../OnDiskPt//OnDiskPt
131
+ $(TOP)//boost_filesystem
132
+ $(TOP)//boost_iostreams
133
+ :
134
+ <threading>single:<source>../util//rt
135
+ ;
136
+
137
+
138
+ alias headers-to-install : [ glob-tree *.h ] ;
139
+
140
+ import testing ;
141
+
142
+ unit-test moses_test : [ glob *Test.cpp Mock*.cpp FF/*Test.cpp ] ..//boost_filesystem moses headers ..//z ../OnDiskPt//OnDiskPt ../probingpt//probingpt ..//boost_unit_test_framework ;
143
+
mosesdecoder/moses/LVoc.cpp ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #include<limits>
2
+ #include "LVoc.h"
3
+
4
+ //rather pointless file because LVoc is template all wee need here is the definitions of consts
5
+
6
+ const LabelId InvalidLabelId = std::numeric_limits<LabelId>::max();
7
+ const LabelId Epsilon = InvalidLabelId-1;
mosesdecoder/moses/LVoc.h ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef moses_LVoc_h
2
+ #define moses_LVoc_h
3
+
4
+ #include<map>
5
+ #include<vector>
6
+ #include<iostream>
7
+ #include<fstream>
8
+ #include <sstream>
9
+
10
+ typedef unsigned LabelId;
11
+ extern const LabelId InvalidLabelId;
12
+ extern const LabelId Epsilon;
13
+
14
+ typedef std::vector<LabelId> IPhrase;
15
+
16
+ /** class used in phrase-based binary phrase-table.
17
+ * @todo vocab?
18
+ * A = type of things to numberize, ie, std::string
19
+ * B = map type to use, might consider using hash_map for better performance
20
+ */
21
+ template<typename A,typename B=std::map<A,LabelId> >
22
+ class LVoc
23
+ {
24
+ typedef A Key;
25
+ typedef B M;
26
+ typedef std::vector<Key> V;
27
+ M m;
28
+ V data;
29
+ public:
30
+ LVoc() {}
31
+
32
+ bool isKnown(const Key& k) const {
33
+ return m.find(k)!=m.end();
34
+ }
35
+ LabelId index(const Key& k) const {
36
+ typename M::const_iterator i=m.find(k);
37
+ return i!=m.end()? i->second : InvalidLabelId;
38
+ }
39
+ LabelId add(const Key& k) {
40
+ std::pair<typename M::iterator,bool> p
41
+ =m.insert(std::make_pair(k,data.size()));
42
+ if(p.second) data.push_back(k);
43
+ assert(static_cast<size_t>(p.first->second)<data.size());
44
+ return p.first->second;
45
+ }
46
+ Key const& symbol(LabelId i) const {
47
+ assert(static_cast<size_t>(i)<data.size());
48
+ return data[i];
49
+ }
50
+
51
+ typedef typename V::const_iterator const_iterator;
52
+ const_iterator begin() const {
53
+ return data.begin();
54
+ }
55
+ const_iterator end() const {
56
+ return data.end();
57
+ }
58
+
59
+ void Write(const std::string& fname) const {
60
+ std::ofstream out(fname.c_str());
61
+ // Little-known fact: ofstream tracks failures but does not, by default,
62
+ // report them. You have to tell it to, or check for errors yourself.
63
+ out.exceptions(std::ifstream::failbit | std::ifstream::badbit);
64
+ Write(out);
65
+ // Make sure the file is flushed, so that any errors are reported. If we
66
+ // flush implicitly in the destructor, it won't be able to throw
67
+ // exceptions.
68
+ out.close();
69
+ }
70
+ void Write(std::ostream& out) const {
71
+ for(int i=data.size()-1; i>=0; --i)
72
+ out<<i<<' '<<data[i]<<'\n';
73
+ }
74
+ void Read(const std::string& fname) {
75
+ std::ifstream in(fname.c_str());
76
+ Read(in);
77
+ }
78
+ void Read(std::istream& in) {
79
+ Key k;
80
+ size_t i;
81
+ std::string line;
82
+ while(getline(in,line)) {
83
+ std::istringstream is(line);
84
+ if(is>>i>>k) {
85
+ if(i>=data.size()) data.resize(i+1);
86
+ data[i]=k;
87
+ m[k]=i;
88
+ }
89
+ }
90
+ }
91
+ };
92
+
93
+ #endif
mosesdecoder/moses/Manager.cpp ADDED
@@ -0,0 +1,2016 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+ #ifdef WIN32
22
+ #include <hash_set>
23
+ #else
24
+ // #include <ext/hash_set>
25
+ #endif
26
+
27
+ #include <algorithm>
28
+ #include <cmath>
29
+ #include <limits>
30
+ #include <map>
31
+ #include <set>
32
+ #include "Manager.h"
33
+ #include "TypeDef.h"
34
+ #include "Util.h"
35
+ #include "TargetPhrase.h"
36
+ #include "TrellisPath.h"
37
+ #include "TrellisPathCollection.h"
38
+ #include "TranslationOption.h"
39
+ #include "TranslationOptionCollection.h"
40
+ #include "Timer.h"
41
+ #include "moses/OutputCollector.h"
42
+ #include "moses/FF/DistortionScoreProducer.h"
43
+ #include "moses/LM/Base.h"
44
+ #include "moses/TranslationModel/PhraseDictionary.h"
45
+ #include "moses/TranslationAnalysis.h"
46
+ #include "moses/TranslationTask.h"
47
+ #include "moses/HypergraphOutput.h"
48
+ #include "moses/mbr.h"
49
+ #include "moses/LatticeMBR.h"
50
+ #include "moses/SearchNormal.h"
51
+ #include "moses/SearchCubePruning.h"
52
+ #include <boost/foreach.hpp>
53
+
54
+ #ifdef HAVE_PROTOBUF
55
+ #include "hypergraph.pb.h"
56
+ #include "rule.pb.h"
57
+ #endif
58
+
59
+ #include "util/exception.hh"
60
+ #include "util/random.hh"
61
+ #include "util/string_stream.hh"
62
+
63
+ using namespace std;
64
+
65
+ namespace Moses
66
+ {
67
+
68
+ Manager::Manager(ttasksptr const& ttask)
69
+ : BaseManager(ttask)
70
+ , interrupted_flag(0)
71
+ , m_hypoId(0)
72
+ {
73
+ boost::shared_ptr<InputType> source = ttask->GetSource();
74
+ m_transOptColl = source->CreateTranslationOptionCollection(ttask);
75
+
76
+ switch(options()->search.algo) {
77
+ case Normal:
78
+ m_search = new SearchNormal(*this, *m_transOptColl);
79
+ break;
80
+ case CubePruning:
81
+ m_search = new SearchCubePruning(*this, *m_transOptColl);
82
+ break;
83
+ default:
84
+ UTIL_THROW2("ERROR: search. Aborting\n");
85
+ }
86
+
87
+ StaticData::Instance().InitializeForInput(ttask);
88
+ }
89
+
90
+ Manager::~Manager()
91
+ {
92
+ delete m_transOptColl;
93
+ delete m_search;
94
+ StaticData::Instance().CleanUpAfterSentenceProcessing(m_ttask.lock());
95
+ }
96
+
97
+ const InputType&
98
+ Manager::GetSource() const
99
+ {
100
+ return m_source ;
101
+ }
102
+
103
+ /**
104
+ * Main decoder loop that translates a sentence by expanding
105
+ * hypotheses stack by stack, until the end of the sentence.
106
+ */
107
+ void Manager::Decode()
108
+ {
109
+
110
+ //std::cerr << options().nbest.nbest_size << " "
111
+ // << options().nbest.enabled << " " << std::endl;
112
+
113
+ // initialize statistics
114
+ ResetSentenceStats(m_source);
115
+ IFVERBOSE(2) {
116
+ GetSentenceStats().StartTimeTotal();
117
+ }
118
+
119
+ // check if alternate weight setting is used
120
+ // this is not thread safe! it changes StaticData
121
+ if (StaticData::Instance().GetHasAlternateWeightSettings()) {
122
+ if (m_source.GetSpecifiesWeightSetting()) {
123
+ StaticData::Instance().SetWeightSetting(m_source.GetWeightSetting());
124
+ } else {
125
+ StaticData::Instance().SetWeightSetting("default");
126
+ }
127
+ }
128
+
129
+ // get translation options
130
+ IFVERBOSE(1) {
131
+ GetSentenceStats().StartTimeCollectOpts();
132
+ }
133
+ m_transOptColl->CreateTranslationOptions();
134
+
135
+ // some reporting on how long this took
136
+ IFVERBOSE(1) {
137
+ GetSentenceStats().StopTimeCollectOpts();
138
+ TRACE_ERR("Line "<< m_source.GetTranslationId()
139
+ << ": Collecting options took "
140
+ << GetSentenceStats().GetTimeCollectOpts() << " seconds at "
141
+ << __FILE__ << " Line " << __LINE__ << endl);
142
+ }
143
+
144
+ // search for best translation with the specified algorithm
145
+ Timer searchTime;
146
+ searchTime.start();
147
+ m_search->Decode();
148
+ VERBOSE(1, "Line " << m_source.GetTranslationId()
149
+ << ": Search took " << searchTime << " seconds" << endl);
150
+ IFVERBOSE(2) {
151
+ GetSentenceStats().StopTimeTotal();
152
+ TRACE_ERR(GetSentenceStats());
153
+ }
154
+ }
155
+
156
+ /**
157
+ * Print all derivations in search graph. Note: The number of derivations is exponential in the sentence length
158
+ *
159
+ */
160
+
161
+ void Manager::PrintAllDerivations(long translationId, ostream& outputStream) const
162
+ {
163
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
164
+
165
+ vector<const Hypothesis*> sortedPureHypo = hypoStackColl.back()->GetSortedList();
166
+
167
+ if (sortedPureHypo.size() == 0)
168
+ return;
169
+
170
+ float remainingScore = 0;
171
+ vector<const TargetPhrase*> remainingPhrases;
172
+
173
+ // add all pure paths
174
+ vector<const Hypothesis*>::const_iterator iterBestHypo;
175
+ for (iterBestHypo = sortedPureHypo.begin()
176
+ ; iterBestHypo != sortedPureHypo.end()
177
+ ; ++iterBestHypo) {
178
+ printThisHypothesis(translationId, *iterBestHypo, remainingPhrases, remainingScore, outputStream);
179
+ printDivergentHypothesis(translationId, *iterBestHypo, remainingPhrases, remainingScore, outputStream);
180
+ }
181
+ }
182
+
183
+ const TranslationOptionCollection* Manager::getSntTranslationOptions()
184
+ {
185
+ return m_transOptColl;
186
+ }
187
+
188
+ void Manager::printDivergentHypothesis(long translationId, const Hypothesis* hypo, const vector <const TargetPhrase*> & remainingPhrases, float remainingScore , ostream& outputStream ) const
189
+ {
190
+ //Backtrack from the predecessor
191
+ if (hypo->GetId() > 0) {
192
+ vector <const TargetPhrase*> followingPhrases;
193
+ followingPhrases.push_back(& (hypo->GetCurrTargetPhrase()));
194
+ ///((Phrase) hypo->GetPrevHypo()->GetTargetPhrase());
195
+ followingPhrases.insert(followingPhrases.end()--, remainingPhrases.begin(), remainingPhrases.end());
196
+ printDivergentHypothesis(translationId, hypo->GetPrevHypo(), followingPhrases , remainingScore + hypo->GetScore() - hypo->GetPrevHypo()->GetScore(), outputStream);
197
+ }
198
+
199
+ //Process the arcs
200
+ const ArcList *pAL = hypo->GetArcList();
201
+ if (pAL) {
202
+ const ArcList &arcList = *pAL;
203
+ // every possible Arc to replace this edge
204
+ ArcList::const_iterator iterArc;
205
+ for (iterArc = arcList.begin() ; iterArc != arcList.end() ; ++iterArc) {
206
+ const Hypothesis *loserHypo = *iterArc;
207
+ const Hypothesis* loserPrevHypo = loserHypo->GetPrevHypo();
208
+ float arcScore = loserHypo->GetScore() - loserPrevHypo->GetScore();
209
+ vector <const TargetPhrase* > followingPhrases;
210
+ followingPhrases.push_back(&(loserHypo->GetCurrTargetPhrase()));
211
+ followingPhrases.insert(followingPhrases.end()--, remainingPhrases.begin(), remainingPhrases.end());
212
+ printThisHypothesis(translationId, loserPrevHypo, followingPhrases, remainingScore + arcScore, outputStream);
213
+ printDivergentHypothesis(translationId, loserPrevHypo, followingPhrases, remainingScore + arcScore, outputStream);
214
+ }
215
+ }
216
+ }
217
+
218
+
219
+ void
220
+ Manager::
221
+ printThisHypothesis(long translationId, const Hypothesis* hypo,
222
+ const vector <const TargetPhrase*> & remainingPhrases,
223
+ float remainingScore, ostream& outputStream) const
224
+ {
225
+
226
+ outputStream << translationId << " ||| ";
227
+
228
+ //Yield of this hypothesis
229
+ hypo->ToStream(outputStream);
230
+ for (size_t p = 0; p < remainingPhrases.size(); ++p) {
231
+ const TargetPhrase * phrase = remainingPhrases[p];
232
+ size_t size = phrase->GetSize();
233
+ for (size_t pos = 0 ; pos < size ; pos++) {
234
+ const Factor *factor = phrase->GetFactor(pos, 0);
235
+ outputStream << *factor;
236
+ outputStream << " ";
237
+ }
238
+ }
239
+
240
+ outputStream << "||| " << hypo->GetScore() + remainingScore;
241
+ outputStream << endl;
242
+ }
243
+
244
+
245
+
246
+
247
+ /**
248
+ * After decoding, the hypotheses in the stacks and additional arcs
249
+ * form a search graph that can be mined for n-best lists.
250
+ * The heavy lifting is done in the TrellisPath and TrellisPathCollection
251
+ * this function controls this for one sentence.
252
+ *
253
+ * \param count the number of n-best translations to produce
254
+ * \param ret holds the n-best list that was calculated
255
+ */
256
+ void Manager::CalcNBest(size_t count, TrellisPathList &ret, bool onlyDistinct) const
257
+ {
258
+ if (count <= 0)
259
+ return;
260
+
261
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
262
+
263
+ vector<const Hypothesis*> sortedPureHypo = hypoStackColl.back()->GetSortedList();
264
+
265
+ if (sortedPureHypo.size() == 0)
266
+ return;
267
+
268
+ TrellisPathCollection contenders;
269
+
270
+ set<Phrase> distinctHyps;
271
+
272
+ // add all pure paths
273
+ vector<const Hypothesis*>::const_iterator iterBestHypo;
274
+ for (iterBestHypo = sortedPureHypo.begin()
275
+ ; iterBestHypo != sortedPureHypo.end()
276
+ ; ++iterBestHypo) {
277
+ contenders.Add(new TrellisPath(*iterBestHypo));
278
+ }
279
+
280
+ // factor defines stopping point for distinct n-best list if too
281
+ // many candidates identical
282
+ size_t nBestFactor = options()->nbest.factor;
283
+ if (nBestFactor < 1) nBestFactor = 1000; // 0 = unlimited
284
+
285
+ // MAIN loop
286
+ for (size_t iteration = 0 ; (onlyDistinct ? distinctHyps.size() : ret.GetSize()) < count && contenders.GetSize() > 0 && (iteration < count * nBestFactor) ; iteration++) {
287
+ // get next best from list of contenders
288
+ TrellisPath *path = contenders.pop();
289
+ UTIL_THROW_IF2(path == NULL, "path is NULL");
290
+ // create deviations from current best
291
+ path->CreateDeviantPaths(contenders);
292
+ if(onlyDistinct) {
293
+ Phrase tgtPhrase = path->GetSurfacePhrase();
294
+ if (distinctHyps.insert(tgtPhrase).second) {
295
+ ret.Add(path);
296
+ } else {
297
+ delete path;
298
+ path = NULL;
299
+ }
300
+ } else {
301
+ ret.Add(path);
302
+ }
303
+
304
+
305
+ if(onlyDistinct) {
306
+ const size_t nBestFactor = options()->nbest.factor;
307
+ if (nBestFactor > 0)
308
+ contenders.Prune(count * nBestFactor);
309
+ } else {
310
+ contenders.Prune(count);
311
+ }
312
+ }
313
+ }
314
+
315
+ struct SGNReverseCompare {
316
+ bool operator() (const SearchGraphNode& s1, const SearchGraphNode& s2) const {
317
+ return s1.hypo->GetId() > s2.hypo->GetId();
318
+ }
319
+ };
320
+
321
+ /**
322
+ * Implements lattice sampling, as in Chatterjee & Cancedda, emnlp 2010
323
+ **/
324
+ void Manager::CalcLatticeSamples(size_t count, TrellisPathList &ret) const
325
+ {
326
+
327
+ vector<SearchGraphNode> searchGraph;
328
+ GetSearchGraph(searchGraph);
329
+
330
+ //Calculation of the sigmas of each hypothesis and edge. In C&C notation this is
331
+ //the "log of the cumulative unnormalized probability of all the paths in the
332
+ // lattice for the hypothesis to a final node"
333
+ typedef pair<int, int> Edge;
334
+ map<const Hypothesis*, float> sigmas;
335
+ map<Edge, float> edgeScores;
336
+ map<const Hypothesis*, set<const Hypothesis*> > outgoingHyps;
337
+ map<int,const Hypothesis*> idToHyp;
338
+ map<int,float> fscores;
339
+
340
+ //Iterating through the hypos in reverse order of id gives a reverse
341
+ //topological order. We rely on the fact that hypo ids are given out
342
+ //sequentially, as the search proceeds.
343
+ //NB: Could just sort by stack.
344
+ sort(searchGraph.begin(), searchGraph.end(), SGNReverseCompare());
345
+
346
+ //first task is to fill in the outgoing hypos and edge scores.
347
+ for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
348
+ i != searchGraph.end(); ++i) {
349
+ const Hypothesis* hypo = i->hypo;
350
+ idToHyp[hypo->GetId()] = hypo;
351
+ fscores[hypo->GetId()] = i->fscore;
352
+ if (hypo->GetId()) {
353
+ //back to current
354
+ const Hypothesis* prevHypo = i->hypo->GetPrevHypo();
355
+ outgoingHyps[prevHypo].insert(hypo);
356
+ edgeScores[Edge(prevHypo->GetId(),hypo->GetId())] =
357
+ hypo->GetScore() - prevHypo->GetScore();
358
+ }
359
+ //forward from current
360
+ if (i->forward >= 0) {
361
+ map<int,const Hypothesis*>::const_iterator idToHypIter = idToHyp.find(i->forward);
362
+ UTIL_THROW_IF2(idToHypIter == idToHyp.end(),
363
+ "Couldn't find hypothesis " << i->forward);
364
+ const Hypothesis* nextHypo = idToHypIter->second;
365
+ outgoingHyps[hypo].insert(nextHypo);
366
+ map<int,float>::const_iterator fscoreIter = fscores.find(nextHypo->GetId());
367
+ UTIL_THROW_IF2(fscoreIter == fscores.end(),
368
+ "Couldn't find scores for hypothsis " << nextHypo->GetId());
369
+ edgeScores[Edge(hypo->GetId(),nextHypo->GetId())] =
370
+ i->fscore - fscoreIter->second;
371
+ }
372
+ }
373
+
374
+
375
+ //then run through again to calculate sigmas
376
+ for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin();
377
+ i != searchGraph.end(); ++i) {
378
+
379
+ if (i->forward == -1) {
380
+ sigmas[i->hypo] = 0;
381
+ } else {
382
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
383
+ outgoingHyps.find(i->hypo);
384
+
385
+ UTIL_THROW_IF2(outIter == outgoingHyps.end(),
386
+ "Couldn't find hypothesis " << i->hypo->GetId());
387
+ float sigma = 0;
388
+ for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
389
+ j != outIter->second.end(); ++j) {
390
+ map<const Hypothesis*, float>::const_iterator succIter = sigmas.find(*j);
391
+ UTIL_THROW_IF2(succIter == sigmas.end(),
392
+ "Couldn't find hypothesis " << (*j)->GetId());
393
+ map<Edge,float>::const_iterator edgeScoreIter =
394
+ edgeScores.find(Edge(i->hypo->GetId(),(*j)->GetId()));
395
+ UTIL_THROW_IF2(edgeScoreIter == edgeScores.end(),
396
+ "Couldn't find edge for hypothesis " << (*j)->GetId());
397
+ float term = edgeScoreIter->second + succIter->second; // Add sigma(*j)
398
+ if (sigma == 0) {
399
+ sigma = term;
400
+ } else {
401
+ sigma = log_sum(sigma,term);
402
+ }
403
+ }
404
+ sigmas[i->hypo] = sigma;
405
+ }
406
+ }
407
+
408
+ //The actual sampling!
409
+ const Hypothesis* startHypo = searchGraph.back().hypo;
410
+ UTIL_THROW_IF2(startHypo->GetId() != 0, "Expecting the start hypothesis ");
411
+ for (size_t i = 0; i < count; ++i) {
412
+ vector<const Hypothesis*> path;
413
+ path.push_back(startHypo);
414
+ while(1) {
415
+ map<const Hypothesis*, set<const Hypothesis*> >::const_iterator outIter =
416
+ outgoingHyps.find(path.back());
417
+ if (outIter == outgoingHyps.end() || !outIter->second.size()) {
418
+ //end of the path
419
+ break;
420
+ }
421
+ //score the possibles
422
+ vector<const Hypothesis*> candidates;
423
+ vector<float> candidateScores;
424
+ float scoreTotal = 0;
425
+ for (set<const Hypothesis*>::const_iterator j = outIter->second.begin();
426
+ j != outIter->second.end(); ++j) {
427
+ candidates.push_back(*j);
428
+ UTIL_THROW_IF2(sigmas.find(*j) == sigmas.end(),
429
+ "Hypothesis " << (*j)->GetId() << " not found");
430
+ Edge edge(path.back()->GetId(),(*j)->GetId());
431
+ UTIL_THROW_IF2(edgeScores.find(edge) == edgeScores.end(),
432
+ "Edge not found");
433
+ candidateScores.push_back(sigmas[*j] + edgeScores[edge]);
434
+ if (scoreTotal == 0) {
435
+ scoreTotal = candidateScores.back();
436
+ } else {
437
+ scoreTotal = log_sum(candidateScores.back(), scoreTotal);
438
+ }
439
+ }
440
+
441
+ //normalise
442
+ transform(candidateScores.begin(), candidateScores.end(), candidateScores.begin(), bind2nd(minus<float>(),scoreTotal));
443
+ //copy(candidateScores.begin(),candidateScores.end(),ostream_iterator<float>(cerr," "));
444
+ //cerr << endl;
445
+
446
+ //draw the sample
447
+ const float frandom = log(util::rand_incl(0.0f, 1.0f));
448
+ size_t position = 1;
449
+ float sum = candidateScores[0];
450
+ for (; position < candidateScores.size() && sum < frandom; ++position) {
451
+ sum = log_sum(sum,candidateScores[position]);
452
+ }
453
+ //cerr << "Random: " << frandom << " Chose " << position-1 << endl;
454
+ const Hypothesis* chosen = candidates[position-1];
455
+ path.push_back(chosen);
456
+ }
457
+ //cerr << "Path: " << endl;
458
+ //for (size_t j = 0; j < path.size(); ++j) {
459
+ // cerr << path[j]->GetId() << " " << path[j]->GetScoreBreakdown() << endl;
460
+ //}
461
+ //cerr << endl;
462
+
463
+ //Convert the hypos to TrellisPath
464
+ ret.Add(new TrellisPath(path));
465
+ //cerr << ret.at(ret.GetSize()-1).GetScoreBreakdown() << endl;
466
+ }
467
+
468
+ }
469
+
470
+
471
+
472
+ void Manager::CalcDecoderStatistics() const
473
+ {
474
+ const Hypothesis *hypo = GetBestHypothesis();
475
+ if (hypo != NULL) {
476
+ GetSentenceStats().CalcFinalStats(*hypo);
477
+ IFVERBOSE(2) {
478
+ if (hypo != NULL) {
479
+ string buff;
480
+ string buff2;
481
+ TRACE_ERR( "Source and Target Units:"
482
+ << hypo->GetInput());
483
+ buff2.insert(0,"] ");
484
+ buff2.insert(0,(hypo->GetCurrTargetPhrase()).ToString());
485
+ buff2.insert(0,":");
486
+ buff2.insert(0,(hypo->GetCurrSourceWordsRange()).ToString());
487
+ buff2.insert(0,"[");
488
+
489
+ hypo = hypo->GetPrevHypo();
490
+ while (hypo != NULL) {
491
+ //dont print out the empty final hypo
492
+ buff.insert(0,buff2);
493
+ buff2.clear();
494
+ buff2.insert(0,"] ");
495
+ buff2.insert(0,(hypo->GetCurrTargetPhrase()).ToString());
496
+ buff2.insert(0,":");
497
+ buff2.insert(0,(hypo->GetCurrSourceWordsRange()).ToString());
498
+ buff2.insert(0,"[");
499
+ hypo = hypo->GetPrevHypo();
500
+ }
501
+ TRACE_ERR( buff << endl);
502
+ }
503
+ }
504
+ }
505
+ }
506
+
507
+ void Manager::OutputWordGraph(std::ostream &outputWordGraphStream, const Hypothesis *hypo, size_t &linkId) const
508
+ {
509
+
510
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
511
+
512
+
513
+ outputWordGraphStream << "J=" << linkId++
514
+ << "\tS=" << prevHypo->GetId()
515
+ << "\tE=" << hypo->GetId()
516
+ << "\ta=";
517
+
518
+ // phrase table scores
519
+ const std::vector<PhraseDictionary*> &phraseTables = PhraseDictionary::GetColl();
520
+ std::vector<PhraseDictionary*>::const_iterator iterPhraseTable;
521
+ for (iterPhraseTable = phraseTables.begin() ; iterPhraseTable != phraseTables.end() ; ++iterPhraseTable) {
522
+ const PhraseDictionary *phraseTable = *iterPhraseTable;
523
+ vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(phraseTable);
524
+
525
+ outputWordGraphStream << scores[0];
526
+ vector<float>::const_iterator iterScore;
527
+ for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore) {
528
+ outputWordGraphStream << ", " << *iterScore;
529
+ }
530
+ }
531
+
532
+ // language model scores
533
+ outputWordGraphStream << "\tl=";
534
+
535
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
536
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
537
+ const StatefulFeatureFunction *ff = statefulFFs[i];
538
+ const LanguageModel *lm = static_cast<const LanguageModel*>(ff);
539
+
540
+ vector<float> scores = hypo->GetScoreBreakdown().GetScoresForProducer(lm);
541
+
542
+ outputWordGraphStream << scores[0];
543
+ vector<float>::const_iterator iterScore;
544
+ for (iterScore = ++scores.begin() ; iterScore != scores.end() ; ++iterScore) {
545
+ outputWordGraphStream << ", " << *iterScore;
546
+ }
547
+ }
548
+
549
+ // re-ordering
550
+ outputWordGraphStream << "\tr=";
551
+
552
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
553
+ std::vector<FeatureFunction*>::const_iterator iter;
554
+ for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
555
+ const FeatureFunction *ff = *iter;
556
+
557
+ const DistortionScoreProducer *model = dynamic_cast<const DistortionScoreProducer*>(ff);
558
+ if (model) {
559
+ outputWordGraphStream << hypo->GetScoreBreakdown().GetScoreForProducer(model);
560
+ }
561
+ }
562
+
563
+ // output both source and target phrases in the word graph
564
+ outputWordGraphStream << "\tw=" << hypo->GetSourcePhraseStringRep()
565
+ << "|" << hypo->GetCurrTargetPhrase();
566
+
567
+ outputWordGraphStream << endl;
568
+ }
569
+
570
+ // VN put back of OutputPassthroughInformation
571
+ void Manager::OutputPassthroughInformation(std::ostream &out, const Hypothesis *hypo) const
572
+ {
573
+ const std::string passthrough = hypo->GetManager().GetSource().GetPassthroughInformation();
574
+ out << passthrough;
575
+ }
576
+ // end of put back
577
+
578
+ void Manager::GetOutputLanguageModelOrder( std::ostream &out, const Hypothesis *hypo ) const
579
+ {
580
+ Phrase translation;
581
+ hypo->GetOutputPhrase(translation);
582
+ const std::vector<const StatefulFeatureFunction*> &statefulFFs = StatefulFeatureFunction::GetStatefulFeatureFunctions();
583
+ for (size_t i = 0; i < statefulFFs.size(); ++i) {
584
+ const StatefulFeatureFunction *ff = statefulFFs[i];
585
+ if (const LanguageModel *lm = dynamic_cast<const LanguageModel*>(ff)) {
586
+ lm->ReportHistoryOrder(out, translation);
587
+ }
588
+ }
589
+ }
590
+
591
+ void Manager::GetWordGraph(long translationId, std::ostream &outputWordGraphStream) const
592
+ {
593
+ const StaticData &staticData = StaticData::Instance();
594
+ const PARAM_VEC *params;
595
+
596
+ string fileName;
597
+ bool outputNBest = false;
598
+ params = staticData.GetParameter().GetParam("output-word-graph");
599
+ if (params && params->size()) {
600
+ fileName = params->at(0);
601
+
602
+ if (params->size() == 2) {
603
+ outputNBest = Scan<bool>(params->at(1));
604
+ }
605
+ }
606
+
607
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
608
+
609
+ outputWordGraphStream << "VERSION=1.0" << endl
610
+ << "UTTERANCE=" << translationId << endl;
611
+
612
+ size_t linkId = 0;
613
+ std::vector < HypothesisStack* >::const_iterator iterStack;
614
+ for (iterStack = ++hypoStackColl.begin() ; iterStack != hypoStackColl.end() ; ++iterStack) {
615
+ const HypothesisStack &stack = **iterStack;
616
+ HypothesisStack::const_iterator iterHypo;
617
+ for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo) {
618
+ const Hypothesis *hypo = *iterHypo;
619
+ OutputWordGraph(outputWordGraphStream, hypo, linkId);
620
+
621
+ if (outputNBest) {
622
+ const ArcList *arcList = hypo->GetArcList();
623
+ if (arcList != NULL) {
624
+ ArcList::const_iterator iterArcList;
625
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
626
+ const Hypothesis *loserHypo = *iterArcList;
627
+ OutputWordGraph(outputWordGraphStream, loserHypo, linkId);
628
+ }
629
+ }
630
+ } //if (outputNBest)
631
+ } //for (iterHypo
632
+ } // for (iterStack
633
+ }
634
+
635
+ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
636
+ {
637
+ std::map < int, bool > connected;
638
+ std::map < int, int > forward;
639
+ std::map < int, double > forwardScore;
640
+
641
+ // *** find connected hypotheses ***
642
+ std::vector< const Hypothesis *> connectedList;
643
+ GetConnectedGraph(&connected, &connectedList);
644
+
645
+ // ** compute best forward path for each hypothesis *** //
646
+
647
+ // forward cost of hypotheses on final stack is 0
648
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
649
+ const HypothesisStack &finalStack = *hypoStackColl.back();
650
+ HypothesisStack::const_iterator iterHypo;
651
+ for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo) {
652
+ const Hypothesis *hypo = *iterHypo;
653
+ forwardScore[ hypo->GetId() ] = 0.0f;
654
+ forward[ hypo->GetId() ] = -1;
655
+ }
656
+
657
+ // compete for best forward score of previous hypothesis
658
+ std::vector < HypothesisStack* >::const_iterator iterStack;
659
+ for (iterStack = --hypoStackColl.end() ; iterStack != hypoStackColl.begin() ; --iterStack) {
660
+ const HypothesisStack &stack = **iterStack;
661
+ HypothesisStack::const_iterator iterHypo;
662
+ for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo) {
663
+ const Hypothesis *hypo = *iterHypo;
664
+ if (connected.find( hypo->GetId() ) != connected.end()) {
665
+ // make a play for previous hypothesis
666
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
667
+ double fscore = forwardScore[ hypo->GetId() ] +
668
+ hypo->GetScore() - prevHypo->GetScore();
669
+ if (forwardScore.find( prevHypo->GetId() ) == forwardScore.end()
670
+ || forwardScore.find( prevHypo->GetId() )->second < fscore) {
671
+ forwardScore[ prevHypo->GetId() ] = fscore;
672
+ forward[ prevHypo->GetId() ] = hypo->GetId();
673
+ }
674
+ // all arcs also make a play
675
+ const ArcList *arcList = hypo->GetArcList();
676
+ if (arcList != NULL) {
677
+ ArcList::const_iterator iterArcList;
678
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
679
+ const Hypothesis *loserHypo = *iterArcList;
680
+ // make a play
681
+ const Hypothesis *loserPrevHypo = loserHypo->GetPrevHypo();
682
+ double fscore = forwardScore[ hypo->GetId() ] +
683
+ loserHypo->GetScore() - loserPrevHypo->GetScore();
684
+ if (forwardScore.find( loserPrevHypo->GetId() ) == forwardScore.end()
685
+ || forwardScore.find( loserPrevHypo->GetId() )->second < fscore) {
686
+ forwardScore[ loserPrevHypo->GetId() ] = fscore;
687
+ forward[ loserPrevHypo->GetId() ] = loserHypo->GetId();
688
+ }
689
+ } // end for arc list
690
+ } // end if arc list empty
691
+ } // end if hypo connected
692
+ } // end for hypo
693
+ } // end for stack
694
+
695
+ // *** output all connected hypotheses *** //
696
+
697
+ connected[ 0 ] = true;
698
+ for (iterStack = hypoStackColl.begin() ; iterStack != hypoStackColl.end() ; ++iterStack) {
699
+ const HypothesisStack &stack = **iterStack;
700
+ HypothesisStack::const_iterator iterHypo;
701
+ for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo) {
702
+ const Hypothesis *hypo = *iterHypo;
703
+ if (connected.find( hypo->GetId() ) != connected.end()) {
704
+ searchGraph.push_back(SearchGraphNode(hypo,NULL,forward[hypo->GetId()],
705
+ forwardScore[hypo->GetId()]));
706
+
707
+ const ArcList *arcList = hypo->GetArcList();
708
+ if (arcList != NULL) {
709
+ ArcList::const_iterator iterArcList;
710
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
711
+ const Hypothesis *loserHypo = *iterArcList;
712
+ searchGraph.push_back(SearchGraphNode(loserHypo,hypo,
713
+ forward[hypo->GetId()], forwardScore[hypo->GetId()]));
714
+ }
715
+ } // end if arcList empty
716
+ } // end if connected
717
+ } // end for iterHypo
718
+ } // end for iterStack
719
+
720
+ }
721
+
722
+ void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
723
+ {
724
+ outputSearchGraphStream.setf(std::ios::fixed);
725
+ outputSearchGraphStream.precision(6);
726
+
727
+ const vector<const StatelessFeatureFunction*>& slf = StatelessFeatureFunction::GetStatelessFeatureFunctions();
728
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
729
+ size_t featureIndex = 1;
730
+ for (size_t i = 0; i < sff.size(); ++i) {
731
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
732
+ }
733
+ for (size_t i = 0; i < slf.size(); ++i) {
734
+ /*
735
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
736
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
737
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
738
+ slf[i]->GetScoreProducerWeightShortName() != "g")
739
+ */
740
+ {
741
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
742
+ }
743
+ }
744
+ const vector<PhraseDictionary*>& pds = PhraseDictionary::GetColl();
745
+ for( size_t i=0; i<pds.size(); i++ ) {
746
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
747
+ }
748
+ const vector<GenerationDictionary*>& gds = GenerationDictionary::GetColl();
749
+ for( size_t i=0; i<gds.size(); i++ ) {
750
+ featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
751
+ }
752
+ }
753
+
754
+ void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
755
+ {
756
+ outputSearchGraphStream.setf(std::ios::fixed);
757
+ outputSearchGraphStream.precision(6);
758
+
759
+ // outputSearchGraphStream << endl;
760
+ // outputSearchGraphStream << (*hypo) << endl;
761
+ // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
762
+ // outputSearchGraphStream << scoreCollection << endl;
763
+
764
+ const vector<const StatelessFeatureFunction*>& slf =StatelessFeatureFunction::GetStatelessFeatureFunctions();
765
+ const vector<const StatefulFeatureFunction*>& sff = StatefulFeatureFunction::GetStatefulFeatureFunctions();
766
+ size_t featureIndex = 1;
767
+ for (size_t i = 0; i < sff.size(); ++i) {
768
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
769
+ }
770
+ for (size_t i = 0; i < slf.size(); ++i) {
771
+ /*
772
+ if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
773
+ slf[i]->GetScoreProducerWeightShortName() != "tm" &&
774
+ slf[i]->GetScoreProducerWeightShortName() != "I" &&
775
+ slf[i]->GetScoreProducerWeightShortName() != "g")
776
+ */
777
+ {
778
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
779
+ }
780
+ }
781
+ const vector<PhraseDictionary*>& pds = PhraseDictionary::GetColl();
782
+ for( size_t i=0; i<pds.size(); i++ ) {
783
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
784
+ }
785
+ const vector<GenerationDictionary*>& gds = GenerationDictionary::GetColl();
786
+ for( size_t i=0; i<gds.size(); i++ ) {
787
+ featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
788
+ }
789
+
790
+ }
791
+
792
+ void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
793
+ {
794
+ outputSearchGraphStream.setf(std::ios::fixed);
795
+ outputSearchGraphStream.precision(6);
796
+ ScoreComponentCollection scores = hypo->GetScoreBreakdown();
797
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
798
+ if (prevHypo) {
799
+ scores.MinusEquals(prevHypo->GetScoreBreakdown());
800
+ }
801
+ scores.Save(outputSearchGraphStream, false);
802
+ }
803
+
804
+
805
+ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
806
+ {
807
+ size_t numScoreComps = ff->GetNumScoreComponents();
808
+ if (numScoreComps != 0) {
809
+ vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
810
+ for (size_t i = 0; i < numScoreComps; ++i) {
811
+ outputSearchGraphStream << "# " << ff->GetScoreProducerDescription()
812
+ << " " << ff->GetScoreProducerDescription()
813
+ << " " << (i+1) << " of " << numScoreComps << endl
814
+ << "x" << (index+i) << "scale=" << values[i] << endl;
815
+ }
816
+ return index+numScoreComps;
817
+ } else {
818
+ cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
819
+ assert(false);
820
+ return 0;
821
+ }
822
+ }
823
+
824
+ size_t
825
+ Manager::
826
+ OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo,
827
+ const FeatureFunction* ff, std::ostream &out) const
828
+ {
829
+ const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown();
830
+ vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
831
+ size_t numScoreComps = featureValues.size();
832
+ for (size_t i = 0; i < numScoreComps; ++i) {
833
+ out << "x" << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
834
+ }
835
+ return index + numScoreComps;
836
+ }
837
+
838
+ /**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
839
+ void
840
+ Manager::
841
+ OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const
842
+ {
843
+
844
+ VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
845
+
846
+ vector<SearchGraphNode> searchGraph;
847
+ GetSearchGraph(searchGraph);
848
+
849
+
850
+ map<int,int> mosesIDToHypergraphID;
851
+ // map<int,int> hypergraphIDToMosesID;
852
+ set<int> terminalNodes;
853
+ multimap<int,int> hypergraphIDToArcs;
854
+
855
+ VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
856
+
857
+ long numNodes = 0;
858
+ long endNode = 0;
859
+ {
860
+ long hypergraphHypothesisID = 0;
861
+ for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
862
+
863
+ // Get an id number for the previous hypothesis
864
+ const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
865
+ if (prevHypo!=NULL) {
866
+ int mosesPrevHypothesisID = prevHypo->GetId();
867
+ if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
868
+ mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
869
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
870
+ hypergraphHypothesisID += 1;
871
+ }
872
+ }
873
+
874
+ // Get an id number for this hypothesis
875
+ int mosesHypothesisID;
876
+ if (searchGraph[arcNumber].recombinationHypo) {
877
+ mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
878
+ } else {
879
+ mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
880
+ }
881
+
882
+ if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
883
+
884
+ mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
885
+ // hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
886
+
887
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
888
+ if (terminalNode) {
889
+ // Final arc to end node, representing the end of the sentence </s>
890
+ terminalNodes.insert(hypergraphHypothesisID);
891
+ }
892
+
893
+ hypergraphHypothesisID += 1;
894
+ }
895
+
896
+ // Record that this arc ends at this node
897
+ hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
898
+
899
+ }
900
+
901
+ // Unique end node
902
+ endNode = hypergraphHypothesisID;
903
+ // mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
904
+ numNodes = endNode + 1;
905
+
906
+ }
907
+
908
+
909
+ long numArcs = searchGraph.size() + terminalNodes.size();
910
+
911
+ //Header
912
+ outputSearchGraphStream << "# target ||| features ||| source-covered" << endl;
913
+
914
+ // Print number of nodes and arcs
915
+ outputSearchGraphStream << numNodes << " " << numArcs << endl;
916
+
917
+ VERBOSE(2,"Search graph to output as hypergraph for sentence " << m_source.GetTranslationId()
918
+ << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
919
+
920
+ VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << m_source.GetTranslationId() << std::endl)
921
+
922
+
923
+ for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
924
+ if (hypergraphHypothesisID % 100000 == 0) {
925
+ VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << m_source.GetTranslationId() << std::endl);
926
+ }
927
+ // int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
928
+ size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
929
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl)
930
+ if (count > 0) {
931
+ outputSearchGraphStream << "# node " << hypergraphHypothesisID << endl;
932
+ outputSearchGraphStream << count << "\n";
933
+
934
+ pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
935
+ hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
936
+ for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
937
+ int lineNumber = (*it).second;
938
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
939
+ int mosesHypothesisID;// = thisHypo->GetId();
940
+ if (searchGraph[lineNumber].recombinationHypo) {
941
+ mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
942
+ } else {
943
+ mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
944
+ }
945
+ // int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
946
+ UTIL_THROW_IF2(
947
+ (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
948
+ "Error while writing search lattice as hypergraph for sentence " << m_source.GetTranslationId() << ". " <<
949
+ "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
950
+ ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] <<
951
+ ". There are " << numNodes << " nodes in the search lattice."
952
+ );
953
+
954
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
955
+ if (prevHypo==NULL) {
956
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
957
+ outputSearchGraphStream << "<s> ||| ||| 0\n";
958
+ } else {
959
+ int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
960
+ // VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
961
+ UTIL_THROW_IF2(
962
+ (startNode >= hypergraphHypothesisID),
963
+ "Error while writing search lattice as hypergraph for sentence" << m_source.GetTranslationId() << ". " <<
964
+ "The nodes must be output in topological order. The code attempted to violate this restriction."
965
+ );
966
+
967
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
968
+ int targetWordCount = targetPhrase.GetSize();
969
+
970
+ outputSearchGraphStream << "[" << startNode << "] ";
971
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
972
+ outputSearchGraphStream << targetPhrase.GetWord(targetWordIndex)[0]->GetString() << " ";
973
+ }
974
+ outputSearchGraphStream << " ||| ";
975
+ OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
976
+ outputSearchGraphStream << " ||| " << thisHypo->GetWordsBitmap().GetNumWordsCovered();
977
+ outputSearchGraphStream << "\n";
978
+ }
979
+ }
980
+ }
981
+ }
982
+
983
+ // Print node and arc(s) for end of sentence </s>
984
+ outputSearchGraphStream << "# node " << endNode << endl;
985
+ outputSearchGraphStream << terminalNodes.size() << "\n";
986
+ for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
987
+ outputSearchGraphStream << "[" << (*it) << "] </s> ||| ||| " << GetSource().GetSize() << "\n";
988
+ }
989
+
990
+ }
991
+
992
+
993
+ /**! Output search graph in HTK standard lattice format (SLF) */
994
+ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
995
+ {
996
+
997
+ vector<SearchGraphNode> searchGraph;
998
+ GetSearchGraph(searchGraph);
999
+
1000
+ long numArcs = 0;
1001
+ long numNodes = 0;
1002
+
1003
+ map<int,int> nodes;
1004
+ set<int> terminalNodes;
1005
+
1006
+ // Unique start node
1007
+ nodes[0] = 0;
1008
+
1009
+ for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
1010
+
1011
+ int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
1012
+ numArcs += targetWordCount;
1013
+
1014
+ int hypothesisID = searchGraph[arcNumber].hypo->GetId();
1015
+ if (nodes.count(hypothesisID) == 0) {
1016
+
1017
+ numNodes += targetWordCount;
1018
+ nodes[hypothesisID] = numNodes;
1019
+ //numNodes += 1;
1020
+
1021
+ bool terminalNode = (searchGraph[arcNumber].forward == -1);
1022
+ if (terminalNode) {
1023
+ numArcs += 1;
1024
+ }
1025
+ }
1026
+
1027
+ }
1028
+ numNodes += 1;
1029
+
1030
+ // Unique end node
1031
+ nodes[numNodes] = numNodes;
1032
+
1033
+ outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
1034
+ outputSearchGraphStream << "VERSION=1.1" << endl;
1035
+ outputSearchGraphStream << "base=2.71828182845905" << endl;
1036
+ outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
1037
+ outputSearchGraphStream << "LINKS=" << numArcs << endl;
1038
+
1039
+ OutputFeatureWeightsForSLF(outputSearchGraphStream);
1040
+
1041
+ for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
1042
+ const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
1043
+ const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
1044
+ if (prevHypo) {
1045
+
1046
+ int startNode = nodes[prevHypo->GetId()];
1047
+ int endNode = nodes[thisHypo->GetId()];
1048
+ bool terminalNode = (searchGraph[lineNumber].forward == -1);
1049
+ const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
1050
+ int targetWordCount = targetPhrase.GetSize();
1051
+
1052
+ for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
1053
+ int x = (targetWordCount-targetWordIndex);
1054
+
1055
+ outputSearchGraphStream << "J=" << arcNumber;
1056
+
1057
+ if (targetWordIndex==0) {
1058
+ outputSearchGraphStream << " S=" << startNode;
1059
+ } else {
1060
+ outputSearchGraphStream << " S=" << endNode - x;
1061
+ }
1062
+
1063
+ outputSearchGraphStream << " E=" << endNode - (x-1)
1064
+ << " W=" << targetPhrase.GetWord(targetWordIndex);
1065
+
1066
+ OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
1067
+
1068
+ outputSearchGraphStream << endl;
1069
+
1070
+ arcNumber += 1;
1071
+ }
1072
+
1073
+ if (terminalNode && terminalNodes.count(endNode) == 0) {
1074
+ terminalNodes.insert(endNode);
1075
+ outputSearchGraphStream << "J=" << arcNumber
1076
+ << " S=" << endNode
1077
+ << " E=" << numNodes
1078
+ << endl;
1079
+ arcNumber += 1;
1080
+ }
1081
+ }
1082
+ }
1083
+
1084
+ }
1085
+
1086
+
1087
+ void
1088
+ OutputSearchNode(AllOptions const& opts, long translationId,
1089
+ std::ostream &out,
1090
+ SearchGraphNode const& searchNode)
1091
+ {
1092
+ const vector<FactorType> &outputFactorOrder = opts.output.factor_order;
1093
+ bool extendedFormat = opts.output.SearchGraphExtended.size();
1094
+ out << translationId;
1095
+
1096
+ // special case: initial hypothesis
1097
+ if ( searchNode.hypo->GetId() == 0 ) {
1098
+ out << " hyp=0 stack=0";
1099
+ if (extendedFormat) {
1100
+ out << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore;
1101
+ }
1102
+ out << endl;
1103
+ return;
1104
+ }
1105
+
1106
+ const Hypothesis *prevHypo = searchNode.hypo->GetPrevHypo();
1107
+
1108
+ // output in traditional format
1109
+ if (!extendedFormat) {
1110
+ out << " hyp=" << searchNode.hypo->GetId()
1111
+ << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
1112
+ << " back=" << prevHypo->GetId()
1113
+ << " score=" << searchNode.hypo->GetScore()
1114
+ << " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
1115
+
1116
+ if (searchNode.recombinationHypo != NULL)
1117
+ out << " recombined=" << searchNode.recombinationHypo->GetId();
1118
+
1119
+ out << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
1120
+ << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
1121
+ << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos()
1122
+ << " out=" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder)
1123
+ << endl;
1124
+ return;
1125
+ }
1126
+
1127
+ out << " hyp=" << searchNode.hypo->GetId();
1128
+ out << " stack=" << searchNode.hypo->GetWordsBitmap().GetNumWordsCovered()
1129
+ << " back=" << prevHypo->GetId()
1130
+ << " score=" << searchNode.hypo->GetScore()
1131
+ << " transition=" << (searchNode.hypo->GetScore() - prevHypo->GetScore());
1132
+
1133
+ if (searchNode.recombinationHypo != NULL)
1134
+ out << " recombined=" << searchNode.recombinationHypo->GetId();
1135
+
1136
+ out << " forward=" << searchNode.forward << " fscore=" << searchNode.fscore
1137
+ << " covered=" << searchNode.hypo->GetCurrSourceWordsRange().GetStartPos()
1138
+ << "-" << searchNode.hypo->GetCurrSourceWordsRange().GetEndPos();
1139
+
1140
+ // Modified so that -osgx is a superset of -osg (GST Oct 2011)
1141
+ ScoreComponentCollection scoreBreakdown = searchNode.hypo->GetScoreBreakdown();
1142
+ scoreBreakdown.MinusEquals( prevHypo->GetScoreBreakdown() );
1143
+ out << " scores=\"" << scoreBreakdown << "\""
1144
+ << " out=\"" << searchNode.hypo->GetSourcePhraseStringRep()
1145
+ << "|" << searchNode.hypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\"" << endl;
1146
+ }
1147
+
1148
+ void Manager::GetConnectedGraph(
1149
+ std::map< int, bool >* pConnected,
1150
+ std::vector< const Hypothesis* >* pConnectedList) const
1151
+ {
1152
+ std::map < int, bool >& connected = *pConnected;
1153
+ std::vector< const Hypothesis *>& connectedList = *pConnectedList;
1154
+
1155
+ // start with the ones in the final stack
1156
+ const std::vector < HypothesisStack* > &hypoStackColl
1157
+ = m_search->GetHypothesisStacks();
1158
+ const HypothesisStack &finalStack = *hypoStackColl.back();
1159
+ HypothesisStack::const_iterator iterHypo;
1160
+ for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo) {
1161
+ const Hypothesis *hypo = *iterHypo;
1162
+ connected[ hypo->GetId() ] = true;
1163
+ connectedList.push_back( hypo );
1164
+ }
1165
+ // move back from known connected hypotheses
1166
+ for(size_t i=0; i<connectedList.size(); i++) {
1167
+ const Hypothesis *hypo = connectedList[i];
1168
+
1169
+ // add back pointer
1170
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
1171
+ if (prevHypo && prevHypo->GetId() > 0 // don't add empty hypothesis
1172
+ && connected.find( prevHypo->GetId() ) == connected.end()) { // don't add already added
1173
+ connected[ prevHypo->GetId() ] = true;
1174
+ connectedList.push_back( prevHypo );
1175
+ }
1176
+
1177
+ // add arcs
1178
+ const ArcList *arcList = hypo->GetArcList();
1179
+ if (arcList != NULL) {
1180
+ ArcList::const_iterator iterArcList;
1181
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
1182
+ const Hypothesis *loserHypo = *iterArcList;
1183
+ if (connected.find( loserHypo->GetId() ) == connected.end()) { // don't add already added
1184
+ connected[ loserHypo->GetId() ] = true;
1185
+ connectedList.push_back( loserHypo );
1186
+ }
1187
+ }
1188
+ }
1189
+ }
1190
+ }
1191
+
1192
+ void Manager::GetWinnerConnectedGraph(
1193
+ std::map< int, bool >* pConnected,
1194
+ std::vector< const Hypothesis* >* pConnectedList) const
1195
+ {
1196
+ std::map < int, bool >& connected = *pConnected;
1197
+ std::vector< const Hypothesis *>& connectedList = *pConnectedList;
1198
+
1199
+ // start with the ones in the final stack
1200
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
1201
+ const HypothesisStack &finalStack = *hypoStackColl.back();
1202
+ HypothesisStack::const_iterator iterHypo;
1203
+ for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo) {
1204
+ const Hypothesis *hypo = *iterHypo;
1205
+ connected[ hypo->GetId() ] = true;
1206
+ connectedList.push_back( hypo );
1207
+ }
1208
+
1209
+ // move back from known connected hypotheses
1210
+ for(size_t i=0; i<connectedList.size(); i++) {
1211
+ const Hypothesis *hypo = connectedList[i];
1212
+
1213
+ // add back pointer
1214
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
1215
+ if (prevHypo->GetId() > 0 // don't add empty hypothesis
1216
+ && connected.find( prevHypo->GetId() ) == connected.end()) { // don't add already added
1217
+ connected[ prevHypo->GetId() ] = true;
1218
+ connectedList.push_back( prevHypo );
1219
+ }
1220
+
1221
+ // add arcs
1222
+ const ArcList *arcList = hypo->GetArcList();
1223
+ if (arcList != NULL) {
1224
+ ArcList::const_iterator iterArcList;
1225
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
1226
+ const Hypothesis *loserHypo = *iterArcList;
1227
+ if (connected.find( loserHypo->GetPrevHypo()->GetId() ) == connected.end() && loserHypo->GetPrevHypo()->GetId() > 0) { // don't add already added & don't add hyp 0
1228
+ connected[ loserHypo->GetPrevHypo()->GetId() ] = true;
1229
+ connectedList.push_back( loserHypo->GetPrevHypo() );
1230
+ }
1231
+ }
1232
+ }
1233
+ }
1234
+ }
1235
+
1236
+
1237
+ #ifdef HAVE_PROTOBUF
1238
+
1239
+ void SerializeEdgeInfo(const Hypothesis* hypo, hgmert::Hypergraph_Edge* edge)
1240
+ {
1241
+ hgmert::Rule* rule = edge->mutable_rule();
1242
+ hypo->GetCurrTargetPhrase().WriteToRulePB(rule);
1243
+ const Hypothesis* prev = hypo->GetPrevHypo();
1244
+ // if the feature values are empty, they default to 0
1245
+ if (!prev) return;
1246
+ // score breakdown is an aggregate (forward) quantity, but the exported
1247
+ // graph object just wants the feature values on the edges
1248
+ const ScoreComponentCollection& scores = hypo->GetScoreBreakdown();
1249
+ const ScoreComponentCollection& pscores = prev->GetScoreBreakdown();
1250
+ for (unsigned int i = 0; i < scores.size(); ++i)
1251
+ edge->add_feature_values((scores[i] - pscores[i]) * -1.0);
1252
+ }
1253
+
1254
+ hgmert::Hypergraph_Node* GetHGNode(
1255
+ const Hypothesis* hypo,
1256
+ std::map< int, int>* i2hgnode,
1257
+ hgmert::Hypergraph* hg,
1258
+ int* hgNodeIdx)
1259
+ {
1260
+ hgmert::Hypergraph_Node* hgnode;
1261
+ std::map < int, int >::iterator idxi = i2hgnode->find(hypo->GetId());
1262
+ if (idxi == i2hgnode->end()) {
1263
+ *hgNodeIdx = ((*i2hgnode)[hypo->GetId()] = hg->nodes_size());
1264
+ hgnode = hg->add_nodes();
1265
+ } else {
1266
+ *hgNodeIdx = idxi->second;
1267
+ hgnode = hg->mutable_nodes(*hgNodeIdx);
1268
+ }
1269
+ return hgnode;
1270
+ }
1271
+
1272
+ void Manager::SerializeSearchGraphPB(
1273
+ long translationId,
1274
+ std::ostream& outputStream) const
1275
+ {
1276
+ using namespace hgmert;
1277
+ std::map < int, bool > connected;
1278
+ std::map < int, int > i2hgnode;
1279
+ std::vector< const Hypothesis *> connectedList;
1280
+ GetConnectedGraph(&connected, &connectedList);
1281
+ connected[ 0 ] = true;
1282
+ Hypergraph hg;
1283
+ hg.set_is_sorted(false);
1284
+ int num_feats = (*m_search->GetHypothesisStacks().back()->begin())->GetScoreBreakdown().size();
1285
+ hg.set_num_features(num_feats);
1286
+ StaticData::Instance().GetScoreIndexManager().SerializeFeatureNamesToPB(&hg);
1287
+ Hypergraph_Node* goal = hg.add_nodes(); // idx=0 goal node must have idx 0
1288
+ Hypergraph_Node* source = hg.add_nodes(); // idx=1
1289
+ i2hgnode[-1] = 1; // source node
1290
+ const std::vector < HypothesisStack* > &hypoStackColl = m_search->GetHypothesisStacks();
1291
+ const HypothesisStack &finalStack = *hypoStackColl.back();
1292
+ for (std::vector < HypothesisStack* >::const_iterator iterStack = hypoStackColl.begin();
1293
+ iterStack != hypoStackColl.end() ; ++iterStack) {
1294
+ const HypothesisStack &stack = **iterStack;
1295
+ HypothesisStack::const_iterator iterHypo;
1296
+
1297
+ for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo) {
1298
+ const Hypothesis *hypo = *iterHypo;
1299
+ bool is_goal = hypo->GetWordsBitmap().IsComplete();
1300
+ if (connected.find( hypo->GetId() ) != connected.end()) {
1301
+ int headNodeIdx;
1302
+ Hypergraph_Node* headNode = GetHGNode(hypo, &i2hgnode, &hg, &headNodeIdx);
1303
+ if (is_goal) {
1304
+ Hypergraph_Edge* ge = hg.add_edges();
1305
+ ge->set_head_node(0); // goal
1306
+ ge->add_tail_nodes(headNodeIdx);
1307
+ ge->mutable_rule()->add_trg_words("[X,1]");
1308
+ }
1309
+ Hypergraph_Edge* edge = hg.add_edges();
1310
+ SerializeEdgeInfo(hypo, edge);
1311
+ edge->set_head_node(headNodeIdx);
1312
+ const Hypothesis* prev = hypo->GetPrevHypo();
1313
+ int tailNodeIdx = 1; // source
1314
+ if (prev)
1315
+ tailNodeIdx = i2hgnode.find(prev->GetId())->second;
1316
+ edge->add_tail_nodes(tailNodeIdx);
1317
+
1318
+ const ArcList *arcList = hypo->GetArcList();
1319
+ if (arcList != NULL) {
1320
+ ArcList::const_iterator iterArcList;
1321
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
1322
+ const Hypothesis *loserHypo = *iterArcList;
1323
+ UTIL_THROW_IF2(!connected[loserHypo->GetId()],
1324
+ "Hypothesis " << loserHypo->GetId() << " is not connected");
1325
+ Hypergraph_Edge* edge = hg.add_edges();
1326
+ SerializeEdgeInfo(loserHypo, edge);
1327
+ edge->set_head_node(headNodeIdx);
1328
+ tailNodeIdx = i2hgnode.find(loserHypo->GetPrevHypo()->GetId())->second;
1329
+ edge->add_tail_nodes(tailNodeIdx);
1330
+ }
1331
+ } // end if arcList empty
1332
+ } // end if connected
1333
+ } // end for iterHypo
1334
+ } // end for iterStack
1335
+ hg.SerializeToOstream(&outputStream);
1336
+ }
1337
+ #endif
1338
+
1339
+ void
1340
+ Manager::
1341
+ OutputSearchGraph(long translationId, std::ostream &out) const
1342
+ {
1343
+ vector<SearchGraphNode> searchGraph;
1344
+ GetSearchGraph(searchGraph);
1345
+ for (size_t i = 0; i < searchGraph.size(); ++i) {
1346
+ OutputSearchNode(*options(),translationId,out,searchGraph[i]);
1347
+ }
1348
+ }
1349
+
1350
+ void
1351
+ Manager::
1352
+ GetForwardBackwardSearchGraph
1353
+ ( std::map< int, bool >* pConnected,
1354
+ std::vector<Hypothesis const* >* pConnectedList,
1355
+ std::map<Hypothesis const*, set<Hypothesis const*> >* pOutgoingHyps,
1356
+ vector< float>* pFwdBwdScores) const
1357
+ {
1358
+ std::map < int, bool > &connected = *pConnected;
1359
+ std::vector< const Hypothesis *>& connectedList = *pConnectedList;
1360
+ std::map < int, int > forward;
1361
+ std::map < int, double > forwardScore;
1362
+
1363
+ std::map < const Hypothesis*, set <const Hypothesis*> > & outgoingHyps
1364
+ = *pOutgoingHyps;
1365
+ vector< float> & estimatedScores = *pFwdBwdScores;
1366
+
1367
+ // *** find connected hypotheses ***
1368
+ GetWinnerConnectedGraph(&connected, &connectedList);
1369
+
1370
+ // ** compute best forward path for each hypothesis *** //
1371
+
1372
+ // forward cost of hypotheses on final stack is 0
1373
+ const std::vector < HypothesisStack* > &hypoStackColl
1374
+ = m_search->GetHypothesisStacks();
1375
+ const HypothesisStack &finalStack = *hypoStackColl.back();
1376
+ HypothesisStack::const_iterator iterHypo;
1377
+ for (iterHypo = finalStack.begin() ; iterHypo != finalStack.end() ; ++iterHypo) {
1378
+ const Hypothesis *hypo = *iterHypo;
1379
+ forwardScore[ hypo->GetId() ] = 0.0f;
1380
+ forward[ hypo->GetId() ] = -1;
1381
+ }
1382
+
1383
+ // compete for best forward score of previous hypothesis
1384
+ std::vector < HypothesisStack* >::const_iterator iterStack;
1385
+ for (iterStack = --hypoStackColl.end() ; iterStack != hypoStackColl.begin() ; --iterStack) {
1386
+ const HypothesisStack &stack = **iterStack;
1387
+ HypothesisStack::const_iterator iterHypo;
1388
+ for (iterHypo = stack.begin() ; iterHypo != stack.end() ; ++iterHypo) {
1389
+ const Hypothesis *hypo = *iterHypo;
1390
+ if (connected.find( hypo->GetId() ) != connected.end()) {
1391
+ // make a play for previous hypothesis
1392
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
1393
+ double fscore = forwardScore[ hypo->GetId() ] +
1394
+ hypo->GetScore() - prevHypo->GetScore();
1395
+ if (forwardScore.find( prevHypo->GetId() ) == forwardScore.end()
1396
+ || forwardScore.find( prevHypo->GetId() )->second < fscore) {
1397
+ forwardScore[ prevHypo->GetId() ] = fscore;
1398
+ forward[ prevHypo->GetId() ] = hypo->GetId();
1399
+ }
1400
+ //store outgoing info
1401
+ outgoingHyps[prevHypo].insert(hypo);
1402
+
1403
+ // all arcs also make a play
1404
+ const ArcList *arcList = hypo->GetArcList();
1405
+ if (arcList != NULL) {
1406
+ ArcList::const_iterator iterArcList;
1407
+ for (iterArcList = arcList->begin() ; iterArcList != arcList->end() ; ++iterArcList) {
1408
+ const Hypothesis *loserHypo = *iterArcList;
1409
+ // make a play
1410
+ const Hypothesis *loserPrevHypo = loserHypo->GetPrevHypo();
1411
+ double fscore = forwardScore[ hypo->GetId() ] +
1412
+ loserHypo->GetScore() - loserPrevHypo->GetScore();
1413
+ if (forwardScore.find( loserPrevHypo->GetId() ) == forwardScore.end()
1414
+ || forwardScore.find( loserPrevHypo->GetId() )->second < fscore) {
1415
+ forwardScore[ loserPrevHypo->GetId() ] = fscore;
1416
+ forward[ loserPrevHypo->GetId() ] = loserHypo->GetId();
1417
+ }
1418
+ //store outgoing info
1419
+ outgoingHyps[loserPrevHypo].insert(hypo);
1420
+
1421
+
1422
+ } // end for arc list
1423
+ } // end if arc list empty
1424
+ } // end if hypo connected
1425
+ } // end for hypo
1426
+ } // end for stack
1427
+
1428
+ for (std::vector< const Hypothesis *>::iterator it = connectedList.begin(); it != connectedList.end(); ++it) {
1429
+ float estimatedScore = (*it)->GetScore() + forwardScore[(*it)->GetId()];
1430
+ estimatedScores.push_back(estimatedScore);
1431
+ }
1432
+ }
1433
+
1434
+
1435
+ const Hypothesis *Manager::GetBestHypothesis() const
1436
+ {
1437
+ return m_search->GetBestHypothesis();
1438
+ }
1439
+
1440
+ int Manager::GetNextHypoId()
1441
+ {
1442
+ GetSentenceStats().AddCreated(); // count created hypotheses
1443
+ return m_hypoId++;
1444
+ }
1445
+
1446
+ void Manager::ResetSentenceStats(const InputType& source)
1447
+ {
1448
+ m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
1449
+ }
1450
+ SentenceStats& Manager::GetSentenceStats() const
1451
+ {
1452
+ return *m_sentenceStats;
1453
+
1454
+ }
1455
+
1456
+ void Manager::OutputBest(OutputCollector *collector) const
1457
+ {
1458
+ long translationId = m_source.GetTranslationId();
1459
+
1460
+ Timer additionalReportingTime;
1461
+
1462
+ // apply decision rule and output best translation(s)
1463
+ if (collector) {
1464
+ ostringstream out;
1465
+ ostringstream debug;
1466
+ FixPrecision(debug,PRECISION);
1467
+
1468
+ // all derivations - send them to debug stream
1469
+ if (options()->output.PrintAllDerivations) {
1470
+ additionalReportingTime.start();
1471
+ PrintAllDerivations(translationId, debug);
1472
+ additionalReportingTime.stop();
1473
+ }
1474
+
1475
+ Timer decisionRuleTime;
1476
+ decisionRuleTime.start();
1477
+
1478
+ // MAP decoding: best hypothesis
1479
+ const Hypothesis* bestHypo = NULL;
1480
+ if (!options()->mbr.enabled) {
1481
+ bestHypo = GetBestHypothesis();
1482
+ if (bestHypo) {
1483
+ if (options()->output.ReportHypoScore) {
1484
+ out << bestHypo->GetFutureScore() << ' ';
1485
+ }
1486
+ if (options()->output.RecoverPath) {
1487
+ bestHypo->OutputInput(out);
1488
+ out << "||| ";
1489
+ }
1490
+
1491
+ if (options()->output.PrintID) {
1492
+ out << translationId << " ";
1493
+ }
1494
+
1495
+ // VN : I put back the code for OutputPassthroughInformation
1496
+ if (options()->output.PrintPassThrough) {
1497
+ OutputPassthroughInformation(out, bestHypo);
1498
+ }
1499
+ // end of add back
1500
+
1501
+ if (options()->output.ReportSegmentation == 2) {
1502
+ GetOutputLanguageModelOrder(out, bestHypo);
1503
+ }
1504
+ OutputSurface(out,*bestHypo, true);
1505
+ if (options()->output.PrintAlignmentInfo) {
1506
+ out << "||| ";
1507
+ bestHypo->OutputAlignment(out, true);
1508
+ }
1509
+
1510
+ IFVERBOSE(1) {
1511
+ debug << "BEST TRANSLATION: " << *bestHypo << endl;
1512
+ }
1513
+ } else {
1514
+ VERBOSE(1, "NO BEST TRANSLATION" << endl);
1515
+ }
1516
+
1517
+ out << endl;
1518
+ } // if (!staticData.UseMBR())
1519
+
1520
+ // MBR decoding (n-best MBR, lattice MBR, consensus)
1521
+ else {
1522
+ // we first need the n-best translations
1523
+ size_t nBestSize = options()->mbr.size;
1524
+ if (nBestSize <= 0) {
1525
+ cerr << "ERROR: negative size for number of MBR candidate translations not allowed (option mbr-size)" << endl;
1526
+ exit(1);
1527
+ }
1528
+ TrellisPathList nBestList;
1529
+ CalcNBest(nBestSize, nBestList, true);
1530
+ VERBOSE(2,"size of n-best: " << nBestList.GetSize() << " (" << nBestSize << ")" << endl);
1531
+ IFVERBOSE(2) {
1532
+ PrintUserTime("calculated n-best list for (L)MBR decoding");
1533
+ }
1534
+
1535
+ // lattice MBR
1536
+ if (options()->lmbr.enabled) {
1537
+ if (options()->nbest.enabled) {
1538
+ //lattice mbr nbest
1539
+ vector<LatticeMBRSolution> solutions;
1540
+ size_t n = min(nBestSize, options()->nbest.nbest_size);
1541
+ getLatticeMBRNBest(*this,nBestList,solutions,n);
1542
+ OutputLatticeMBRNBest(m_latticeNBestOut, solutions, translationId);
1543
+ } else {
1544
+ //Lattice MBR decoding
1545
+ vector<Word> mbrBestHypo = doLatticeMBR(*this,nBestList);
1546
+ OutputBestHypo(mbrBestHypo, out);
1547
+ IFVERBOSE(2) {
1548
+ PrintUserTime("finished Lattice MBR decoding");
1549
+ }
1550
+ }
1551
+ }
1552
+
1553
+ // consensus decoding
1554
+ else if (options()->search.consensus) {
1555
+ const TrellisPath &conBestHypo = doConsensusDecoding(*this,nBestList);
1556
+ OutputBestHypo(conBestHypo, out);
1557
+ OutputAlignment(m_alignmentOut, conBestHypo);
1558
+ IFVERBOSE(2) {
1559
+ PrintUserTime("finished Consensus decoding");
1560
+ }
1561
+ }
1562
+
1563
+ // n-best MBR decoding
1564
+ else {
1565
+ const TrellisPath &mbrBestHypo = doMBR(nBestList, *options());
1566
+ OutputBestHypo(mbrBestHypo, out);
1567
+ OutputAlignment(m_alignmentOut, mbrBestHypo);
1568
+ IFVERBOSE(2) {
1569
+ PrintUserTime("finished MBR decoding");
1570
+ }
1571
+ }
1572
+ }
1573
+
1574
+ // report best translation to output collector
1575
+ collector->Write(translationId,out.str(),debug.str());
1576
+
1577
+ decisionRuleTime.stop();
1578
+ VERBOSE(1, "Line " << translationId << ": Decision rule took " << decisionRuleTime << " seconds total" << endl);
1579
+ } // if (m_ioWrapper.GetSingleBestOutputCollector())
1580
+
1581
+ }
1582
+
1583
+ void Manager::OutputNBest(OutputCollector *collector) const
1584
+ {
1585
+ if (collector == NULL) {
1586
+ return;
1587
+ }
1588
+
1589
+ if (options()->lmbr.enabled) {
1590
+ if (options()->nbest.enabled) {
1591
+ collector->Write(m_source.GetTranslationId(), m_latticeNBestOut.str());
1592
+ }
1593
+ } else {
1594
+ TrellisPathList nBestList;
1595
+ ostringstream out;
1596
+ NBestOptions const& nbo = options()->nbest;
1597
+ CalcNBest(nbo.nbest_size, nBestList, nbo.only_distinct);
1598
+ OutputNBest(out, nBestList);
1599
+ collector->Write(m_source.GetTranslationId(), out.str());
1600
+ }
1601
+
1602
+ }
1603
+
1604
+ void
1605
+ Manager::
1606
+ OutputNBest(std::ostream& out, Moses::TrellisPathList const& nBestList) const
1607
+ {
1608
+ NBestOptions const& nbo = options()->nbest;
1609
+ bool reportAllFactors = nbo.include_all_factors;
1610
+ bool includeSegmentation = nbo.include_segmentation;
1611
+ bool includeWordAlignment = nbo.include_alignment_info;
1612
+
1613
+ TrellisPathList::const_iterator iter;
1614
+ for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
1615
+ const TrellisPath &path = **iter;
1616
+ const std::vector<const Hypothesis *> &edges = path.GetEdges();
1617
+
1618
+ // print the surface factor of the translation
1619
+ out << m_source.GetTranslationId() << " ||| ";
1620
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
1621
+ const Hypothesis &edge = *edges[currEdge];
1622
+ OutputSurface(out, edge);
1623
+ }
1624
+ out << " |||";
1625
+
1626
+ // print scores with feature names
1627
+ bool with_labels = options()->nbest.include_feature_labels;
1628
+ path.GetScoreBreakdown()->OutputAllFeatureScores(out, with_labels);
1629
+
1630
+ // total
1631
+ out << " ||| " << path.GetFutureScore();
1632
+
1633
+ //phrase-to-phrase segmentation
1634
+ if (includeSegmentation) {
1635
+ out << " |||";
1636
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
1637
+ const Hypothesis &edge = *edges[currEdge];
1638
+ const Range &sourceRange = edge.GetCurrSourceWordsRange();
1639
+ Range targetRange = path.GetTargetWordsRange(edge);
1640
+ out << " " << sourceRange.GetStartPos();
1641
+ if (sourceRange.GetStartPos() < sourceRange.GetEndPos()) {
1642
+ out << "-" << sourceRange.GetEndPos();
1643
+ }
1644
+ out<< "=" << targetRange.GetStartPos();
1645
+ if (targetRange.GetStartPos() < targetRange.GetEndPos()) {
1646
+ out<< "-" << targetRange.GetEndPos();
1647
+ }
1648
+ }
1649
+ }
1650
+
1651
+ if (includeWordAlignment) {
1652
+ out << " ||| ";
1653
+ for (int currEdge = (int)edges.size() - 2 ; currEdge >= 0 ; currEdge--) {
1654
+ const Hypothesis &edge = *edges[currEdge];
1655
+ const Range &sourceRange = edge.GetCurrSourceWordsRange();
1656
+ Range targetRange = path.GetTargetWordsRange(edge);
1657
+ const int sourceOffset = sourceRange.GetStartPos();
1658
+ const int targetOffset = targetRange.GetStartPos();
1659
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
1660
+
1661
+ OutputAlignment(out, ai, sourceOffset, targetOffset);
1662
+
1663
+ }
1664
+ }
1665
+
1666
+ if (options()->output.RecoverPath) {
1667
+ out << " ||| ";
1668
+ OutputInput(out, edges[0]);
1669
+ }
1670
+
1671
+ out << endl;
1672
+ }
1673
+
1674
+ out << std::flush;
1675
+ }
1676
+
1677
+ //////////////////////////////////////////////////////////////////////////
1678
+ /***
1679
+ * print surface factor only for the given phrase
1680
+ */
1681
+ void
1682
+ Manager::
1683
+ OutputSurface(std::ostream &out, Hypothesis const& edge, bool const recursive) const
1684
+ {
1685
+ if (recursive && edge.GetPrevHypo()) {
1686
+ OutputSurface(out,*edge.GetPrevHypo(), true);
1687
+ }
1688
+
1689
+ std::vector<FactorType> outputFactorOrder = options()->output.factor_order;
1690
+ UTIL_THROW_IF2(outputFactorOrder.size() == 0,
1691
+ "Must specific at least 1 output factor");
1692
+
1693
+ FactorType placeholderFactor = options()->input.placeholder_factor;
1694
+ std::map<size_t, const Factor*> placeholders;
1695
+ if (placeholderFactor != NOT_FOUND) {
1696
+ // creates map of target position -> factor for placeholders
1697
+ placeholders = GetPlaceholders(edge, placeholderFactor);
1698
+ }
1699
+
1700
+ bool markUnknown = options()->unk.mark;
1701
+ std::string const& fd = options()->output.factor_delimiter;
1702
+
1703
+ TargetPhrase const& phrase = edge.GetCurrTargetPhrase();
1704
+ size_t size = phrase.GetSize();
1705
+ for (size_t pos = 0 ; pos < size ; pos++) {
1706
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[0]);
1707
+ if (placeholders.size()) {
1708
+ // do placeholders
1709
+ std::map<size_t, const Factor*>::const_iterator iter = placeholders.find(pos);
1710
+ if (iter != placeholders.end()) {
1711
+ factor = iter->second;
1712
+ }
1713
+ }
1714
+
1715
+ UTIL_THROW_IF2(factor == NULL, "No factor 0 at position " << pos);
1716
+
1717
+ //preface surface form with UNK if marking unknowns
1718
+ const Word &word = phrase.GetWord(pos);
1719
+ if(markUnknown && word.IsOOV()) {
1720
+ out << options()->unk.prefix;
1721
+ }
1722
+
1723
+ out << *factor;
1724
+ for (size_t i = 1 ; i < outputFactorOrder.size() ; i++) {
1725
+ const Factor *factor = phrase.GetFactor(pos, outputFactorOrder[i]);
1726
+ if (factor) out << fd << *factor;
1727
+ //else out << fd << UNKNOWN_FACTOR;
1728
+ }
1729
+
1730
+ if(markUnknown && word.IsOOV()) {
1731
+ out << options()->unk.suffix;
1732
+ }
1733
+
1734
+ out << " ";
1735
+
1736
+ }
1737
+
1738
+ // trace ("report segmentation") option "-t" / "-tt"
1739
+ int reportSegmentation = options()->output.ReportSegmentation;
1740
+ if (reportSegmentation > 0 && phrase.GetSize() > 0) {
1741
+ const Range &sourceRange = edge.GetCurrSourceWordsRange();
1742
+ const int sourceStart = sourceRange.GetStartPos();
1743
+ const int sourceEnd = sourceRange.GetEndPos();
1744
+ out << "|" << sourceStart << "-" << sourceEnd; // enriched "-tt"
1745
+ if (reportSegmentation == 2) {
1746
+ out << ",wa=";
1747
+ const AlignmentInfo &ai = edge.GetCurrTargetPhrase().GetAlignTerm();
1748
+ OutputAlignment(out, ai, 0, 0);
1749
+ out << ",total=";
1750
+ out << edge.GetScore() - edge.GetPrevHypo()->GetScore();
1751
+ out << ",";
1752
+ ScoreComponentCollection scoreBreakdown(edge.GetScoreBreakdown());
1753
+ scoreBreakdown.MinusEquals(edge.GetPrevHypo()->GetScoreBreakdown());
1754
+ bool with_labels = options()->nbest.include_feature_labels;
1755
+ scoreBreakdown.OutputAllFeatureScores(out, with_labels);
1756
+ }
1757
+ out << "| ";
1758
+ }
1759
+ }
1760
+
1761
+ void
1762
+ Manager::
1763
+ OutputAlignment(ostream &out, const AlignmentInfo &ai,
1764
+ size_t sourceOffset, size_t targetOffset) const
1765
+ {
1766
+ typedef std::vector< const std::pair<size_t,size_t>* > AlignVec;
1767
+ AlignVec alignments = ai.GetSortedAlignments(options()->output.WA_SortOrder);
1768
+
1769
+ AlignVec::const_iterator it;
1770
+ for (it = alignments.begin(); it != alignments.end(); ++it) {
1771
+ const std::pair<size_t,size_t> &alignment = **it;
1772
+ out << alignment.first + sourceOffset << "-"
1773
+ << alignment.second + targetOffset << " ";
1774
+ }
1775
+
1776
+ }
1777
+
1778
+ void
1779
+ Manager::
1780
+ OutputInput(std::ostream& os, const Hypothesis* hypo) const
1781
+ {
1782
+ size_t len = hypo->GetInput().GetSize();
1783
+ std::vector<const Phrase*> inp_phrases(len, 0);
1784
+ OutputInput(inp_phrases, hypo);
1785
+ for (size_t i=0; i<len; ++i)
1786
+ if (inp_phrases[i]) os << *inp_phrases[i];
1787
+ }
1788
+
1789
+ void Manager::OutputInput(std::vector<const Phrase*>& map, const Hypothesis* hypo) const
1790
+ {
1791
+ if (hypo->GetPrevHypo()) {
1792
+ OutputInput(map, hypo->GetPrevHypo());
1793
+ map[hypo->GetCurrSourceWordsRange().GetStartPos()] = &hypo->GetTranslationOption().GetInputPath().GetPhrase();
1794
+ }
1795
+ }
1796
+
1797
+ std::map<size_t, const Factor*> Manager::GetPlaceholders(const Hypothesis &hypo, FactorType placeholderFactor) const
1798
+ {
1799
+ const InputPath &inputPath = hypo.GetTranslationOption().GetInputPath();
1800
+ const Phrase &inputPhrase = inputPath.GetPhrase();
1801
+
1802
+ std::map<size_t, const Factor*> ret;
1803
+
1804
+ for (size_t sourcePos = 0; sourcePos < inputPhrase.GetSize(); ++sourcePos) {
1805
+ const Factor *factor = inputPhrase.GetFactor(sourcePos, placeholderFactor);
1806
+ if (factor) {
1807
+ TargetPhrase const& tp = hypo.GetTranslationOption().GetTargetPhrase();
1808
+ std::set<size_t> targetPos = tp.GetAlignTerm().GetAlignmentsForSource(sourcePos);
1809
+ UTIL_THROW_IF2(targetPos.size() != 1,
1810
+ "Placeholder should be aligned to 1, and only 1, word");
1811
+ ret[*targetPos.begin()] = factor;
1812
+ }
1813
+ }
1814
+
1815
+ return ret;
1816
+ }
1817
+
1818
+ void Manager::OutputLatticeSamples(OutputCollector *collector) const
1819
+ {
1820
+ if (collector) {
1821
+ TrellisPathList latticeSamples;
1822
+ ostringstream out;
1823
+ CalcLatticeSamples(options()->output.lattice_sample_size, latticeSamples);
1824
+ OutputNBest(out,latticeSamples);
1825
+ collector->Write(m_source.GetTranslationId(), out.str());
1826
+ }
1827
+
1828
+ }
1829
+
1830
+ void Manager::OutputAlignment(OutputCollector *collector) const
1831
+ {
1832
+ if (collector == NULL) {
1833
+ return;
1834
+ }
1835
+
1836
+ if (!m_alignmentOut.str().empty()) {
1837
+ collector->Write(m_source.GetTranslationId(), m_alignmentOut.str());
1838
+ } else {
1839
+ std::vector<const Hypothesis *> edges;
1840
+ const Hypothesis *currentHypo = GetBestHypothesis();
1841
+ while (currentHypo) {
1842
+ edges.push_back(currentHypo);
1843
+ currentHypo = currentHypo->GetPrevHypo();
1844
+ }
1845
+ ostringstream out;
1846
+ size_t targetOffset = 0;
1847
+ BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) {
1848
+ const TargetPhrase &tp = e->GetCurrTargetPhrase();
1849
+ size_t sourceOffset = e->GetCurrSourceWordsRange().GetStartPos();
1850
+ OutputAlignment(out, tp.GetAlignTerm(), sourceOffset, targetOffset);
1851
+ targetOffset += tp.GetSize();
1852
+ }
1853
+ out << std::endl; // Used by --alignment-output-file so requires endl
1854
+ collector->Write(m_source.GetTranslationId(), out.str());
1855
+
1856
+ }
1857
+ }
1858
+
1859
+ void
1860
+ Manager::
1861
+ OutputDetailedTranslationReport(OutputCollector *collector) const
1862
+ {
1863
+ if (collector) {
1864
+ ostringstream out;
1865
+ FixPrecision(out,PRECISION);
1866
+ TranslationAnalysis::PrintTranslationAnalysis(out, GetBestHypothesis());
1867
+ collector->Write(m_source.GetTranslationId(),out.str());
1868
+ }
1869
+
1870
+ }
1871
+
1872
+ void
1873
+ Manager::
1874
+ OutputUnknowns(OutputCollector *collector) const
1875
+ {
1876
+ if (collector) {
1877
+ long translationId = m_source.GetTranslationId();
1878
+ const vector<const Phrase*>& unknowns = m_transOptColl->GetUnknownSources();
1879
+ ostringstream out;
1880
+ for (size_t i = 0; i < unknowns.size(); ++i) {
1881
+ out << *(unknowns[i]);
1882
+ }
1883
+ out << endl;
1884
+ collector->Write(translationId, out.str());
1885
+ }
1886
+
1887
+ }
1888
+
1889
+ void
1890
+ Manager::
1891
+ OutputWordGraph(OutputCollector *collector) const
1892
+ {
1893
+ if (collector) {
1894
+ long translationId = m_source.GetTranslationId();
1895
+ ostringstream out;
1896
+ FixPrecision(out,PRECISION);
1897
+ GetWordGraph(translationId, out);
1898
+ collector->Write(translationId, out.str());
1899
+ }
1900
+ }
1901
+
1902
+ void
1903
+ Manager::
1904
+ OutputSearchGraph(OutputCollector *collector) const
1905
+ {
1906
+ if (collector) {
1907
+ long translationId = m_source.GetTranslationId();
1908
+ ostringstream out;
1909
+ FixPrecision(out,PRECISION);
1910
+ OutputSearchGraph(translationId, out);
1911
+ collector->Write(translationId, out.str());
1912
+
1913
+ #ifdef HAVE_PROTOBUF
1914
+ const StaticData &staticData = StaticData::Instance();
1915
+ if (staticData.GetOutputSearchGraphPB()) {
1916
+ ostringstream sfn;
1917
+ sfn << staticData.GetParam("output-search-graph-pb")[0] << '/' << translationId << ".pb" << ends;
1918
+ string fn = sfn.str();
1919
+ VERBOSE(2, "Writing search graph to " << fn << endl);
1920
+ fstream output(fn.c_str(), ios::trunc | ios::binary | ios::out);
1921
+ SerializeSearchGraphPB(translationId, output);
1922
+ }
1923
+ #endif
1924
+ }
1925
+
1926
+ }
1927
+
1928
+ void Manager::OutputSearchGraphSLF() const
1929
+ {
1930
+ // const StaticData &staticData = StaticData::Instance();
1931
+ long translationId = m_source.GetTranslationId();
1932
+
1933
+ // Output search graph in HTK standard lattice format (SLF)
1934
+ std::string const& slf = options()->output.SearchGraphSLF;
1935
+ if (slf.size()) {
1936
+ util::StringStream fileName;
1937
+ fileName << slf << "/" << translationId << ".slf";
1938
+ ofstream *file = new ofstream;
1939
+ file->open(fileName.str().c_str());
1940
+ if (file->is_open() && file->good()) {
1941
+ ostringstream out;
1942
+ FixPrecision(out,PRECISION);
1943
+ OutputSearchGraphAsSLF(translationId, out);
1944
+ *file << out.str();
1945
+ file -> flush();
1946
+ } else {
1947
+ TRACE_ERR("Cannot output HTK standard lattice for line " << translationId << " because the output file is not open or not ready for writing" << endl);
1948
+ }
1949
+ delete file;
1950
+ }
1951
+
1952
+ }
1953
+
1954
+ void Manager::OutputLatticeMBRNBest(std::ostream& out, const vector<LatticeMBRSolution>& solutions,long translationId) const
1955
+ {
1956
+ for (vector<LatticeMBRSolution>::const_iterator si = solutions.begin(); si != solutions.end(); ++si) {
1957
+ out << translationId;
1958
+ out << " |||";
1959
+ const vector<Word> mbrHypo = si->GetWords();
1960
+ for (size_t i = 0 ; i < mbrHypo.size() ; i++) {
1961
+ const Factor *factor = mbrHypo[i].GetFactor(options()->output.factor_order[0]);
1962
+ if (i>0) out << " " << *factor;
1963
+ else out << *factor;
1964
+ }
1965
+ out << " |||";
1966
+ out << " map: " << si->GetMapScore();
1967
+ out << " w: " << mbrHypo.size();
1968
+ const vector<float>& ngramScores = si->GetNgramScores();
1969
+ for (size_t i = 0; i < ngramScores.size(); ++i) {
1970
+ out << " " << ngramScores[i];
1971
+ }
1972
+ out << " ||| " << si->GetScore();
1973
+
1974
+ out << endl;
1975
+ }
1976
+ }
1977
+
1978
+ void
1979
+ Manager::
1980
+ OutputBestHypo(const std::vector<Word>& mbrBestHypo, ostream& out) const
1981
+ {
1982
+ FactorType f = options()->output.factor_order[0];
1983
+ for (size_t i = 0 ; i < mbrBestHypo.size() ; i++) {
1984
+ const Factor *factor = mbrBestHypo[i].GetFactor(f);
1985
+ UTIL_THROW_IF2(factor == NULL, "No factor " << f << " at position " << i);
1986
+ if (i) out << " ";
1987
+ out << *factor;
1988
+ }
1989
+ out << endl;
1990
+ }
1991
+
1992
+ void
1993
+ Manager::
1994
+ OutputBestHypo(const Moses::TrellisPath &path, std::ostream &out) const
1995
+ {
1996
+ std::vector<const Hypothesis *> const& edges = path.GetEdges();
1997
+ for (int currEdge = (int)edges.size() - 1 ; currEdge >= 0 ; currEdge--) {
1998
+ Hypothesis const& edge = *edges[currEdge];
1999
+ OutputSurface(out, edge);
2000
+ }
2001
+ out << endl;
2002
+ }
2003
+
2004
+ void
2005
+ Manager::
2006
+ OutputAlignment(std::ostringstream &out, const TrellisPath &path) const
2007
+ {
2008
+ WordAlignmentSort waso = options()->output.WA_SortOrder;
2009
+ BOOST_REVERSE_FOREACH(Hypothesis const* e, path.GetEdges())
2010
+ e->OutputAlignment(out, false);
2011
+ // Hypothesis::OutputAlignment(out, path.GetEdges(), waso);
2012
+ // Used by --alignment-output-file so requires endl
2013
+ out << std::endl;
2014
+ }
2015
+
2016
+ } // namespace
mosesdecoder/moses/MockHypothesis.h ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2010 University of Edinburgh
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #ifndef _MOCK_HYPOTHESIS_
22
+ #define _MOCK_HYPOTHESIS_
23
+
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "moses/FF/UnknownWordPenaltyProducer.h"
28
+ #include "moses/FF/DistortionScoreProducer.h"
29
+ #include "moses/FF/WordPenaltyProducer.h"
30
+ #include "Hypothesis.h"
31
+ #include "Manager.h"
32
+ #include "TranslationOption.h"
33
+
34
+ namespace MosesTest
35
+ {
36
+
37
+ //
38
+ // Construct a hypothesis with arbitrary source and target phrase
39
+ // sequences. Useful for testing feature functions.
40
+ //
41
+
42
+ typedef std::pair<size_t,size_t> Alignment; //(first,last) in source
43
+
44
+ class MockHypothesisGuard
45
+ {
46
+ public:
47
+ /** Creates a phrase-based hypothesis.
48
+ */
49
+ MockHypothesisGuard
50
+ ( const std::string& sourceSentence,
51
+ const std::vector<Alignment>& alignments,
52
+ const std::vector<std::string>& targetSegments);
53
+
54
+ Moses::Hypothesis* operator*() const {
55
+ return m_hypothesis;
56
+ }
57
+
58
+ /** Destroy the hypothesis chain */
59
+ ~MockHypothesisGuard();
60
+
61
+ private:
62
+ Moses::TranslationOption m_initialTransOpt;
63
+ boost::shared_ptr<Moses::Sentence> m_sentence;
64
+ Moses::WordPenaltyProducer m_wp;
65
+ Moses::UnknownWordPenaltyProducer m_uwp;
66
+ Moses::DistortionScoreProducer m_dist;
67
+ boost::shared_ptr<Moses::Manager> m_manager;
68
+ boost::shared_ptr<Moses::TranslationTask> m_ttask;
69
+ Moses::Hypothesis* m_hypothesis;
70
+ std::vector<Moses::TargetPhrase> m_targetPhrases;
71
+ std::vector<Moses::TranslationOption*> m_toptions;
72
+ };
73
+
74
+ class HypothesisFixture
75
+ {
76
+ public:
77
+ HypothesisFixture();
78
+ const Moses::Hypothesis* empty() {
79
+ return **m_empty;
80
+ }
81
+ const Moses::Hypothesis* partial() {
82
+ return **m_partial;
83
+ }
84
+ const Moses::Hypothesis* full() {
85
+ return **m_full;
86
+ }
87
+
88
+ private:
89
+ std::auto_ptr<MockHypothesisGuard> m_empty;
90
+ std::auto_ptr<MockHypothesisGuard> m_partial;
91
+ std::auto_ptr<MockHypothesisGuard> m_full;
92
+ };
93
+
94
+
95
+ }
96
+
97
+ #endif
mosesdecoder/moses/OutputFileStream.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <cstdlib>
25
+ #include <fstream>
26
+ #include <string>
27
+ #include <iostream>
28
+ #include <boost/iostreams/filtering_stream.hpp>
29
+
30
+ namespace Moses
31
+ {
32
+
33
+ /** Version of std::ostream with transparent compression.
34
+ *
35
+ * Transparently compresses output when writing to a file whose name ends in
36
+ * ".gz". Or, writes to stdout instead of a file when given a filename
37
+ * consisting of just a dash ("-").
38
+ */
39
+ class OutputFileStream : public boost::iostreams::filtering_ostream
40
+ {
41
+ private:
42
+ /** File that needs flushing & closing when we close this stream.
43
+ *
44
+ * Is NULL when no file is opened, e.g. when writing to standard output.
45
+ */
46
+ std::ofstream *m_outFile;
47
+
48
+ /// Is this stream open?
49
+ bool m_open;
50
+
51
+ public:
52
+ /** Create an unopened OutputFileStream.
53
+ *
54
+ * Until it's been opened, nothing can be done with this stream.
55
+ */
56
+ OutputFileStream();
57
+
58
+ /// Create an OutputFileStream, and open it by calling Open().
59
+ OutputFileStream(const std::string &filePath);
60
+ virtual ~OutputFileStream();
61
+
62
+ // TODO: Can we please just always throw an exception when this fails?
63
+ /** Open stream.
64
+ *
65
+ * If filePath is "-" (just a dash), this opens the stream for writing to
66
+ * standard output. Otherwise, it opens the given file. If the filename
67
+ * has the ".gz" suffix, output will be transparently compressed.
68
+ *
69
+ * Call Close() to close the file.
70
+ *
71
+ * Returns whether opening the file was successful. It may also throw an
72
+ * exception on failure.
73
+ */
74
+ bool Open(const std::string &filePath);
75
+
76
+ /// Flush and close stream. After this, the stream can be opened again.
77
+ void Close();
78
+ };
79
+
80
+ }
81
+
mosesdecoder/moses/PCNTools.h ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_PCNTools
23
+ #define moses_PCNTools
24
+
25
+ #include <vector>
26
+ #include <map>
27
+ #include <string>
28
+ #include <utility>
29
+ #include <cstdlib>
30
+
31
+ /** A couple of utilities to read .pcn files. A python-compatible format
32
+ * for encoding confusion networks and word lattices.
33
+ */
34
+ namespace PCN
35
+ {
36
+
37
+ struct CNAlt {
38
+ CNAlt() {
39
+ }
40
+ CNAlt(const std::string &word,
41
+ const std::vector<float> &denseFeatures,
42
+ const std::map<std::string, float> &sparseFeatures,
43
+ size_t next)
44
+ :m_word(word)
45
+ ,m_denseFeatures(denseFeatures)
46
+ ,m_sparseFeatures(sparseFeatures)
47
+ ,m_next(next) {
48
+ }
49
+
50
+ std::string m_word;
51
+ std::vector<float> m_denseFeatures;
52
+ std::map<std::string, float> m_sparseFeatures;
53
+ size_t m_next;
54
+ };
55
+
56
+ //typedef std::pair<std::pair<std::string, std::vector<float> >, size_t> CNAlt;
57
+ typedef std::vector<CNAlt> CNCol;
58
+ typedef std::vector<CNCol> CN;
59
+
60
+ /** Given a string ((('foo',0.1,1),('bar',0.9,2)),...) representation of a
61
+ * word lattice in PCN format, return a CN object representing the lattice
62
+ */
63
+ CN parsePCN(const std::string& in);
64
+
65
+ };
66
+
67
+ #endif
mosesdecoder/moses/PDTAimp.cpp ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "PDTAimp.h"
2
+
3
+ namespace Moses
4
+ {
5
+
6
+ PDTAimp::PDTAimp(PhraseDictionaryTreeAdaptor *p)
7
+ : m_dict(0),
8
+ m_obj(p),
9
+ useCache(1),
10
+ totalE(0),
11
+ distinctE(0)
12
+ {
13
+ m_numInputScores = 0;
14
+ m_inputFeature = InputFeature::InstancePtr();
15
+
16
+ if (m_inputFeature) {
17
+ const PhraseDictionary *firstPt = PhraseDictionary::GetColl()[0];
18
+ if (firstPt == m_obj) {
19
+ m_numInputScores = m_inputFeature->GetNumScoreComponents();
20
+ }
21
+ }
22
+ }
23
+
24
+ PDTAimp::~PDTAimp()
25
+ {
26
+ CleanUp();
27
+ delete m_dict;
28
+
29
+ if (StaticData::Instance().GetVerboseLevel() >= 2) {
30
+
31
+ TRACE_ERR("tgt candidates stats: total="<<totalE<<"; distinct="
32
+ <<distinctE<<" ("<<distinctE/(0.01*totalE)<<"); duplicates="
33
+ <<totalE-distinctE<<" ("<<(totalE-distinctE)/(0.01*totalE)
34
+ <<")\n");
35
+
36
+ TRACE_ERR("\npath statistics\n");
37
+
38
+ if(path1Best.size()) {
39
+ TRACE_ERR("1-best: ");
40
+ std::copy(path1Best.begin()+1,path1Best.end(),
41
+ std::ostream_iterator<size_t>(std::cerr," \t"));
42
+ TRACE_ERR("\n");
43
+ }
44
+ if(pathCN.size()) {
45
+ TRACE_ERR("CN (full): ");
46
+ std::transform(pathCN.begin()+1
47
+ ,pathCN.end()
48
+ ,std::ostream_iterator<double>(std::cerr," \t")
49
+ ,Exp);
50
+ TRACE_ERR("\n");
51
+ }
52
+ if(pathExplored.size()) {
53
+ TRACE_ERR("CN (explored): ");
54
+ std::copy(pathExplored.begin()+1,pathExplored.end(),
55
+ std::ostream_iterator<size_t>(std::cerr," \t"));
56
+ TRACE_ERR("\n");
57
+ }
58
+ }
59
+
60
+ }
61
+
62
+ void PDTAimp::CleanUp()
63
+ {
64
+ assert(m_dict);
65
+ m_dict->FreeMemory();
66
+ // for(size_t i=0; i<m_tgtColls.size(); ++i) m_tgtColls[i].reset();
67
+ m_tgtColls.clear();
68
+ m_cache.clear();
69
+ m_rangeCache.clear();
70
+ uniqSrcPhr.clear();
71
+ }
72
+
73
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr
74
+ PDTAimp::GetTargetPhraseCollection(Phrase const &src) const
75
+ {
76
+
77
+ assert(m_dict);
78
+
79
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr ret;
80
+ if(src.GetSize()==0) return ret;
81
+
82
+ std::pair<MapSrc2Tgt::iterator,bool> piter;
83
+ if(useCache) {
84
+ piter=m_cache.insert(std::make_pair(src, ret));
85
+ if(!piter.second) return piter.first->second;
86
+ } else if (m_cache.size()) {
87
+ MapSrc2Tgt::const_iterator i=m_cache.find(src);
88
+ return (i!=m_cache.end() ? i->second : ret);
89
+ }
90
+
91
+ std::vector<std::string> srcString(src.GetSize());
92
+ // convert source Phrase into vector of strings
93
+ for(size_t i=0; i<srcString.size(); ++i) {
94
+ Factors2String(src.GetWord(i),srcString[i]);
95
+ }
96
+
97
+ // get target phrases in string representation
98
+ std::vector<StringTgtCand> cands;
99
+ std::vector<std::string> wacands;
100
+ m_dict->GetTargetCandidates(srcString,cands,wacands);
101
+ if(cands.empty()) {
102
+ return ret;
103
+ }
104
+
105
+ //TODO: Multiple models broken here
106
+ std::vector<float> weights = StaticData::Instance().GetWeights(m_obj);
107
+
108
+ std::vector<TargetPhrase> tCands;
109
+ tCands.reserve(cands.size());
110
+
111
+ std::vector<std::pair<float,size_t> > costs;
112
+ costs.reserve(cands.size());
113
+
114
+ std::vector<Phrase> sourcePhrases;
115
+ sourcePhrases.reserve(cands.size());
116
+
117
+
118
+ // convert into TargetPhrases
119
+ std::string fd = m_obj->options()->output.factor_delimiter;
120
+ for(size_t i=0; i<cands.size(); ++i) {
121
+ TargetPhrase targetPhrase(m_obj);
122
+
123
+ StringTgtCand::Tokens const& factorStrings=cands[i].tokens;
124
+ Scores const& probVector=cands[i].scores;
125
+
126
+ std::vector<float> scoreVector(probVector.size());
127
+ std::transform(probVector.begin(),probVector.end(),scoreVector.begin(),
128
+ TransformScore);
129
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),
130
+ FloorScore);
131
+
132
+ //sparse features.
133
+ //These are already in log-space
134
+ for (size_t j = 0; j < cands[i].fnames.size(); ++j) {
135
+ targetPhrase.GetScoreBreakdown().Assign(m_obj, *cands[i].fnames[j], cands[i].fvalues[j]);
136
+ }
137
+
138
+ CreateTargetPhrase(targetPhrase,factorStrings, fd, scoreVector, Scores(0),
139
+ &wacands[i], &src);
140
+
141
+ costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
142
+ tCands.push_back(targetPhrase);
143
+
144
+ sourcePhrases.push_back(src);
145
+ }
146
+
147
+ ret = PruneTargetCandidates(tCands,costs, sourcePhrases);
148
+ if(ret->IsEmpty()) {
149
+ ret.reset();
150
+ } else {
151
+ if(useCache) piter.first->second = ret;
152
+ m_tgtColls.push_back(ret);
153
+ }
154
+ return ret;
155
+
156
+ }
157
+
158
+ void PDTAimp::Create(const std::vector<FactorType> &input
159
+ , const std::vector<FactorType> &output
160
+ , const std::string &filePath
161
+ , const std::vector<float> &weight
162
+ )
163
+ {
164
+
165
+ // set my members
166
+ m_dict=new PhraseDictionaryTree();
167
+ m_input=input;
168
+ m_output=output;
169
+
170
+ const StaticData &staticData = StaticData::Instance();
171
+ m_dict->NeedAlignmentInfo(staticData.NeedAlignmentInfo());
172
+
173
+ std::string binFname=filePath+".binphr.idx";
174
+ if(!FileExists(binFname.c_str())) {
175
+ UTIL_THROW2( "bin ttable does not exist");
176
+ //TRACE_ERR( "bin ttable does not exist -> create it\n");
177
+ //InputFileStream in(filePath);
178
+ //m_dict->Create(in,filePath);
179
+ }
180
+ VERBOSE(1,"reading bin ttable\n");
181
+ // m_dict->Read(filePath);
182
+ bool res=m_dict->Read(filePath);
183
+ if (!res) {
184
+ std::cerr << "bin ttable was read in a wrong way\n";
185
+ exit(1);
186
+ }
187
+ }
188
+
189
+
190
+ void PDTAimp::CacheSource(ConfusionNet const& src)
191
+ {
192
+ assert(m_dict);
193
+ const size_t srcSize=src.GetSize();
194
+
195
+ std::vector<size_t> exploredPaths(srcSize+1,0);
196
+ std::vector<double> exPathsD(srcSize+1,-1.0);
197
+
198
+ // collect some statistics
199
+ std::vector<size_t> cnDepths(srcSize,0);
200
+ for(size_t i=0; i<srcSize; ++i) cnDepths[i]=src[i].size();
201
+
202
+ for(size_t len=1; len<=srcSize; ++len)
203
+ for(size_t i=0; i<=srcSize-len; ++i) {
204
+ double pd=0.0;
205
+ for(size_t k=i; k<i+len; ++k) pd+=log(1.0*cnDepths[k]);
206
+ exPathsD[len]=(exPathsD[len]>=0.0 ? addLogScale(pd,exPathsD[len]) : pd);
207
+ }
208
+
209
+ // update global statistics
210
+ if(pathCN.size()<=srcSize) pathCN.resize(srcSize+1,-1.0);
211
+ for(size_t len=1; len<=srcSize; ++len)
212
+ pathCN[len]=pathCN[len]>=0.0 ? addLogScale(pathCN[len],exPathsD[len]) : exPathsD[len];
213
+
214
+ if(path1Best.size()<=srcSize) path1Best.resize(srcSize+1,0);
215
+ for(size_t len=1; len<=srcSize; ++len) path1Best[len]+=srcSize-len+1;
216
+
217
+
218
+ if (StaticData::Instance().GetVerboseLevel() >= 2 && exPathsD.size()) {
219
+ TRACE_ERR("path stats for current CN: \nCN (full): ");
220
+ std::transform(exPathsD.begin()+1
221
+ ,exPathsD.end()
222
+ ,std::ostream_iterator<double>(std::cerr," ")
223
+ ,Exp);
224
+ TRACE_ERR("\n");
225
+ }
226
+
227
+ typedef std::map<StringTgtCand::Tokens,TScores> E2Costs;
228
+
229
+ std::map<Range,E2Costs> cov2cand;
230
+ std::vector<State> stack;
231
+ for(Position i=0 ; i < srcSize ; ++i)
232
+ stack.push_back(State(i, i, m_dict->GetRoot(), std::vector<float>(m_numInputScores,0.0)));
233
+
234
+ std::vector<float> weightTrans = StaticData::Instance().GetWeights(m_obj);
235
+ std::vector<float> weightInput = StaticData::Instance().GetWeights(m_inputFeature);
236
+ float weightWP = StaticData::Instance().GetWeightWordPenalty();
237
+
238
+ while(!stack.empty()) {
239
+ State curr(stack.back());
240
+ stack.pop_back();
241
+
242
+ UTIL_THROW_IF2(curr.end() >= srcSize, "Error");
243
+ const ConfusionNet::Column &currCol=src[curr.end()];
244
+ // in a given column, loop over all possibilities
245
+ for(size_t colidx=0; colidx<currCol.size(); ++colidx) {
246
+ const Word& w=currCol[colidx].first; // w=the i^th possibility in column colidx
247
+ std::string s;
248
+ Factors2String(w,s);
249
+ bool isEpsilon=(s=="" || s==EPSILON);
250
+
251
+ //assert that we have the right number of link params in this CN option
252
+ UTIL_THROW_IF2(currCol[colidx].second.denseScores.size() < m_numInputScores,
253
+ "Incorrect number of input scores");
254
+
255
+ // do not start with epsilon (except at first position)
256
+ if(isEpsilon && curr.begin()==curr.end() && curr.begin()>0) continue;
257
+
258
+ // At a given node in the prefix tree, look to see if w defines an edge to
259
+ // another node (Extend). Stay at the same node if w==EPSILON
260
+ PPtr nextP = (isEpsilon ? curr.ptr : m_dict->Extend(curr.ptr,s));
261
+
262
+ if(nextP) { // w is a word that should be considered
263
+ Range newRange(curr.begin(),curr.end()+src.GetColumnIncrement(curr.end(),colidx));
264
+
265
+ //add together the link scores from the current state and the new arc
266
+ float inputScoreSum = 0;
267
+ std::vector<float> newInputScores(m_numInputScores,0.0);
268
+ if (m_numInputScores) {
269
+ std::transform(currCol[colidx].second.denseScores.begin(), currCol[colidx].second.denseScores.end(),
270
+ curr.GetScores().begin(),
271
+ newInputScores.begin(),
272
+ std::plus<float>());
273
+
274
+
275
+ //we need to sum up link weights (excluding realWordCount, which isn't in numLinkParams)
276
+ //if the sum is too low, then we won't expand this.
277
+ //TODO: dodgy! shouldn't we consider weights here? what about zero-weight params?
278
+ inputScoreSum = std::accumulate(newInputScores.begin(),newInputScores.begin()+m_numInputScores,0.0);
279
+ }
280
+
281
+ Phrase newSrc(curr.src);
282
+ if(!isEpsilon) newSrc.AddWord(w);
283
+ if(newRange.second<srcSize && inputScoreSum>LOWEST_SCORE) {
284
+ // if there is more room to grow, add a new state onto the queue
285
+ // to be explored that represents [begin, curEnd+)
286
+ stack.push_back(State(newRange,nextP,newInputScores));
287
+ stack.back().src=newSrc;
288
+ }
289
+
290
+ std::vector<StringTgtCand> tcands;
291
+ // now, look up the target candidates (aprx. TargetPhraseCollection) for
292
+ // the current path through the CN
293
+ m_dict->GetTargetCandidates(nextP,tcands);
294
+
295
+ if(newRange.second>=exploredPaths.size()+newRange.first)
296
+ exploredPaths.resize(newRange.second-newRange.first+1,0);
297
+ ++exploredPaths[newRange.second-newRange.first];
298
+
299
+ totalE+=tcands.size();
300
+
301
+ if(tcands.size()) {
302
+ E2Costs& e2costs=cov2cand[newRange];
303
+ Phrase const* srcPtr=uniqSrcPhr(newSrc);
304
+ for(size_t i=0; i<tcands.size(); ++i) {
305
+ //put input scores in first - already logged, just drop in directly
306
+ std::vector<float> transcores(m_obj->GetNumScoreComponents());
307
+ UTIL_THROW_IF2(transcores.size() != weightTrans.size(),
308
+ "Incorrect number of translation scores");
309
+
310
+ //put in phrase table scores, logging as we insert
311
+ std::transform(tcands[i].scores.begin()
312
+ ,tcands[i].scores.end()
313
+ ,transcores.begin()
314
+ ,TransformScore);
315
+
316
+
317
+ //tally up
318
+ float score=std::inner_product(transcores.begin(), transcores.end(), weightTrans.begin(), 0.0f);
319
+
320
+ // input feature
321
+ score +=std::inner_product(newInputScores.begin(), newInputScores.end(), weightInput.begin(), 0.0f);
322
+
323
+ //count word penalty
324
+ score-=tcands[i].tokens.size() * weightWP;
325
+
326
+ std::pair<E2Costs::iterator,bool> p=e2costs.insert(std::make_pair(tcands[i].tokens,TScores()));
327
+
328
+ if(p.second) ++distinctE;
329
+
330
+ TScores & scores=p.first->second;
331
+ if(p.second || scores.total<score) {
332
+ scores.total=score;
333
+ scores.transScore=transcores;
334
+ scores.inputScores=newInputScores;
335
+ scores.src=srcPtr;
336
+ }
337
+ }
338
+ }
339
+ }
340
+ }
341
+ } // end while(!stack.empty())
342
+
343
+
344
+ if (StaticData::Instance().GetVerboseLevel() >= 2 && exploredPaths.size()) {
345
+ TRACE_ERR("CN (explored): ");
346
+ std::copy(exploredPaths.begin()+1,exploredPaths.end(),
347
+ std::ostream_iterator<size_t>(std::cerr," "));
348
+ TRACE_ERR("\n");
349
+ }
350
+
351
+ if(pathExplored.size()<exploredPaths.size())
352
+ pathExplored.resize(exploredPaths.size(),0);
353
+ for(size_t len=1; len<=srcSize; ++len)
354
+ pathExplored[len]+=exploredPaths[len];
355
+
356
+
357
+ // m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize(),0));
358
+ m_rangeCache.resize(src.GetSize(),vTPC(src.GetSize()));
359
+
360
+ for(std::map<Range,E2Costs>::const_iterator i=cov2cand.begin(); i!=cov2cand.end(); ++i) {
361
+ assert(i->first.first<m_rangeCache.size());
362
+ assert(i->first.second>0);
363
+ assert(static_cast<size_t>(i->first.second-1)<m_rangeCache[i->first.first].size());
364
+ assert(m_rangeCache[i->first.first][i->first.second-1]==0);
365
+
366
+ std::vector<TargetPhrase> tCands;
367
+ tCands.reserve(i->second.size());
368
+
369
+ std::vector<std::pair<float,size_t> > costs;
370
+ costs.reserve(i->second.size());
371
+
372
+ std::vector<Phrase> sourcePhrases;
373
+ sourcePhrases.reserve(i->second.size());
374
+
375
+ for(E2Costs::const_iterator j=i->second.begin(); j!=i->second.end(); ++j) {
376
+ TScores const & scores=j->second;
377
+ TargetPhrase targetPhrase(m_obj);
378
+ CreateTargetPhrase(targetPhrase
379
+ , j ->first
380
+ , m_obj->options()->output.factor_delimiter
381
+ , scores.transScore
382
+ , scores.inputScores
383
+ , NULL
384
+ , scores.src);
385
+ costs.push_back(std::make_pair(-targetPhrase.GetFutureScore(),tCands.size()));
386
+ tCands.push_back(targetPhrase);
387
+
388
+ sourcePhrases.push_back(*scores.src);
389
+
390
+ //std::cerr << i->first.first << "-" << i->first.second << ": " << targetPhrase << std::endl;
391
+ }
392
+
393
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr
394
+ rv = PruneTargetCandidates(tCands, costs, sourcePhrases);
395
+
396
+ if(rv->IsEmpty())
397
+ rv.reset();
398
+ else {
399
+ m_rangeCache[i->first.first][i->first.second-1]=rv;
400
+ m_tgtColls.push_back(rv);
401
+ }
402
+ }
403
+ // free memory
404
+ m_dict->FreeMemory();
405
+ }
406
+
407
+ void PDTAimp::CreateTargetPhrase(TargetPhrase& targetPhrase,
408
+ StringTgtCand::Tokens const& factorStrings,
409
+ std::string const& factorDelimiter,
410
+ Scores const& transVector,
411
+ Scores const& inputVector,
412
+ const std::string *alignmentString,
413
+ Phrase const* srcPtr) const
414
+ {
415
+ FactorCollection &factorCollection = FactorCollection::Instance();
416
+
417
+ for(size_t k=0; k<factorStrings.size(); ++k) {
418
+ util::TokenIter<util::MultiCharacter, false>
419
+ word(*factorStrings[k], factorDelimiter);
420
+ Word& w=targetPhrase.AddWord();
421
+ for(size_t l=0; l<m_output.size(); ++l, ++word) {
422
+ w[m_output[l]]= factorCollection.AddFactor(*word);
423
+ }
424
+ }
425
+
426
+ if (alignmentString) {
427
+ targetPhrase.SetAlignmentInfo(*alignmentString);
428
+ }
429
+
430
+ if (m_numInputScores) {
431
+ targetPhrase.GetScoreBreakdown().Assign(m_inputFeature, inputVector);
432
+ }
433
+
434
+ targetPhrase.GetScoreBreakdown().Assign(m_obj, transVector);
435
+ targetPhrase.EvaluateInIsolation(*srcPtr, m_obj->GetFeaturesToApply());
436
+ }
437
+
438
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr
439
+ PDTAimp::PruneTargetCandidates
440
+ (const std::vector<TargetPhrase> & tCands,
441
+ std::vector<std::pair<float,size_t> >& costs,
442
+ const std::vector<Phrase> &sourcePhrases) const
443
+ {
444
+ // convert into TargetPhraseCollection
445
+ UTIL_THROW_IF2(tCands.size() != sourcePhrases.size(),
446
+ "Number of target phrases must equal number of source phrases");
447
+
448
+ TargetPhraseCollectionWithSourcePhrase::shared_ptr rv;
449
+ rv.reset(new TargetPhraseCollectionWithSourcePhrase);
450
+
451
+
452
+ // set limit to tableLimit or actual size, whatever is smaller
453
+ std::vector<std::pair<float,size_t> >::iterator nth =
454
+ costs.begin() + ((m_obj->m_tableLimit>0 && // 0 indicates no limit
455
+ m_obj->m_tableLimit < costs.size()) ?
456
+ m_obj->m_tableLimit : costs.size());
457
+
458
+ // find the nth phrase according to future cost
459
+ NTH_ELEMENT3(costs.begin(),nth ,costs.end());
460
+
461
+ // add n top phrases to the return list
462
+ for(std::vector<std::pair<float,size_t> >::iterator
463
+ it = costs.begin(); it != nth; ++it) {
464
+ size_t ind = it->second;
465
+ TargetPhrase *targetPhrase = new TargetPhrase(tCands[ind]);
466
+ const Phrase &sourcePhrase = sourcePhrases[ind];
467
+ rv->Add(targetPhrase, sourcePhrase);
468
+
469
+ }
470
+
471
+ return rv;
472
+ }
473
+
474
+ }
475
+
476
+
mosesdecoder/moses/Parameter.cpp ADDED
@@ -0,0 +1,1690 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <ctime>
23
+ #include <iostream>
24
+ #include <iterator>
25
+ #include <fstream>
26
+ #include <sstream>
27
+ #include <algorithm>
28
+ #include <boost/algorithm/string/predicate.hpp>
29
+ #include "Parameter.h"
30
+ #include "Util.h"
31
+ #include "InputFileStream.h"
32
+ #include "StaticData.h"
33
+ #include "util/string_stream.hh"
34
+ #include "util/exception.hh"
35
+ #include "util/random.hh"
36
+ #include <boost/program_options.hpp>
37
+
38
+ #ifdef HAVE_XMLRPC_C
39
+ #include <xmlrpc_server.h>
40
+ #endif
41
+
42
+ using namespace std;
43
+ using namespace boost::algorithm;
44
+ namespace po = boost::program_options;
45
+
46
+ namespace Moses
47
+ {
48
+
49
+ /** define allowed parameters */
50
+ Parameter::Parameter()
51
+ {
52
+ ///////////////////////////////////////////////////////////////////////////////////////
53
+ // general options
54
+ po::options_description main_opts("Main Options");
55
+ AddParam(main_opts,"config", "f", "location of the configuration file");
56
+ AddParam(main_opts,"input-file", "i", "location of the input file to be translated");
57
+
58
+ AddParam(main_opts,"verbose", "v", "verbosity level of the logging");
59
+ AddParam(main_opts,"version", "show version of Moses and libraries used");
60
+ AddParam(main_opts,"show-weights", "print feature weights and exit");
61
+ AddParam(main_opts,"time-out", "seconds after which is interrupted (-1=no time-out, default is -1)");
62
+ AddParam(main_opts,"segment-time-out", "seconds for single segment after which is interrupted (-1=no time-out, default is -1)");
63
+
64
+ ///////////////////////////////////////////////////////////////////////////////////////
65
+ // factorization options
66
+ po::options_description factor_opts("General Factorization Options");
67
+ AddParam(factor_opts,"factor-delimiter", "fd", "specify a different factor delimiter than the default");
68
+ // one should be able to specify different factor delimiters for intput and output
69
+ AddParam(factor_opts,"mapping", "description of decoding steps"); // whatever that means ...
70
+ AddParam(factor_opts,"placeholder-factor", "Which source factor to use to store the original text for placeholders. The factor must not be used by a translation or gen model");
71
+
72
+ ///////////////////////////////////////////////////////////////////////////////////////
73
+ // general search options
74
+ po::options_description search_opts("Search Options");
75
+ string desc = "Which search algorithm to use.\n";
76
+ desc += "0=normal stack (default)\n";
77
+ desc += "1=cube pruning\n";
78
+ desc += "3=chart (with cube pruning)\n";
79
+ desc += "4=stack with batched lm requests\n";
80
+ desc += "5=chart (with incremental search)\n";
81
+ desc += "6=string-to-tree\n";
82
+ desc += "7=tree-to-string\n";
83
+ desc += "8=tree-to-string (SCFG-based)\n";
84
+ desc += "9=forest-to-string";
85
+ AddParam(search_opts,"search-algorithm", desc);
86
+ AddParam(search_opts,"beam-threshold", "b", "threshold for threshold pruning");
87
+ AddParam(search_opts,"early-discarding-threshold", "edt", "threshold for constructing hypotheses based on estimate cost");
88
+ AddParam(search_opts,"stack", "s", "maximum stack size for histogram pruning. 0 = unlimited stack size");
89
+ AddParam(search_opts,"stack-diversity", "sd", "minimum number of hypothesis of each coverage in stack (default 0)");
90
+
91
+ // feature weight-related options
92
+ AddParam(search_opts,"weight-file", "wf", "feature weights file. Do *not* put weights for 'core' features in here - they go in moses.ini");
93
+ AddParam(search_opts,"weight", "weights for ALL models, 1 per line 'WeightName value'. Weight names can be repeated");
94
+
95
+ AddParam(search_opts,"feature-overwrite", "Override arguments in a particular feature function with a particular key. Format: -feature-overwrite \"FeatureName key=value\"");
96
+
97
+ po::options_description tune_opts("Options used in tuning.");
98
+ AddParam(tune_opts,"weight-overwrite", "special parameter for mert. All on 1 line. Overrides weights specified in 'weights' argument");
99
+ AddParam(tune_opts,"feature-add", "Add a feature function on the command line. Used by mira to add BLEU feature");
100
+ AddParam(tune_opts,"weight-add", "Add weight for FF if it doesn't exist, i.e weights here are added 1st, and can be override by the ini file or on the command line. Used to specify initial weights for FF that was also specified on the copmmand line");
101
+
102
+ // phrase table limitations:
103
+ AddParam(search_opts,"max-partial-trans-opt", "maximum number of partial translation options per input span (during mapping steps)");
104
+ AddParam(search_opts,"max-trans-opt-per-coverage", "maximum number of translation options per input span (after applying mapping steps)");
105
+ AddParam(search_opts,"max-phrase-length", "maximum phrase length (default 20)");
106
+ AddParam(search_opts,"translation-option-threshold", "tot", "threshold for translation options relative to best for input phrase");
107
+
108
+ // miscellaneous search options
109
+ AddParam(search_opts,"disable-discarding", "dd", "disable hypothesis discarding"); // ??? memory management? UG
110
+ AddParam(search_opts,"phrase-drop-allowed", "da", "if present, allow dropping of source words"); //da = drop any (word); see -du for comparison
111
+ AddParam(search_opts,"threads","th", "number of threads to use in decoding (defaults to single-threaded)");
112
+
113
+ // distortion options
114
+ po::options_description disto_opts("Distortion options");
115
+ AddParam(disto_opts,"distortion-limit", "dl", "distortion (reordering) limit in maximum number of words (0 = monotone, -1 = unlimited)");
116
+ AddParam(disto_opts,"monotone-at-punctuation", "mp", "do not reorder over punctuation");
117
+ AddParam(disto_opts,"early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
118
+ AddParam(disto_opts,"distortion", "configurations for each factorized/lexicalized reordering model."); // zombie parameter?
119
+
120
+ // cube pruning
121
+ po::options_description cube_opts("Cube pruning options.");
122
+ AddParam(cube_opts,"cube-pruning-pop-limit", "cbp", "How many hypotheses should be popped for each stack. (default = 1000)");
123
+ AddParam(cube_opts,"cube-pruning-diversity", "cbd", "How many hypotheses should be created for each coverage. (default = 0)");
124
+ AddParam(cube_opts,"cube-pruning-lazy-scoring", "cbls", "Don't fully score a hypothesis until it is popped");
125
+ AddParam(cube_opts,"cube-pruning-deterministic-search", "cbds", "Break ties deterministically during search");
126
+
127
+ ///////////////////////////////////////////////////////////////////////////////////////
128
+ // minimum bayes risk decoding
129
+ po::options_description mbr_opts("Minimum Bayes Risk (MBR), Lattice MBR, and Consensus decoding");
130
+
131
+ AddParam(mbr_opts,"minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
132
+ AddParam(mbr_opts,"mbr-size", "number of translation candidates considered in MBR decoding (default 200)");
133
+ AddParam(mbr_opts,"mbr-scale", "scaling factor to convert log linear score probability in MBR decoding (default 1.0)");
134
+
135
+ AddParam(mbr_opts,"lminimum-bayes-risk", "lmbr", "use lattice miminum Bayes risk to determine best translation");
136
+ AddParam(mbr_opts,"consensus-decoding", "con", "use consensus decoding (De Nero et. al. 2009)");
137
+
138
+ po::options_description lmbr_opts("Options specific to Lattic MBR");
139
+ AddParam(lmbr_opts,"lmbr-p", "unigram precision value for lattice mbr");
140
+ AddParam(lmbr_opts,"lmbr-r", "ngram precision decay value for lattice mbr");
141
+ AddParam(lmbr_opts,"lmbr-thetas", "theta(s) for lattice mbr calculation");
142
+ AddParam(mbr_opts,"lmbr-map-weight", "weight given to map solution when doing lattice MBR (default 0)");
143
+ AddParam(mbr_opts,"lmbr-pruning-factor", "average number of nodes/word wanted in pruned lattice");
144
+ AddParam(mbr_opts,"lattice-hypo-set", "to use lattice as hypo set during lattice MBR");
145
+
146
+ ///////////////////////////////////////////////////////////////////////////////////////
147
+ // OOV handling options
148
+ po::options_description oov_opts("OOV Handling Options");
149
+ AddParam(oov_opts,"drop-unknown", "du", "drop unknown words instead of copying them");
150
+ AddParam(oov_opts,"mark-unknown", "mu", "mark unknown words in output");
151
+ AddParam(oov_opts,"unknown-word-prefix", "prefix to unknwon word when marked (default: 'UNK')");
152
+ AddParam(oov_opts,"unknown-word-suffix", "suffix to unknwon word when marked (default: '')");
153
+ AddParam(oov_opts,"lmodel-oov-feature", "add language model oov feature, one per model");
154
+ AddParam(oov_opts,"output-unknowns", "Output the unknown (OOV) words to the given file, one line per sentence");
155
+ AddParam(oov_opts,"always-create-direct-transopt", "Always create a translation that translates the source word ad-verbatim");
156
+
157
+ ///////////////////////////////////////////////////////////////////////////////////////
158
+ // input options
159
+ po::options_description input_opts("Input Format Options");
160
+ AddParam(input_opts,"input-factors", "list of factors in the input");
161
+ AddParam(input_opts,"inputtype", "text (0), confusion network (1), word lattice (2), tree (3) (default = 0)");
162
+ AddParam(input_opts,"xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'constraint', 'ignore'");
163
+ AddParam(input_opts,"xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
164
+ AddParam(input_opts,"start-translation-id", "Id of 1st input. Default = 0");
165
+ AddParam(input_opts,"alternate-weight-setting", "aws", "alternate set of weights to used per xml specification");
166
+
167
+ ///////////////////////////////////////////////////////////////////////////////////////
168
+ // output options
169
+ po::options_description output_opts("Output Options");
170
+ AddParam(output_opts,"report-all-factors", "report all factors in output, not just first");
171
+ AddParam(output_opts,"output-factors", "list if factors in the output");
172
+ AddParam(output_opts,"print-id", "prefix translations with id. Default if false");
173
+ AddParam(output_opts,"print-passthrough", "output the sgml tag <passthrough> without any computation on that. Default is false");
174
+ AddParam(output_opts,"print-passthrough-in-n-best", "output the sgml tag <passthrough> without any computation on that in each entry of the n-best-list. Default is false");
175
+ AddParam(output_opts,"output-factors", "list of factors in the output");
176
+ AddParam(output_opts,"print-all-derivations", "to print all derivations in search graph");
177
+ AddParam(output_opts,"translation-details", "T", "for each best hypothesis, report translation details to the given file");
178
+
179
+ AddParam(output_opts,"output-hypo-score", "Output the hypo score to stdout with the output string. For search error analysis. Default is false");
180
+ AddParam(output_opts,"output-word-graph", "owg", "Output stack info as word graph. Takes filename, 0=only hypos in stack, 1=stack + nbest hypos");
181
+ AddParam(output_opts,"tree-translation-details", "Ttree", "for each hypothesis, report translation details with tree fragment info to given file");
182
+ AddParam(output_opts,"print-alignment-info", "Output word-to-word alignment to standard out, separated from translation by |||. Word-to-word alignments are takne from the phrase table if any. Default is false");
183
+ AddParam(output_opts,"alignment-output-file", "print output word alignments into given file");
184
+ AddParam(output_opts,"sort-word-alignment", "Sort word alignments for more consistent display. 0=no sort (default), 1=target order");
185
+ AddParam(output_opts,"report-segmentation", "t", "report phrase segmentation in the output");
186
+ AddParam(output_opts,"report-segmentation-enriched", "tt", "report phrase segmentation in the output with additional information");
187
+
188
+ // translation-all-details was introduced in the context of DIMwid: Decoder Inspection for Moses (using Widgets)
189
+ // see here: https://ufal.mff.cuni.cz/pbml/100/art-kurtz-seemann-braune-maletti.pdf
190
+ AddParam(output_opts,"translation-all-details", "Tall", "for all hypotheses, report translation details to the given file");
191
+
192
+ po::options_description osg_opts("Options for outputting search graphs");
193
+ AddParam(osg_opts,"output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
194
+ AddParam(osg_opts,"output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
195
+ AddParam(osg_opts,"unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
196
+ AddParam(osg_opts,"output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF) - the flag should be followed by a directory name, which must exist");
197
+ AddParam(output_opts,"include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
198
+ #ifdef HAVE_PROTOBUF
199
+ AddParam(osg_opts,"output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
200
+ #endif
201
+ AddParam(osg_opts,"output-search-graph-hypergraph", "DEPRECATED! Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder). This flag is followed by 3 values: 'true (gz|txt|bz) directory-name'");
202
+
203
+ ///////////////////////////////////////////////////////////////////////////////////////
204
+ // nbest-options
205
+ po::options_description nbest_opts("N-best Options");
206
+ AddParam(nbest_opts,"n-best-list", "file and size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
207
+ // AddParam(nbest_opts,"n-best-list-file", "file of n-best-list to be generated; specify - as the file in order to write to STDOUT");
208
+ // AddParam(nbest_opts,"n-best-list-size", "size of n-best-list to be generated; specify - as the file in order to write to STDOUT");
209
+ AddParam(nbest_opts,"labeled-n-best-list", "print out labels for each weight type in n-best list. default is true");
210
+ AddParam(nbest_opts,"n-best-trees", "Write n-best target-side trees to n-best-list");
211
+ AddParam(nbest_opts,"n-best-factor", "factor to compute the maximum number of contenders (=factor*nbest-size). value 0 means infinity, i.e. no threshold. default is 0");
212
+ AddParam(nbest_opts,"report-all-factors-in-n-best", "Report all factors in n-best-lists. Default is false");
213
+ AddParam(nbest_opts,"lattice-samples", "generate samples from lattice, in same format as nbest list. Uses the file and size arguments, as in n-best-list");
214
+ AddParam(nbest_opts,"include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
215
+ AddParam(nbest_opts,"print-alignment-info-in-n-best",
216
+ "Include word-to-word alignment in the n-best list. Word-to-word alignments are taken from the phrase table if any. Default is false");
217
+
218
+ ///////////////////////////////////////////////////////////////////////////////////////
219
+ // server options
220
+ po::options_description server_opts("Moses Server Options");
221
+ AddParam(server_opts,"server", "Run moses as a translation server.");
222
+ AddParam(server_opts,"daemon", "Run moses as a translation server in the background.");
223
+ AddParam(server_opts,"server-port", "Port for moses server");
224
+ AddParam(server_opts,"server-log", "Log destination for moses server");
225
+ AddParam(server_opts,"serial", "Run server in serial mode, processing only one request at a time.");
226
+
227
+ AddParam(server_opts,"server-maxconn",
228
+ "Max. No of simultaneous HTTP transactions allowed by the server.");
229
+ AddParam(server_opts,"server-maxconn-backlog",
230
+ "Max. No. of requests the OS will queue if the server is busy.");
231
+ AddParam(server_opts,"server-keepalive-maxconn",
232
+ "Max. No. of requests the server will accept on a single TCP connection.");
233
+ AddParam(server_opts,"server-keepalive-timeout",
234
+ "Max. number of seconds the server will keep a persistent connection alive.");
235
+ AddParam(server_opts,"server-timeout",
236
+ "Max. number of seconds the server will wait for a client to submit a request once a connection has been established.");
237
+ // session timeout and session cache size are for moses translation session handling
238
+ // they have nothing to do with the abyss server (but relate to the moses server)
239
+ AddParam(server_opts,"session-timeout",
240
+ "Timeout for sessions, e.g. '2h30m' or 1d (=24h)");
241
+ AddParam(server_opts,"session-cache-size", string("Max. number of sessions cached.")
242
+ +"Least recently used session is dumped first.");
243
+
244
+ po::options_description irstlm_opts("IRSTLM Options");
245
+ AddParam(irstlm_opts,"clean-lm-cache",
246
+ "clean language model caches after N translations (default N=1)");
247
+
248
+ po::options_description chart_opts("Chart Decoding Options");
249
+ AddParam(chart_opts,"max-chart-span", "maximum num. of source word chart rules can consume (default 10)");
250
+ AddParam(chart_opts,"non-terminals", "list of non-term symbols, space separated");
251
+ AddParam(chart_opts,"rule-limit", "a little like table limit. But for chart decoding rules. Default is DEFAULT_MAX_TRANS_OPT_SIZE");
252
+ AddParam(chart_opts,"source-label-overlap", "What happens if a span already has a label. 0=add more. 1=replace. 2=discard. Default is 0");
253
+ AddParam(chart_opts,"unknown-lhs", "file containing target lhs of unknown words. 1 per line: LHS prob");
254
+
255
+ po::options_description misc_opts("Miscellaneous Options");
256
+ AddParam(misc_opts,"mira", "do mira training");
257
+ AddParam(misc_opts,"description", "Source language, target language, description");
258
+ AddParam(misc_opts,"no-cache", "Disable all phrase-table caching. Default = false (ie. enable caching)");
259
+ AddParam(misc_opts,"default-non-term-for-empty-range-only", "Don't add [X] to all ranges, just ranges where there isn't a source non-term. Default = false (ie. add [X] everywhere)");
260
+ AddParam(misc_opts,"s2t-parsing-algorithm", "Which S2T parsing algorithm to use. 0=recursive CYK+, 1=scope-3 (default = 0)");
261
+
262
+ //AddParam(o,"continue-partial-translation", "cpt", "start from nonempty hypothesis");
263
+ AddParam(misc_opts,"decoding-graph-backoff", "dpb", "only use subsequent decoding paths for unknown spans of given length");
264
+ AddParam(misc_opts,"references", "Reference file(s) - used for bleu score feature");
265
+ AddParam(misc_opts,"recover-input-path", "r", "(conf net/word lattice only) - recover input path corresponding to the best translation");
266
+ AddParam(misc_opts,"link-param-count", "Number of parameters on word links when using confusion networks or lattices (default = 1)");
267
+ AddParam(misc_opts,"feature-name-overwrite", "Override feature name (NOT arguments). Eg. SRILM-->KENLM, PhraseDictionaryMemory-->PhraseDictionaryScope3");
268
+
269
+ AddParam(misc_opts,"feature", "All the feature functions should be here");
270
+ AddParam(misc_opts,"context-string",
271
+ "A (tokenized) string containing context words for context-sensitive translation.");
272
+ AddParam(misc_opts,"context-weights", "A key-value map for context-sensitive translation.");
273
+ AddParam(misc_opts,"context-window",
274
+ "Context window (in words) for context-sensitive translation: {+|-|+-}<number>.");
275
+
276
+ // Compact phrase table and reordering table.
277
+ po::options_description cpt_opts("Options when using compact phrase and reordering tables.");
278
+ AddParam(cpt_opts,"minphr-memory", "Load phrase table in minphr format into memory");
279
+ AddParam(cpt_opts,"minlexr-memory", "Load lexical reordering table in minlexr format into memory");
280
+
281
+ po::options_description spe_opts("Simulated Post-editing Options");
282
+ AddParam(spe_opts,"spe-src", "Simulated post-editing. Source filename");
283
+ AddParam(spe_opts,"spe-trg", "Simulated post-editing. Target filename");
284
+ AddParam(spe_opts,"spe-aln", "Simulated post-editing. Alignment filename");
285
+
286
+ ///////////////////////////////////////////////////////////////////////////////////////
287
+ // DEPRECATED options
288
+ po::options_description deprec_opts("Deprecated Options");
289
+ AddParam(deprec_opts,"link-param-count", "DEPRECATED. DO NOT USE. Number of parameters on word links when using confusion networks or lattices (default = 1)");
290
+ AddParam(deprec_opts,"weight-slm", "slm", "DEPRECATED. DO NOT USE. weight(s) for syntactic language model");
291
+ AddParam(deprec_opts,"weight-bl", "bl", "DEPRECATED. DO NOT USE. weight for bleu score feature");
292
+ AddParam(deprec_opts,"weight-d", "d", "DEPRECATED. DO NOT USE. weight(s) for distortion (reordering components)");
293
+ AddParam(deprec_opts,"weight-dlm", "dlm", "DEPRECATED. DO NOT USE. weight for discriminative LM feature function (on top of sparse weights)");
294
+ AddParam(deprec_opts,"weight-lr", "lr", "DEPRECATED. DO NOT USE. weight(s) for lexicalized reordering, if not included in weight-d");
295
+ AddParam(deprec_opts,"weight-generation", "g", "DEPRECATED. DO NOT USE. weight(s) for generation components");
296
+ AddParam(deprec_opts,"weight-i", "I", "DEPRECATED. DO NOT USE. weight(s) for word insertion - used for parameters from confusion network and lattice input links");
297
+ AddParam(deprec_opts,"weight-l", "lm", "DEPRECATED. DO NOT USE. weight(s) for language models");
298
+ AddParam(deprec_opts,"weight-lex", "lex", "DEPRECATED. DO NOT USE. weight for global lexical model");
299
+ AddParam(deprec_opts,"weight-glm", "glm", "DEPRECATED. DO NOT USE. weight for global lexical feature, sparse producer");
300
+ AddParam(deprec_opts,"weight-wt", "wt", "DEPRECATED. DO NOT USE. weight for word translation feature");
301
+ AddParam(deprec_opts,"weight-pp", "pp", "DEPRECATED. DO NOT USE. weight for phrase pair feature");
302
+ AddParam(deprec_opts,"weight-pb", "pb", "DEPRECATED. DO NOT USE. weight for phrase boundary feature");
303
+ AddParam(deprec_opts,"weight-t", "tm", "DEPRECATED. DO NOT USE. weights for translation model components");
304
+ AddParam(deprec_opts,"weight-p", "w", "DEPRECATED. DO NOT USE. weight for phrase penalty");
305
+ AddParam(deprec_opts,"weight-w", "w", "DEPRECATED. DO NOT USE. weight for word penalty");
306
+ AddParam(deprec_opts,"weight-u", "u", "DEPRECATED. DO NOT USE. weight for unknown word penalty");
307
+ AddParam(deprec_opts,"weight-e", "e", "DEPRECATED. DO NOT USE. weight for word deletion");
308
+ AddParam(deprec_opts,"text-type", "DEPRECATED. DO NOT USE. should be one of dev/devtest/test, used for domain adaptation features");
309
+ AddParam(deprec_opts,"input-scores", "DEPRECATED. DO NOT USE. 2 numbers on 2 lines - [1] of scores on each edge of a confusion network or lattice input (default=1). [2] Number of 'real' word scores (0 or 1. default=0)");
310
+ AddParam(deprec_opts,"dlm-model", "DEPRECATED. DO NOT USE. Order, factor and vocabulary file for discriminative LM. Use * for filename to indicate unlimited vocabulary.");
311
+ AddParam(deprec_opts,"generation-file", "DEPRECATED. DO NOT USE. location and properties of the generation table");
312
+ AddParam(deprec_opts,"global-lexical-file", "gl", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation model file");
313
+ AddParam(deprec_opts,"glm-feature", "DEPRECATED. DO NOT USE. discriminatively trained global lexical translation feature, sparse producer");
314
+ AddParam(deprec_opts,"lmodel-file", "DEPRECATED. DO NOT USE. location and properties of the language models");
315
+ AddParam(deprec_opts,"lmodel-dub", "DEPRECATED. DO NOT USE. dictionary upper bounds of language models");
316
+ #ifdef HAVE_SYNLM
317
+ AddParam(deprec_opts,"slmodel-file", "DEPRECATED. DO NOT USE. location of the syntactic language model file(s)");
318
+ AddParam(deprec_opts,"slmodel-factor", "DEPRECATED. DO NOT USE. factor to use with syntactic language model");
319
+ AddParam(deprec_opts,"slmodel-beam", "DEPRECATED. DO NOT USE. beam width to use with syntactic language model's parser");
320
+ #endif
321
+ AddParam(deprec_opts,"ttable-file", "DEPRECATED. DO NOT USE. location and properties of the translation tables");
322
+ AddParam(deprec_opts,"phrase-pair-feature", "DEPRECATED. DO NOT USE. Source and target factors for phrase pair feature");
323
+ AddParam(deprec_opts,"phrase-boundary-source-feature", "DEPRECATED. DO NOT USE. Source factors for phrase boundary feature");
324
+ AddParam(deprec_opts,"phrase-boundary-target-feature", "DEPRECATED. DO NOT USE. Target factors for phrase boundary feature");
325
+ AddParam(deprec_opts,"phrase-length-feature", "DEPRECATED. DO NOT USE. Count features for source length, target length, both of each phrase");
326
+ AddParam(deprec_opts,"target-word-insertion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned target word");
327
+ AddParam(deprec_opts,"source-word-deletion-feature", "DEPRECATED. DO NOT USE. Count feature for each unaligned source word");
328
+ AddParam(deprec_opts,"word-translation-feature", "DEPRECATED. DO NOT USE. Count feature for word translation according to word alignment");
329
+
330
+ po::options_description zombie_opts("Zombie Options");
331
+ AddParam(zombie_opts,"distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
332
+
333
+
334
+ mbr_opts.add(lmbr_opts);
335
+ search_opts.add(cube_opts);
336
+ search_opts.add(mbr_opts);
337
+ search_opts.add(disto_opts);
338
+ search_opts.add(chart_opts);
339
+
340
+ input_opts.add(spe_opts);
341
+
342
+ output_opts.add(nbest_opts);
343
+ output_opts.add(osg_opts);
344
+
345
+ m_options.add(main_opts);
346
+ m_options.add(server_opts);
347
+ m_options.add(input_opts);
348
+ m_options.add(search_opts);
349
+ m_options.add(output_opts);
350
+ m_options.add(oov_opts);
351
+ m_options.add(factor_opts);
352
+ m_options.add(cpt_opts);
353
+ m_options.add(irstlm_opts);
354
+ m_options.add(tune_opts);
355
+ m_options.add(misc_opts);
356
+ m_options.add(deprec_opts);
357
+ m_options.add(zombie_opts);
358
+
359
+ }
360
+
361
+ Parameter::~Parameter()
362
+ {
363
+ }
364
+
365
+ const PARAM_VEC *Parameter::GetParam(const std::string &paramName) const
366
+ {
367
+ PARAM_MAP::const_iterator iter = m_setting.find( paramName );
368
+ if (iter == m_setting.end()) {
369
+ return NULL;
370
+ } else {
371
+ return &iter->second;
372
+ }
373
+
374
+ }
375
+
376
+ /** initialize a parameter, sub of constructor */
377
+ void
378
+ Parameter::
379
+ AddParam(po::options_description& optgroup,
380
+ string const& paramName,
381
+ string const& description)
382
+ {
383
+ m_valid[paramName] = true;
384
+ m_description[paramName] = description;
385
+ optgroup.add_options()(paramName.c_str(), description.c_str());
386
+ }
387
+
388
+ /** initialize a parameter (including abbreviation), sub of constructor */
389
+ void
390
+ Parameter::
391
+ AddParam(po::options_description& optgroup,
392
+ string const& paramName,
393
+ string const& abbrevName,
394
+ string const& description)
395
+ {
396
+ m_valid[paramName] = true;
397
+ m_valid[abbrevName] = true;
398
+ m_abbreviation[paramName] = abbrevName;
399
+ m_fullname[abbrevName] = paramName;
400
+ m_description[paramName] = description;
401
+ string optname = paramName;
402
+ if (abbrevName.size() == 1) {
403
+ optname += string(",")+abbrevName;
404
+ // m_confusable[abbrevName[0]].insert(paramName);
405
+ }
406
+ optgroup.add_options()(optname.c_str(),description.c_str());
407
+ }
408
+
409
+ /** print descriptions of all parameters */
410
+ void
411
+ Parameter::
412
+ Explain()
413
+ {
414
+ cerr << "Usage:" << endl;
415
+ cerr << m_options << endl;
416
+ // for(PARAM_STRING::const_iterator iterParam = m_description.begin();
417
+ // iterParam != m_description.end(); iterParam++)
418
+ // {
419
+ // const string paramName = iterParam->first;
420
+ // const string paramDescription = iterParam->second;
421
+ // cerr << "\t-" << paramName;
422
+ // PARAM_STRING::const_iterator iterAbbr = m_abbreviation.find( paramName );
423
+ // if ( iterAbbr != m_abbreviation.end() )
424
+ // cerr << " (" << iterAbbr->second << ")";
425
+ // cerr << ": " << paramDescription << endl;
426
+ // }
427
+ }
428
+
429
+ /** check whether an item on the command line is a switch or a value
430
+ * \param token token on the command line to checked **/
431
+
432
+ bool
433
+ Parameter::
434
+ isOption(const char* token)
435
+ {
436
+ if (! token) return false;
437
+ std::string tokenString(token);
438
+ size_t length = tokenString.size();
439
+ if (length <= 1) return false;
440
+ if (!starts_with(tokenString, "-")) return false;
441
+ if (tokenString.substr(1,1).find_first_not_of("0123456789") == 0) return true;
442
+ return false;
443
+ }
444
+
445
+ /** load all parameters from the configuration file and the command line switches */
446
+ bool
447
+ Parameter::
448
+ LoadParam(const string &filePath)
449
+ {
450
+ const char *argv[] = {"executable", "-f", filePath.c_str() };
451
+ return LoadParam(3, (char const**) argv);
452
+ }
453
+
454
+ /// Print out version information about the things that went into this
455
+ /// executable.
456
+ void show_version()
457
+ {
458
+ std::cout << "\nMoses code version (git tag or commit hash):\n "
459
+ << MOSES_VERSION_ID << std::endl
460
+ << "Libraries used:" << std::endl
461
+ << " Boost version "
462
+ << BOOST_VERSION / 100000 << "." // major version
463
+ << BOOST_VERSION / 100 % 1000 << "." // minor version
464
+ << BOOST_VERSION % 100 // patch level
465
+ << std::endl;
466
+ #ifdef HAVE_XMLRPC_C
467
+ unsigned int major, minor, point;
468
+ xmlrpc_server_version(&major, &minor, &point);
469
+ std::cout << " Xmlrpc-c version "
470
+ << major << "." << minor << "." << point << std::endl;
471
+ #endif
472
+ #ifdef HAVE_CMPH
473
+ // there's no easy way to determine the cmph version at compile time
474
+ std::cout << " CMPH (version unknown)" << std::endl;
475
+ #endif
476
+
477
+ #ifdef MMT_VERSION_ID
478
+ std::cout << string(20,'-')
479
+ << "\nMMT extras version: " << MMT_VERSION_ID << std::endl;
480
+ #endif
481
+ }
482
+
483
+ /** load all parameters from the configuration file and the command line switches */
484
+ bool
485
+ Parameter::
486
+ LoadParam(int argc, char const* xargv[])
487
+ {
488
+ // legacy parameter handling: all parameters are expected
489
+ // to start with a single dash
490
+ char const* argv[argc+1];
491
+ for (int i = 0; i < argc; ++i) {
492
+ argv[i] = xargv[i];
493
+ if (strlen(argv[i]) > 2 && argv[i][0] == '-' && argv[i][1] == '-')
494
+ ++argv[i];
495
+ if (!strcmp(argv[i],"-version")) {
496
+ show_version();
497
+ exit(0);
498
+ }
499
+ }
500
+
501
+
502
+
503
+ // config file (-f) arg mandatory
504
+ string configPath;
505
+ if ( (configPath = FindParam("-f", argc, argv)) == ""
506
+ && (configPath = FindParam("-config", argc, argv)) == "") {
507
+ PrintCredit();
508
+ Explain();
509
+ PrintFF();
510
+
511
+ cerr << endl;
512
+ cerr << "No configuration file was specified. Use -config or -f";
513
+ cerr << endl;
514
+ return false;
515
+ } else {
516
+ if (!ReadConfigFile(configPath)) {
517
+ std::cerr << "Could not read " << configPath;
518
+ return false;
519
+ }
520
+ }
521
+
522
+ // overwrite parameters with values from switches
523
+ for(PARAM_STRING::const_iterator iterParam = m_description.begin();
524
+ iterParam != m_description.end(); iterParam++) {
525
+ const string paramName = iterParam->first;
526
+ OverwriteParam("-" + paramName, paramName, argc, argv);
527
+ }
528
+
529
+ // ... also shortcuts
530
+ for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
531
+ iterParam != m_abbreviation.end(); iterParam++) {
532
+ const string paramName = iterParam->first;
533
+ const string paramShortName = iterParam->second;
534
+ OverwriteParam("-" + paramShortName, paramName, argc, argv);
535
+ }
536
+
537
+ AddFeaturesCmd();
538
+
539
+ // logging of parameters that were set in either config or switch
540
+ int verbose = 1;
541
+ if (m_setting.find("verbose") != m_setting.end() &&
542
+ m_setting["verbose"].size() > 0)
543
+ verbose = Scan<int>(m_setting["verbose"][0]);
544
+ if (verbose >= 1) { // only if verbose
545
+ TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
546
+ for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
547
+ iterParam != m_setting.end(); iterParam++) {
548
+ TRACE_ERR( "\t" << iterParam->first << ": ");
549
+ for ( size_t i = 0; i < iterParam->second.size(); i++ )
550
+ TRACE_ERR( iterParam->second[i] << " ");
551
+ TRACE_ERR( endl);
552
+ }
553
+ }
554
+
555
+ // don't mix old and new format
556
+ if ((GetParam("feature") || GetParam("weight"))
557
+ && (GetParam("weight-slm") || GetParam("weight-bl") || GetParam("weight-d") ||
558
+ GetParam("weight-dlm") || GetParam("weight-lrl") || GetParam("weight-generation") ||
559
+ GetParam("weight-i") || GetParam("weight-l") || GetParam("weight-lex") ||
560
+ GetParam("weight-glm") || GetParam("weight-wt") || GetParam("weight-pp") ||
561
+ GetParam("weight-pb") || GetParam("weight-t") || GetParam("weight-w") ||
562
+ GetParam("weight-p") ||
563
+ GetParam("weight-u") || GetParam("weight-e") ||
564
+ GetParam("dlm-mode") || GetParam("generation-file") || GetParam("global-lexical-file") ||
565
+ GetParam("glm-feature") || GetParam("lmodel-file") || GetParam("lmodel-dub") ||
566
+ GetParam("slmodel-file") || GetParam("slmodel-factor") ||
567
+ GetParam("slmodel-beam") || GetParam("ttable-file") || GetParam("phrase-pair-feature") ||
568
+ GetParam("phrase-boundary-source-feature") || GetParam("phrase-boundary-target-feature") || GetParam("phrase-length-feature") ||
569
+ GetParam("target-word-insertion-feature") || GetParam("source-word-deletion-feature") || GetParam("word-translation-feature")
570
+ )
571
+ ) {
572
+ UTIL_THROW(util::Exception, "Don't mix old and new ini file format");
573
+ }
574
+
575
+ // convert old weights args to new format
576
+ if (GetParam("feature") == NULL) {
577
+ ConvertWeightArgs();
578
+ }
579
+ CreateWeightsMap();
580
+ WeightOverwrite();
581
+
582
+ // check for illegal parameters
583
+ bool noErrorFlag = true;
584
+ for (int i = 0 ; i < argc ; i++) {
585
+ if (isOption(argv[i])) {
586
+ string paramSwitch = (string) argv[i];
587
+ string paramName = paramSwitch.substr(1);
588
+ if (m_valid.find(paramName) == m_valid.end()) {
589
+ std::cerr << "illegal switch: " << paramSwitch;
590
+ noErrorFlag = false;
591
+ }
592
+ }
593
+ }
594
+
595
+ //Save("/tmp/moses.ini.new");
596
+
597
+ // check if parameters make sense
598
+ return Validate() && noErrorFlag;
599
+ }
600
+
601
+ void
602
+ Parameter::
603
+ AddFeaturesCmd()
604
+ {
605
+ const PARAM_VEC *params = GetParam("feature-add");
606
+ if (params) {
607
+ PARAM_VEC::const_iterator iter;
608
+ for (iter = params->begin(); iter != params->end(); ++iter) {
609
+ const string &line = *iter;
610
+ AddFeature(line);
611
+ }
612
+
613
+ m_setting.erase("feature-add");
614
+ }
615
+ }
616
+
617
+ std::vector<float>
618
+ Parameter::
619
+ GetWeights(const std::string &name)
620
+ {
621
+ std::vector<float> ret = m_weights[name];
622
+
623
+ // cerr << "WEIGHT " << name << "=";
624
+ // for (size_t i = 0; i < ret.size(); ++i) {
625
+ // cerr << ret[i] << ",";
626
+ // }
627
+ // cerr << endl;
628
+ return ret;
629
+ }
630
+
631
+ void
632
+ Parameter::
633
+ SetWeight(const std::string &name, size_t ind, float weight)
634
+ {
635
+ PARAM_VEC &newWeights = m_setting["weight"];
636
+ string line = name + SPrint(ind) + "= " + SPrint(weight);
637
+ newWeights.push_back(line);
638
+ }
639
+
640
+ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<float> &weights)
641
+ {
642
+ PARAM_VEC &newWeights = m_setting["weight"];
643
+ string line = name + SPrint(ind) + "=";
644
+
645
+ for (size_t i = 0; i < weights.size(); ++i) {
646
+ line += " " + SPrint(weights[i]);
647
+ }
648
+ newWeights.push_back(line);
649
+ }
650
+
651
+ void
652
+ Parameter::
653
+ AddWeight(const std::string &name, size_t ind,
654
+ const std::vector<float> &weights)
655
+ {
656
+ PARAM_VEC &newWeights = m_setting["weight"];
657
+
658
+ string sought = name + SPrint(ind) + "=";
659
+ for (size_t i = 0; i < newWeights.size(); ++i) {
660
+ string &line = newWeights[i];
661
+ if (line.find(sought) == 0) {
662
+ // found existing weight, most likely to be input weights. Append to this line
663
+ for (size_t i = 0; i < weights.size(); ++i) {
664
+ line += " " + SPrint(weights[i]);
665
+ }
666
+ return;
667
+ }
668
+ }
669
+
670
+ // nothing found. Just set
671
+ SetWeight(name, ind, weights);
672
+ }
673
+
674
+ void
675
+ Parameter::
676
+ ConvertWeightArgsSingleWeight(const string &oldWeightName, const string &newWeightName)
677
+ {
678
+ size_t ind = 0;
679
+ PARAM_MAP::iterator iterMap;
680
+
681
+ iterMap = m_setting.find(oldWeightName);
682
+ if (iterMap != m_setting.end()) {
683
+ const PARAM_VEC &weights = iterMap->second;
684
+ for (size_t i = 0; i < weights.size(); ++i) {
685
+ SetWeight(newWeightName, ind, Scan<float>(weights[i]));
686
+ }
687
+
688
+ m_setting.erase(iterMap);
689
+ }
690
+ }
691
+
692
+ void
693
+ Parameter::
694
+ ConvertWeightArgsPhraseModel(const string &oldWeightName)
695
+ {
696
+ const PARAM_VEC *params;
697
+
698
+ // process input weights 1st
699
+ params = GetParam("weight-i");
700
+ if (params) {
701
+ vector<float> inputWeights = Scan<float>(*params);
702
+ PARAM_VEC &numInputScores = m_setting["input-scores"];
703
+ if (inputWeights.size() == 1) {
704
+ UTIL_THROW_IF2(numInputScores.size() != 0, "No [input-scores] section allowed");
705
+ numInputScores.push_back("1");
706
+ numInputScores.push_back("0");
707
+ } else if (inputWeights.size() == 2) {
708
+ UTIL_THROW_IF2(numInputScores.size() != 0, "No [input-scores] section allowed");
709
+ numInputScores.push_back("1");
710
+ numInputScores.push_back("1");
711
+ }
712
+
713
+ SetWeight("PhraseDictionaryBinary", 0, inputWeights);
714
+ }
715
+
716
+ // convert actually pt feature
717
+ VERBOSE(2,"Creating phrase table features" << endl);
718
+
719
+ size_t numInputScores = 0;
720
+ size_t numRealWordsInInput = 0;
721
+ map<string, size_t> ptIndices;
722
+
723
+ params = GetParam("input-scores");
724
+ if (params) {
725
+ numInputScores = Scan<size_t>(params->at(0));
726
+
727
+ if (params->size() > 1) {
728
+ numRealWordsInInput = Scan<size_t>(params->at(1));
729
+ }
730
+ }
731
+
732
+ // load phrase translation tables
733
+ params = GetParam("ttable-file");
734
+ if (params) {
735
+ // weights
736
+ const vector<string> translationVector = *params;
737
+
738
+ vector<size_t> maxTargetPhrase;
739
+ params = GetParam("ttable-limit");
740
+ if (params) {
741
+ maxTargetPhrase = Scan<size_t>(*params);
742
+ }
743
+
744
+ if(maxTargetPhrase.size() == 1 && translationVector.size() > 1) {
745
+ VERBOSE(1, "Using uniform ttable-limit of " << maxTargetPhrase[0] << " for all translation tables." << endl);
746
+ for(size_t i = 1; i < translationVector.size(); i++)
747
+ maxTargetPhrase.push_back(maxTargetPhrase[0]);
748
+ } else if(maxTargetPhrase.size() != 1 && maxTargetPhrase.size() < translationVector.size()) {
749
+ std::cerr << "You specified " << translationVector.size() << " translation tables, but only " << maxTargetPhrase.size() << " ttable-limits.";
750
+ return;
751
+ }
752
+
753
+ // MAIN LOOP
754
+ const PARAM_VEC &oldWeights = m_setting[oldWeightName];
755
+
756
+ size_t currOldInd = 0;
757
+ for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
758
+ util::StringStream ptLine;
759
+
760
+ vector<string> token = Tokenize(translationVector[currDict]);
761
+
762
+ if(currDict == 0 && token.size() == 4) {
763
+ std::cerr << "Phrase table specification in old 4-field format. No longer supported";
764
+ return;
765
+ }
766
+ UTIL_THROW_IF2(token.size() < 5, "Phrase table must have at least 5 scores");
767
+
768
+ int implementation = Scan<int>(token[0]);
769
+
770
+ string ptType;
771
+ switch (implementation) {
772
+ case 0: // Memory
773
+ ptType = "PhraseDictionaryMemory";
774
+ break;
775
+ case 1: // Binary
776
+ ptType = "PhraseDictionaryBinary";
777
+ break;
778
+ case 2: // OnDisk
779
+ ptType = "PhraseDictionaryOnDisk";
780
+ break;
781
+ case 6: // SCFG
782
+ ptType = "PhraseDictionaryMemory";
783
+ break;
784
+ case 12: // Compact
785
+ ptType = "PhraseDictionaryCompact";
786
+ break;
787
+ case 8: // SuffixArray
788
+ ptType = "PhraseDictionarySuffixArray";
789
+ break;
790
+ case 14: // DSuffixArray
791
+ ptType = "PhraseDictionaryDynSuffixArray";
792
+ break;
793
+ case 15: // DCacheBased:
794
+ ptType = "PhraseDictionaryDynamicCacheBased";
795
+ break;
796
+ case 16: // CachePT:
797
+ ptType = "PhraseDictionaryCache";
798
+ break;
799
+ default:
800
+ break;
801
+ }
802
+
803
+ size_t ptInd;
804
+ if (ptIndices.find(ptType) == ptIndices.end()) {
805
+ ptIndices[ptType] = 0;
806
+ ptInd = 0;
807
+ } else {
808
+ ptInd = ++ptIndices[ptType];
809
+ }
810
+
811
+ // weights
812
+ size_t numFFInd = (token.size() == 4) ? 2 : 3;
813
+ size_t numFF = Scan<size_t>(token[numFFInd]);
814
+
815
+ vector<float> weights(numFF);
816
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
817
+ UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
818
+ "Errors converting old phrase-table weights to new weights");
819
+ float weight = Scan<float>(oldWeights[currOldInd]);
820
+ weights[currFF] = weight;
821
+
822
+ ++currOldInd;
823
+ }
824
+
825
+ // cerr << weights.size() << " PHRASE TABLE WEIGHTS "
826
+ // << __FILE__ << ":" << __LINE__ << endl;
827
+ AddWeight(ptType, ptInd, weights);
828
+
829
+ // actual pt
830
+ ptLine << ptType << " ";
831
+ ptLine << "input-factor=" << token[1] << " ";
832
+ ptLine << "output-factor=" << token[2] << " ";
833
+ ptLine << "path=" << token[4] << " ";
834
+
835
+ //characteristics of the phrase table
836
+
837
+ vector<FactorType> input = Tokenize<FactorType>(token[1], ",")
838
+ ,output = Tokenize<FactorType>(token[2], ",");
839
+ size_t numScoreComponent = Scan<size_t>(token[3]);
840
+ string filePath= token[4];
841
+
842
+ if(currDict==0) {
843
+ // only the 1st pt. THis is shit
844
+ // TODO. find what the assumptions made by confusion network about phrase table output which makes
845
+ // it only work with binary file. This is a hack
846
+ numScoreComponent += numInputScores + numRealWordsInInput;
847
+ }
848
+
849
+ ptLine << "num-features=" << numScoreComponent << " ";
850
+ ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
851
+
852
+ if (implementation == 8 || implementation == 14) {
853
+ ptLine << "target-path=" << token[5] << " ";
854
+ ptLine << "alignment-path=" << token[6] << " ";
855
+ }
856
+
857
+ AddFeature(ptLine.str());
858
+ } // for(size_t currDict = 0 ; currDict < translationVector.size(); currDict++) {
859
+ } // if (GetParam("ttable-file").size() > 0) {
860
+
861
+ m_setting.erase("weight-i");
862
+ m_setting.erase(oldWeightName);
863
+ m_setting.erase("ttable-file");
864
+ m_setting.erase("ttable-limit");
865
+
866
+ }
867
+
868
+ void
869
+ Parameter::
870
+ AddFeature(const std::string &line)
871
+ {
872
+ PARAM_VEC &features = m_setting["feature"];
873
+ features.push_back(line);
874
+ }
875
+
876
+ void
877
+ Parameter::
878
+ ConvertWeightArgsDistortion()
879
+ {
880
+ const string oldWeightName = "weight-d";
881
+ const string oldLexReordingName = "distortion-file";
882
+
883
+ // distortion / lex distortion
884
+ const PARAM_VEC *oldWeights = GetParam(oldWeightName);
885
+
886
+ if (oldWeights) {
887
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
888
+ if (searchAlgo == NULL ||
889
+ (searchAlgo->size() > 0
890
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
891
+ )
892
+ ) {
893
+ // phrase-based. Add distance distortion to list of features
894
+ AddFeature("Distortion");
895
+ SetWeight("Distortion", 0, Scan<float>(oldWeights->at(0)));
896
+ }
897
+
898
+ // everything but the last is lex reordering model
899
+
900
+ size_t currOldInd = 1;
901
+ const PARAM_VEC *lextable = GetParam(oldLexReordingName);
902
+
903
+ for (size_t indTable = 0; lextable && indTable < lextable->size(); ++indTable) {
904
+ const string &line = lextable->at(indTable);
905
+ vector<string> toks = Tokenize(line);
906
+
907
+ size_t numFF = Scan<size_t>(toks[2]);
908
+
909
+ vector<float> weights(numFF);
910
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
911
+ UTIL_THROW_IF2(oldWeights && currOldInd >= oldWeights->size(),
912
+ "Errors converting old distortion weights to new weights");
913
+ float weight = Scan<float>(oldWeights->at(currOldInd));
914
+ weights[currFF] = weight;
915
+
916
+ ++currOldInd;
917
+ }
918
+ SetWeight("LexicalReordering", indTable, weights);
919
+
920
+ util::StringStream strme;
921
+ strme << "LexicalReordering "
922
+ << "type=" << toks[1] << " ";
923
+
924
+ vector<FactorType> factors = Tokenize<FactorType>(toks[0], "-");
925
+ UTIL_THROW_IF2(factors.size() != 2,
926
+ "Error in old factor specification for lexicalized reordering model: "
927
+ << toks[0]);
928
+ strme << "input-factor=" << factors[0]
929
+ << " output-factor=" << factors[1] << " ";
930
+
931
+ strme << "num-features=" << toks[2] << " ";
932
+ strme << "path=" << toks[3];
933
+
934
+ AddFeature(strme.str());
935
+ }
936
+ }
937
+
938
+ m_setting.erase(oldWeightName);
939
+ m_setting.erase(oldLexReordingName);
940
+
941
+ }
942
+
943
+ void
944
+ Parameter::
945
+ ConvertWeightArgsLM()
946
+ {
947
+ const string oldWeightName = "weight-l";
948
+ const string oldFeatureName = "lmodel-file";
949
+ const PARAM_VEC *params;
950
+
951
+ bool isChartDecoding = true;
952
+
953
+ params = GetParam("search-algorithm");
954
+ if (params == NULL ||
955
+ (params->size() > 0
956
+ && (Trim(params->at(0)) == "0" || Trim(params->at(0)) == "1")
957
+ )
958
+ ) {
959
+ isChartDecoding = false;
960
+ }
961
+
962
+ vector<int> oovWeights;
963
+ params = GetParam("lmodel-oov-feature");
964
+ if (params) {
965
+ oovWeights = Scan<int>(*params);
966
+ }
967
+
968
+ PARAM_MAP::iterator iterMap;
969
+
970
+ iterMap = m_setting.find(oldWeightName);
971
+ if (iterMap != m_setting.end()) {
972
+
973
+ size_t currOldInd = 0;
974
+ const PARAM_VEC &weights = iterMap->second;
975
+ const PARAM_VEC &models = m_setting[oldFeatureName];
976
+ for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) {
977
+ const string &line = models[lmIndex];
978
+ vector<string> modelToks = Tokenize(line);
979
+
980
+ int lmType = Scan<int>(modelToks[0]);
981
+
982
+ string newFeatureName;
983
+ switch (lmType) {
984
+ case 0:
985
+ newFeatureName = "SRILM";
986
+ break;
987
+ case 1:
988
+ newFeatureName = "IRSTLM";
989
+ break;
990
+ case 8:
991
+ case 9:
992
+ newFeatureName = "KENLM";
993
+ break;
994
+ default:
995
+ UTIL_THROW2("Unkown language model type id:" << lmType);
996
+ }
997
+
998
+ size_t numFF = 1;
999
+ if (oovWeights.size() > lmIndex)
1000
+ numFF += oovWeights[lmIndex];
1001
+
1002
+ vector<float> weightsLM(numFF);
1003
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
1004
+ UTIL_THROW_IF2(currOldInd >= weights.size(),
1005
+ "Errors converting old LM weights to new weights");
1006
+ weightsLM[currFF] = Scan<float>(weights[currOldInd]);
1007
+ if (isChartDecoding) {
1008
+ weightsLM[currFF] = UntransformLMScore(weightsLM[currFF]);
1009
+ }
1010
+
1011
+ ++currOldInd;
1012
+ }
1013
+
1014
+ SetWeight(newFeatureName, lmIndex, weightsLM);
1015
+
1016
+ string featureLine = newFeatureName + " "
1017
+ + "factor=" + modelToks[1] + " " // factor
1018
+ + "order=" + modelToks[2] + " " // order
1019
+ + "num-features=" + SPrint(numFF) + " ";
1020
+ if (lmType == 9) {
1021
+ featureLine += "load=lazy ";
1022
+ }
1023
+
1024
+ if(oovWeights.size() > lmIndex)
1025
+ featureLine += "oov-feature=1 ";
1026
+
1027
+ featureLine += "path=" + modelToks[3]; // file
1028
+
1029
+ AddFeature(featureLine);
1030
+ } // for (size_t lmIndex = 0; lmIndex < models.size(); ++lmIndex) {
1031
+
1032
+ m_setting.erase(iterMap);
1033
+ }
1034
+
1035
+ m_setting.erase(oldFeatureName);
1036
+ }
1037
+
1038
+ void
1039
+ Parameter::
1040
+ ConvertWeightArgsGeneration(const std::string &oldWeightName, const std::string &newWeightName)
1041
+ {
1042
+ string oldFeatureName = "generation-file";
1043
+
1044
+ // distortion / lex distortion
1045
+ PARAM_VEC &oldWeights = m_setting[oldWeightName];
1046
+
1047
+ if (oldWeights.size() > 0) {
1048
+ size_t currOldInd = 0;
1049
+ PARAM_VEC &models = m_setting[oldFeatureName];
1050
+
1051
+ for (size_t indTable = 0; indTable < models.size(); ++indTable) {
1052
+ string &line = models[indTable];
1053
+ vector<string> modelToks = Tokenize(line);
1054
+
1055
+ size_t numFF = Scan<size_t>(modelToks[2]);
1056
+
1057
+ vector<float> weights(numFF);
1058
+ for (size_t currFF = 0; currFF < numFF; ++currFF) {
1059
+ UTIL_THROW_IF2(currOldInd >= oldWeights.size(),
1060
+ "Errors converting old generation weights to new weights");
1061
+ float weight = Scan<float>(oldWeights[currOldInd]);
1062
+ weights[currFF] = weight;
1063
+
1064
+ ++currOldInd;
1065
+ }
1066
+ SetWeight(newWeightName, indTable, weights);
1067
+
1068
+ util::StringStream strme;
1069
+ strme << "Generation "
1070
+ << "input-factor=" << modelToks[0] << " "
1071
+ << "output-factor=" << modelToks[1] << " "
1072
+ << "num-features=" << modelToks[2] << " "
1073
+ << "path=" << modelToks[3];
1074
+ AddFeature(strme.str());
1075
+ }
1076
+ }
1077
+
1078
+ m_setting.erase(oldWeightName);
1079
+ m_setting.erase(oldFeatureName);
1080
+ }
1081
+
1082
+ void
1083
+ Parameter::
1084
+ ConvertWeightArgsWordPenalty()
1085
+ {
1086
+ const std::string oldWeightName = "weight-w";
1087
+ const std::string newWeightName = "WordPenalty";
1088
+
1089
+ bool isChartDecoding = true;
1090
+ const PARAM_VEC *searchAlgo = GetParam("search-algorithm");
1091
+ if (searchAlgo == NULL ||
1092
+ (searchAlgo->size() > 0
1093
+ && (Trim(searchAlgo->at(0)) == "0" || Trim(searchAlgo->at(0)) == "1")
1094
+ )
1095
+ ) {
1096
+ isChartDecoding = false;
1097
+ }
1098
+
1099
+ PARAM_MAP::iterator iterMap;
1100
+
1101
+ iterMap = m_setting.find(oldWeightName);
1102
+ if (iterMap != m_setting.end()) {
1103
+ const PARAM_VEC &weights = iterMap->second;
1104
+ for (size_t i = 0; i < weights.size(); ++i) {
1105
+ float weight = Scan<float>(weights[i]);
1106
+ if (isChartDecoding) {
1107
+ weight *= 0.434294482;
1108
+ }
1109
+ SetWeight(newWeightName, i, weight);
1110
+ }
1111
+
1112
+ m_setting.erase(iterMap);
1113
+ }
1114
+
1115
+ }
1116
+
1117
+ void
1118
+ Parameter::
1119
+ ConvertPhrasePenalty()
1120
+ {
1121
+ string oldWeightName = "weight-p";
1122
+ const PARAM_VEC *params = GetParam(oldWeightName);
1123
+ if (params) {
1124
+ UTIL_THROW_IF2(params->size() != 1,
1125
+ "There should be only 1 phrase-penalty weight");
1126
+ float weight = Scan<float>(params->at(0));
1127
+ AddFeature("PhrasePenalty");
1128
+ SetWeight("PhrasePenalty", 0, weight);
1129
+
1130
+ m_setting.erase(oldWeightName);
1131
+ }
1132
+ }
1133
+
1134
+ void
1135
+ Parameter::
1136
+ ConvertWeightArgs()
1137
+ {
1138
+ // can't handle discr LM. must do it manually 'cos of bigram/n-gram split
1139
+ UTIL_THROW_IF2( m_setting.count("weight-dlm") != 0,
1140
+ "Can't handle discr LM. must do it manually 'cos of bigram/n-gram split");
1141
+
1142
+ // check that old & new format aren't mixed
1143
+ if (m_setting.count("weight") &&
1144
+ (m_setting.count("weight-i") || m_setting.count("weight-t") || m_setting.count("weight-w") ||
1145
+ m_setting.count("weight-l") || m_setting.count("weight-u") || m_setting.count("weight-lex") ||
1146
+ m_setting.count("weight-generation") || m_setting.count("weight-lr") || m_setting.count("weight-d")
1147
+ )) {
1148
+ cerr << "Do not mix old and new format for specify weights";
1149
+ }
1150
+
1151
+ ConvertWeightArgsWordPenalty();
1152
+ ConvertWeightArgsLM();
1153
+ ConvertWeightArgsSingleWeight("weight-slm", "SyntacticLM");
1154
+ ConvertWeightArgsSingleWeight("weight-u", "UnknownWordPenalty");
1155
+ ConvertWeightArgsGeneration("weight-generation", "Generation");
1156
+ ConvertWeightArgsDistortion();
1157
+
1158
+ // don't know or can't be bothered converting these weights
1159
+ ConvertWeightArgsSingleWeight("weight-lr", "LexicalReordering");
1160
+ ConvertWeightArgsSingleWeight("weight-bl", "BleuScoreFeature");
1161
+ ConvertWeightArgsSingleWeight("weight-glm", "GlobalLexicalModel");
1162
+ ConvertWeightArgsSingleWeight("weight-wt", "WordTranslationFeature");
1163
+ ConvertWeightArgsSingleWeight("weight-pp", "PhrasePairFeature");
1164
+ ConvertWeightArgsSingleWeight("weight-pb", "PhraseBoundaryFeature");
1165
+
1166
+ ConvertWeightArgsSingleWeight("weight-e", "WordDeletion"); // TODO Can't find real name
1167
+ ConvertWeightArgsSingleWeight("weight-lex", "GlobalLexicalReordering"); // TODO Can't find real name
1168
+
1169
+ ConvertPhrasePenalty();
1170
+
1171
+ AddFeature("WordPenalty");
1172
+ AddFeature("UnknownWordPenalty");
1173
+
1174
+ ConvertWeightArgsPhraseModel("weight-t");
1175
+
1176
+ }
1177
+
1178
+ void
1179
+ Parameter::
1180
+ CreateWeightsMap()
1181
+ {
1182
+ CreateWeightsMap(m_setting["weight-add"]);
1183
+ CreateWeightsMap(m_setting["weight"]);
1184
+ }
1185
+
1186
+ void
1187
+ Parameter::
1188
+ CreateWeightsMap(const PARAM_VEC &vec)
1189
+ {
1190
+ for (size_t i = 0; i < vec.size(); ++i) {
1191
+ const string &line = vec[i];
1192
+ vector<string> toks = Tokenize(line);
1193
+ UTIL_THROW_IF2(toks.size() < 2,
1194
+ "Error in format of weights: " << line);
1195
+
1196
+ string name = toks[0];
1197
+ name = name.substr(0, name.size() - 1);
1198
+
1199
+ vector<float> weights(toks.size() - 1);
1200
+ for (size_t i = 1; i < toks.size(); ++i) {
1201
+ float weight = Scan<float>(toks[i]);
1202
+ weights[i - 1] = weight;
1203
+ }
1204
+ m_weights[name] = weights;
1205
+ }
1206
+ }
1207
+
1208
+ void
1209
+ Parameter::
1210
+ WeightOverwrite()
1211
+ {
1212
+ PARAM_VEC &vec = m_setting["weight-overwrite"];
1213
+
1214
+ if (vec.size() == 0)
1215
+ return;
1216
+
1217
+ // should only be on 1 line
1218
+ UTIL_THROW_IF2(vec.size() != 1,
1219
+ "weight-overwrite should only be on 1 line");
1220
+
1221
+ string name("");
1222
+ vector<float> weights;
1223
+ vector<string> toks = Tokenize(vec[0]);
1224
+ size_t cnt = 0;
1225
+ const std::vector<float>* oldWeights = NULL;
1226
+ for (size_t i = 0; i < toks.size(); ++i) {
1227
+ const string &tok = toks[i];
1228
+
1229
+ if (ends_with(tok, "=")) {
1230
+ // start of new feature
1231
+
1232
+ if (name != "") {
1233
+ // save previous ff
1234
+ m_weights[name] = weights;
1235
+ weights.clear();
1236
+ }
1237
+
1238
+ name = tok.substr(0, tok.size() - 1);
1239
+ std::map<std::string, std::vector<float> >::const_iterator found = m_weights.find(name);
1240
+ if (found!=m_weights.end()) {
1241
+ oldWeights = &(found->second);
1242
+ } else {
1243
+ oldWeights = NULL;
1244
+ }
1245
+ cnt = 0;
1246
+ } else {
1247
+ // a weight for curr ff
1248
+ if (toks[i] == "x") {
1249
+ UTIL_THROW_IF2(!oldWeights || cnt>=oldWeights->size(),
1250
+ "Keeping previous weight failed in weight-overwrite");
1251
+ weights.push_back(oldWeights->at(cnt));
1252
+ } else {
1253
+ float weight = Scan<float>(toks[i]);
1254
+ weights.push_back(weight);
1255
+ }
1256
+ ++cnt;
1257
+ }
1258
+ }
1259
+
1260
+ if (name != "") {
1261
+ m_weights[name] = weights;
1262
+ }
1263
+
1264
+ }
1265
+
1266
+ /** check that parameter settings make sense */
1267
+ bool
1268
+ Parameter::
1269
+ Validate()
1270
+ {
1271
+ bool noErrorFlag = true;
1272
+
1273
+ PARAM_MAP::const_iterator iterParams;
1274
+ for (iterParams = m_setting.begin(); iterParams != m_setting.end(); ++iterParams) {
1275
+ const std::string &key = iterParams->first;
1276
+
1277
+ if (m_valid.find(key) == m_valid.end()) {
1278
+ std::cerr << "Unknown parameter " << key;
1279
+ noErrorFlag = false;
1280
+ }
1281
+ }
1282
+
1283
+ if (m_setting["lmodel-dub"].size() > 0) {
1284
+ if (m_setting["lmodel-file"].size() != m_setting["lmodel-dub"].size()) {
1285
+ std::cerr << "Config and parameters specify "
1286
+ << static_cast<int>(m_setting["lmodel-file"].size())
1287
+ << " language model files (lmodel-file), but "
1288
+ << static_cast<int>(m_setting["lmodel-dub"].size())
1289
+ << " LM upperbounds (lmodel-dub)"
1290
+ << endl;
1291
+ noErrorFlag = false;
1292
+ }
1293
+ }
1294
+
1295
+ // do files exist?
1296
+
1297
+ // input file
1298
+ if (noErrorFlag && m_setting["input-file"].size() == 1) {
1299
+ noErrorFlag = FileExists(m_setting["input-file"][0]);
1300
+ if (!noErrorFlag) {
1301
+ std::cerr << endl << "Input file " << m_setting["input-file"][0] << " does not exist";
1302
+ }
1303
+ }
1304
+ // generation tables
1305
+ if (noErrorFlag) {
1306
+ std::vector<std::string> ext;
1307
+ //raw tables in either un compressed or compressed form
1308
+ ext.push_back("");
1309
+ ext.push_back(".gz");
1310
+ noErrorFlag = FilesExist("generation-file", 3, ext);
1311
+ }
1312
+ // distortion
1313
+ if (noErrorFlag) {
1314
+ std::vector<std::string> ext;
1315
+ //raw tables in either un compressed or compressed form
1316
+ ext.push_back("");
1317
+ ext.push_back(".gz");
1318
+ //prefix tree format
1319
+ ext.push_back(".binlexr.idx");
1320
+ //prefix tree format
1321
+ ext.push_back(".minlexr");
1322
+ noErrorFlag = FilesExist("distortion-file", 3, ext);
1323
+ }
1324
+ return noErrorFlag;
1325
+ }
1326
+
1327
+ /** check whether a file exists */
1328
+ bool
1329
+ Parameter::
1330
+ FilesExist(const string &paramName, int fieldNo,
1331
+ std::vector<std::string> const& extensions)
1332
+ {
1333
+ typedef std::vector<std::string> StringVec;
1334
+ StringVec::const_iterator iter;
1335
+
1336
+ PARAM_MAP::const_iterator iterParam = m_setting.find(paramName);
1337
+ if (iterParam == m_setting.end()) {
1338
+ // no param. therefore nothing to check
1339
+ return true;
1340
+ }
1341
+ const StringVec &pathVec = (*iterParam).second;
1342
+ for (iter = pathVec.begin() ; iter != pathVec.end() ; ++iter) {
1343
+ StringVec vec = Tokenize(*iter);
1344
+
1345
+ size_t tokenizeIndex;
1346
+ if (fieldNo == -1)
1347
+ tokenizeIndex = vec.size() - 1;
1348
+ else
1349
+ tokenizeIndex = static_cast<size_t>(fieldNo);
1350
+
1351
+ if (tokenizeIndex >= vec.size()) {
1352
+ std::cerr << "Expected at least " << (tokenizeIndex+1) << " tokens per entry in '"
1353
+ << paramName << "', but only found "
1354
+ << vec.size();
1355
+ return false;
1356
+ }
1357
+ const string &pathStr = vec[tokenizeIndex];
1358
+
1359
+ bool fileFound=0;
1360
+ for(size_t i=0; i<extensions.size() && !fileFound; ++i) {
1361
+ fileFound|=FileExists(pathStr + extensions[i]);
1362
+ }
1363
+ if(!fileFound) {
1364
+ std::cerr << "File " << pathStr << " does not exist";
1365
+ return false;
1366
+ }
1367
+ }
1368
+ return true;
1369
+ }
1370
+
1371
+ /** look for a switch in arg, update parameter */
1372
+ // TODO arg parsing like this does not belong in the library, it belongs
1373
+ // in moses-cmd
1374
+ string
1375
+ Parameter::
1376
+ FindParam(const string &paramSwitch, int argc, char const* argv[])
1377
+ {
1378
+ for (int i = 0 ; i < argc ; i++) {
1379
+ if (string(argv[i]) == paramSwitch) {
1380
+ if (i+1 < argc) {
1381
+ return argv[i+1];
1382
+ } else {
1383
+ std::cerr << "Option " << paramSwitch << " requires a parameter!";
1384
+ // TODO return some sort of error, not the empty string
1385
+ }
1386
+ }
1387
+ }
1388
+ return "";
1389
+ }
1390
+
1391
+ /** update parameter settings with command line switches
1392
+ * \param paramSwitch (potentially short) name of switch
1393
+ * \param paramName full name of parameter
1394
+ * \param argc number of arguments on command line
1395
+ * \param argv values of paramters on command line */
1396
+ void
1397
+ Parameter::
1398
+ OverwriteParam(const string &paramSwitch, const string &paramName,
1399
+ int argc, char const* argv[])
1400
+ {
1401
+ int startPos = -1;
1402
+ for (int i = 0 ; i < argc ; i++) {
1403
+ if (string(argv[i]) == paramSwitch) {
1404
+ startPos = i+1;
1405
+ break;
1406
+ }
1407
+ }
1408
+ if (startPos < 0)
1409
+ return;
1410
+
1411
+ int index = 0;
1412
+ m_setting[paramName]; // defines the parameter, important for boolean switches
1413
+ while (startPos < argc && (!isOption(argv[startPos]))) {
1414
+ if (m_setting[paramName].size() > (size_t)index)
1415
+ m_setting[paramName][index] = argv[startPos];
1416
+ else
1417
+ m_setting[paramName].push_back(argv[startPos]);
1418
+ index++;
1419
+ startPos++;
1420
+ }
1421
+ }
1422
+
1423
+
1424
+ /** read parameters from a configuration file */
1425
+ bool
1426
+ Parameter::
1427
+ ReadConfigFile(const string &filePath )
1428
+ {
1429
+ InputFileStream inFile(filePath);
1430
+ string line, paramName;
1431
+ while(getline(inFile, line)) {
1432
+ // comments
1433
+ size_t comPos = line.find_first_of("#");
1434
+ if (comPos != string::npos)
1435
+ line = line.substr(0, comPos);
1436
+ // trim leading and trailing spaces/tabs
1437
+ line = Trim(line);
1438
+
1439
+ if (line.size() == 0) {
1440
+ // blank line. do nothing.
1441
+ } else if (line[0]=='[') {
1442
+ // new parameter
1443
+ for (size_t currPos = 0 ; currPos < line.size() ; currPos++) {
1444
+ if (line[currPos] == ']') {
1445
+ paramName = line.substr(1, currPos - 1);
1446
+ break;
1447
+ }
1448
+ }
1449
+ } else {
1450
+ // add value to parameter
1451
+ m_setting[paramName].push_back(line);
1452
+ }
1453
+ }
1454
+ return true;
1455
+ }
1456
+
1457
+ struct Credit {
1458
+ string name, contact, currentPursuits, areaResponsibility;
1459
+ int sortId;
1460
+
1461
+ Credit(string name, string contact, string currentPursuits, string areaResponsibility) {
1462
+ this->name = name ;
1463
+ this->contact = contact ;
1464
+ this->currentPursuits = currentPursuits ;
1465
+ this->areaResponsibility = areaResponsibility;
1466
+ this->sortId = util::rand_excl(1000);
1467
+ }
1468
+
1469
+ bool operator<(const Credit &other) const {
1470
+ /*
1471
+ if (areaResponsibility.size() != 0 && other.areaResponsibility.size() ==0)
1472
+ return true;
1473
+ if (areaResponsibility.size() == 0 && other.areaResponsibility.size() !=0)
1474
+ return false;
1475
+
1476
+ return name < other.name;
1477
+ */
1478
+ return sortId < other.sortId;
1479
+ }
1480
+
1481
+ };
1482
+
1483
+ std::ostream& operator<<(std::ostream &os, const Credit &credit)
1484
+ {
1485
+ os << credit.name;
1486
+ if (credit.contact != "")
1487
+ os << "\t contact: " << credit.contact;
1488
+ if (credit.currentPursuits != "")
1489
+ os << " " << credit.currentPursuits;
1490
+ if (credit.areaResponsibility != "")
1491
+ os << " I'll answer question on: " << credit.areaResponsibility;
1492
+ return os;
1493
+ }
1494
+
1495
+ void
1496
+ Parameter::
1497
+ PrintCredit()
1498
+ {
1499
+ vector<Credit> everyone;
1500
+ srand ( time(NULL) );
1501
+
1502
+ everyone.push_back(Credit("Nicola Bertoldi"
1503
+ , "911"
1504
+ , ""
1505
+ , "scripts & other stuff"));
1506
+ everyone.push_back(Credit("Ondrej Bojar"
1507
+ , ""
1508
+ , "czech this out!"
1509
+ , ""));
1510
+ everyone.push_back(Credit("Chris Callison-Burch"
1511
+ , "anytime, anywhere"
1512
+ , "international playboy"
1513
+ , ""));
1514
+ everyone.push_back(Credit("Alexandra Constantin"
1515
+ , ""
1516
+ , "eu sunt varza"
1517
+ , ""));
1518
+ everyone.push_back(Credit("Brooke Cowan"
1519
+ , "brooke@csail.mit.edu"
1520
+ , "if you're going to san francisco, be sure to wear a flower in your hair"
1521
+ , ""));
1522
+ everyone.push_back(Credit("Chris Dyer"
1523
+ , "can't. i'll be out driving my mustang"
1524
+ , "driving my mustang"
1525
+ , ""));
1526
+ everyone.push_back(Credit("Marcello Federico"
1527
+ , "federico at itc at it"
1528
+ , "Researcher at ITC-irst, Trento, Italy"
1529
+ , "IRST language model"));
1530
+ everyone.push_back(Credit("Evan Herbst"
1531
+ , "Small college in upstate New York"
1532
+ , ""
1533
+ , ""));
1534
+ everyone.push_back(Credit("Philipp Koehn"
1535
+ , "only between 2 and 4am"
1536
+ , ""
1537
+ , "Nothing fazes this dude"));
1538
+ everyone.push_back(Credit("Christine Moran"
1539
+ , "weird building at MIT"
1540
+ , ""
1541
+ , ""));
1542
+ everyone.push_back(Credit("Wade Shen"
1543
+ , "via morse code"
1544
+ , "buying another laptop"
1545
+ , ""));
1546
+ everyone.push_back(Credit("Richard Zens"
1547
+ , "richard at aachen dot de"
1548
+ , ""
1549
+ , "ambiguous source input, confusion networks, confusing source code"));
1550
+ everyone.push_back(Credit("Hieu Hoang", "http://www.hoang.co.uk/hieu/"
1551
+ , "phd student at Edinburgh Uni. Original Moses developer"
1552
+ , "general queries/ flames on Moses."));
1553
+
1554
+ sort(everyone.begin(), everyone.end());
1555
+
1556
+
1557
+ cerr << "Moses - A beam search decoder for phrase-based statistical machine translation models" << endl
1558
+ << "Copyright (C) 2006 University of Edinburgh" << endl << endl
1559
+
1560
+ << "This library is free software; you can redistribute it and/or" << endl
1561
+ << "modify it under the terms of the GNU Lesser General Public" << endl
1562
+ << "License as published by the Free Software Foundation; either" << endl
1563
+ << "version 2.1 of the License, or (at your option) any later version." << endl << endl
1564
+
1565
+ << "This library is distributed in the hope that it will be useful," << endl
1566
+ << "but WITHOUT ANY WARRANTY; without even the implied warranty of" << endl
1567
+ << "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU" << endl
1568
+ << "Lesser General Public License for more details." << endl << endl
1569
+
1570
+ << "You should have received a copy of the GNU Lesser General Public" << endl
1571
+ << "License along with this library; if not, write to the Free Software" << endl
1572
+ << "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA" << endl << endl
1573
+ << "***********************************************************************" << endl << endl
1574
+ << "Built on " << __DATE__ << " at " __TIME__ << endl << endl
1575
+ << "WHO'S FAULT IS THIS GODDAM SOFTWARE:" << endl;
1576
+
1577
+ ostream_iterator<Credit> out(cerr, "\n");
1578
+ copy(everyone.begin(), everyone.end(), out);
1579
+ cerr << endl << endl;
1580
+ }
1581
+
1582
+ /** update parameter settings with command line switches
1583
+ * \param paramName full name of parameter
1584
+ * \param values inew values for paramName */
1585
+ void
1586
+ Parameter::
1587
+ OverwriteParam(const string &paramName, PARAM_VEC values)
1588
+ {
1589
+ VERBOSE(2,"Overwriting parameter " << paramName);
1590
+
1591
+ m_setting[paramName]; // defines the parameter, important for boolean switches
1592
+ if (m_setting[paramName].size() > 1) {
1593
+ VERBOSE(2," (the parameter had " << m_setting[paramName].size() << " previous values)");
1594
+ UTIL_THROW_IF2(m_setting[paramName].size() != values.size(),
1595
+ "Number of weight override for " << paramName
1596
+ << " is not the same as the original number of weights");
1597
+ } else {
1598
+ VERBOSE(2," (the parameter does not have previous values)");
1599
+ m_setting[paramName].resize(values.size());
1600
+ }
1601
+ VERBOSE(2," with the following values:");
1602
+ int i=0;
1603
+ for (PARAM_VEC::iterator iter = values.begin(); iter != values.end() ; iter++, i++) {
1604
+ m_setting[paramName][i] = *iter;
1605
+ VERBOSE(2, " " << *iter);
1606
+ }
1607
+ VERBOSE(2, std::endl);
1608
+ }
1609
+
1610
+ void
1611
+ Parameter::
1612
+ PrintFF() const
1613
+ {
1614
+ StaticData::Instance().GetFeatureRegistry().PrintFF();
1615
+ }
1616
+
1617
+ std::set<std::string>
1618
+ Parameter::
1619
+ GetWeightNames() const
1620
+ {
1621
+ std::set<std::string> ret;
1622
+ std::map<std::string, std::vector<float> >::const_iterator iter;
1623
+ for (iter = m_weights.begin(); iter != m_weights.end(); ++iter) {
1624
+ const string &key = iter->first;
1625
+ ret.insert(key);
1626
+ }
1627
+ return ret;
1628
+ }
1629
+
1630
+ void
1631
+ Parameter::
1632
+ Save(const std::string path)
1633
+ {
1634
+ ofstream file;
1635
+ file.open(path.c_str());
1636
+
1637
+ PARAM_MAP::const_iterator iterOuter;
1638
+ for (iterOuter = m_setting.begin(); iterOuter != m_setting.end(); ++iterOuter) {
1639
+ const std::string &sectionName = iterOuter->first;
1640
+ file << "[" << sectionName << "]" << endl;
1641
+
1642
+ const PARAM_VEC &values = iterOuter->second;
1643
+
1644
+ PARAM_VEC::const_iterator iterInner;
1645
+ for (iterInner = values.begin(); iterInner != values.end(); ++iterInner) {
1646
+ const std::string &value = *iterInner;
1647
+ file << value << endl;
1648
+ }
1649
+
1650
+ file << endl;
1651
+ }
1652
+
1653
+
1654
+ file.close();
1655
+ }
1656
+
1657
+ template<>
1658
+ void
1659
+ Parameter::
1660
+ SetParameter<bool>(bool &parameter, std::string const& parameterName,
1661
+ bool const& defaultValue) const
1662
+ {
1663
+ const PARAM_VEC *params = GetParam(parameterName);
1664
+
1665
+ // default value if nothing is specified
1666
+ parameter = defaultValue;
1667
+ if (params == NULL) {
1668
+ return;
1669
+ }
1670
+
1671
+ // if parameter is just specified as, e.g. "-parameter" set it true
1672
+ if (params->size() == 0) {
1673
+ parameter = true;
1674
+ }
1675
+ // if paramter is specified "-parameter true" or "-parameter false"
1676
+ else if (params->size() == 1) {
1677
+ parameter = Scan<bool>( params->at(0));
1678
+ }
1679
+ }
1680
+
1681
+ void
1682
+ Parameter::
1683
+ SetParameter(bool& var, std::string const& name)
1684
+ {
1685
+ SetParameter(var,name,false);
1686
+ }
1687
+
1688
+ } // namespace
1689
+
1690
+
mosesdecoder/moses/Parameter.h ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /// $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_Parameter_h
23
+ #define moses_Parameter_h
24
+
25
+ #include <string>
26
+ #include <set>
27
+ #include <map>
28
+ #include <vector>
29
+ #include "TypeDef.h"
30
+ #include "Util.h"
31
+ #include <boost/program_options.hpp>
32
+
33
+ namespace Moses
34
+ {
35
+ typedef std::vector<std::string> PARAM_VEC;
36
+ typedef std::map<std::string, PARAM_VEC > PARAM_MAP;
37
+ typedef std::map<std::string, bool> PARAM_BOOL;
38
+ typedef std::map<std::string, std::string > PARAM_STRING;
39
+
40
+ /** Handles parameter values set in config file or on command line.
41
+ * Process raw parameter data (names and values as strings) for StaticData
42
+ * to parse; to get useful values, see StaticData.
43
+ */
44
+ class Parameter
45
+ {
46
+ typedef boost::program_options::options_description options_description;
47
+ typedef boost::program_options::value_semantic value_semantic;
48
+ protected:
49
+ PARAM_MAP m_setting;
50
+ PARAM_BOOL m_valid;
51
+ PARAM_STRING m_abbreviation;
52
+ PARAM_STRING m_description;
53
+ PARAM_STRING m_fullname;
54
+ // std::map<char,std::set<std::string> > m_confusable;
55
+ // stores long parameter names that start with a letter that is also a short option.
56
+ options_description m_options;
57
+
58
+ std::map<std::string, std::vector<float> > m_weights;
59
+
60
+ std::string FindParam(const std::string &paramSwitch, int argc, char const* argv[]);
61
+ void OverwriteParam(const std::string &paramSwitch, const std::string &paramName,
62
+ int argc, char const* argv[]);
63
+ bool ReadConfigFile(const std::string &filePath );
64
+ bool FilesExist(const std::string &paramName, int fieldNo, std::vector<std::string> const& fileExtension=std::vector<std::string>(1,""));
65
+ bool isOption(const char* token);
66
+ bool Validate();
67
+
68
+ void
69
+ AddParam(options_description& optgroup,
70
+ value_semantic const* optvalue,
71
+ std::string const& paramName,
72
+ std::string const& description);
73
+
74
+ void
75
+ AddParam(options_description& optgroup,
76
+ std::string const &paramName,
77
+ std::string const &description);
78
+
79
+ void
80
+ AddParam(options_description& optgroup,
81
+ value_semantic const* optvalue,
82
+ std::string const& paramName,
83
+ std::string const& abbrevName,
84
+ std::string const& description);
85
+
86
+ void
87
+ AddParam(options_description& optgroup,
88
+ std::string const& paramName,
89
+ std::string const& abbrevName,
90
+ std::string const& description);
91
+
92
+ void PrintCredit();
93
+ void PrintFF() const;
94
+
95
+ void SetWeight(const std::string &name, size_t ind, float weight);
96
+ void SetWeight(const std::string &name, size_t ind, const std::vector<float> &weights);
97
+ void AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights);
98
+ void ConvertWeightArgs();
99
+ void ConvertWeightArgsSingleWeight(const std::string &oldWeightName, const std::string &newWeightName);
100
+ void ConvertWeightArgsPhraseModel(const std::string &oldWeightName);
101
+ void ConvertWeightArgsLM();
102
+ void ConvertWeightArgsDistortion();
103
+ void ConvertWeightArgsGeneration(const std::string &oldWeightName, const std::string &newWeightName);
104
+ void ConvertWeightArgsPhrasePenalty();
105
+ void ConvertWeightArgsWordPenalty();
106
+ void ConvertPhrasePenalty();
107
+ void CreateWeightsMap();
108
+ void CreateWeightsMap(const PARAM_VEC &vec);
109
+ void WeightOverwrite();
110
+ void AddFeature(const std::string &line);
111
+ void AddFeaturesCmd();
112
+
113
+
114
+ public:
115
+ Parameter();
116
+ ~Parameter();
117
+ bool LoadParam(int argc, char const* argv[]);
118
+ bool LoadParam(const std::string &filePath);
119
+ void Explain();
120
+
121
+ /** return a vector of strings holding the whitespace-delimited values on the ini-file line corresponding to the given parameter name */
122
+ const PARAM_VEC *GetParam(const std::string &paramName) const;
123
+
124
+ /** check if parameter is defined (either in moses.ini or as switch) */
125
+ bool isParamSpecified(const std::string &paramName) const {
126
+ return m_setting.find( paramName ) != m_setting.end();
127
+ }
128
+
129
+ void OverwriteParam(const std::string &paramName, PARAM_VEC values);
130
+
131
+ std::vector<float> GetWeights(const std::string &name);
132
+ const std::map<std::string, std::vector<float> > &GetAllWeights() const {
133
+ return m_weights;
134
+ }
135
+ std::set<std::string> GetWeightNames() const;
136
+
137
+ const PARAM_MAP &GetParams() const {
138
+ return m_setting;
139
+ }
140
+
141
+ void Save(const std::string path);
142
+
143
+ template<typename T>
144
+ void SetParameter(T &var, const std::string &name, const T &defaultValue) const {
145
+ const PARAM_VEC *params = GetParam(name);
146
+ if (params && params->size()) {
147
+ var = Scan<T>( params->at(0));
148
+ } else {
149
+ var = defaultValue;
150
+ }
151
+ }
152
+
153
+ void SetParameter(bool& var, std::string const& name);
154
+
155
+ bool SetBooleanSwitch(bool& val, std::string const name) {
156
+ // issues a warning if format is wrong
157
+ const PARAM_VEC *params = GetParam(name);
158
+ val = (params && params->size());
159
+ if (val && params->size() != 1) {
160
+ TRACE_ERR("ERROR: wrong format for switch -" << name);
161
+ return false;
162
+ }
163
+ return true;
164
+ }
165
+
166
+ };
167
+
168
+ template<>
169
+ void Parameter::SetParameter<bool>(bool &var, const std::string &name, const bool &defaultValue) const;
170
+
171
+ }
172
+
173
+ #endif
mosesdecoder/moses/Phrase.h ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #ifndef moses_Phrase_h
24
+ #define moses_Phrase_h
25
+
26
+ #include <iostream>
27
+ #include <vector>
28
+ #include <list>
29
+ #include <string>
30
+
31
+ #include <boost/functional/hash.hpp>
32
+
33
+ #include "Word.h"
34
+ #include "Util.h"
35
+
36
+ #include "util/string_piece.hh"
37
+ #include "util/exception.hh"
38
+ #include "parameters/AllOptions.h"
39
+
40
+ namespace Moses
41
+ {
42
+ class FactorMask;
43
+ class Range;
44
+ class ContextScope;
45
+
46
+ /** Representation of a phrase, ie. a contiguous number of words.
47
+ * Wrapper for vector of words
48
+ */
49
+ class Phrase
50
+ {
51
+ friend std::ostream& operator<<(std::ostream&, const Phrase&);
52
+ // private:
53
+ protected:
54
+ std::vector<Word> m_words;
55
+
56
+ public:
57
+
58
+ virtual bool HasScope() const {
59
+ return false;
60
+ }
61
+
62
+ virtual SPTR<ContextScope> GetScope() const {
63
+ return SPTR<ContextScope>();
64
+ }
65
+
66
+
67
+ /** No longer does anything as not using mem pool for Phrase class anymore */
68
+ static void InitializeMemPool();
69
+ static void FinalizeMemPool();
70
+
71
+ /** create empty phrase
72
+ */
73
+ Phrase();
74
+ explicit Phrase(size_t reserveSize);
75
+ /** create phrase from vectors of words */
76
+ explicit Phrase(const std::vector< const Word* > &mergeWords);
77
+
78
+ /* This isn't a swap function because classes inherit from Phrase and might
79
+ * not override swap, which would be bad.
80
+ */
81
+ void SwapWords(Phrase &other) {
82
+ swap(m_words, other.m_words);
83
+ }
84
+
85
+ /** destructor */
86
+ virtual ~Phrase();
87
+
88
+ /**
89
+ * Fills phrase with words from format string, typically from phrase table or sentence input
90
+ *
91
+ * \param factorOrder factor types of each element in 2D string vector
92
+ * \param phraseString formatted input string to parse
93
+ * \param lhs returns the non-terminal Word for the left-hand side of an SCFG rule, may be NULL for phrase-based
94
+ */
95
+ void CreateFromString(FactorDirection direction,
96
+ const std::vector<FactorType> &factorOrder,
97
+ const StringPiece &phraseString,
98
+ Word **lhs);
99
+
100
+ /** copy factors from the other phrase to this phrase.
101
+ IsCompatible() must be run beforehand to ensure incompatible factors aren't overwritten
102
+ */
103
+ void MergeFactors(const Phrase &copy);
104
+ //! copy a single factor (specified by factorType)
105
+ void MergeFactors(const Phrase &copy, FactorType factorType);
106
+ //! copy all factors specified in factorVec and none others
107
+ void MergeFactors(const Phrase &copy, const std::vector<FactorType>& factorVec);
108
+
109
+ /** compare 2 phrases to ensure no factors are lost if the phrases are merged
110
+ * must run IsCompatible() to ensure incompatible factors aren't being overwritten
111
+ */
112
+ bool IsCompatible(const Phrase &inputPhrase) const;
113
+ bool IsCompatible(const Phrase &inputPhrase, FactorType factorType) const;
114
+ bool IsCompatible(const Phrase &inputPhrase, const std::vector<FactorType>& factorVec) const;
115
+
116
+ //! number of words
117
+ inline size_t GetSize() const {
118
+ return m_words.size();
119
+ }
120
+
121
+ //! word at a particular position
122
+ inline const Word &GetWord(size_t pos) const {
123
+ return m_words[pos];
124
+ }
125
+ inline Word &GetWord(size_t pos) {
126
+ return m_words[pos];
127
+ }
128
+
129
+ inline Word &Front() {
130
+ return m_words[0];
131
+ }
132
+
133
+ inline Word &Back() {
134
+ return m_words[GetSize() - 1];
135
+ }
136
+
137
+ inline const Word &Front() const {
138
+ return m_words[0];
139
+ }
140
+
141
+ inline const Word &Back() const {
142
+ return m_words[GetSize() - 1];
143
+ }
144
+
145
+ //! particular factor at a particular position
146
+ inline const Factor *GetFactor(size_t pos, FactorType factorType) const {
147
+ const Word &ptr = m_words[pos];
148
+ return ptr[factorType];
149
+ }
150
+ inline void SetFactor(size_t pos, FactorType factorType, const Factor *factor) {
151
+ Word &ptr = m_words[pos];
152
+ ptr[factorType] = factor;
153
+ }
154
+
155
+ size_t GetNumTerminals() const;
156
+ size_t GetNumNonTerminals() const {
157
+ return GetSize() - GetNumTerminals();
158
+ }
159
+
160
+ //! whether the 2D vector is a substring of this phrase
161
+ bool Contains(const std::vector< std::vector<std::string> > &subPhraseVector
162
+ , const std::vector<FactorType> &inputFactor) const;
163
+
164
+ size_t Find(const Phrase &sought, int maxUnknown) const;
165
+
166
+ //! create an empty word at the end of the phrase
167
+ Word &AddWord();
168
+ //! create copy of input word at the end of the phrase
169
+ void AddWord(const Word &newWord) {
170
+ AddWord() = newWord;
171
+ }
172
+
173
+ /** appends a phrase at the end of current phrase **/
174
+ void Append(const Phrase &endPhrase);
175
+ void PrependWord(const Word &newWord);
176
+
177
+ void Clear() {
178
+ m_words.clear();
179
+ }
180
+
181
+ void RemoveWord(size_t pos) {
182
+ UTIL_THROW_IF2(pos >= m_words.size(),
183
+ "Referencing position " << pos << " out of bound");
184
+ m_words.erase(m_words.begin() + pos);
185
+ }
186
+
187
+ void InitStartEndWord();
188
+
189
+ //! create new phrase class that is a substring of this phrase
190
+ Phrase GetSubString(const Range &range) const;
191
+ Phrase GetSubString(const Range &range, FactorType factorType) const;
192
+
193
+ //! return a string rep of the phrase;
194
+ // w/ factors delimited by FactorDelimiter
195
+ std::string
196
+ GetStringRep(std::vector<FactorType> const& factorsToPrint,
197
+ AllOptions const* opts=NULL) const;
198
+
199
+ TO_STRING();
200
+
201
+
202
+ int Compare(const Phrase &other) const;
203
+
204
+ /** transitive comparison between 2 phrases
205
+ * used to insert & find phrase in dictionary
206
+ */
207
+ bool operator< (const Phrase &compare) const {
208
+ return Compare(compare) < 0;
209
+ }
210
+
211
+ size_t hash() const;
212
+
213
+ bool operator==(const Phrase &compare) const;
214
+ bool operator!=(const Phrase &compare) const {
215
+ return ! (*this == compare);
216
+ }
217
+
218
+ void OnlyTheseFactors(const FactorMask &factors);
219
+
220
+ };
221
+
222
+ inline size_t hash_value(const Phrase& phrase)
223
+ {
224
+ return phrase.hash();
225
+ }
226
+
227
+ struct PhrasePtrComparator {
228
+ inline bool operator()(const Phrase* lhs, const Phrase* rhs) const {
229
+ return *lhs == *rhs;
230
+ }
231
+ };
232
+
233
+ struct PhrasePtrHasher {
234
+ inline size_t operator()(const Phrase* phrase) const {
235
+ size_t seed = 0;
236
+ boost::hash_combine(seed,*phrase);
237
+ return seed;
238
+ }
239
+
240
+ };
241
+
242
+ }
243
+
244
+ #endif
mosesdecoder/moses/PrefixTree.h ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /* ---------------------------------------------------------------- */
4
+ /* Copyright 2005 (c) by RWTH Aachen - Lehrstuhl fuer Informatik VI */
5
+ /* Richard Zens */
6
+ /* ---------------------------------------------------------------- */
7
+ #ifndef moses_PrefixTree_h
8
+ #define moses_PrefixTree_h
9
+
10
+ #include <vector>
11
+ #include <algorithm>
12
+ #include <deque>
13
+ #include "Util.h"
14
+ #include "FilePtr.h"
15
+ #include "File.h"
16
+
17
+ namespace Moses
18
+ {
19
+
20
+ /** @todo How is this used in the pb binary phrase table?
21
+ */
22
+ template<typename T,typename D>
23
+ class PrefixTreeSA
24
+ {
25
+ public:
26
+ typedef T Key;
27
+ typedef D Data;
28
+
29
+ typedef PrefixTreeSA<T,D> Self;
30
+ typedef std::vector<T> VT;
31
+ typedef std::vector<Self*> VP;
32
+ typedef std::vector<D> VD;
33
+
34
+ VT keys;
35
+ VP ptr;
36
+ VD data;
37
+
38
+ static Data def;
39
+
40
+ public:
41
+ PrefixTreeSA() {}
42
+
43
+ ~PrefixTreeSA() {
44
+ for(size_t i=0; i<ptr.size(); ++i) delete ptr[i];
45
+ }
46
+
47
+ static const Data& getDefault() {
48
+ return def;
49
+ }
50
+ static void setDefault(const Data& x) {
51
+ def=x;
52
+ }
53
+
54
+
55
+ // insert sequence
56
+ template<typename fwiter> Data& insert(fwiter b,fwiter e) {
57
+ typename VT::iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
58
+ typename VT::iterator kb=keys.begin();
59
+ size_t pos=std::distance(kb,i);
60
+
61
+ if(i==keys.end() || *i!=*b) {
62
+ keys.insert(i,*b);
63
+ data.insert(data.begin()+pos,def);
64
+
65
+ Self *self = NULL;
66
+ ptr.insert(ptr.begin()+pos, self);
67
+ }
68
+ if(++b!=e) {
69
+ if(!ptr[pos]) ptr[pos]=new Self;
70
+ return ptr[pos]->insert(b,e);
71
+ } else return data[pos];
72
+ }
73
+ // insert container
74
+ template<typename cont> Data& insert(const cont& c) {
75
+ return insert(c.begin(),c.end());
76
+ }
77
+
78
+ size_t size() const {
79
+ return keys.size();
80
+ }
81
+ const Key& getKey(size_t i) const {
82
+ return keys[i];
83
+ }
84
+ const Data& getData(size_t i) const {
85
+ return data[i];
86
+ }
87
+ const Self* getPtr(size_t i) const {
88
+ return ptr[i];
89
+ }
90
+
91
+ size_t findKey(const Key& k) const {
92
+ typename VT::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
93
+ if(i==keys.end() || *i!=k) return keys.size();
94
+ return std::distance(keys.begin(),i);
95
+ }
96
+
97
+ // find sequence
98
+ template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
99
+ size_t pos=findKey(*b);
100
+ if(pos==keys.size()) return 0;
101
+ if(++b==e) return &data[pos];
102
+ if(ptr[pos]) return ptr[pos]->findPtr(b,e);
103
+ else return 0;
104
+ }
105
+ // find container
106
+ template<typename cont> const Data* findPtr(const cont& c) const {
107
+ return findPtr(c.begin(),c.end());
108
+ }
109
+
110
+
111
+ // find sequence
112
+ template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
113
+ if(const Data* p=findPtr(b,e)) return *p;
114
+ else return def;
115
+ }
116
+
117
+ // find container
118
+ template<typename cont> const Data& find(const cont& c) const {
119
+ return find(c.begin(),c.end());
120
+ }
121
+
122
+ void shrink() {
123
+ ShrinkToFit(keys);
124
+ ShrinkToFit(ptr);
125
+ ShrinkToFit(data);
126
+ }
127
+
128
+ };
129
+ template<typename T,typename D> D PrefixTreeSA<T,D>::def;
130
+
131
+ /////////////////////////////////////////////////////////////////////////////
132
+
133
+ /** @todo How is this used in the pb binary phrase table?
134
+ */
135
+ template<typename T,typename D>
136
+ class PrefixTreeF
137
+ {
138
+ public:
139
+ typedef T Key;
140
+ typedef D Data;
141
+ private:
142
+ typedef PrefixTreeF<Key,Data> Self;
143
+ public:
144
+ typedef FilePtr<Self> Ptr;
145
+ private:
146
+ typedef std::vector<Key> VK;
147
+ typedef std::vector<Data> VD;
148
+ typedef std::vector<Ptr> VP;
149
+
150
+ VK keys;
151
+ VD data;
152
+ VP ptr;
153
+
154
+ static Data def;
155
+
156
+ OFF_T startPos;
157
+ FILE* f;
158
+ public:
159
+
160
+ PrefixTreeF(FILE* f_=0) : f(f_) {
161
+ if(f) read();
162
+ }
163
+
164
+ ~PrefixTreeF() {
165
+ free();
166
+ }
167
+
168
+ void read() {
169
+ startPos=fTell(f);
170
+ fReadVector(f,keys);
171
+ fReadVector(f,data);
172
+ ptr.clear();
173
+ ptr.resize(keys.size());
174
+ std::vector<OFF_T> rawOffs(keys.size());
175
+ size_t bytes_read = fread(&rawOffs[0], sizeof(OFF_T), keys.size(), f);
176
+ UTIL_THROW_IF2(bytes_read != keys.size(), "Read error at " << HERE);
177
+ for(size_t i=0; i<ptr.size(); ++i)
178
+ if (rawOffs[i]) ptr[i].set(f, rawOffs[i]);
179
+ }
180
+
181
+ void free() {
182
+ for(typename VP::iterator i=ptr.begin(); i!=ptr.end(); ++i) i->free();
183
+ }
184
+
185
+ void reserve(size_t s) {
186
+ keys.reserve(s);
187
+ data.reserve(s);
188
+ ptr.reserve(s);
189
+ }
190
+
191
+ template<typename fwiter>
192
+ void changeData(fwiter b,fwiter e,const Data& d) {
193
+ typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
194
+ if(i==keys.end() || *i!=*b) {
195
+ TRACE_ERR("ERROR: key not found in changeData!\n");
196
+ return;
197
+ }
198
+ typename VK::const_iterator kb=keys.begin();
199
+ size_t pos=std::distance(kb,i);
200
+ if(++b==e) {
201
+ OFF_T p=startPos+keys.size()*sizeof(Key)+2*sizeof(unsigned)+pos*sizeof(Data);
202
+ TRACE_ERR("elem found at pos "<<p<<" old val: "<<data[pos]<<" startpos: "<<startPos<<"\n");
203
+ if(data[pos]!=d) {
204
+ data[pos]=d;
205
+ fSeek(f,p);
206
+ fWrite(f,d);
207
+ }
208
+ return;
209
+ }
210
+ if(ptr[pos]) ptr[pos]->changeData(b,e,d);
211
+ else {
212
+ TRACE_ERR("ERROR: seg not found!in changeData\n");
213
+ }
214
+ }
215
+
216
+
217
+ void create(const PrefixTreeSA<Key,Data>& psa,const std::string& fname) {
218
+ FILE* f=fOpen(fname.c_str(),"wb");
219
+ create(psa,f);
220
+ fclose(f);
221
+ }
222
+
223
+ void create(const PrefixTreeSA<Key,Data>& psa,FILE* f,int verbose=0) {
224
+ setDefault(psa.getDefault());
225
+
226
+ typedef std::pair<const PrefixTreeSA<Key,Data>*,OFF_T> P;
227
+ typedef std::deque<P> Queue;
228
+
229
+ Queue queue;
230
+
231
+ queue.push_back(P(&psa,fTell(f)));
232
+ bool isFirst=1;
233
+ size_t ns=1;
234
+ while(queue.size()) {
235
+ if(verbose && queue.size()>ns) {
236
+ TRACE_ERR("stack size in PF create: "<<queue.size()<<"\n");
237
+ while(ns<queue.size()) ns*=2;
238
+ }
239
+ const P& pp=queue.back();
240
+ const PrefixTreeSA<Key,Data>& p=*pp.first;
241
+ OFF_T pos=pp.second;
242
+ queue.pop_back();
243
+
244
+ if(!isFirst) {
245
+ OFF_T curr=fTell(f);
246
+ fSeek(f,pos);
247
+ fWrite(f,curr);
248
+ fSeek(f,curr);
249
+ } else isFirst=0;
250
+
251
+ size_t s=0;
252
+ s+=fWriteVector(f,p.keys);
253
+ s+=fWriteVector(f,p.data);
254
+
255
+ for(size_t i=0; i<p.ptr.size(); ++i) {
256
+ if(p.ptr[i])
257
+ queue.push_back(P(p.ptr[i],fTell(f)));
258
+ OFF_T ppos=0;
259
+ s+=fWrite(f,ppos);
260
+ }
261
+ }
262
+ }
263
+
264
+ size_t size() const {
265
+ return keys.size();
266
+ }
267
+ const Key& getKey(size_t i) const {
268
+ return keys[i];
269
+ }
270
+ const Data& getData(size_t i) const {
271
+ return data[i];
272
+ }
273
+ const Self* getPtr(size_t i) const {
274
+ return ptr[i];
275
+ }
276
+
277
+ size_t findKey(const Key& k) const {
278
+ typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),k);
279
+ if(i==keys.end() || *i!=k) return keys.size();
280
+ return std::distance(keys.begin(),i);
281
+ }
282
+
283
+ Ptr const* findKeyPtr(const Key& k) const {
284
+ size_t pos=findKey(k);
285
+ return (pos<keys.size() ? &ptr[pos] : 0);
286
+ }
287
+
288
+ // find sequence
289
+ template<typename fwiter> const Data* findPtr(fwiter b,fwiter e) const {
290
+ typename VK::const_iterator i=std::lower_bound(keys.begin(),keys.end(),*b);
291
+ if(i==keys.end() || *i!=*b) return 0;
292
+ size_t pos=std::distance(keys.begin(),i);
293
+ if(++b==e) return &data[pos];
294
+ if(ptr[pos]) return ptr[pos]->findPtr(b,e);
295
+ else return 0;
296
+ }
297
+ // find container
298
+ template<typename cont> const Data* findPtr(const cont& c) const {
299
+ return findPtr(c.begin(),c.end());
300
+ }
301
+
302
+
303
+ // find sequence
304
+ template<typename fwiter> const Data& find(fwiter b,fwiter e) const {
305
+ if(const Data* p=findPtr(b,e)) return *p;
306
+ else return def;
307
+ } //return (p?*p:def);}
308
+
309
+ // find container
310
+ template<typename cont> const Data& find(const cont& c) const {
311
+ return find(c.begin(),c.end());
312
+ }
313
+
314
+ static void setDefault(const Data& d) {
315
+ def=d;
316
+ }
317
+ static const Data& getDefault() {
318
+ return def;
319
+ }
320
+
321
+
322
+ void print(std::ostream& out,const std::string s="") const {
323
+
324
+ out<<s<<"startpos: "<<startPos<<" size: "<<keys.size()<<"\n";
325
+ for(size_t i=0; i<keys.size(); ++i) {
326
+ out<<s<<i<<" - "<<keys[i]<<" "<<data[i]<<"\n";
327
+ }
328
+ for(size_t i=0; i<ptr.size(); ++i)
329
+ if(ptr[i])
330
+ ptr[i]->print(out,s+" ");
331
+ }
332
+
333
+
334
+ };
335
+ template<typename T,typename D> D PrefixTreeF<T,D>::def;
336
+
337
+ }
338
+
339
+ #endif
mosesdecoder/moses/Range.h ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_WordsRange_h
23
+ #define moses_WordsRange_h
24
+
25
+ #include <iostream>
26
+ #include <boost/functional/hash.hpp>
27
+ #include "TypeDef.h"
28
+ #include "Util.h"
29
+ #include "util/exception.hh"
30
+
31
+ #ifdef WIN32
32
+ #undef max
33
+ #endif
34
+
35
+ namespace Moses
36
+ {
37
+
38
+ /***
39
+ * Efficient version of Bitmap for contiguous ranges
40
+ */
41
+ class Range
42
+ {
43
+ friend std::ostream& operator << (std::ostream& out, const Range& range);
44
+
45
+ // m_endPos is inclusive
46
+ size_t m_startPos, m_endPos;
47
+ public:
48
+ inline explicit Range() {}
49
+ inline Range(size_t startPos, size_t endPos) : m_startPos(startPos), m_endPos(endPos) {}
50
+ inline Range(const Range &copy)
51
+ : m_startPos(copy.GetStartPos())
52
+ , m_endPos(copy.GetEndPos()) {
53
+ }
54
+
55
+ inline size_t GetStartPos() const {
56
+ return m_startPos;
57
+ }
58
+ inline size_t GetEndPos() const {
59
+ return m_endPos;
60
+ }
61
+
62
+ //! count of words translated
63
+ inline size_t GetNumWordsCovered() const {
64
+ return (m_startPos == NOT_FOUND) ? 0 : m_endPos - m_startPos + 1;
65
+ }
66
+
67
+ //! transitive comparison
68
+ inline bool operator<(const Range& x) const {
69
+ return (m_startPos<x.m_startPos
70
+ || (m_startPos==x.m_startPos && m_endPos<x.m_endPos));
71
+ }
72
+
73
+ // equality operator
74
+ inline bool operator==(const Range& x) const {
75
+ return (m_startPos==x.m_startPos && m_endPos==x.m_endPos);
76
+ }
77
+ // Whether two word ranges overlap or not
78
+ inline bool Overlap(const Range& x) const {
79
+
80
+ if ( x.m_endPos < m_startPos || x.m_startPos > m_endPos) return false;
81
+
82
+ return true;
83
+ }
84
+
85
+ inline size_t GetNumWordsBetween(const Range& x) const {
86
+ UTIL_THROW_IF2(Overlap(x), "Overlapping ranges");
87
+
88
+ if (x.m_endPos < m_startPos) {
89
+ return m_startPos - x.m_endPos - 1;
90
+ }
91
+
92
+ return x.m_startPos - m_endPos - 1;
93
+ }
94
+
95
+
96
+ TO_STRING();
97
+ };
98
+
99
+ inline size_t hash_value(const Range& range)
100
+ {
101
+ size_t seed = range.GetStartPos();
102
+ boost::hash_combine(seed, range.GetEndPos());
103
+ return seed;
104
+ }
105
+
106
+ }
107
+ #endif
mosesdecoder/moses/ReorderingConstraint.cpp ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2008 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include "ReorderingConstraint.h"
24
+ #include "InputType.h"
25
+ #include "StaticData.h"
26
+ #include "Bitmap.h"
27
+
28
+ namespace Moses
29
+ {
30
+
31
+ //! allocate memory for reordering walls
32
+ void ReorderingConstraint::InitializeWalls(size_t size)
33
+ {
34
+ m_size = size;
35
+ m_wall = (bool*) malloc(sizeof(bool) * size);
36
+ m_localWall = (size_t*) malloc(sizeof(size_t) * size);
37
+
38
+ for (size_t pos = 0 ; pos < m_size ; pos++) {
39
+ m_wall[pos] = false;
40
+ m_localWall[pos] = NOT_A_ZONE;
41
+ }
42
+ }
43
+
44
+
45
+ //! set value at a particular position
46
+ void ReorderingConstraint::SetWall( size_t pos, bool value )
47
+ {
48
+ VERBOSE(3,"SETTING reordering wall at position " << pos << std::endl);
49
+ m_wall[pos] = value;
50
+ m_active = true;
51
+ }
52
+
53
+ //! has to be called to localized walls
54
+ void ReorderingConstraint::FinalizeWalls()
55
+ {
56
+ for(size_t z = 0; z < m_zone.size(); z++ ) {
57
+ const size_t startZone = m_zone[z].first;
58
+ const size_t endZone = m_zone[z].second;// note: wall after endZone is not local
59
+ for( size_t pos = startZone; pos < endZone; pos++ ) {
60
+ if (m_wall[ pos ]) {
61
+ m_localWall[ pos ] = z;
62
+ m_wall[ pos ] = false;
63
+ VERBOSE(3,"SETTING local wall " << pos << std::endl);
64
+ }
65
+ // enforce that local walls only apply to innermost zone
66
+ else if (m_localWall[ pos ] != NOT_A_ZONE) {
67
+ size_t assigned_z = m_localWall[ pos ];
68
+ if ((m_zone[assigned_z].first < startZone) ||
69
+ (m_zone[assigned_z].second > endZone)) {
70
+ m_localWall[ pos ] = z;
71
+ }
72
+ }
73
+ }
74
+ }
75
+ }
76
+
77
+ //! set walls based on "-monotone-at-punctuation" flag
78
+ void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence )
79
+ {
80
+ for( size_t i=0; i<sentence.GetSize(); i++ ) {
81
+ const Word& word = sentence.GetWord(i);
82
+ if (word[0]->GetString() == "," ||
83
+ word[0]->GetString() == "." ||
84
+ word[0]->GetString() == "!" ||
85
+ word[0]->GetString() == "?" ||
86
+ word[0]->GetString() == ":" ||
87
+ word[0]->GetString() == ";" ||
88
+ word[0]->GetString() == "\"") {
89
+ // set wall before and after punc, but not at sentence start, end
90
+ if (i>0 && i<m_size-1) SetWall( i, true );
91
+ if (i>1) SetWall( i-1, true );
92
+ }
93
+ }
94
+ }
95
+
96
+ //! set a reordering zone (once entered, need to finish)
97
+ void ReorderingConstraint::SetZone( size_t startPos, size_t endPos )
98
+ {
99
+ VERBOSE(3,"SETTING zone " << startPos << "-" << endPos << std::endl);
100
+ std::pair<size_t,size_t> newZone;
101
+ newZone.first = startPos;
102
+ newZone.second = endPos;
103
+ m_zone.push_back( newZone );
104
+ m_active = true;
105
+ }
106
+
107
+ //! check if the current hypothesis extension violates reordering constraints
108
+ bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t endPos ) const
109
+ {
110
+ // nothing to be checked, we are done
111
+ if (! IsActive() ) return true;
112
+
113
+ VERBOSE(3,"Check " << bitmap << " " << startPos << "-" << endPos);
114
+
115
+ // check walls
116
+ size_t firstGapPos = bitmap.GetFirstGapPos();
117
+ // filling first gap -> no wall violation possible
118
+ if (firstGapPos != startPos) {
119
+ // if there is a wall before the last word,
120
+ // we created a gap while moving through wall
121
+ // -> violation
122
+ for( size_t pos = firstGapPos; pos < endPos; pos++ ) {
123
+ if( GetWall( pos ) ) {
124
+ VERBOSE(3," hitting wall " << pos << std::endl);
125
+ return false;
126
+ }
127
+ }
128
+ }
129
+
130
+ // monotone -> no violation possible
131
+ size_t lastPos = bitmap.GetLastPos();
132
+ if ((lastPos == NOT_FOUND && startPos == 0) || // nothing translated
133
+ (firstGapPos > lastPos && // no gaps
134
+ firstGapPos == startPos)) { // translating first empty word
135
+ VERBOSE(3," montone, fine." << std::endl);
136
+ return true;
137
+ }
138
+
139
+ // check zones
140
+ for(size_t z = 0; z < m_zone.size(); z++ ) {
141
+ const size_t startZone = m_zone[z].first;
142
+ const size_t endZone = m_zone[z].second;
143
+
144
+ // fine, if translation has not reached zone yet and phrase outside zone
145
+ if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) {
146
+ continue;
147
+ }
148
+
149
+ // already completely translated zone, no violations possible
150
+ if (firstGapPos > endZone) {
151
+ continue;
152
+ }
153
+
154
+ // some words are translated beyond the start
155
+ // let's look closer if some are in the zone
156
+ size_t numWordsInZoneTranslated = 0;
157
+ if (lastPos >= startZone) {
158
+ for(size_t pos = startZone; pos <= endZone; pos++ ) {
159
+ if( bitmap.GetValue( pos ) ) {
160
+ numWordsInZoneTranslated++;
161
+ }
162
+ }
163
+ }
164
+
165
+ // all words in zone translated, no violation possible
166
+ if (numWordsInZoneTranslated == endZone-startZone+1) {
167
+ continue;
168
+ }
169
+
170
+ // flag if this is an active zone
171
+ bool activeZone = (numWordsInZoneTranslated > 0);
172
+
173
+ // fine, if zone completely untranslated and phrase outside zone
174
+ if (!activeZone && ( endPos < startZone || startPos > endZone ) ) {
175
+ continue;
176
+ }
177
+
178
+ // violation, if phrase completely outside active zone
179
+ if (activeZone && ( endPos < startZone || startPos > endZone ) ) {
180
+ VERBOSE(3," outside active zone" << std::endl);
181
+ return false;
182
+ }
183
+
184
+ // ok, this is what we know now:
185
+ // * the phrase is in the zone (at least partially)
186
+ // * either zone is already active, or it becomes active now
187
+
188
+
189
+ // check, if we are setting us up for a dead end due to distortion limits
190
+
191
+ // size_t distortionLimit = (size_t)StaticData::Instance().GetMaxDistortion();
192
+ size_t distortionLimit = m_max_distortion;
193
+ if (startPos != firstGapPos && endZone-firstGapPos >= distortionLimit) {
194
+ VERBOSE(3," dead end due to distortion limit" << std::endl);
195
+ return false;
196
+ }
197
+
198
+ // let us check on phrases that are partially outside
199
+
200
+ // phrase overlaps at the beginning, always ok
201
+ if (startPos <= startZone) {
202
+ continue;
203
+ }
204
+
205
+ // phrase goes beyond end, has to fill zone completely
206
+ if (endPos > endZone) {
207
+ if (endZone-startPos+1 < // num. words filled in by phrase
208
+ endZone-startZone+1-numWordsInZoneTranslated) { // num. untranslated
209
+ VERBOSE(3," overlap end, but not completing" << std::endl);
210
+ return false;
211
+ } else {
212
+ continue;
213
+ }
214
+ }
215
+
216
+ // now we are down to phrases that are completely inside the zone
217
+ // we have to check local walls
218
+ bool seenUntranslatedBeforeStartPos = false;
219
+ for(size_t pos = startZone; pos < endZone && pos < endPos; pos++ ) {
220
+ // be careful when there is a gap before phrase
221
+ if( !bitmap.GetValue( pos ) // untranslated word
222
+ && pos < startPos ) { // before startPos
223
+ seenUntranslatedBeforeStartPos = true;
224
+ }
225
+ if( seenUntranslatedBeforeStartPos && GetLocalWall( pos, z ) ) {
226
+ VERBOSE(3," local wall violation" << std::endl);
227
+ return false;
228
+ }
229
+ }
230
+
231
+ // passed all checks for this zone, on to the next one
232
+ }
233
+
234
+ // passed all checks, no violations
235
+ VERBOSE(3," fine." << std::endl);
236
+ return true;
237
+ }
238
+
239
+ std::ostream& operator<<(std::ostream& out, const ReorderingConstraint &obj)
240
+ {
241
+ out << "Zones:";
242
+ for (size_t i = 0; i < obj.m_zone.size(); ++i) {
243
+ const std::pair<size_t,size_t> &zone1 = obj.m_zone[i];
244
+ out << zone1.first << "-" << zone1.second << " ";
245
+ }
246
+
247
+ out << "Walls:";
248
+ for (size_t i = 0; i < obj.m_size; ++i) {
249
+ out << obj.m_wall[i];
250
+ }
251
+
252
+ out << " Local walls:";
253
+ for (size_t i = 0; i < obj.m_size; ++i) {
254
+ out << obj.m_localWall[i] << " ";
255
+ }
256
+
257
+ return out;
258
+ }
259
+
260
+ }
mosesdecoder/moses/ReorderingConstraint.h ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ // $Id$
3
+ // vim:tabstop=2
4
+
5
+ /***********************************************************************
6
+ Moses - factored phrase-based language decoder
7
+ Copyright (C) 2008 University of Edinburgh
8
+
9
+ This library is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU Lesser General Public
11
+ License as published by the Free Software Foundation; either
12
+ version 2.1 of the License, or (at your option) any later version.
13
+
14
+ This library is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
+ Lesser General Public License for more details.
18
+
19
+ You should have received a copy of the GNU Lesser General Public
20
+ License along with this library; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ ***********************************************************************/
23
+
24
+ #ifndef moses_ReorderingConstraint_h
25
+ #define moses_ReorderingConstraint_h
26
+
27
+ //#include <malloc.h>
28
+ #include <limits>
29
+ #include <vector>
30
+ #include <iostream>
31
+ #include <cstring>
32
+ #include <cmath>
33
+ #include "TypeDef.h"
34
+ #include "Word.h"
35
+ #include "Phrase.h"
36
+
37
+ namespace Moses
38
+ {
39
+
40
+ class InputType;
41
+ class Bitmap;
42
+
43
+ #define NOT_A_ZONE 999999999
44
+ /** A list of zones and walls to limit which reordering can occur
45
+ */
46
+ class ReorderingConstraint
47
+ {
48
+ friend std::ostream& operator<<(std::ostream& out, const ReorderingConstraint &obj);
49
+ protected:
50
+ // const size_t m_size; /**< number of words in sentence */
51
+ size_t m_size; /**< number of words in sentence */
52
+ bool *m_wall; /**< flag for each word if it is a wall */
53
+ size_t *m_localWall; /**< flag for each word if it is a local wall */
54
+ std::vector< std::pair<size_t,size_t> > m_zone; /** zones that limit reordering */
55
+ bool m_active; /**< flag indicating, if there are any active constraints */
56
+ int m_max_distortion;
57
+ public:
58
+
59
+ //! create ReorderingConstraint of length size and initialise to zero
60
+ ReorderingConstraint(int max_distortion)
61
+ : m_wall(NULL)
62
+ , m_localWall(NULL)
63
+ , m_active(false)
64
+ , m_max_distortion(max_distortion)
65
+ {}
66
+
67
+ //! destructer
68
+ ~ReorderingConstraint() {
69
+ if (m_wall != NULL) free(m_wall);
70
+ if (m_localWall != NULL) free(m_localWall);
71
+ }
72
+
73
+ //! allocate memory for memory for a sentence of a given size
74
+ void InitializeWalls(size_t size);
75
+
76
+ //! changes walls in zones into local walls
77
+ void FinalizeWalls();
78
+
79
+ //! set value at a particular position
80
+ void SetWall( size_t pos, bool value );
81
+
82
+ //! whether a word has been translated at a particular position
83
+ bool GetWall(size_t pos) const {
84
+ return m_wall[pos];
85
+ }
86
+
87
+ //! whether a word has been translated at a particular position
88
+ bool GetLocalWall(size_t pos, size_t zone ) const {
89
+ return (m_localWall[pos] == zone);
90
+ }
91
+
92
+ //! set a zone
93
+ void SetZone( size_t startPos, size_t endPos );
94
+
95
+ //! returns the vector of zones
96
+ std::vector< std::pair<size_t,size_t> > & GetZones() {
97
+ return m_zone;
98
+ }
99
+
100
+ //! set the reordering walls based on punctuation in the sentence
101
+ void SetMonotoneAtPunctuation( const Phrase & sentence );
102
+
103
+ //! check if all constraints are fulfilled -> all find
104
+ bool Check( const Bitmap &bitmap, size_t start, size_t end ) const;
105
+
106
+ //! checks if reordering constraints will be enforced
107
+ bool IsActive() const {
108
+ return m_active;
109
+ }
110
+ };
111
+
112
+ }
113
+ #endif
mosesdecoder/moses/ScoreComponentCollectionTest.cpp ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include <stdexcept>
21
+
22
+ #include <boost/test/unit_test.hpp>
23
+
24
+ #include "moses/FF/StatelessFeatureFunction.h"
25
+ #include "ScoreComponentCollection.h"
26
+
27
+ using namespace Moses;
28
+ using namespace std;
29
+
30
+ BOOST_AUTO_TEST_SUITE(scc)
31
+
32
+ class MockStatelessFeatureFunction : public StatelessFeatureFunction
33
+ {
34
+ public:
35
+ MockStatelessFeatureFunction(size_t n, const string &line) :
36
+ StatelessFeatureFunction(n, line) {}
37
+ void EvaluateWhenApplied(const Hypothesis&, ScoreComponentCollection*) const {}
38
+ void EvaluateWhenApplied(const ChartHypothesis&, ScoreComponentCollection*) const {}
39
+ void EvaluateWithSourceContext(const InputType &input
40
+ , const InputPath &inputPath
41
+ , const TargetPhrase &targetPhrase
42
+ , const StackVec *stackVec
43
+ , ScoreComponentCollection &scoreBreakdown
44
+ , ScoreComponentCollection *estimatedScores) const {
45
+ }
46
+
47
+ void EvaluateTranslationOptionListWithSourceContext(const InputType &input
48
+ , const TranslationOptionList &translationOptionList) const {
49
+ }
50
+ void EvaluateInIsolation(const Phrase &source
51
+ , const TargetPhrase &targetPhrase
52
+ , ScoreComponentCollection &scoreBreakdown
53
+ , ScoreComponentCollection &estimatedScores) const {
54
+ }
55
+
56
+ };
57
+
58
+ class MockSingleFeature : public MockStatelessFeatureFunction
59
+ {
60
+ public:
61
+ MockSingleFeature(): MockStatelessFeatureFunction(1, "MockSingle") {}
62
+
63
+ bool IsUseable(const FactorMask &mask) const {
64
+ return true;
65
+ }
66
+ };
67
+
68
+ class MockMultiFeature : public MockStatelessFeatureFunction
69
+ {
70
+ public:
71
+ MockMultiFeature(): MockStatelessFeatureFunction(5, "MockMulti") {}
72
+
73
+ bool IsUseable(const FactorMask &mask) const {
74
+ return true;
75
+ }
76
+
77
+ };
78
+
79
+ class MockSparseFeature : public MockStatelessFeatureFunction
80
+ {
81
+ public:
82
+ MockSparseFeature(): MockStatelessFeatureFunction(0, "MockSparse") {}
83
+
84
+ bool IsUseable(const FactorMask &mask) const {
85
+ return true;
86
+ }
87
+ };
88
+
89
+
90
+
91
+ struct MockProducers {
92
+ MockProducers() {
93
+ FeatureFunction::Register(&single);
94
+ FeatureFunction::Register(&multi);
95
+ FeatureFunction::Register(&sparse);
96
+ }
97
+
98
+ MockSingleFeature single;
99
+ MockMultiFeature multi;
100
+ MockSparseFeature sparse;
101
+ };
102
+
103
+ BOOST_FIXTURE_TEST_CASE(ctor, MockProducers)
104
+ {
105
+ ScoreComponentCollection scc;
106
+ BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single),0);
107
+ float expected[] = {0,0,0,0,0};
108
+ std::vector<float> actual= scc.GetScoresForProducer(&multi);
109
+ BOOST_CHECK_EQUAL_COLLECTIONS(expected, expected+5, actual.begin(), actual.begin()+5);
110
+ }
111
+
112
+ BOOST_FIXTURE_TEST_CASE(plusequals, MockProducers)
113
+ {
114
+ float arr1[] = {1,2,3,4,5};
115
+ float arr2[] = {2,4,6,8,10};
116
+ std::vector<float> vec1(arr1,arr1+5);
117
+ std::vector<float> vec2(arr2,arr2+5);
118
+
119
+ ScoreComponentCollection scc;
120
+ scc.PlusEquals(&single, 3.4f);
121
+ BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
122
+ scc.PlusEquals(&multi,vec1);
123
+ std::vector<float> actual = scc.GetScoresForProducer(&multi);
124
+ BOOST_CHECK_EQUAL_COLLECTIONS(vec1.begin(),vec1.end()
125
+ ,actual.begin(), actual.end());
126
+ scc.PlusEquals(&multi,vec1);
127
+ actual = scc.GetScoresForProducer(&multi);
128
+ BOOST_CHECK_EQUAL_COLLECTIONS(vec2.begin(),vec2.end(),
129
+ actual.begin(), actual.end());
130
+
131
+ BOOST_CHECK_EQUAL(scc.GetScoreForProducer(&single), 3.4f);
132
+ }
133
+
134
+ BOOST_FIXTURE_TEST_CASE(sparse_feature, MockProducers)
135
+ {
136
+ ScoreComponentCollection scc;
137
+ scc.Assign(&sparse, "first", 1.3f);
138
+ scc.Assign(&sparse, "second", 2.1f);
139
+ BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), 1.3f);
140
+ BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"second"), 2.1f);
141
+ BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"third"), 0.0f);
142
+ scc.Assign(&sparse, "first", -1.9f);
143
+ BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), -1.9f);
144
+ scc.PlusEquals(&sparse, StringPiece("first"), -1.9f);
145
+ BOOST_CHECK_EQUAL( scc.GetScoreForProducer(&sparse,"first"), -3.8f);
146
+ }
147
+
148
+ /*
149
+ Doesn't work because of the static registration of ScoreProducers
150
+ in ScoreComponentCollection.
151
+ BOOST_FIXTURE_TEST_CASE(save, MockProducers)
152
+ {
153
+ ScoreComponentCollection scc;
154
+ scc.Assign(&sparse, "first", 1.1f);
155
+ scc.Assign(&single, 0.25f);
156
+ float arr[] = {1,2.1,3,4,5};
157
+ std::vector<float> vec1(arr,arr+5);
158
+ scc.Assign(&multi,vec1);
159
+ ostringstream out;
160
+ scc.Save(out);
161
+ cerr << out.str() << endl;
162
+ istringstream in (out.str());
163
+ string line;
164
+ getline(in,line);
165
+ BOOST_CHECK_EQUAL(line, "MockSingle:4_1 0.25");
166
+ getline(in,line);
167
+ BOOST_CHECK_EQUAL(line, "MockMulti:4_1 1");
168
+ getline(in,line);
169
+ BOOST_CHECK_EQUAL(line, "MockMulti:4_2 2.1");
170
+ getline(in,line);
171
+ BOOST_CHECK_EQUAL(line, "MockMulti:4_3 3");
172
+ getline(in,line);
173
+ BOOST_CHECK_EQUAL(line, "MockMulti:4_4 4");
174
+ getline(in,line);
175
+ BOOST_CHECK_EQUAL(line, "MockMulti:4_5 5");
176
+ getline(in,line);
177
+ BOOST_CHECK_EQUAL(line,"MockSparse:4_first 1.1");
178
+ BOOST_CHECK(!getline(in,line));
179
+ }
180
+ */
181
+
182
+
183
+ BOOST_AUTO_TEST_SUITE_END()
184
+
mosesdecoder/moses/Search.cpp ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Manager.h"
2
+ #include "SearchCubePruning.h"
3
+ #include "SearchNormal.h"
4
+ #include "InputType.h"
5
+ #include "util/exception.hh"
6
+
7
+ namespace Moses
8
+ {
9
+
10
+ Search::Search(Manager& manager)
11
+ : m_manager(manager)
12
+ , m_source(manager.GetSource())
13
+ , m_options(*manager.options())
14
+ , m_inputPath()
15
+ , m_initialTransOpt()
16
+ , m_bitmaps(manager.GetSource().GetSize(), manager.GetSource().m_sourceCompleted)
17
+ , interrupted_flag(0)
18
+ {
19
+ m_initialTransOpt.SetInputPath(m_inputPath);
20
+ m_timer.start();
21
+ }
22
+
23
+ bool
24
+ Search::
25
+ out_of_time()
26
+ {
27
+ int const& timelimit = m_options.search.timeout;
28
+ if (timelimit > 0) {
29
+ double elapsed_time = GetUserTime();
30
+ if (elapsed_time > timelimit) {
31
+ VERBOSE(1,"Decoding is out of time (" << elapsed_time << ","
32
+ << timelimit << ")" << std::endl);
33
+ interrupted_flag = 1;
34
+ return true;
35
+ }
36
+ }
37
+ int const& segment_timelimit = m_options.search.segment_timeout;
38
+ if (segment_timelimit > 0) {
39
+ double elapsed_time = m_timer.get_elapsed_time();
40
+ if (elapsed_time > segment_timelimit) {
41
+ VERBOSE(1,"Decoding for segment is out of time (" << elapsed_time << ","
42
+ << segment_timelimit << ")" << std::endl);
43
+ interrupted_flag = 1;
44
+ return true;
45
+ }
46
+ }
47
+ return false;
48
+ }
49
+
50
+ }
mosesdecoder/moses/Search.h ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef moses_Search_h
2
+ #define moses_Search_h
3
+
4
+ #include <vector>
5
+ #include "TypeDef.h"
6
+ #include "TranslationOption.h"
7
+ #include "Phrase.h"
8
+ #include "InputPath.h"
9
+ #include "Bitmaps.h"
10
+ #include "Timer.h"
11
+
12
+ namespace Moses
13
+ {
14
+
15
+ class HypothesisStack;
16
+ class Hypothesis;
17
+ class InputType;
18
+ class TranslationOptionCollection;
19
+ class Manager;
20
+ class Phrase;
21
+
22
+ /** Base search class used in the phrase-based decoder.
23
+ *
24
+ * Actual search class that implement the cube pruning algorithm (SearchCubePruning)
25
+ * or standard beam search (SearchNormal) should inherits from this class, and
26
+ * override pure virtual functions.
27
+ */
28
+ class Search
29
+ {
30
+ public:
31
+ virtual const std::vector<HypothesisStack*>& GetHypothesisStacks() const = 0;
32
+ virtual const Hypothesis *GetBestHypothesis() const = 0;
33
+
34
+ //! Decode the sentence according to the specified search algorithm.
35
+ virtual void Decode() = 0;
36
+
37
+ explicit Search(Manager& manager);
38
+ virtual ~Search() {}
39
+
40
+ protected:
41
+ Manager& m_manager;
42
+ const InputType &m_source;
43
+ AllOptions const& m_options;
44
+
45
+ InputPath m_inputPath; // for initial hypo
46
+ TranslationOption m_initialTransOpt; /**< used to seed 1st hypo */
47
+ Bitmaps m_bitmaps;
48
+
49
+ /** flag indicating that decoder ran out of time (see switch -time-out) */
50
+ size_t interrupted_flag;
51
+
52
+ Timer m_timer;
53
+ bool out_of_time();
54
+ };
55
+
56
+ }
57
+ #endif
mosesdecoder/moses/SearchCubePruning.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef moses_SearchCubePruning_h
2
+ #define moses_SearchCubePruning_h
3
+
4
+ #include <vector>
5
+ #include "Search.h"
6
+ #include "HypothesisStackCubePruning.h"
7
+ #include "SentenceStats.h"
8
+
9
+ namespace Moses
10
+ {
11
+
12
+ class InputType;
13
+ class TranslationOptionCollection;
14
+
15
+ /** Functions and variables you need to decoder an input using the phrase-based decoder with cube-pruning
16
+ * Instantiated by the Manager class
17
+ */
18
+ class SearchCubePruning: public Search
19
+ {
20
+ protected:
21
+ std::vector < HypothesisStack* > m_hypoStackColl; /**< stacks to store hypotheses (partial translations) */
22
+ // no of elements = no of words in source + 1
23
+ const TranslationOptionCollection &m_transOptColl; /**< pre-computed list of translation options for the phrases in this sentence */
24
+
25
+ //! go thru all bitmaps in 1 stack & create backpointers to bitmaps in the stack
26
+ void CreateForwardTodos(HypothesisStackCubePruning &stack);
27
+ //! create a back pointer to this bitmap, with edge that has this words range translation
28
+ void CreateForwardTodos(const Bitmap &bitmap, const Range &range, BitmapContainer &bitmapContainer);
29
+ bool CheckDistortion(const Bitmap &bitmap, const Range &range) const;
30
+
31
+ void PrintBitmapContainerGraph();
32
+
33
+ public:
34
+ SearchCubePruning(Manager& manager, const TranslationOptionCollection &transOptColl);
35
+ ~SearchCubePruning();
36
+
37
+ void Decode();
38
+
39
+ void OutputHypoStackSize();
40
+ void OutputHypoStack(int stack);
41
+
42
+ virtual const std::vector < HypothesisStack* >& GetHypothesisStacks() const;
43
+ virtual const Hypothesis *GetBestHypothesis() const;
44
+ };
45
+
46
+
47
+ }
48
+ #endif
mosesdecoder/moses/SearchNormal.cpp ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Manager.h"
2
+ #include "Timer.h"
3
+ #include "SearchNormal.h"
4
+ #include "SentenceStats.h"
5
+
6
+ #include <boost/foreach.hpp>
7
+
8
+ using namespace std;
9
+
10
+ namespace Moses
11
+ {
12
+ /**
13
+ * Organizing main function
14
+ *
15
+ * /param source input sentence
16
+ * /param transOptColl collection of translation options to be used for this sentence
17
+ */
18
+ SearchNormal::
19
+ SearchNormal(Manager& manager, const TranslationOptionCollection &transOptColl)
20
+ : Search(manager)
21
+ , m_hypoStackColl(manager.GetSource().GetSize() + 1)
22
+ , m_transOptColl(transOptColl)
23
+ {
24
+ VERBOSE(1, "Translating: " << m_source << endl);
25
+
26
+ // initialize the stacks: create data structure and set limits
27
+ std::vector < HypothesisStackNormal >::iterator iterStack;
28
+ for (size_t ind = 0 ; ind < m_hypoStackColl.size() ; ++ind) {
29
+ HypothesisStackNormal *sourceHypoColl = new HypothesisStackNormal(m_manager);
30
+ sourceHypoColl->SetMaxHypoStackSize(this->m_options.search.stack_size,
31
+ this->m_options.search.stack_diversity);
32
+ sourceHypoColl->SetBeamWidth(this->m_options.search.beam_width);
33
+ m_hypoStackColl[ind] = sourceHypoColl;
34
+ }
35
+ }
36
+
37
+ SearchNormal::~SearchNormal()
38
+ {
39
+ RemoveAllInColl(m_hypoStackColl);
40
+ }
41
+
42
+
43
+ bool
44
+ SearchNormal::
45
+ ProcessOneStack(HypothesisStack* hstack)
46
+ {
47
+ if (this->out_of_time()) return false;
48
+ SentenceStats &stats = m_manager.GetSentenceStats();
49
+ HypothesisStackNormal &sourceHypoColl
50
+ = *static_cast<HypothesisStackNormal*>(hstack);
51
+
52
+ // the stack is pruned before processing (lazy pruning):
53
+ VERBOSE(3,"processing hypothesis from next stack");
54
+ IFVERBOSE(2) stats.StartTimeStack();
55
+ sourceHypoColl.PruneToSize(m_options.search.stack_size);
56
+ VERBOSE(3,std::endl);
57
+ sourceHypoColl.CleanupArcList();
58
+ IFVERBOSE(2) stats.StopTimeStack();
59
+
60
+ // go through each hypothesis on the stack and try to expand it
61
+ // BOOST_FOREACH(Hypothesis* h, sourceHypoColl)
62
+ HypothesisStackNormal::const_iterator h;
63
+ for (h = sourceHypoColl.begin(); h != sourceHypoColl.end(); ++h)
64
+ ProcessOneHypothesis(**h);
65
+ return true;
66
+ }
67
+
68
+
69
+ /**
70
+ * Main decoder loop that translates a sentence by expanding
71
+ * hypotheses stack by stack, until the end of the sentence.
72
+ */
73
+ void SearchNormal::Decode()
74
+ {
75
+ // initial seed hypothesis: nothing translated, no words produced
76
+ const Bitmap &initBitmap = m_bitmaps.GetInitialBitmap();
77
+ Hypothesis *hypo = new Hypothesis(m_manager, m_source, m_initialTransOpt, initBitmap, m_manager.GetNextHypoId());
78
+
79
+ m_hypoStackColl[0]->AddPrune(hypo);
80
+
81
+ // go through each stack
82
+ BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) {
83
+ if (!ProcessOneStack(hstack)) return;
84
+ IFVERBOSE(2) OutputHypoStackSize();
85
+ actual_hypoStack = static_cast<HypothesisStackNormal*>(hstack);
86
+ }
87
+ }
88
+
89
+
90
+ /** Find all translation options to expand one hypothesis, trigger expansion
91
+ * this is mostly a check for overlap with already covered words, and for
92
+ * violation of reordering limits.
93
+ * \param hypothesis hypothesis to be expanded upon
94
+ */
95
+ void
96
+ SearchNormal::
97
+ ProcessOneHypothesis(const Hypothesis &hypothesis)
98
+ {
99
+ // since we check for reordering limits, its good to have that limit handy
100
+ bool isWordLattice = m_source.GetType() == WordLatticeInput;
101
+
102
+ const Bitmap &hypoBitmap = hypothesis.GetWordsBitmap();
103
+ const size_t hypoFirstGapPos = hypoBitmap.GetFirstGapPos();
104
+ size_t const sourceSize = m_source.GetSize();
105
+
106
+ ReorderingConstraint const&
107
+ ReoConstraint = m_source.GetReorderingConstraint();
108
+
109
+ // no limit of reordering: only check for overlap
110
+ if (m_options.reordering.max_distortion < 0) {
111
+
112
+ for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos) {
113
+ TranslationOptionList const* tol;
114
+ size_t endPos = startPos;
115
+ for (tol = m_transOptColl.GetTranslationOptionList(startPos, endPos);
116
+ tol && endPos < sourceSize;
117
+ tol = m_transOptColl.GetTranslationOptionList(startPos, ++endPos)) {
118
+ if (tol->size() == 0
119
+ || hypoBitmap.Overlap(Range(startPos, endPos))
120
+ || !ReoConstraint.Check(hypoBitmap, startPos, endPos)) {
121
+ continue;
122
+ }
123
+
124
+ //TODO: does this method include incompatible WordLattice hypotheses?
125
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
126
+ }
127
+ }
128
+ return; // done with special case (no reordering limit)
129
+ }
130
+
131
+ // There are reordering limits. Make sure they are not violated.
132
+
133
+ Range prevRange = hypothesis.GetCurrSourceWordsRange();
134
+ for (size_t startPos = hypoFirstGapPos ; startPos < sourceSize ; ++startPos) {
135
+
136
+ // don't bother expanding phrases if the first position is already taken
137
+ if(hypoBitmap.GetValue(startPos)) continue;
138
+
139
+ size_t maxSize = sourceSize - startPos;
140
+ size_t maxSizePhrase = m_options.search.max_phrase_length;
141
+ maxSize = (maxSize < maxSizePhrase) ? maxSize : maxSizePhrase;
142
+ size_t closestLeft = hypoBitmap.GetEdgeToTheLeftOf(startPos);
143
+
144
+ if (isWordLattice) {
145
+ // first question: is there a path from the closest translated word to the left
146
+ // of the hypothesized extension to the start of the hypothesized extension?
147
+ // long version:
148
+ // - is there anything to our left?
149
+ // - is it farther left than where we're starting anyway?
150
+ // - can we get to it?
151
+
152
+ // closestLeft is exclusive: a value of 3 means 2 is covered, our
153
+ // arc is currently ENDING at 3 and can start at 3 implicitly
154
+ if (closestLeft != 0 && closestLeft != startPos
155
+ && !m_source.CanIGetFromAToB(closestLeft, startPos))
156
+ continue;
157
+
158
+ if (prevRange.GetStartPos() != NOT_FOUND &&
159
+ prevRange.GetStartPos() > startPos &&
160
+ !m_source.CanIGetFromAToB(startPos, prevRange.GetStartPos()))
161
+ continue;
162
+ }
163
+
164
+ Range currentStartRange(startPos, startPos);
165
+ if(m_source.ComputeDistortionDistance(prevRange, currentStartRange)
166
+ > m_options.reordering.max_distortion)
167
+ continue;
168
+
169
+ TranslationOptionList const* tol;
170
+ size_t endPos = startPos;
171
+ for (tol = m_transOptColl.GetTranslationOptionList(startPos, endPos);
172
+ tol && endPos < sourceSize;
173
+ tol = m_transOptColl.GetTranslationOptionList(startPos, ++endPos)) {
174
+ Range extRange(startPos, endPos);
175
+ if (tol->size() == 0
176
+ || hypoBitmap.Overlap(extRange)
177
+ || !ReoConstraint.Check(hypoBitmap, startPos, endPos)
178
+ || (isWordLattice && !m_source.IsCoveragePossible(extRange))) {
179
+ continue;
180
+ }
181
+
182
+ // ask second question here: we already know we can get to our
183
+ // starting point from the closest thing to the left. We now ask the
184
+ // follow up: can we get from our end to the closest thing on the
185
+ // right?
186
+ //
187
+ // long version: is anything to our right? is it farther
188
+ // right than our (inclusive) end? can our end reach it?
189
+ bool isLeftMostEdge = (hypoFirstGapPos == startPos);
190
+
191
+ size_t closestRight = hypoBitmap.GetEdgeToTheRightOf(endPos);
192
+ if (isWordLattice) {
193
+ if (closestRight != endPos
194
+ && ((closestRight + 1) < sourceSize)
195
+ && !m_source.CanIGetFromAToB(endPos + 1, closestRight + 1)) {
196
+ continue;
197
+ }
198
+ }
199
+
200
+ if (isLeftMostEdge) {
201
+ // any length extension is okay if starting at left-most edge
202
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
203
+ } else { // starting somewhere other than left-most edge, use caution
204
+ // the basic idea is this: we would like to translate a phrase
205
+ // starting from a position further right than the left-most
206
+ // open gap. The distortion penalty for the following phrase
207
+ // will be computed relative to the ending position of the
208
+ // current extension, so we ask now what its maximum value will
209
+ // be (which will always be the value of the hypothesis starting
210
+ // at the left-most edge). If this value is less than the
211
+ // distortion limit, we don't allow this extension to be made.
212
+ Range bestNextExtension(hypoFirstGapPos, hypoFirstGapPos);
213
+
214
+ if (m_source.ComputeDistortionDistance(extRange, bestNextExtension)
215
+ > m_options.reordering.max_distortion) continue;
216
+
217
+ // everything is fine, we're good to go
218
+ ExpandAllHypotheses(hypothesis, startPos, endPos);
219
+ }
220
+ }
221
+ }
222
+ }
223
+
224
+
225
+ /**
226
+ * Expand a hypothesis given a list of translation options
227
+ * \param hypothesis hypothesis to be expanded upon
228
+ * \param startPos first word position of span covered
229
+ * \param endPos last word position of span covered
230
+ */
231
+
232
+ void
233
+ SearchNormal::
234
+ ExpandAllHypotheses(const Hypothesis &hypothesis, size_t startPos, size_t endPos)
235
+ {
236
+ // early discarding: check if hypothesis is too bad to build
237
+ // this idea is explained in (Moore&Quirk, MT Summit 2007)
238
+ float expectedScore = 0.0f;
239
+
240
+ const Bitmap &sourceCompleted = hypothesis.GetWordsBitmap();
241
+ float estimatedScore = m_transOptColl.GetEstimatedScores().CalcEstimatedScore( sourceCompleted, startPos, endPos );
242
+
243
+ const Range &hypoRange = hypothesis.GetCurrSourceWordsRange();
244
+ //cerr << "DOING " << sourceCompleted << " [" << hypoRange.GetStartPos() << " " << hypoRange.GetEndPos() << "]"
245
+ // " [" << startPos << " " << endPos << "]" << endl;
246
+
247
+ if (m_options.search.UseEarlyDiscarding()) {
248
+ // expected score is based on score of current hypothesis
249
+ expectedScore = hypothesis.GetScore();
250
+
251
+ // add new future score estimate
252
+ expectedScore += estimatedScore;
253
+ }
254
+
255
+ // loop through all translation options
256
+ const TranslationOptionList* tol
257
+ = m_transOptColl.GetTranslationOptionList(startPos, endPos);
258
+ if (!tol || tol->size() == 0) return;
259
+
260
+ // Create new bitmap
261
+ const TranslationOption &transOpt = **tol->begin();
262
+ const Range &nextRange = transOpt.GetSourceWordsRange();
263
+ const Bitmap &nextBitmap = m_bitmaps.GetBitmap(sourceCompleted, nextRange);
264
+
265
+ TranslationOptionList::const_iterator iter;
266
+ for (iter = tol->begin() ; iter != tol->end() ; ++iter) {
267
+ const TranslationOption &transOpt = **iter;
268
+ ExpandHypothesis(hypothesis, transOpt, expectedScore, estimatedScore, nextBitmap);
269
+ }
270
+ }
271
+
272
+ /**
273
+ * Expand one hypothesis with a translation option.
274
+ * this involves initial creation, scoring and adding it to the proper stack
275
+ * \param hypothesis hypothesis to be expanded upon
276
+ * \param transOpt translation option (phrase translation)
277
+ * that is applied to create the new hypothesis
278
+ * \param expectedScore base score for early discarding
279
+ * (base hypothesis score plus future score estimation)
280
+ */
281
+ void SearchNormal::ExpandHypothesis(const Hypothesis &hypothesis,
282
+ const TranslationOption &transOpt,
283
+ float expectedScore,
284
+ float estimatedScore,
285
+ const Bitmap &bitmap)
286
+ {
287
+ SentenceStats &stats = m_manager.GetSentenceStats();
288
+
289
+ Hypothesis *newHypo;
290
+ if (! m_options.search.UseEarlyDiscarding()) {
291
+ // simple build, no questions asked
292
+ IFVERBOSE(2) {
293
+ stats.StartTimeBuildHyp();
294
+ }
295
+ newHypo = new Hypothesis(hypothesis, transOpt, bitmap, m_manager.GetNextHypoId());
296
+ IFVERBOSE(2) {
297
+ stats.StopTimeBuildHyp();
298
+ }
299
+ if (newHypo==NULL) return;
300
+
301
+ IFVERBOSE(2) {
302
+ m_manager.GetSentenceStats().StartTimeOtherScore();
303
+ }
304
+ newHypo->EvaluateWhenApplied(estimatedScore);
305
+ IFVERBOSE(2) {
306
+ m_manager.GetSentenceStats().StopTimeOtherScore();
307
+
308
+ // TODO: these have been meaningless for a while.
309
+ // At least since commit 67fb5c
310
+ // should now be measured in SearchNormal.cpp:254 instead, around CalcFutureScore2()
311
+ // CalcFutureScore2() also called in BackwardsEdge::Initialize().
312
+ //
313
+ // however, CalcFutureScore2() should be quick
314
+ // since it uses dynamic programming results in SquareMatrix
315
+ m_manager.GetSentenceStats().StartTimeEstimateScore();
316
+ m_manager.GetSentenceStats().StopTimeEstimateScore();
317
+ }
318
+ } else
319
+ // early discarding: check if hypothesis is too bad to build
320
+ {
321
+ // worst possible score may have changed -> recompute
322
+ size_t wordsTranslated = hypothesis.GetWordsBitmap().GetNumWordsCovered() + transOpt.GetSize();
323
+ float allowedScore = m_hypoStackColl[wordsTranslated]->GetWorstScore();
324
+ if (m_options.search.stack_diversity) {
325
+ WordsBitmapID id = hypothesis.GetWordsBitmap().GetIDPlus(transOpt.GetStartPos(), transOpt.GetEndPos());
326
+ float allowedScoreForBitmap = m_hypoStackColl[wordsTranslated]->GetWorstScoreForBitmap( id );
327
+ allowedScore = std::min( allowedScore, allowedScoreForBitmap );
328
+ }
329
+ allowedScore += m_options.search.early_discarding_threshold;
330
+
331
+ // add expected score of translation option
332
+ expectedScore += transOpt.GetFutureScore();
333
+
334
+ // check if transOpt score push it already below limit
335
+ if (expectedScore < allowedScore) {
336
+ IFVERBOSE(2) {
337
+ stats.AddNotBuilt();
338
+ }
339
+ return;
340
+ }
341
+
342
+ // build the hypothesis without scoring
343
+ IFVERBOSE(2) {
344
+ stats.StartTimeBuildHyp();
345
+ }
346
+ newHypo = new Hypothesis(hypothesis, transOpt, bitmap, m_manager.GetNextHypoId());
347
+ if (newHypo==NULL) return;
348
+ IFVERBOSE(2) {
349
+ stats.StopTimeBuildHyp();
350
+ }
351
+
352
+ // ... and check if that is below the limit
353
+ if (expectedScore < allowedScore) {
354
+ IFVERBOSE(2) {
355
+ stats.AddEarlyDiscarded();
356
+ }
357
+ delete newHypo;
358
+ return;
359
+ }
360
+
361
+ }
362
+
363
+ // logging for the curious
364
+ IFVERBOSE(3) {
365
+ newHypo->PrintHypothesis();
366
+ }
367
+
368
+ // add to hypothesis stack
369
+ size_t wordsTranslated = newHypo->GetWordsBitmap().GetNumWordsCovered();
370
+ IFVERBOSE(2) {
371
+ stats.StartTimeStack();
372
+ }
373
+ m_hypoStackColl[wordsTranslated]->AddPrune(newHypo);
374
+ IFVERBOSE(2) {
375
+ stats.StopTimeStack();
376
+ }
377
+ }
378
+
379
+ const std::vector < HypothesisStack* >& SearchNormal::GetHypothesisStacks() const
380
+ {
381
+ return m_hypoStackColl;
382
+ }
383
+
384
+ /**
385
+ * Find best hypothesis on the last stack.
386
+ * This is the end point of the best translation, which can be traced back from here
387
+ */
388
+ const Hypothesis *SearchNormal::GetBestHypothesis() const
389
+ {
390
+ if (interrupted_flag == 0) {
391
+ const HypothesisStackNormal &hypoColl = *static_cast<HypothesisStackNormal*>(m_hypoStackColl.back());
392
+ return hypoColl.GetBestHypothesis();
393
+ } else {
394
+ const HypothesisStackNormal &hypoColl = *actual_hypoStack;
395
+ return hypoColl.GetBestHypothesis();
396
+ }
397
+ }
398
+
399
+ /**
400
+ * Logging of hypothesis stack sizes
401
+ */
402
+ void SearchNormal::OutputHypoStackSize()
403
+ {
404
+ std::vector < HypothesisStack* >::const_iterator iterStack = m_hypoStackColl.begin();
405
+ TRACE_ERR( "Stack sizes: " << (int)(*iterStack)->size());
406
+ for (++iterStack; iterStack != m_hypoStackColl.end() ; ++iterStack) {
407
+ TRACE_ERR( ", " << (int)(*iterStack)->size());
408
+ }
409
+ TRACE_ERR( endl);
410
+ }
411
+
412
+ void SearchNormal::OutputHypoStack()
413
+ {
414
+ // all stacks
415
+ int i = 0;
416
+ vector < HypothesisStack* >::iterator iterStack;
417
+ for (iterStack = m_hypoStackColl.begin() ; iterStack != m_hypoStackColl.end() ; ++iterStack) {
418
+ HypothesisStackNormal &hypoColl = *static_cast<HypothesisStackNormal*>(*iterStack);
419
+ TRACE_ERR( "Stack " << i++ << ": " << endl << hypoColl << endl);
420
+ }
421
+ }
422
+
423
+ }
mosesdecoder/moses/SquareMatrix.cpp ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include <string>
24
+ #include <iostream>
25
+ #include "SquareMatrix.h"
26
+ #include "TypeDef.h"
27
+ #include "Util.h"
28
+
29
+ using namespace std;
30
+
31
+ namespace Moses
32
+ {
33
+ void SquareMatrix::InitTriangle(float val)
34
+ {
35
+ for(size_t row=0; row < m_size; row++) {
36
+ for(size_t col=row; col<m_size; col++) {
37
+ SetScore(row, col, -numeric_limits<float>::infinity());
38
+ }
39
+ }
40
+ }
41
+
42
+ /**
43
+ * Calculate future score estimate for a given coverage bitmap
44
+ *
45
+ * /param bitmap coverage bitmap
46
+ */
47
+
48
+ float SquareMatrix::CalcEstimatedScore( Bitmap const &bitmap ) const
49
+ {
50
+ const size_t notInGap= numeric_limits<size_t>::max();
51
+ size_t startGap = notInGap;
52
+ float estimatedScore = 0.0f;
53
+ for(size_t currPos = 0 ; currPos < bitmap.GetSize() ; currPos++) {
54
+ // start of a new gap?
55
+ if(bitmap.GetValue(currPos) == false && startGap == notInGap) {
56
+ startGap = currPos;
57
+ }
58
+ // end of a gap?
59
+ else if(bitmap.GetValue(currPos) == true && startGap != notInGap) {
60
+ estimatedScore += GetScore(startGap, currPos - 1);
61
+ startGap = notInGap;
62
+ }
63
+ }
64
+ // coverage ending with gap?
65
+ if (startGap != notInGap) {
66
+ estimatedScore += GetScore(startGap, bitmap.GetSize() - 1);
67
+ }
68
+
69
+ return estimatedScore;
70
+ }
71
+
72
+ /**
73
+ * Calculare future score estimate for a given coverage bitmap
74
+ * and an additional span that is also covered. This function is used
75
+ * to compute future score estimates for hypotheses that we may want
76
+ * build, but first want to check.
77
+ *
78
+ * Note: this function is implemented a bit more complex than
79
+ * the basic one (w/o additional phrase) for speed reasons,
80
+ * which is probably overkill.
81
+ *
82
+ * /param bitmap coverage bitmap
83
+ * /param startPos start of the span that is added to the coverage
84
+ * /param endPos end of the span that is added to the coverage
85
+ */
86
+
87
+ float SquareMatrix::CalcEstimatedScore( Bitmap const &bitmap, size_t startPos, size_t endPos ) const
88
+ {
89
+ const size_t notInGap= numeric_limits<size_t>::max();
90
+ float estimatedScore = 0.0f;
91
+ size_t startGap = bitmap.GetFirstGapPos();
92
+ if (startGap == NOT_FOUND) return estimatedScore; // everything filled
93
+
94
+ // start loop at first gap
95
+ size_t startLoop = startGap+1;
96
+ if (startPos == startGap) { // unless covered by phrase
97
+ startGap = notInGap;
98
+ startLoop = endPos+1; // -> postpone start
99
+ }
100
+
101
+ size_t lastCovered = bitmap.GetLastPos();
102
+ if (endPos > lastCovered || lastCovered == NOT_FOUND) lastCovered = endPos;
103
+
104
+ for(size_t currPos = startLoop; currPos <= lastCovered ; currPos++) {
105
+ // start of a new gap?
106
+ if(startGap == notInGap && bitmap.GetValue(currPos) == false && (currPos < startPos || currPos > endPos)) {
107
+ startGap = currPos;
108
+ }
109
+ // end of a gap?
110
+ else if(startGap != notInGap && (bitmap.GetValue(currPos) == true || (startPos <= currPos && currPos <= endPos))) {
111
+ estimatedScore += GetScore(startGap, currPos - 1);
112
+ startGap = notInGap;
113
+ }
114
+ }
115
+ // coverage ending with gap?
116
+ if (lastCovered != bitmap.GetSize() - 1) {
117
+ estimatedScore += GetScore(lastCovered+1, bitmap.GetSize() - 1);
118
+ }
119
+
120
+ return estimatedScore;
121
+ }
122
+
123
+ TO_STRING_BODY(SquareMatrix);
124
+
125
+ }
126
+
127
+
mosesdecoder/moses/StaticData.cpp ADDED
@@ -0,0 +1,966 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ // $Id$
3
+ // vim:tabstop=2
4
+
5
+ /***********************************************************************
6
+ Moses - factored phrase-based language decoder
7
+ Copyright (C) 2006 University of Edinburgh
8
+
9
+ This library is free software; you can redistribute it and/or
10
+ modify it under the terms of the GNU Lesser General Public
11
+ License as published by the Free Software Foundation; either
12
+ version 2.1 of the License, or (at your option) any later version.
13
+
14
+ This library is distributed in the hope that it will be useful,
15
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17
+ Lesser General Public License for more details.
18
+
19
+ You should have received a copy of the GNU Lesser General Public
20
+ License along with this library; if not, write to the Free Software
21
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
+ ***********************************************************************/
23
+
24
+ #include <string>
25
+ #include <boost/algorithm/string/predicate.hpp>
26
+
27
+ #include "moses/FF/Factory.h"
28
+ #include "TypeDef.h"
29
+ #include "moses/FF/WordPenaltyProducer.h"
30
+ #include "moses/FF/UnknownWordPenaltyProducer.h"
31
+ #include "moses/FF/InputFeature.h"
32
+ #include "moses/FF/DynamicCacheBasedLanguageModel.h"
33
+ #include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
34
+
35
+ #include "DecodeStepTranslation.h"
36
+ #include "DecodeStepGeneration.h"
37
+ #include "GenerationDictionary.h"
38
+ #include "StaticData.h"
39
+ #include "Util.h"
40
+ #include "FactorCollection.h"
41
+ #include "Timer.h"
42
+ #include "TranslationOption.h"
43
+ #include "DecodeGraph.h"
44
+ #include "InputFileStream.h"
45
+ #include "ScoreComponentCollection.h"
46
+ #include "DecodeGraph.h"
47
+ #include "TranslationModel/PhraseDictionary.h"
48
+ #include "TranslationModel/PhraseDictionaryTreeAdaptor.h"
49
+
50
+ #ifdef WITH_THREADS
51
+ #include <boost/thread.hpp>
52
+ #endif
53
+ #ifdef HAVE_CMPH
54
+ #include "moses/TranslationModel/CompactPT/PhraseDictionaryCompact.h"
55
+ #endif
56
+ #if defined HAVE_CMPH
57
+ #include "moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h"
58
+ #endif
59
+
60
+ using namespace std;
61
+ using namespace boost::algorithm;
62
+
63
+ namespace Moses
64
+ {
65
+ StaticData StaticData::s_instance;
66
+
67
+ StaticData::StaticData()
68
+ : m_options(new AllOptions)
69
+ , m_requireSortingAfterSourceContext(false)
70
+ , m_currentWeightSetting("default")
71
+ , m_treeStructure(NULL)
72
+ , m_coordSpaceNextID(1)
73
+ {
74
+ Phrase::InitializeMemPool();
75
+ }
76
+
77
+ StaticData::~StaticData()
78
+ {
79
+ RemoveAllInColl(m_decodeGraphs);
80
+ Phrase::FinalizeMemPool();
81
+ }
82
+
83
+ bool StaticData::LoadDataStatic(Parameter *parameter, const std::string &execPath)
84
+ {
85
+ s_instance.SetExecPath(execPath);
86
+ return s_instance.LoadData(parameter);
87
+ }
88
+
89
+ void
90
+ StaticData
91
+ ::initialize_features()
92
+ {
93
+ std::map<std::string, std::string> featureNameOverride = OverrideFeatureNames();
94
+ // all features
95
+ map<string, int> featureIndexMap;
96
+
97
+ const PARAM_VEC* params = m_parameter->GetParam("feature");
98
+ for (size_t i = 0; params && i < params->size(); ++i) {
99
+ const string &line = Trim(params->at(i));
100
+ VERBOSE(1,"line=" << line << endl);
101
+ if (line.empty())
102
+ continue;
103
+
104
+ vector<string> toks = Tokenize(line);
105
+
106
+ string &feature = toks[0];
107
+ std::map<std::string, std::string>::const_iterator iter
108
+ = featureNameOverride.find(feature);
109
+ if (iter == featureNameOverride.end()) {
110
+ // feature name not override
111
+ m_registry.Construct(feature, line);
112
+ } else {
113
+ // replace feature name with new name
114
+ string newName = iter->second;
115
+ feature = newName;
116
+ string newLine = Join(" ", toks);
117
+ m_registry.Construct(newName, newLine);
118
+ }
119
+ }
120
+
121
+ NoCache();
122
+ OverrideFeatures();
123
+
124
+ }
125
+
126
+ bool
127
+ StaticData
128
+ ::ini_output_options()
129
+ {
130
+ // verbose level
131
+ m_parameter->SetParameter(m_verboseLevel, "verbose", (size_t) 1);
132
+ m_parameter->SetParameter<string>(m_outputUnknownsFile,
133
+ "output-unknowns", "");
134
+ return true;
135
+ }
136
+
137
+ // threads, timeouts, etc.
138
+ bool
139
+ StaticData
140
+ ::ini_performance_options()
141
+ {
142
+ const PARAM_VEC *params;
143
+
144
+ m_threadCount = 1;
145
+ params = m_parameter->GetParam("threads");
146
+ if (params && params->size()) {
147
+ if (params->at(0) == "all") {
148
+ #ifdef WITH_THREADS
149
+ m_threadCount = boost::thread::hardware_concurrency();
150
+ if (!m_threadCount) {
151
+ std::cerr << "-threads all specified but Boost doesn't know how many cores there are";
152
+ return false;
153
+ }
154
+ #else
155
+ std::cerr << "-threads all specified but moses not built with thread support";
156
+ return false;
157
+ #endif
158
+ } else {
159
+ m_threadCount = Scan<int>(params->at(0));
160
+ if (m_threadCount < 1) {
161
+ std::cerr << "Specify at least one thread.";
162
+ return false;
163
+ }
164
+ #ifndef WITH_THREADS
165
+ if (m_threadCount > 1) {
166
+ std::cerr << "Error: Thread count of " << params->at(0)
167
+ << " but moses not built with thread support";
168
+ return false;
169
+ }
170
+ #endif
171
+ }
172
+ }
173
+ return true;
174
+ }
175
+
176
+ bool StaticData::LoadData(Parameter *parameter)
177
+ {
178
+ m_parameter = parameter;
179
+
180
+ const PARAM_VEC *params;
181
+
182
+ m_options->init(*parameter);
183
+ if (is_syntax(m_options->search.algo))
184
+ m_options->syntax.LoadNonTerminals(*parameter, FactorCollection::Instance());
185
+
186
+ if (is_syntax(m_options->search.algo))
187
+ LoadChartDecodingParameters();
188
+
189
+ // ORDER HERE MATTERS, SO DON'T CHANGE IT UNLESS YOU KNOW WHAT YOU ARE DOING!
190
+ // input, output
191
+
192
+ m_parameter->SetParameter<string>(m_factorDelimiter, "factor-delimiter", "|");
193
+ m_parameter->SetParameter<size_t>(m_lmcache_cleanup_threshold, "clean-lm-cache", 1);
194
+
195
+ m_bookkeeping_options.init(*parameter);
196
+ if (!ini_output_options()) return false;
197
+
198
+ // threading etc.
199
+ if (!ini_performance_options()) return false;
200
+
201
+ // FEATURE FUNCTION INITIALIZATION HAPPENS HERE ===============================
202
+
203
+ // set class-specific default parameters
204
+ #if defined HAVE_CMPH
205
+ LexicalReorderingTableCompact::SetStaticDefaultParameters(*parameter);
206
+ PhraseDictionaryCompact::SetStaticDefaultParameters(*parameter);
207
+ #endif
208
+
209
+ initialize_features();
210
+
211
+ if (m_parameter->GetParam("show-weights") == NULL)
212
+ LoadFeatureFunctions();
213
+
214
+ LoadDecodeGraphs();
215
+
216
+ // sanity check that there are no weights without an associated FF
217
+ if (!CheckWeights()) return false;
218
+
219
+ //Load extra feature weights
220
+ string weightFile;
221
+ m_parameter->SetParameter<string>(weightFile, "weight-file", "");
222
+ if (!weightFile.empty()) {
223
+ ScoreComponentCollection extraWeights;
224
+ if (!extraWeights.Load(weightFile)) {
225
+ std::cerr << "Unable to load weights from " << weightFile;
226
+ return false;
227
+ }
228
+ m_allWeights.PlusEquals(extraWeights);
229
+ }
230
+
231
+ //Load sparse features from config (overrules weight file)
232
+ LoadSparseWeightsFromConfig();
233
+
234
+ // load alternate weight settings
235
+ //
236
+ // When and where are these used??? [UG]
237
+ //
238
+ // Update: Just checked the manual. The config file is NOT the right
239
+ // place to do this. [UG]
240
+ //
241
+ // <TODO>
242
+ // * Eliminate alternate-weight-setting. Alternate weight settings should
243
+ // be provided with the input, not in the config file.
244
+ // </TODO>
245
+ params = m_parameter->GetParam("alternate-weight-setting");
246
+ if (params && params->size() && !LoadAlternateWeightSettings())
247
+ return false;
248
+
249
+ return true;
250
+ }
251
+
252
+ void StaticData::SetWeight(const FeatureFunction* sp, float weight)
253
+ {
254
+ m_allWeights.Resize();
255
+ m_allWeights.Assign(sp,weight);
256
+ }
257
+
258
+ void StaticData::SetWeights(const FeatureFunction* sp,
259
+ const std::vector<float>& weights)
260
+ {
261
+ m_allWeights.Resize();
262
+ m_allWeights.Assign(sp,weights);
263
+ }
264
+
265
+ void StaticData::LoadNonTerminals()
266
+ {
267
+ string defaultNonTerminals;
268
+ m_parameter->SetParameter<string>(defaultNonTerminals, "non-terminals", "X");
269
+
270
+ FactorCollection &factorCollection = FactorCollection::Instance();
271
+
272
+ m_inputDefaultNonTerminal.SetIsNonTerminal(true);
273
+ const Factor *sourceFactor = factorCollection.AddFactor(Input, 0, defaultNonTerminals, true);
274
+ m_inputDefaultNonTerminal.SetFactor(0, sourceFactor);
275
+
276
+ m_outputDefaultNonTerminal.SetIsNonTerminal(true);
277
+ const Factor *targetFactor = factorCollection.AddFactor(Output, 0, defaultNonTerminals, true);
278
+ m_outputDefaultNonTerminal.SetFactor(0, targetFactor);
279
+
280
+ // for unknown words
281
+ const PARAM_VEC *params = m_parameter->GetParam("unknown-lhs");
282
+ if (params == NULL || params->size() == 0) {
283
+ UnknownLHSEntry entry(defaultNonTerminals, 0.0f);
284
+ m_unknownLHS.push_back(entry);
285
+ } else {
286
+ const string &filePath = params->at(0);
287
+
288
+ InputFileStream inStream(filePath);
289
+ string line;
290
+ while(getline(inStream, line)) {
291
+ vector<string> tokens = Tokenize(line);
292
+ UTIL_THROW_IF2(tokens.size() != 2,
293
+ "Incorrect unknown LHS format: " << line);
294
+ UnknownLHSEntry entry(tokens[0], Scan<float>(tokens[1]));
295
+ m_unknownLHS.push_back(entry);
296
+ // const Factor *targetFactor =
297
+ factorCollection.AddFactor(Output, 0, tokens[0], true);
298
+ }
299
+
300
+ }
301
+
302
+ }
303
+
304
+ void StaticData::LoadChartDecodingParameters()
305
+ {
306
+ LoadNonTerminals();
307
+
308
+ // source label overlap
309
+ m_parameter->SetParameter(m_sourceLabelOverlap, "source-label-overlap",
310
+ SourceLabelOverlapAdd);
311
+
312
+ }
313
+
314
+ void StaticData::LoadDecodeGraphs()
315
+ {
316
+ vector<string> mappingVector;
317
+ vector<size_t> maxChartSpans;
318
+
319
+ const PARAM_VEC *params;
320
+
321
+ params = m_parameter->GetParam("mapping");
322
+ if (params && params->size()) {
323
+ mappingVector = *params;
324
+ } else {
325
+ mappingVector.assign(1,"0 T 0");
326
+ }
327
+
328
+ params = m_parameter->GetParam("max-chart-span");
329
+ if (params && params->size()) {
330
+ maxChartSpans = Scan<size_t>(*params);
331
+ }
332
+
333
+ vector<string> toks = Tokenize(mappingVector[0]);
334
+ if (toks.size() == 3) {
335
+ // eg 0 T 0
336
+ LoadDecodeGraphsOld(mappingVector, maxChartSpans);
337
+ } else if (toks.size() == 2) {
338
+ if (toks[0] == "T" || toks[0] == "G") {
339
+ // eg. T 0
340
+ LoadDecodeGraphsOld(mappingVector, maxChartSpans);
341
+ } else {
342
+ // eg. 0 TM1
343
+ LoadDecodeGraphsNew(mappingVector, maxChartSpans);
344
+ }
345
+ } else {
346
+ UTIL_THROW(util::Exception, "Malformed mapping");
347
+ }
348
+ }
349
+
350
+ void
351
+ StaticData::
352
+ LoadDecodeGraphsOld(const vector<string> &mappingVector,
353
+ const vector<size_t> &maxChartSpans)
354
+ {
355
+ const vector<PhraseDictionary*>& pts = PhraseDictionary::GetColl();
356
+ const vector<GenerationDictionary*>& gens = GenerationDictionary::GetColl();
357
+
358
+ const std::vector<FeatureFunction*> *featuresRemaining
359
+ = &FeatureFunction::GetFeatureFunctions();
360
+ DecodeStep *prev = 0;
361
+ size_t prevDecodeGraphInd = 0;
362
+
363
+ for(size_t i=0; i<mappingVector.size(); i++) {
364
+ vector<string> token = Tokenize(mappingVector[i]);
365
+ size_t decodeGraphInd;
366
+ DecodeType decodeType;
367
+ size_t index;
368
+ if (token.size() == 2) {
369
+ // eg. T 0
370
+ decodeGraphInd = 0;
371
+ decodeType = token[0] == "T" ? Translate : Generate;
372
+ index = Scan<size_t>(token[1]);
373
+ } else if (token.size() == 3) {
374
+ // eg. 0 T 0
375
+ // For specifying multiple translation model
376
+ decodeGraphInd = Scan<size_t>(token[0]);
377
+ //the vectorList index can only increment by one
378
+ UTIL_THROW_IF2(decodeGraphInd != prevDecodeGraphInd
379
+ && decodeGraphInd != prevDecodeGraphInd + 1,
380
+ "Malformed mapping");
381
+ if (decodeGraphInd > prevDecodeGraphInd) {
382
+ prev = NULL;
383
+ }
384
+
385
+ if (prevDecodeGraphInd < decodeGraphInd) {
386
+ featuresRemaining = &FeatureFunction::GetFeatureFunctions();
387
+ }
388
+
389
+ decodeType = token[1] == "T" ? Translate : Generate;
390
+ index = Scan<size_t>(token[2]);
391
+ } else {
392
+ UTIL_THROW(util::Exception, "Malformed mapping");
393
+ }
394
+
395
+ DecodeStep* decodeStep = NULL;
396
+ switch (decodeType) {
397
+ case Translate:
398
+ if(index>=pts.size()) {
399
+ util::StringStream strme;
400
+ strme << "No phrase dictionary with index "
401
+ << index << " available!";
402
+ UTIL_THROW(util::Exception, strme.str());
403
+ }
404
+ decodeStep = new DecodeStepTranslation(pts[index], prev, *featuresRemaining);
405
+ break;
406
+ case Generate:
407
+ if(index>=gens.size()) {
408
+ util::StringStream strme;
409
+ strme << "No generation dictionary with index "
410
+ << index << " available!";
411
+ UTIL_THROW(util::Exception, strme.str());
412
+ }
413
+ decodeStep = new DecodeStepGeneration(gens[index], prev, *featuresRemaining);
414
+ break;
415
+ default:
416
+ UTIL_THROW(util::Exception, "Unknown decode step");
417
+ break;
418
+ }
419
+
420
+ featuresRemaining = &decodeStep->GetFeaturesRemaining();
421
+
422
+ UTIL_THROW_IF2(decodeStep == NULL, "Null decode step");
423
+ if (m_decodeGraphs.size() < decodeGraphInd + 1) {
424
+ DecodeGraph *decodeGraph;
425
+ if (is_syntax(m_options->search.algo)) {
426
+ size_t maxChartSpan;
427
+ if (decodeGraphInd < maxChartSpans.size()) {
428
+ maxChartSpan = maxChartSpans[decodeGraphInd];
429
+ VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
430
+ } else {
431
+ maxChartSpan = DEFAULT_MAX_CHART_SPAN;
432
+ }
433
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
434
+ } else {
435
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size());
436
+ }
437
+
438
+ m_decodeGraphs.push_back(decodeGraph); // TODO max chart span
439
+ }
440
+
441
+ m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
442
+ prev = decodeStep;
443
+ prevDecodeGraphInd = decodeGraphInd;
444
+ }
445
+
446
+ // set maximum n-gram size for backoff approach to decoding paths
447
+ // default is always use subsequent paths (value = 0)
448
+ // if specified, record maxmimum unseen n-gram size
449
+ const vector<string> *backoffVector = m_parameter->GetParam("decoding-graph-backoff");
450
+ for(size_t i=0; i<m_decodeGraphs.size() && backoffVector && i<backoffVector->size(); i++) {
451
+ DecodeGraph &decodeGraph = *m_decodeGraphs[i];
452
+
453
+ if (i < backoffVector->size()) {
454
+ decodeGraph.SetBackoff(Scan<size_t>(backoffVector->at(i)));
455
+ }
456
+ }
457
+ }
458
+
459
+ void StaticData::LoadDecodeGraphsNew(const std::vector<std::string> &mappingVector, const std::vector<size_t> &maxChartSpans)
460
+ {
461
+ const std::vector<FeatureFunction*> *featuresRemaining = &FeatureFunction::GetFeatureFunctions();
462
+ DecodeStep *prev = 0;
463
+ size_t prevDecodeGraphInd = 0;
464
+
465
+ for(size_t i=0; i<mappingVector.size(); i++) {
466
+ vector<string> token = Tokenize(mappingVector[i]);
467
+ size_t decodeGraphInd;
468
+
469
+ decodeGraphInd = Scan<size_t>(token[0]);
470
+ //the vectorList index can only increment by one
471
+ UTIL_THROW_IF2(decodeGraphInd != prevDecodeGraphInd
472
+ && decodeGraphInd != prevDecodeGraphInd + 1,
473
+ "Malformed mapping");
474
+ if (decodeGraphInd > prevDecodeGraphInd) {
475
+ prev = NULL;
476
+ }
477
+
478
+ if (prevDecodeGraphInd < decodeGraphInd) {
479
+ featuresRemaining = &FeatureFunction::GetFeatureFunctions();
480
+ }
481
+
482
+ FeatureFunction &ff = FeatureFunction::FindFeatureFunction(token[1]);
483
+
484
+ DecodeStep* decodeStep = NULL;
485
+ if (typeid(ff) == typeid(PhraseDictionary)) {
486
+ decodeStep = new DecodeStepTranslation(&static_cast<PhraseDictionary&>(ff), prev, *featuresRemaining);
487
+ } else if (typeid(ff) == typeid(GenerationDictionary)) {
488
+ decodeStep = new DecodeStepGeneration(&static_cast<GenerationDictionary&>(ff), prev, *featuresRemaining);
489
+ } else {
490
+ UTIL_THROW(util::Exception, "Unknown decode step");
491
+ }
492
+
493
+ featuresRemaining = &decodeStep->GetFeaturesRemaining();
494
+
495
+ UTIL_THROW_IF2(decodeStep == NULL, "Null decode step");
496
+ if (m_decodeGraphs.size() < decodeGraphInd + 1) {
497
+ DecodeGraph *decodeGraph;
498
+ if (is_syntax(m_options->search.algo)) {
499
+ size_t maxChartSpan = (decodeGraphInd < maxChartSpans.size()) ? maxChartSpans[decodeGraphInd] : DEFAULT_MAX_CHART_SPAN;
500
+ VERBOSE(1,"max-chart-span: " << maxChartSpans[decodeGraphInd] << endl);
501
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size(), maxChartSpan);
502
+ } else {
503
+ decodeGraph = new DecodeGraph(m_decodeGraphs.size());
504
+ }
505
+
506
+ m_decodeGraphs.push_back(decodeGraph); // TODO max chart span
507
+ }
508
+
509
+ m_decodeGraphs[decodeGraphInd]->Add(decodeStep);
510
+ prev = decodeStep;
511
+ prevDecodeGraphInd = decodeGraphInd;
512
+ }
513
+
514
+ // set maximum n-gram size for backoff approach to decoding paths
515
+ // default is always use subsequent paths (value = 0)
516
+ // if specified, record maxmimum unseen n-gram size
517
+ const vector<string> *backoffVector = m_parameter->GetParam("decoding-graph-backoff");
518
+ for(size_t i=0; i<m_decodeGraphs.size() && backoffVector && i<backoffVector->size(); i++) {
519
+ DecodeGraph &decodeGraph = *m_decodeGraphs[i];
520
+
521
+ if (i < backoffVector->size()) {
522
+ decodeGraph.SetBackoff(Scan<size_t>(backoffVector->at(i)));
523
+ }
524
+ }
525
+
526
+ }
527
+
528
+ void StaticData::ReLoadBleuScoreFeatureParameter(float weight)
529
+ {
530
+ //loop over ScoreProducers to update weights of BleuScoreFeature
531
+ const std::vector<FeatureFunction*> &producers = FeatureFunction::GetFeatureFunctions();
532
+ for(size_t i=0; i<producers.size(); ++i) {
533
+ FeatureFunction *ff = producers[i];
534
+ std::string ffName = ff->GetScoreProducerDescription();
535
+
536
+ if (ffName == "BleuScoreFeature") {
537
+ SetWeight(ff, weight);
538
+ break;
539
+ }
540
+ }
541
+ }
542
+
543
+ // ScoreComponentCollection StaticData::GetAllWeightsScoreComponentCollection() const {}
544
+ // in ScoreComponentCollection.h
545
+
546
+ void StaticData::SetExecPath(const std::string &path)
547
+ {
548
+ // NOT TESTED
549
+ size_t pos = path.rfind("/");
550
+ if (pos != string::npos) {
551
+ m_binPath = path.substr(0, pos);
552
+ }
553
+ VERBOSE(1,m_binPath << endl);
554
+ }
555
+
556
+ const string &StaticData::GetBinDirectory() const
557
+ {
558
+ return m_binPath;
559
+ }
560
+
561
+ float StaticData::GetWeightWordPenalty() const
562
+ {
563
+ float weightWP = GetWeight(&WordPenaltyProducer::Instance());
564
+ return weightWP;
565
+ }
566
+
567
+ void
568
+ StaticData::
569
+ InitializeForInput(ttasksptr const& ttask) const
570
+ {
571
+ const std::vector<FeatureFunction*> &producers
572
+ = FeatureFunction::GetFeatureFunctions();
573
+ for(size_t i=0; i<producers.size(); ++i) {
574
+ FeatureFunction &ff = *producers[i];
575
+ if (! IsFeatureFunctionIgnored(ff)) {
576
+ Timer iTime;
577
+ iTime.start();
578
+ ff.InitializeForInput(ttask);
579
+ VERBOSE(3,"InitializeForInput( " << ff.GetScoreProducerDescription()
580
+ << " )" << "= " << iTime << endl);
581
+ }
582
+ }
583
+ }
584
+
585
+ void
586
+ StaticData::
587
+ CleanUpAfterSentenceProcessing(ttasksptr const& ttask) const
588
+ {
589
+ const std::vector<FeatureFunction*> &producers
590
+ = FeatureFunction::GetFeatureFunctions();
591
+ for(size_t i=0; i<producers.size(); ++i) {
592
+ FeatureFunction &ff = *producers[i];
593
+ if (! IsFeatureFunctionIgnored(ff)) {
594
+ ff.CleanUpAfterSentenceProcessing(ttask);
595
+ }
596
+ }
597
+ }
598
+
599
+ void StaticData::LoadFeatureFunctions()
600
+ {
601
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
602
+ std::vector<FeatureFunction*>::const_iterator iter;
603
+ for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
604
+ FeatureFunction *ff = *iter;
605
+ bool doLoad = true;
606
+
607
+ if (ff->RequireSortingAfterSourceContext()) {
608
+ m_requireSortingAfterSourceContext = true;
609
+ }
610
+
611
+ if (dynamic_cast<PhraseDictionary*>(ff)) {
612
+ doLoad = false;
613
+ }
614
+
615
+ if (doLoad) {
616
+ VERBOSE(1, "Loading " << ff->GetScoreProducerDescription() << endl);
617
+ ff->Load(options());
618
+ }
619
+ }
620
+
621
+ const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl();
622
+ for (size_t i = 0; i < pts.size(); ++i) {
623
+ PhraseDictionary *pt = pts[i];
624
+ VERBOSE(1, "Loading " << pt->GetScoreProducerDescription() << endl);
625
+ pt->Load(options());
626
+ }
627
+
628
+ CheckLEGACYPT();
629
+ }
630
+
631
+ bool StaticData::CheckWeights() const
632
+ {
633
+ set<string> weightNames = m_parameter->GetWeightNames();
634
+ set<string> featureNames;
635
+
636
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
637
+ for (size_t i = 0; i < ffs.size(); ++i) {
638
+ const FeatureFunction &ff = *ffs[i];
639
+ const string &descr = ff.GetScoreProducerDescription();
640
+ featureNames.insert(descr);
641
+
642
+ set<string>::iterator iter = weightNames.find(descr);
643
+ if (iter == weightNames.end()) {
644
+ cerr << "Can't find weights for feature function " << descr << endl;
645
+ } else {
646
+ weightNames.erase(iter);
647
+ }
648
+ }
649
+
650
+ //sparse features
651
+ if (!weightNames.empty()) {
652
+ set<string>::iterator iter;
653
+ for (iter = weightNames.begin(); iter != weightNames.end(); ) {
654
+ string fname = (*iter).substr(0, (*iter).find("_"));
655
+ VERBOSE(1,fname << "\n");
656
+ if (featureNames.find(fname) != featureNames.end()) {
657
+ weightNames.erase(iter++);
658
+ } else {
659
+ ++iter;
660
+ }
661
+ }
662
+ }
663
+
664
+ if (!weightNames.empty()) {
665
+ cerr << "The following weights have no feature function. "
666
+ << "Maybe incorrectly spelt weights: ";
667
+ set<string>::iterator iter;
668
+ for (iter = weightNames.begin(); iter != weightNames.end(); ++iter) {
669
+ cerr << *iter << ",";
670
+ }
671
+ return false;
672
+ }
673
+
674
+ return true;
675
+ }
676
+
677
+
678
+ void StaticData::LoadSparseWeightsFromConfig()
679
+ {
680
+ set<string> featureNames;
681
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
682
+ for (size_t i = 0; i < ffs.size(); ++i) {
683
+ const FeatureFunction &ff = *ffs[i];
684
+ const string &descr = ff.GetScoreProducerDescription();
685
+ featureNames.insert(descr);
686
+ }
687
+
688
+ const std::map<std::string, std::vector<float> > &weights = m_parameter->GetAllWeights();
689
+ std::map<std::string, std::vector<float> >::const_iterator iter;
690
+ for (iter = weights.begin(); iter != weights.end(); ++iter) {
691
+ // this indicates that it is sparse feature
692
+ if (featureNames.find(iter->first) == featureNames.end()) {
693
+ UTIL_THROW_IF2(iter->second.size() != 1, "ERROR: only one weight per sparse feature allowed: " << iter->first);
694
+ m_allWeights.Assign(iter->first, iter->second[0]);
695
+ }
696
+ }
697
+
698
+ }
699
+
700
+
701
+ /**! Read in settings for alternative weights */
702
+ bool StaticData::LoadAlternateWeightSettings()
703
+ {
704
+ if (m_threadCount > 1) {
705
+ cerr << "ERROR: alternative weight settings currently not supported with multi-threading.";
706
+ return false;
707
+ }
708
+
709
+ vector<string> weightSpecification;
710
+ const PARAM_VEC *params = m_parameter->GetParam("alternate-weight-setting");
711
+ if (params && params->size()) {
712
+ weightSpecification = *params;
713
+ }
714
+
715
+ // get mapping from feature names to feature functions
716
+ map<string,FeatureFunction*> nameToFF;
717
+ const std::vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
718
+ for (size_t i = 0; i < ffs.size(); ++i) {
719
+ nameToFF[ ffs[i]->GetScoreProducerDescription() ] = ffs[i];
720
+ }
721
+
722
+ // copy main weight setting as default
723
+ m_weightSetting["default"] = new ScoreComponentCollection( m_allWeights );
724
+
725
+ // go through specification in config file
726
+ string currentId = "";
727
+ bool hasErrors = false;
728
+ for (size_t i=0; i<weightSpecification.size(); ++i) {
729
+
730
+ // identifier line (with optional additional specifications)
731
+ if (weightSpecification[i].find("id=") == 0) {
732
+ vector<string> tokens = Tokenize(weightSpecification[i]);
733
+ vector<string> args = Tokenize(tokens[0], "=");
734
+ currentId = args[1];
735
+ VERBOSE(1,"alternate weight setting " << currentId << endl);
736
+ UTIL_THROW_IF2(m_weightSetting.find(currentId) != m_weightSetting.end(),
737
+ "Duplicate alternate weight id: " << currentId);
738
+ m_weightSetting[ currentId ] = new ScoreComponentCollection;
739
+
740
+ // other specifications
741
+ for(size_t j=1; j<tokens.size(); j++) {
742
+ vector<string> args = Tokenize(tokens[j], "=");
743
+ // sparse weights
744
+ if (args[0] == "weight-file") {
745
+ if (args.size() != 2) {
746
+ std::cerr << "One argument should be supplied for weight-file";
747
+ return false;
748
+ }
749
+ ScoreComponentCollection extraWeights;
750
+ if (!extraWeights.Load(args[1])) {
751
+ std::cerr << "Unable to load weights from " << args[1];
752
+ return false;
753
+ }
754
+ m_weightSetting[ currentId ]->PlusEquals(extraWeights);
755
+ }
756
+ // ignore feature functions
757
+ else if (args[0] == "ignore-ff") {
758
+ set< string > *ffNameSet = new set< string >;
759
+ m_weightSettingIgnoreFF[ currentId ] = *ffNameSet;
760
+ vector<string> featureFunctionName = Tokenize(args[1], ",");
761
+ for(size_t k=0; k<featureFunctionName.size(); k++) {
762
+ // check if a valid nane
763
+ map<string,FeatureFunction*>::iterator ffLookUp = nameToFF.find(featureFunctionName[k]);
764
+ if (ffLookUp == nameToFF.end()) {
765
+ cerr << "ERROR: alternate weight setting " << currentId
766
+ << " specifies to ignore feature function " << featureFunctionName[k]
767
+ << " but there is no such feature function" << endl;
768
+ hasErrors = true;
769
+ } else {
770
+ m_weightSettingIgnoreFF[ currentId ].insert( featureFunctionName[k] );
771
+ }
772
+ }
773
+ }
774
+ }
775
+ }
776
+
777
+ // weight lines
778
+ else {
779
+ UTIL_THROW_IF2(currentId.empty(), "No alternative weights specified");
780
+ vector<string> tokens = Tokenize(weightSpecification[i]);
781
+ UTIL_THROW_IF2(tokens.size() < 2
782
+ , "Incorrect format for alternate weights: " << weightSpecification[i]);
783
+
784
+ // get name and weight values
785
+ string name = tokens[0];
786
+ name = name.substr(0, name.size() - 1); // remove trailing "="
787
+ vector<float> weights(tokens.size() - 1);
788
+ for (size_t i = 1; i < tokens.size(); ++i) {
789
+ float weight = Scan<float>(tokens[i]);
790
+ weights[i - 1] = weight;
791
+ }
792
+
793
+ // check if a valid nane
794
+ map<string,FeatureFunction*>::iterator ffLookUp = nameToFF.find(name);
795
+ if (ffLookUp == nameToFF.end()) {
796
+ cerr << "ERROR: alternate weight setting " << currentId
797
+ << " specifies weight(s) for " << name
798
+ << " but there is no such feature function" << endl;
799
+ hasErrors = true;
800
+ } else {
801
+ m_weightSetting[ currentId ]->Assign( nameToFF[name], weights);
802
+ }
803
+ }
804
+ }
805
+ UTIL_THROW_IF2(hasErrors, "Errors loading alternate weights");
806
+ return true;
807
+ }
808
+
809
+ void StaticData::NoCache()
810
+ {
811
+ bool noCache;
812
+ m_parameter->SetParameter(noCache, "no-cache", false );
813
+
814
+ if (noCache) {
815
+ const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl();
816
+ for (size_t i = 0; i < pts.size(); ++i) {
817
+ PhraseDictionary &pt = *pts[i];
818
+ pt.SetParameter("cache-size", "0");
819
+ }
820
+ }
821
+ }
822
+
823
+ std::map<std::string, std::string>
824
+ StaticData
825
+ ::OverrideFeatureNames()
826
+ {
827
+ std::map<std::string, std::string> ret;
828
+
829
+ const PARAM_VEC *params = m_parameter->GetParam("feature-name-overwrite");
830
+ if (params && params->size()) {
831
+ UTIL_THROW_IF2(params->size() != 1, "Only provide 1 line in the section [feature-name-overwrite]");
832
+ vector<string> toks = Tokenize(params->at(0));
833
+ UTIL_THROW_IF2(toks.size() % 2 != 0, "Format of -feature-name-overwrite must be [old-name new-name]*");
834
+
835
+ for (size_t i = 0; i < toks.size(); i += 2) {
836
+ const string &oldName = toks[i];
837
+ const string &newName = toks[i+1];
838
+ ret[oldName] = newName;
839
+ }
840
+ }
841
+
842
+ // FIXME Does this make sense for F2S? Perhaps it should be changed once
843
+ // FIXME the pipeline uses RuleTable consistently.
844
+ SearchAlgorithm algo = m_options->search.algo;
845
+ if (algo == SyntaxS2T || algo == SyntaxT2S ||
846
+ algo == SyntaxT2S_SCFG || algo == SyntaxF2S) {
847
+ // Automatically override PhraseDictionary{Memory,Scope3}. This will
848
+ // have to change if the FF parameters diverge too much in the future,
849
+ // but for now it makes switching between the old and new decoders much
850
+ // more convenient.
851
+ ret["PhraseDictionaryMemory"] = "RuleTable";
852
+ ret["PhraseDictionaryScope3"] = "RuleTable";
853
+ }
854
+
855
+ return ret;
856
+ }
857
+
858
+ void StaticData::OverrideFeatures()
859
+ {
860
+ const PARAM_VEC *params = m_parameter->GetParam("feature-overwrite");
861
+ for (size_t i = 0; params && i < params->size(); ++i) {
862
+ const string &str = params->at(i);
863
+ vector<string> toks = Tokenize(str);
864
+ UTIL_THROW_IF2(toks.size() <= 1, "Incorrect format for feature override: " << str);
865
+
866
+ FeatureFunction &ff = FeatureFunction::FindFeatureFunction(toks[0]);
867
+
868
+ for (size_t j = 1; j < toks.size(); ++j) {
869
+ const string &keyValStr = toks[j];
870
+ vector<string> keyVal = Tokenize(keyValStr, "=");
871
+ UTIL_THROW_IF2(keyVal.size() != 2, "Incorrect format for parameter override: " << keyValStr);
872
+
873
+ VERBOSE(1, "Override " << ff.GetScoreProducerDescription() << " "
874
+ << keyVal[0] << "=" << keyVal[1] << endl);
875
+
876
+ ff.SetParameter(keyVal[0], keyVal[1]);
877
+
878
+ }
879
+ }
880
+
881
+ }
882
+
883
+ void StaticData::CheckLEGACYPT()
884
+ {
885
+ const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl();
886
+ for (size_t i = 0; i < pts.size(); ++i) {
887
+ const PhraseDictionary *phraseDictionary = pts[i];
888
+ if (dynamic_cast<const PhraseDictionaryTreeAdaptor*>(phraseDictionary) != NULL) {
889
+ m_useLegacyPT = true;
890
+ return;
891
+ }
892
+ }
893
+
894
+ m_useLegacyPT = false;
895
+ }
896
+
897
+
898
+ void StaticData::ResetWeights(const std::string &denseWeights, const std::string &sparseFile)
899
+ {
900
+ m_allWeights = ScoreComponentCollection();
901
+
902
+ // dense weights
903
+ string name("");
904
+ vector<float> weights;
905
+ vector<string> toks = Tokenize(denseWeights);
906
+ for (size_t i = 0; i < toks.size(); ++i) {
907
+ const string &tok = toks[i];
908
+
909
+ if (ends_with(tok, "=")) {
910
+ // start of new feature
911
+
912
+ if (name != "") {
913
+ // save previous ff
914
+ const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
915
+ m_allWeights.Assign(&ff, weights);
916
+ weights.clear();
917
+ }
918
+
919
+ name = tok.substr(0, tok.size() - 1);
920
+ } else {
921
+ // a weight for curr ff
922
+ float weight = Scan<float>(toks[i]);
923
+ weights.push_back(weight);
924
+ }
925
+ }
926
+
927
+ const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(name);
928
+ m_allWeights.Assign(&ff, weights);
929
+
930
+ // sparse weights
931
+ InputFileStream sparseStrme(sparseFile);
932
+ string line;
933
+ while (getline(sparseStrme, line)) {
934
+ vector<string> toks = Tokenize(line);
935
+ UTIL_THROW_IF2(toks.size() != 2, "Incorrect sparse weight format. Should be FFName_spareseName weight");
936
+
937
+ vector<string> names = Tokenize(toks[0], "_");
938
+ UTIL_THROW_IF2(names.size() != 2, "Incorrect sparse weight name. Should be FFName_spareseName");
939
+
940
+ const FeatureFunction &ff = FeatureFunction::FindFeatureFunction(names[0]);
941
+ m_allWeights.Assign(&ff, names[1], Scan<float>(toks[1]));
942
+ }
943
+ }
944
+
945
+ size_t StaticData::GetCoordSpace(string space) const
946
+ {
947
+ map<string const, size_t>::const_iterator m = m_coordSpaceMap.find(space);
948
+ if(m == m_coordSpaceMap.end()) {
949
+ return 0;
950
+ }
951
+ return m->second;
952
+ }
953
+
954
+ size_t StaticData::MapCoordSpace(string space)
955
+ {
956
+ map<string const, size_t>::const_iterator m = m_coordSpaceMap.find(space);
957
+ if (m != m_coordSpaceMap.end()) {
958
+ return m->second;
959
+ }
960
+ size_t id = m_coordSpaceNextID;
961
+ m_coordSpaceNextID += 1;
962
+ m_coordSpaceMap[space] = id;
963
+ return id;
964
+ }
965
+
966
+ } // namespace