sleepyhead111 commited on
Commit
76efa37
·
verified ·
1 Parent(s): 1747e32

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp +94 -0
  2. mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h +105 -0
  3. mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +195 -0
  4. mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h +94 -0
  5. mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h +143 -0
  6. mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h +387 -0
  7. mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h +187 -0
  8. mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +194 -0
  9. mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h +430 -0
  10. mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h +163 -0
  11. mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp +30 -0
  12. mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h +31 -0
  13. mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp +65 -0
  14. mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp +33 -0
  15. mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp +260 -0
  16. mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +417 -0
  17. mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +398 -0
  18. mosesdecoder/moses/TranslationModel/RuleTable/Trie.h +63 -0
  19. mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp +96 -0
  20. mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h +73 -0
  21. mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h +20 -0
  22. mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +1029 -0
  23. mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h +91 -0
  24. mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h +34 -0
  25. mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp +25 -0
  26. mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h +43 -0
  27. mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp +240 -0
  28. mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h +69 -0
  29. mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp +71 -0
  30. mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h +46 -0
  31. mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp +387 -0
  32. mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h +5 -0
  33. mosesdecoder/moses/server/Hypothesis_4server.cpp +37 -0
  34. mosesdecoder/moses/server/Optimizer.h +17 -0
  35. mosesdecoder/moses/server/PackScores.cpp +45 -0
  36. mosesdecoder/moses/server/PackScores.h +10 -0
  37. mosesdecoder/moses/server/Server.h +46 -0
  38. mosesdecoder/moses/server/Session.h +75 -0
  39. mosesdecoder/moses/server/TranslationRequest.cpp +524 -0
  40. mosesdecoder/moses/server/Updater.cpp +58 -0
  41. mosesdecoder/moses/server/Updater.h +44 -0
  42. mosesdecoder/util/bit_packing_test.cc +59 -0
  43. mosesdecoder/util/ersatz_progress.hh +57 -0
  44. mosesdecoder/util/exception.hh +165 -0
  45. mosesdecoder/util/fake_ostream.hh +111 -0
  46. mosesdecoder/util/file_piece.hh +175 -0
  47. mosesdecoder/util/file_piece_test.cc +154 -0
  48. mosesdecoder/util/generator.hh +34 -0
  49. mosesdecoder/util/getopt.c +78 -0
  50. mosesdecoder/util/integer_to_string_test.cc +81 -0
mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifdef HAVE_CMPH
23
+
24
+ #include "CmphStringVectorAdapter.h"
25
+
26
+ namespace Moses
27
+ {
28
+
29
+ void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
30
+ {
31
+ delete[] key;
32
+ }
33
+
34
+ void CmphStringVectorAdapterRewind(void *data)
35
+ {
36
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
37
+ cmph_vector->position = 0;
38
+ }
39
+
40
+ //************************************************************************//
41
+
42
+ cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
43
+ {
44
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
45
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
46
+ assert(key_source);
47
+ assert(cmph_vector);
48
+
49
+ cmph_vector->vector = (void *)&v;
50
+ cmph_vector->position = 0;
51
+ key_source->data = (void *)cmph_vector;
52
+ key_source->nkeys = v.size();
53
+
54
+ return key_source;
55
+ }
56
+
57
+ int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
58
+ {
59
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
60
+ std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
61
+ size_t size;
62
+ *keylen = (*v)[cmph_vector->position].size();
63
+ size = *keylen;
64
+ *key = new char[size + 1];
65
+ std::string temp = (*v)[cmph_vector->position];
66
+ strcpy(*key, temp.c_str());
67
+ cmph_vector->position = cmph_vector->position + 1;
68
+ return (int)(*keylen);
69
+ }
70
+
71
+ void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
72
+ {
73
+ delete[] key;
74
+ }
75
+
76
+ void CmphVectorAdapterRewind(void *data)
77
+ {
78
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
79
+ cmph_vector->position = 0;
80
+ }
81
+
82
+ cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
83
+ {
84
+ cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
85
+
86
+ key_source->read = CmphVectorAdapterRead;
87
+ key_source->dispose = CmphVectorAdapterDispose;
88
+ key_source->rewind = CmphVectorAdapterRewind;
89
+ return key_source;
90
+ }
91
+
92
+ }
93
+
94
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_CmphStringVectorAdapterNew_h
23
+ #define moses_CmphStringVectorAdapterNew_h
24
+
25
+ #include <cassert>
26
+ #include <cstring>
27
+
28
+ #ifdef HAVE_CMPH
29
+ #include "cmph.h"
30
+
31
+ #include "StringVector.h"
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ typedef struct {
37
+ void *vector;
38
+ cmph_uint32 position;
39
+ }
40
+ cmph_vector_t;
41
+
42
+
43
+ template <typename ValueT, typename PosT, template <typename> class Allocator>
44
+ cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
45
+ {
46
+ cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
47
+ cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
48
+ assert(key_source);
49
+ assert(cmph_vector);
50
+
51
+ cmph_vector->vector = (void *)&sv;
52
+ cmph_vector->position = 0;
53
+ key_source->data = (void *)cmph_vector;
54
+ key_source->nkeys = sv.size();
55
+
56
+ return key_source;
57
+ }
58
+
59
+ template <typename ValueT, typename PosT, template <typename> class Allocator>
60
+ int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
61
+ {
62
+ cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
63
+ StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
64
+ size_t size;
65
+ *keylen = (*sv)[cmph_vector->position].size();
66
+ size = *keylen;
67
+ *key = new char[size + 1];
68
+ std::string temp = (*sv)[cmph_vector->position];
69
+ std::strcpy(*key, temp.c_str());
70
+ cmph_vector->position = cmph_vector->position + 1;
71
+ return (int)(*keylen);
72
+ }
73
+
74
+ void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
75
+
76
+ void CmphStringVectorAdapterRewind(void *data);
77
+
78
+ template <typename ValueT, typename PosT, template <typename> class Allocator>
79
+ cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
80
+ {
81
+ cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
82
+
83
+ key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
84
+ key_source->dispose = CmphStringVectorAdapterDispose;
85
+ key_source->rewind = CmphStringVectorAdapterRewind;
86
+ return key_source;
87
+ }
88
+
89
+ //************************************************************************//
90
+
91
+ cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
92
+
93
+ int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
94
+
95
+ void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
96
+
97
+ void CmphVectorAdapterRewind(void *data);
98
+
99
+ cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
100
+
101
+ }
102
+
103
+ #endif
104
+
105
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ // vim:tabstop=2
3
+ // $Id$
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include "LexicalReorderingTableCompact.h"
24
+ #include "moses/parameters/OOVHandlingOptions.h"
25
+
26
+ namespace Moses
27
+ {
28
+ bool LexicalReorderingTableCompact::s_inMemoryByDefault = false;
29
+
30
+ LexicalReorderingTableCompact::
31
+ LexicalReorderingTableCompact(const std::string& filePath,
32
+ const std::vector<FactorType>& f_factors,
33
+ const std::vector<FactorType>& e_factors,
34
+ const std::vector<FactorType>& c_factors)
35
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
36
+ , m_inMemory(s_inMemoryByDefault)
37
+ , m_numScoreComponent(6)
38
+ , m_multipleScoreTrees(true)
39
+ , m_hash(10, 16)
40
+ , m_scoreTrees(1)
41
+ {
42
+ Load(filePath);
43
+ }
44
+
45
+ LexicalReorderingTableCompact::
46
+ LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
47
+ const std::vector<FactorType>& e_factors,
48
+ const std::vector<FactorType>& c_factors)
49
+ : LexicalReorderingTable(f_factors, e_factors, c_factors)
50
+ , m_inMemory(s_inMemoryByDefault)
51
+ , m_numScoreComponent(6)
52
+ , m_multipleScoreTrees(true)
53
+ , m_hash(10, 16)
54
+ , m_scoreTrees(1)
55
+ { }
56
+
57
+ LexicalReorderingTableCompact::
58
+ ~LexicalReorderingTableCompact()
59
+ {
60
+ for(size_t i = 0; i < m_scoreTrees.size(); i++)
61
+ delete m_scoreTrees[i];
62
+ }
63
+
64
+ std::vector<float>
65
+ LexicalReorderingTableCompact::
66
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
67
+ {
68
+ std::string key;
69
+ Scores scores;
70
+
71
+ if(0 == c.GetSize())
72
+ key = MakeKey(f, e, c);
73
+ else
74
+ for(size_t i = 0; i <= c.GetSize(); ++i) {
75
+ Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1)));
76
+ key = MakeKey(f,e,sub_c);
77
+ }
78
+
79
+ size_t index = m_hash[key];
80
+ if(m_hash.GetSize() != index) {
81
+ std::string scoresString;
82
+ if(m_inMemory)
83
+ scoresString = m_scoresMemory[index].str();
84
+ else
85
+ scoresString = m_scoresMapped[index].str();
86
+
87
+ BitWrapper<> bitStream(scoresString);
88
+ for(size_t i = 0; i < m_numScoreComponent; i++)
89
+ scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
90
+
91
+ return scores;
92
+ }
93
+
94
+ return Scores();
95
+ }
96
+
97
+ std::string
98
+ LexicalReorderingTableCompact::
99
+ MakeKey(const Phrase& f,
100
+ const Phrase& e,
101
+ const Phrase& c) const
102
+ {
103
+ return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
104
+ Trim(e.GetStringRep(m_FactorsE)),
105
+ Trim(c.GetStringRep(m_FactorsC)));
106
+ }
107
+
108
+ std::string
109
+ LexicalReorderingTableCompact::
110
+ MakeKey(const std::string& f,
111
+ const std::string& e,
112
+ const std::string& c) const
113
+ {
114
+ std::string key;
115
+ if(!f.empty()) key += f;
116
+ if(!m_FactorsE.empty()) {
117
+ if(!key.empty()) key += " ||| ";
118
+ key += e;
119
+ }
120
+ if(!m_FactorsC.empty()) {
121
+ if(!key.empty()) key += " ||| ";
122
+ key += c;
123
+ }
124
+ key += " ||| ";
125
+ return key;
126
+ }
127
+
128
+ LexicalReorderingTable*
129
+ LexicalReorderingTableCompact::
130
+ CheckAndLoad
131
+ (const std::string& filePath,
132
+ const std::vector<FactorType>& f_factors,
133
+ const std::vector<FactorType>& e_factors,
134
+ const std::vector<FactorType>& c_factors)
135
+ {
136
+ #ifdef HAVE_CMPH
137
+ std::string minlexr = ".minlexr";
138
+ // file name is specified without suffix
139
+ if(FileExists(filePath + minlexr)) {
140
+ //there exists a compact binary version use that
141
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
142
+ return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
143
+ }
144
+ // file name is specified with suffix
145
+ if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
146
+ && FileExists(filePath)) {
147
+ //there exists a compact binary version use that
148
+ VERBOSE(2,"Using compact lexical reordering table" << std::endl);
149
+ return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
150
+ }
151
+ #endif
152
+ return 0;
153
+ }
154
+
155
+ void
156
+ LexicalReorderingTableCompact::
157
+ Load(std::string filePath)
158
+ {
159
+ std::FILE* pFile = std::fopen(filePath.c_str(), "r");
160
+ UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");
161
+
162
+ //if(m_inMemory)
163
+ m_hash.Load(pFile);
164
+ //else
165
+ //m_hash.LoadIndex(pFile);
166
+
167
+ size_t read = 0;
168
+ read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
169
+ read += std::fread(&m_multipleScoreTrees,
170
+ sizeof(m_multipleScoreTrees), 1, pFile);
171
+
172
+ if(m_multipleScoreTrees) {
173
+ m_scoreTrees.resize(m_numScoreComponent);
174
+ for(size_t i = 0; i < m_numScoreComponent; i++)
175
+ m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
176
+ } else {
177
+ m_scoreTrees.resize(1);
178
+ m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
179
+ }
180
+
181
+ if(m_inMemory)
182
+ m_scoresMemory.load(pFile, false);
183
+ else
184
+ m_scoresMapped.load(pFile, true);
185
+ }
186
+
187
+ void
188
+ LexicalReorderingTableCompact::
189
+ SetStaticDefaultParameters(Parameter const& param)
190
+ {
191
+ param.SetParameter(s_inMemoryByDefault, "minlexr-memory", false);
192
+ }
193
+
194
+
195
+ }
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_LexicalReorderingTableCompact_h
23
+ #define moses_LexicalReorderingTableCompact_h
24
+
25
+ #include "moses/FF/LexicalReordering/LexicalReorderingTable.h"
26
+ #include "moses/StaticData.h"
27
+ #include "moses/TranslationModel/PhraseDictionary.h"
28
+ #include "moses/GenerationDictionary.h"
29
+ #include "moses/TargetPhrase.h"
30
+ #include "moses/TargetPhraseCollection.h"
31
+
32
+ #include "BlockHashIndex.h"
33
+ #include "CanonicalHuffman.h"
34
+ #include "StringVector.h"
35
+
36
+ namespace Moses
37
+ {
38
+
39
+ class LexicalReorderingTableCompact:
40
+ public LexicalReorderingTable
41
+ {
42
+ private:
43
+ static bool s_inMemoryByDefault;
44
+ bool m_inMemory;
45
+
46
+ size_t m_numScoreComponent;
47
+ bool m_multipleScoreTrees;
48
+
49
+ BlockHashIndex m_hash;
50
+
51
+ typedef CanonicalHuffman<float> ScoreTree;
52
+ std::vector<ScoreTree*> m_scoreTrees;
53
+
54
+ StringVector<unsigned char, unsigned long, MmapAllocator> m_scoresMapped;
55
+ StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
56
+
57
+ std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
58
+ std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
59
+
60
+ public:
61
+ LexicalReorderingTableCompact(const std::string& filePath,
62
+ const std::vector<FactorType>& f_factors,
63
+ const std::vector<FactorType>& e_factors,
64
+ const std::vector<FactorType>& c_factors);
65
+
66
+ LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
67
+ const std::vector<FactorType>& e_factors,
68
+ const std::vector<FactorType>& c_factors);
69
+
70
+ virtual
71
+ ~LexicalReorderingTableCompact();
72
+
73
+ virtual
74
+ std::vector<float>
75
+ GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
76
+
77
+ static
78
+ LexicalReorderingTable*
79
+ CheckAndLoad(const std::string& filePath,
80
+ const std::vector<FactorType>& f_factors,
81
+ const std::vector<FactorType>& e_factors,
82
+ const std::vector<FactorType>& c_factors);
83
+
84
+ void
85
+ Load(std::string filePath);
86
+
87
+ static void
88
+ SetStaticDefaultParameters(Parameter const& param);
89
+
90
+ };
91
+
92
+ }
93
+
94
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_LexicalReorderingTableCreator_h
23
+ #define moses_LexicalReorderingTableCreator_h
24
+
25
+ #include "PhraseTableCreator.h"
26
+
27
+ namespace Moses
28
+ {
29
+
30
+ class LexicalReorderingTableCreator
31
+ {
32
+ private:
33
+ std::string m_inPath;
34
+ std::string m_outPath;
35
+ std::string m_tempfilePath;
36
+
37
+ std::FILE* m_outFile;
38
+
39
+ size_t m_orderBits;
40
+ size_t m_fingerPrintBits;
41
+
42
+ size_t m_numScoreComponent;
43
+
44
+ bool m_multipleScoreTrees;
45
+ bool m_quantize;
46
+
47
+ std::string m_separator;
48
+
49
+ BlockHashIndex m_hash;
50
+
51
+ typedef Counter<float> ScoreCounter;
52
+ typedef CanonicalHuffman<float> ScoreTree;
53
+
54
+ std::vector<ScoreCounter*> m_scoreCounters;
55
+ std::vector<ScoreTree*> m_scoreTrees;
56
+
57
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
58
+ StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
59
+
60
+ std::priority_queue<PackedItem> m_queue;
61
+ long m_lastFlushedLine;
62
+ long m_lastFlushedSourceNum;
63
+ std::string m_lastFlushedSourcePhrase;
64
+ std::vector<std::string> m_lastRange;
65
+
66
+ #ifdef WITH_THREADS
67
+ size_t m_threads;
68
+ #endif
69
+
70
+ void PrintInfo();
71
+
72
+ void EncodeScores();
73
+ void CalcHuffmanCodes();
74
+ void CompressScores();
75
+ void Save();
76
+
77
+ std::string MakeSourceTargetKey(std::string&, std::string&);
78
+
79
+ std::string EncodeLine(std::vector<std::string>& tokens);
80
+ void AddEncodedLine(PackedItem& pi);
81
+ void FlushEncodedQueue(bool force = false);
82
+
83
+ std::string CompressEncodedScores(std::string &encodedScores);
84
+ void AddCompressedScores(PackedItem& pi);
85
+ void FlushCompressedQueue(bool force = false);
86
+
87
+ public:
88
+ LexicalReorderingTableCreator(std::string inPath,
89
+ std::string outPath,
90
+ std::string tempfilePath,
91
+ size_t orderBits = 10,
92
+ size_t fingerPrintBits = 16,
93
+ bool multipleScoreTrees = true,
94
+ size_t quantize = 0
95
+ #ifdef WITH_THREADS
96
+ , size_t threads = 2
97
+ #endif
98
+ );
99
+
100
+ ~LexicalReorderingTableCreator();
101
+
102
+ friend class EncodingTaskReordering;
103
+ friend class CompressionTaskReordering;
104
+ };
105
+
106
+ class EncodingTaskReordering
107
+ {
108
+ private:
109
+ #ifdef WITH_THREADS
110
+ static boost::mutex m_mutex;
111
+ static boost::mutex m_fileMutex;
112
+ #endif
113
+ static size_t m_lineNum;
114
+ static size_t m_sourcePhraseNum;
115
+ static std::string m_lastSourcePhrase;
116
+
117
+ InputFileStream& m_inFile;
118
+ LexicalReorderingTableCreator& m_creator;
119
+
120
+ public:
121
+ EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
122
+ void operator()();
123
+ };
124
+
125
+ class CompressionTaskReordering
126
+ {
127
+ private:
128
+ #ifdef WITH_THREADS
129
+ static boost::mutex m_mutex;
130
+ #endif
131
+ static size_t m_scoresNum;
132
+ StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
133
+ LexicalReorderingTableCreator &m_creator;
134
+
135
+ public:
136
+ CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
137
+ m_encodedScores, LexicalReorderingTableCreator& creator);
138
+ void operator()();
139
+ };
140
+
141
+ }
142
+
143
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_ListCoders_h
23
+ #define moses_ListCoders_h
24
+
25
+ #include <cmath>
26
+ #include <cassert>
27
+
28
+ namespace Moses
29
+ {
30
+
31
+ template <typename T = unsigned int>
32
+ class VarIntType
33
+ {
34
+ private:
35
+ template <typename IntType, typename OutIt>
36
+ static void EncodeSymbol(IntType input, OutIt output) {
37
+ if(input == 0) {
38
+ *output = 0;
39
+ output++;
40
+ return;
41
+ }
42
+
43
+ T msb = 1 << (sizeof(T)*8-1);
44
+ IntType mask = ~msb;
45
+ IntType shift = (sizeof(T)*8-1);
46
+
47
+ while(input) {
48
+ T res = input & mask;
49
+ input >>= shift;
50
+ if(input)
51
+ res |= msb;
52
+ *output = res;
53
+ output++;
54
+ }
55
+ };
56
+
57
+ template <typename InIt, typename IntType>
58
+ static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
59
+ T msb = 1 << (sizeof(T)*8-1);
60
+ IntType shift = (sizeof(T)*8-1);
61
+
62
+ output = 0;
63
+ size_t i = 0;
64
+ while(it != end && *it & msb) {
65
+ IntType temp = *it & ~msb;
66
+ temp <<= shift*i;
67
+ output |= temp;
68
+ it++;
69
+ i++;
70
+ }
71
+ assert(it != end);
72
+
73
+ IntType temp = *it;
74
+ temp <<= shift*i;
75
+ output |= temp;
76
+ it++;
77
+ }
78
+
79
+ public:
80
+
81
+ template <typename InIt, typename OutIt>
82
+ static void Encode(InIt it, InIt end, OutIt outIt) {
83
+ while(it != end) {
84
+ EncodeSymbol(*it, outIt);
85
+ it++;
86
+ }
87
+ }
88
+
89
+ template <typename InIt, typename OutIt>
90
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
91
+ while(it != end) {
92
+ size_t output;
93
+ DecodeSymbol(it, end, output);
94
+ *outIt = output;
95
+ outIt++;
96
+ }
97
+ }
98
+
99
+ template <typename InIt>
100
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
101
+ size_t sum = 0;
102
+ size_t curr = 0;
103
+
104
+ while(it != end && curr < num) {
105
+ size_t output;
106
+ DecodeSymbol(it, end, output);
107
+ sum += output;
108
+ curr++;
109
+ }
110
+
111
+ return sum;
112
+ }
113
+
114
+ };
115
+
116
+ typedef VarIntType<unsigned char> VarByte;
117
+
118
+ typedef VarByte VarInt8;
119
+ typedef VarIntType<unsigned short> VarInt16;
120
+ typedef VarIntType<unsigned int> VarInt32;
121
+
122
+ class Simple9
123
+ {
124
+ private:
125
+ typedef unsigned int uint;
126
+
127
+ template <typename InIt>
128
+ inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
129
+ uint length = end - it;
130
+
131
+ uint type = 0;
132
+ uint bitlength = 0;
133
+
134
+ switch(length) {
135
+ case 1:
136
+ type = 1;
137
+ bitlength = 28;
138
+ break;
139
+ case 2:
140
+ type = 2;
141
+ bitlength = 14;
142
+ break;
143
+ case 3:
144
+ type = 3;
145
+ bitlength = 9;
146
+ break;
147
+ case 4:
148
+ type = 4;
149
+ bitlength = 7;
150
+ break;
151
+ case 5:
152
+ type = 5;
153
+ bitlength = 5;
154
+ break;
155
+ case 7:
156
+ type = 6;
157
+ bitlength = 4;
158
+ break;
159
+ case 9:
160
+ type = 7;
161
+ bitlength = 3;
162
+ break;
163
+ case 14:
164
+ type = 8;
165
+ bitlength = 2;
166
+ break;
167
+ case 28:
168
+ type = 9;
169
+ bitlength = 1;
170
+ break;
171
+ }
172
+
173
+ output = 0;
174
+ output |= (type << 28);
175
+
176
+ uint i = 0;
177
+ while(it != end) {
178
+ UTIL_THROW_IF2(*it > 268435455, "You are trying to encode " << *it
179
+ << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)");
180
+
181
+ uint l = bitlength * (length-i-1);
182
+ output |= *it << l;
183
+ it++;
184
+ i++;
185
+ }
186
+ }
187
+
188
+ template <typename OutIt>
189
+ static inline void DecodeSymbol(uint input, OutIt outIt) {
190
+ uint type = (input >> 28);
191
+
192
+ uint bitlen = 0;
193
+ uint shift = 0;
194
+ uint mask = 0;
195
+
196
+ switch(type) {
197
+ case 1:
198
+ bitlen = 28;
199
+ shift = 0;
200
+ mask = 268435455;
201
+ break;
202
+ case 2:
203
+ bitlen = 14;
204
+ shift = 14;
205
+ mask = 16383;
206
+ break;
207
+ case 3:
208
+ bitlen = 9;
209
+ shift = 18;
210
+ mask = 511;
211
+ break;
212
+ case 4:
213
+ bitlen = 7;
214
+ shift = 21;
215
+ mask = 127;
216
+ break;
217
+ case 5:
218
+ bitlen = 5;
219
+ shift = 20;
220
+ mask = 31;
221
+ break;
222
+ case 6:
223
+ bitlen = 4;
224
+ shift = 24;
225
+ mask = 15;
226
+ break;
227
+ case 7:
228
+ bitlen = 3;
229
+ shift = 24;
230
+ mask = 7;
231
+ break;
232
+ case 8:
233
+ bitlen = 2;
234
+ shift = 26;
235
+ mask = 3;
236
+ break;
237
+ case 9:
238
+ bitlen = 1;
239
+ shift = 27;
240
+ mask = 1;
241
+ break;
242
+ }
243
+
244
+ while(shift > 0) {
245
+ *outIt = (input >> shift) & mask;
246
+ shift -= bitlen;
247
+ outIt++;
248
+ }
249
+ *outIt = input & mask;
250
+ outIt++;
251
+ }
252
+
253
+ static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
254
+ uint type = (input >> 28);
255
+
256
+ uint bitlen = 0;
257
+ uint shift = 0;
258
+ uint mask = 0;
259
+
260
+ switch(type) {
261
+ case 1:
262
+ bitlen = 28;
263
+ shift = 0;
264
+ mask = 268435455;
265
+ break;
266
+ case 2:
267
+ bitlen = 14;
268
+ shift = 14;
269
+ mask = 16383;
270
+ break;
271
+ case 3:
272
+ bitlen = 9;
273
+ shift = 18;
274
+ mask = 511;
275
+ break;
276
+ case 4:
277
+ bitlen = 7;
278
+ shift = 21;
279
+ mask = 127;
280
+ break;
281
+ case 5:
282
+ bitlen = 5;
283
+ shift = 20;
284
+ mask = 31;
285
+ break;
286
+ case 6:
287
+ bitlen = 4;
288
+ shift = 24;
289
+ mask = 15;
290
+ break;
291
+ case 7:
292
+ bitlen = 3;
293
+ shift = 24;
294
+ mask = 7;
295
+ break;
296
+ case 8:
297
+ bitlen = 2;
298
+ shift = 26;
299
+ mask = 3;
300
+ break;
301
+ case 9:
302
+ bitlen = 1;
303
+ shift = 27;
304
+ mask = 1;
305
+ break;
306
+ }
307
+
308
+ size_t sum = 0;
309
+ while(shift > 0) {
310
+ sum += (input >> shift) & mask;
311
+ shift -= bitlen;
312
+ if(++curr == num)
313
+ return sum;
314
+ }
315
+ sum += input & mask;
316
+ curr++;
317
+ return sum;
318
+ }
319
+
320
+ public:
321
+ template <typename InIt, typename OutIt>
322
+ static void Encode(InIt it, InIt end, OutIt outIt) {
323
+ uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
324
+
325
+ uint buffer[28];
326
+ for(InIt i = it; i < end; i++) {
327
+ uint lastbit = 1;
328
+ uint lastpos = 0;
329
+ uint lastyes = 0;
330
+ uint j = 0;
331
+
332
+ double log2 = log(2);
333
+ while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
334
+ if(lastpos >= parts[j])
335
+ j++;
336
+
337
+ buffer[lastpos] = *(i + lastpos);
338
+
339
+ uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
340
+ assert(reqbit <= 28);
341
+
342
+ uint bit = 28/floor(28/reqbit);
343
+ if(lastbit < bit)
344
+ lastbit = bit;
345
+
346
+ if(parts[j] > 28/lastbit)
347
+ break;
348
+ else if(lastpos == parts[j]-1)
349
+ lastyes = lastpos;
350
+
351
+ lastpos++;
352
+ }
353
+ i += lastyes;
354
+
355
+ uint length = lastyes + 1;
356
+ uint output;
357
+ EncodeSymbol(output, buffer, buffer + length);
358
+
359
+ *outIt = output;
360
+ outIt++;
361
+ }
362
+ }
363
+
364
+ template <typename InIt, typename OutIt>
365
+ static void Decode(InIt &it, InIt end, OutIt outIt) {
366
+ while(it != end) {
367
+ DecodeSymbol(*it, outIt);
368
+ it++;
369
+ }
370
+ }
371
+
372
+ template <typename InIt>
373
+ static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
374
+ size_t sum = 0;
375
+ size_t curr = 0;
376
+ while(it != end && curr < num) {
377
+ sum += DecodeAndSumSymbol(*it, num, curr);
378
+ it++;
379
+ }
380
+ assert(curr == num);
381
+ return sum;
382
+ }
383
+ };
384
+
385
+ }
386
+
387
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_PackedArray_h
23
+ #define moses_PackedArray_h
24
+
25
+ #include <vector>
26
+ #include <cmath>
27
+ #include <cstring>
28
+ #include <cstdio>
29
+
30
+ #include "ThrowingFwrite.h"
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ template <typename T = size_t, typename D = unsigned char>
36
+ class PackedArray
37
+ {
38
+ protected:
39
+ static size_t m_dataBits;
40
+
41
+ size_t m_size;
42
+ size_t m_storageSize;
43
+ D* m_storage;
44
+
45
+ public:
46
+ PackedArray() {
47
+ m_size = 0;
48
+ m_storageSize = 0;
49
+ m_storage = new D[0];
50
+ }
51
+
52
+ PackedArray(size_t size, size_t bits) : m_size(size) {
53
+ m_storageSize = ceil(float(bits * size) / float(m_dataBits));
54
+ m_storage = new D[m_storageSize];
55
+ }
56
+
57
+ PackedArray(const PackedArray<T, D> &c) {
58
+ m_size = c.m_size;
59
+
60
+ m_storageSize = c.m_storageSize;
61
+ m_storage = new D[m_storageSize];
62
+
63
+ std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
64
+ }
65
+
66
+ virtual ~PackedArray() {
67
+ delete [] m_storage;
68
+ m_size = 0;
69
+ m_storageSize = 0;
70
+ m_storage = 0;
71
+ }
72
+
73
+ T Get(size_t i, size_t bits) const {
74
+ T out = 0;
75
+
76
+ size_t bitstart = (i * bits);
77
+ size_t bitpos = bitstart;
78
+
79
+ size_t zero = ((1ul << (bits)) - 1);
80
+
81
+ while(bitpos - bitstart < bits) {
82
+ size_t pos = bitpos / m_dataBits;
83
+ size_t off = bitpos % m_dataBits;
84
+
85
+ out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
86
+
87
+ bitpos += (m_dataBits - off);
88
+ }
89
+
90
+ out &= zero;
91
+ return out;
92
+ }
93
+
94
+ void Set(size_t i, T v, size_t bits) {
95
+ size_t bitstart = (i * bits);
96
+ size_t bitpos = bitstart;
97
+
98
+ while(bitpos - bitstart < bits) {
99
+ size_t pos = bitpos / m_dataBits;
100
+ size_t off = bitpos % m_dataBits;
101
+
102
+ size_t rest = bits - (bitpos - bitstart);
103
+ D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
104
+
105
+ m_storage[pos] &= zero;
106
+ m_storage[pos] |= v << off;
107
+ v = v >> (m_dataBits - off);
108
+ bitpos += (m_dataBits - off);
109
+ }
110
+ }
111
+
112
+ virtual D*& GetStorage() {
113
+ return m_storage;
114
+ }
115
+
116
+ virtual size_t GetStorageSize() const {
117
+ return m_storageSize;
118
+ }
119
+
120
+ virtual size_t Size() const {
121
+ return m_size;
122
+ }
123
+
124
+ virtual size_t Load(std::FILE* in) {
125
+ size_t a1 = std::ftell(in);
126
+
127
+ size_t read = 0;
128
+ read += std::fread(&m_size, sizeof(m_size), 1, in);
129
+ read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
130
+ delete [] m_storage;
131
+ m_storage = new D[m_storageSize];
132
+ read += std::fread(m_storage, sizeof(D), m_storageSize, in);
133
+
134
+ size_t a2 = std::ftell(in);
135
+ return a2 - a1;
136
+ }
137
+
138
+ virtual size_t Save(std::FILE* out) {
139
+ size_t a1 = std::ftell(out);
140
+
141
+ ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
142
+ ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
143
+ ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
144
+
145
+ size_t a2 = std::ftell(out);
146
+ return a2 - a1;
147
+ }
148
+
149
+ };
150
+
151
+ template <typename T, typename D>
152
+ size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
153
+
154
+ /**************************************************************************/
155
+
156
+ template <typename T = size_t, typename D = unsigned char>
157
+ class PairedPackedArray : public PackedArray<T,D>
158
+ {
159
+ public:
160
+ PairedPackedArray() : PackedArray<T,D>() {}
161
+
162
+ PairedPackedArray(size_t size, size_t bits1, size_t bits2)
163
+ : PackedArray<T, D>(size, bits1 + bits2) { }
164
+
165
+ void Set(size_t i, T a, T b, size_t bits1, size_t bits2) {
166
+ T c = 0;
167
+ c = a | (b << bits1);
168
+ PackedArray<T,D>::Set(i, c, bits1 + bits2);
169
+ }
170
+
171
+ void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2) {
172
+ T c = 0;
173
+ c = p.second | (p.first << bits1);
174
+ PackedArray<T, D>::Set(i, c);
175
+ }
176
+
177
+ std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2) {
178
+ T v = PackedArray<T, D>::Get(i, bits1 + bits2);
179
+ T a = v & ((1 << bits1) - 1);
180
+ T b = v >> bits1;
181
+ return std::pair<T, T>(a, b);
182
+ }
183
+ };
184
+
185
+ }
186
+
187
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <fstream>
23
+ #include <string>
24
+ #include <iterator>
25
+ #include <queue>
26
+ #include <algorithm>
27
+ #include <sys/stat.h>
28
+ #include <boost/algorithm/string/predicate.hpp>
29
+ #include <boost/thread/tss.hpp>
30
+
31
+ #include "PhraseDictionaryCompact.h"
32
+ #include "moses/FactorCollection.h"
33
+ #include "moses/Word.h"
34
+ #include "moses/Util.h"
35
+ #include "moses/InputFileStream.h"
36
+ #include "moses/StaticData.h"
37
+ #include "moses/Range.h"
38
+ #include "moses/ThreadPool.h"
39
+ #include "util/exception.hh"
40
+
41
+ using namespace std;
42
+ using namespace boost::algorithm;
43
+
44
+ namespace Moses
45
+ {
46
+
47
+ PhraseDictionaryCompact::SentenceCache PhraseDictionaryCompact::m_sentenceCache;
48
+
49
+ PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
50
+ :PhraseDictionary(line, true)
51
+ ,m_inMemory(s_inMemoryByDefault)
52
+ ,m_useAlignmentInfo(true)
53
+ ,m_hash(10, 16)
54
+ ,m_phraseDecoder(0)
55
+ {
56
+ ReadParameters();
57
+ }
58
+
59
+ void PhraseDictionaryCompact::Load(AllOptions::ptr const& opts)
60
+ {
61
+ m_options = opts;
62
+ const StaticData &staticData = StaticData::Instance();
63
+
64
+ SetFeaturesToApply();
65
+
66
+ std::string tFilePath = m_filePath;
67
+
68
+ std::string suffix = ".minphr";
69
+ if (!ends_with(tFilePath, suffix)) tFilePath += suffix;
70
+ if (!FileExists(tFilePath))
71
+ throw runtime_error("Error: File " + tFilePath + " does not exist.");
72
+
73
+ m_phraseDecoder
74
+ = new PhraseDecoder(*this, &m_input, &m_output, m_numScoreComponents);
75
+
76
+ std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
77
+
78
+ size_t indexSize;
79
+ //if(m_inMemory)
80
+ // Load source phrase index into memory
81
+ indexSize = m_hash.Load(pFile);
82
+ // else
83
+ // Keep source phrase index on disk
84
+ //indexSize = m_hash.LoadIndex(pFile);
85
+
86
+ size_t coderSize = m_phraseDecoder->Load(pFile);
87
+
88
+ size_t phraseSize;
89
+ if(m_inMemory)
90
+ // Load target phrase collections into memory
91
+ phraseSize = m_targetPhrasesMemory.load(pFile, false);
92
+ else
93
+ // Keep target phrase collections on disk
94
+ phraseSize = m_targetPhrasesMapped.load(pFile, true);
95
+
96
+ UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0,
97
+ "Not successfully loaded");
98
+ }
99
+
100
+ TargetPhraseCollection::shared_ptr
101
+ PhraseDictionaryCompact::
102
+ GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &sourcePhrase) const
103
+ {
104
+ //cerr << "sourcePhrase=" << sourcePhrase << endl;
105
+
106
+ TargetPhraseCollection::shared_ptr ret;
107
+ // There is no souch source phrase if source phrase is longer than longest
108
+ // observed source phrase during compilation
109
+ if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
110
+ return ret;
111
+
112
+ // Retrieve target phrase collection from phrase table
113
+ TargetPhraseVectorPtr decodedPhraseColl
114
+ = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
115
+
116
+ if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
117
+ TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
118
+ TargetPhraseCollection::shared_ptr phraseColl(new TargetPhraseCollection);
119
+
120
+ // Score phrases and if possible apply ttable_limit
121
+ TargetPhraseVector::iterator nth =
122
+ (m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
123
+ tpv->end() : tpv->begin() + m_tableLimit;
124
+ NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
125
+ for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) {
126
+ TargetPhrase *tp = new TargetPhrase(*it);
127
+ phraseColl->Add(tp);
128
+ }
129
+
130
+ // Cache phrase pair for clean-up or retrieval with PREnc
131
+ const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
132
+
133
+ return phraseColl;
134
+ } else
135
+ return ret;
136
+ }
137
+
138
+ TargetPhraseVectorPtr
139
+ PhraseDictionaryCompact::
140
+ GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
141
+ {
142
+
143
+ // There is no such source phrase if source phrase is longer than longest
144
+ // observed source phrase during compilation
145
+ if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
146
+ return TargetPhraseVectorPtr();
147
+
148
+ // Retrieve target phrase collection from phrase table
149
+ return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
150
+ }
151
+
152
+ PhraseDictionaryCompact::
153
+ ~PhraseDictionaryCompact()
154
+ {
155
+ if(m_phraseDecoder)
156
+ delete m_phraseDecoder;
157
+ }
158
+
159
+ void
160
+ PhraseDictionaryCompact::
161
+ CacheForCleanup(TargetPhraseCollection::shared_ptr tpc)
162
+ {
163
+ if(!m_sentenceCache.get())
164
+ m_sentenceCache.reset(new PhraseCache());
165
+ m_sentenceCache->push_back(tpc);
166
+ }
167
+
168
+ void
169
+ PhraseDictionaryCompact::
170
+ AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
171
+ { }
172
+
173
+ void
174
+ PhraseDictionaryCompact::
175
+ CleanUpAfterSentenceProcessing(const InputType &source)
176
+ {
177
+ if(!m_sentenceCache.get())
178
+ m_sentenceCache.reset(new PhraseCache());
179
+
180
+ m_phraseDecoder->PruneCache();
181
+ m_sentenceCache->clear();
182
+
183
+ ReduceCache();
184
+ }
185
+
186
+ bool PhraseDictionaryCompact::s_inMemoryByDefault = false;
187
+ void
188
+ PhraseDictionaryCompact::
189
+ SetStaticDefaultParameters(Parameter const& param)
190
+ {
191
+ param.SetParameter(s_inMemoryByDefault, "minphr-memory", false);
192
+ }
193
+ }
194
+
mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_StringVectorTemp_h
23
+ #define moses_StringVectorTemp_h
24
+
25
+ #include <vector>
26
+ #include <algorithm>
27
+ #include <string>
28
+ #include <iterator>
29
+ #include <cstdio>
30
+ #include <cassert>
31
+
32
+ #include <boost/iterator/iterator_facade.hpp>
33
+
34
+ #include "ThrowingFwrite.h"
35
+ #include "StringVector.h"
36
+
37
+ #include "MmapAllocator.h"
38
+
39
+ namespace Moses
40
+ {
41
+
42
+
43
+ // ********** StringVectorTemp **********
44
+
45
+ template <typename ValueT = unsigned char, typename PosT = unsigned int,
46
+ template <typename> class Allocator = std::allocator>
47
+ class StringVectorTemp
48
+ {
49
+ protected:
50
+ bool m_sorted;
51
+ bool m_memoryMapped;
52
+
53
+ std::vector<ValueT, Allocator<ValueT> >* m_charArray;
54
+ std::vector<PosT> m_positions;
55
+
56
+ virtual const ValueT* value_ptr(PosT i) const;
57
+
58
+ public:
59
+ //typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
60
+ typedef ValueIteratorRange<const ValueT *> range;
61
+
62
+ // ********** RangeIterator **********
63
+
64
+ class RangeIterator : public boost::iterator_facade<RangeIterator,
65
+ range, std::random_access_iterator_tag, range, PosT>
66
+ {
67
+
68
+ private:
69
+ PosT m_index;
70
+ StringVectorTemp<ValueT, PosT, Allocator>* m_container;
71
+
72
+ public:
73
+ RangeIterator();
74
+ RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
75
+
76
+ PosT get_index();
77
+
78
+ private:
79
+ friend class boost::iterator_core_access;
80
+
81
+ range dereference() const;
82
+ bool equal(RangeIterator const& other) const;
83
+ void increment();
84
+ void decrement();
85
+ void advance(PosT n);
86
+
87
+ PosT distance_to(RangeIterator const& other) const;
88
+ };
89
+
90
+ // ********** StringIterator **********
91
+
92
+ class StringIterator : public boost::iterator_facade<StringIterator,
93
+ std::string, std::random_access_iterator_tag, const std::string, PosT>
94
+ {
95
+
96
+ private:
97
+ PosT m_index;
98
+ StringVectorTemp<ValueT, PosT, Allocator>* m_container;
99
+
100
+ public:
101
+ StringIterator();
102
+ StringIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
103
+
104
+ PosT get_index();
105
+
106
+ private:
107
+ friend class boost::iterator_core_access;
108
+
109
+ const std::string dereference() const;
110
+ bool equal(StringIterator const& other) const;
111
+ void increment();
112
+ void decrement();
113
+ void advance(PosT n);
114
+ PosT distance_to(StringIterator const& other) const;
115
+ };
116
+
117
+ typedef RangeIterator iterator;
118
+ typedef StringIterator string_iterator;
119
+
120
+ StringVectorTemp();
121
+ StringVectorTemp(Allocator<ValueT> alloc);
122
+
123
+ virtual ~StringVectorTemp() {
124
+ delete m_charArray;
125
+ }
126
+
127
+ void swap(StringVectorTemp<ValueT, PosT, Allocator> &c) {
128
+ m_positions.swap(c.m_positions);
129
+ m_charArray->swap(*c.m_charArray);
130
+
131
+ bool temp = m_sorted;
132
+ m_sorted = c.m_sorted;
133
+ c.m_sorted = temp;
134
+ }
135
+
136
+ bool is_sorted() const;
137
+ PosT size() const;
138
+ virtual PosT size2() const;
139
+
140
+ template<class Iterator> Iterator begin() const;
141
+ template<class Iterator> Iterator end() const;
142
+
143
+ iterator begin() const;
144
+ iterator end() const;
145
+
146
+ PosT length(PosT i) const;
147
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
148
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
149
+ const ValueT* begin(PosT i) const;
150
+ const ValueT* end(PosT i) const;
151
+
152
+ void clear() {
153
+ m_charArray->clear();
154
+ m_sorted = true;
155
+ m_positions.clear();
156
+ }
157
+
158
+ range at(PosT i) const;
159
+ range operator[](PosT i) const;
160
+ range back() const;
161
+
162
+ template <typename StringT>
163
+ void push_back(StringT s);
164
+ void push_back(const char* c);
165
+
166
+ template <typename StringT>
167
+ PosT find(StringT &s) const;
168
+ PosT find(const char* c) const;
169
+ };
170
+
171
+ // ********** Implementation **********
172
+
173
+ // StringVectorTemp
174
+
175
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
176
+ StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp()
177
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
178
+
179
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
180
+ StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp(Allocator<ValueT> alloc)
181
+ : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
182
+
183
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
184
+ template <typename StringT>
185
+ void StringVectorTemp<ValueT, PosT, Allocator>::push_back(StringT s)
186
+ {
187
+ if(is_sorted() && size() && !(back() < s))
188
+ m_sorted = false;
189
+
190
+ m_positions.push_back(size2());
191
+ std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
192
+ }
193
+
194
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
195
+ void StringVectorTemp<ValueT, PosT, Allocator>::push_back(const char* c)
196
+ {
197
+ std::string dummy(c);
198
+ push_back(dummy);
199
+ }
200
+
201
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
202
+ template <typename Iterator>
203
+ Iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
204
+ {
205
+ return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), 0);
206
+ }
207
+
208
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
209
+ template <typename Iterator>
210
+ Iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
211
+ {
212
+ return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), size());
213
+ }
214
+
215
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
216
+ typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
217
+ {
218
+ return begin<iterator>();
219
+ };
220
+
221
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
222
+ typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
223
+ {
224
+ return end<iterator>();
225
+ };
226
+
227
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
228
+ bool StringVectorTemp<ValueT, PosT, Allocator>::is_sorted() const
229
+ {
230
+ return m_sorted;
231
+ }
232
+
233
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
234
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::size() const
235
+ {
236
+ return m_positions.size();
237
+ }
238
+
239
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
240
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::size2() const
241
+ {
242
+ return m_charArray->size();
243
+ }
244
+
245
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
246
+ typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::at(PosT i) const
247
+ {
248
+ return range(begin(i), end(i));
249
+ }
250
+
251
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
252
+ typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::operator[](PosT i) const
253
+ {
254
+ return at(i);
255
+ }
256
+
257
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
258
+ typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::back() const
259
+ {
260
+ return at(size()-1);
261
+ }
262
+
263
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
264
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::length(PosT i) const
265
+ {
266
+ if(i+1 < size())
267
+ return m_positions[i+1] - m_positions[i];
268
+ else
269
+ return size2() - m_positions[i];
270
+ }
271
+
272
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
273
+ const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::value_ptr(PosT i) const
274
+ {
275
+ return &(*m_charArray)[m_positions[i]];
276
+ }
277
+
278
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
279
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
280
+ const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
281
+ {
282
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
283
+ return value_ptr(i);
284
+ }
285
+
286
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
287
+ //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
288
+ const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
289
+ {
290
+ //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
291
+ return value_ptr(i) + length(i);
292
+ }
293
+
294
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
295
+ template <typename StringT>
296
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::find(StringT &s) const
297
+ {
298
+ if(m_sorted)
299
+ return std::distance(begin(), std::lower_bound(begin(), end(), s));
300
+ return std::distance(begin(), std::find(begin(), end(), s));
301
+ }
302
+
303
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
304
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::find(const char* c) const
305
+ {
306
+ std::string s(c);
307
+ return find(s);
308
+ }
309
+
310
+ // RangeIterator
311
+
312
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
313
+ StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(0), m_container(0) { }
314
+
315
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
316
+ StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index)
317
+ : m_index(index), m_container(&sv) { }
318
+
319
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
320
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::get_index()
321
+ {
322
+ return m_index;
323
+ }
324
+
325
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
326
+ typename StringVectorTemp<ValueT, PosT, Allocator>::range
327
+ StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::dereference() const
328
+ {
329
+ return typename StringVectorTemp<ValueT, PosT, Allocator>::range(
330
+ m_container->begin(m_index),
331
+ m_container->end(m_index)
332
+ );
333
+ }
334
+
335
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
336
+ bool StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::equal(
337
+ StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
338
+ {
339
+ return m_index == other.m_index && m_container == other.m_container;
340
+ }
341
+
342
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
343
+ void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::increment()
344
+ {
345
+ m_index++;
346
+ }
347
+
348
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
349
+ void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::decrement()
350
+ {
351
+ m_index--;
352
+ }
353
+
354
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
355
+ void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
356
+ {
357
+ m_index += n;
358
+ }
359
+
360
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
361
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::distance_to(
362
+ StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
363
+ {
364
+ return other.m_index - m_index;
365
+ }
366
+
367
+ // StringIterator
368
+
369
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
370
+ StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator()
371
+ : m_index(0), m_container(0) { }
372
+
373
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
374
+ StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator(
375
+ StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
376
+ m_container(&sv) { }
377
+
378
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
379
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::get_index()
380
+ {
381
+ return m_index;
382
+ }
383
+
384
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
385
+ const std::string StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::dereference() const
386
+ {
387
+ return StringVectorTemp<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
388
+ m_container->end(m_index)).str();
389
+ }
390
+
391
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
392
+ bool StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::equal(
393
+ StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
394
+ {
395
+ return m_index == other.m_index && m_container == other.m_container;
396
+ }
397
+
398
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
399
+ void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::increment()
400
+ {
401
+ m_index++;
402
+ }
403
+
404
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
405
+ void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::decrement()
406
+ {
407
+ m_index--;
408
+ }
409
+
410
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
411
+ void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
412
+ {
413
+ m_index += n;
414
+ }
415
+
416
+ template<typename ValueT, typename PosT, template <typename> class Allocator>
417
+ PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::distance_to(
418
+ StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
419
+ {
420
+ return other.m_index - m_index;
421
+ }
422
+
423
+ // ********** Some typedefs **********
424
+
425
+ typedef StringVectorTemp<unsigned char, unsigned int> MediumStringVectorTemp;
426
+ typedef StringVectorTemp<unsigned char, unsigned long> LongStringVectorTemp;
427
+
428
+ }
429
+
430
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_TargetPhraseCollectionCache_h
23
+ #define moses_TargetPhraseCollectionCache_h
24
+
25
+ #include <map>
26
+ #include <set>
27
+ #include <vector>
28
+
29
+ #include <boost/thread/tss.hpp>
30
+ #include <boost/shared_ptr.hpp>
31
+
32
+ #include "moses/Phrase.h"
33
+ #include "moses/TargetPhraseCollection.h"
34
+
35
+ namespace Moses
36
+ {
37
+
38
+ // Avoid using new due to locking
39
+ typedef std::vector<TargetPhrase> TargetPhraseVector;
40
+ typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
41
+
42
+ /** Implementation of Persistent Cache **/
43
+ class TargetPhraseCollectionCache
44
+ {
45
+ private:
46
+ size_t m_max;
47
+ float m_tolerance;
48
+
49
+ struct LastUsed {
50
+ clock_t m_clock;
51
+ TargetPhraseVectorPtr m_tpv;
52
+ size_t m_bitsLeft;
53
+
54
+ LastUsed() : m_clock(0), m_bitsLeft(0) {}
55
+
56
+ LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
57
+ : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
58
+ };
59
+
60
+ typedef std::map<Phrase, LastUsed> CacheMap;
61
+ static boost::thread_specific_ptr<CacheMap> m_phraseCache;
62
+
63
+ public:
64
+
65
+ typedef CacheMap::iterator iterator;
66
+ typedef CacheMap::const_iterator const_iterator;
67
+
68
+ TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
69
+ : m_max(max), m_tolerance(tolerance) {
70
+ }
71
+
72
+ iterator Begin() {
73
+ if(!m_phraseCache.get())
74
+ m_phraseCache.reset(new CacheMap());
75
+ return m_phraseCache->begin();
76
+ }
77
+
78
+ const_iterator Begin() const {
79
+ if(!m_phraseCache.get())
80
+ m_phraseCache.reset(new CacheMap());
81
+ return m_phraseCache->begin();
82
+ }
83
+
84
+ iterator End() {
85
+ if(!m_phraseCache.get())
86
+ m_phraseCache.reset(new CacheMap());
87
+ return m_phraseCache->end();
88
+ }
89
+
90
+ const_iterator End() const {
91
+ if(!m_phraseCache.get())
92
+ m_phraseCache.reset(new CacheMap());
93
+ return m_phraseCache->end();
94
+ }
95
+
96
+ /** retrieve translations for source phrase from persistent cache **/
97
+ void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
98
+ size_t bitsLeft = 0, size_t maxRank = 0) {
99
+ if(!m_phraseCache.get())
100
+ m_phraseCache.reset(new CacheMap());
101
+ // check if source phrase is already in cache
102
+ iterator it = m_phraseCache->find(sourcePhrase);
103
+ if(it != m_phraseCache->end())
104
+ // if found, just update clock
105
+ it->second.m_clock = clock();
106
+ else {
107
+ // else, add to cache
108
+ if(maxRank && tpv->size() > maxRank) {
109
+ TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
110
+ tpv_temp->resize(maxRank);
111
+ std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
112
+ (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
113
+ } else
114
+ (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
115
+ }
116
+ }
117
+
118
+ std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
119
+ if(!m_phraseCache.get())
120
+ m_phraseCache.reset(new CacheMap());
121
+ iterator it = m_phraseCache->find(sourcePhrase);
122
+ if(it != m_phraseCache->end()) {
123
+ LastUsed &lu = it->second;
124
+ lu.m_clock = clock();
125
+ return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
126
+ } else
127
+ return std::make_pair(TargetPhraseVectorPtr(), 0);
128
+ }
129
+
130
+ // if cache full, reduce
131
+ void Prune() {
132
+ if(!m_phraseCache.get())
133
+ m_phraseCache.reset(new CacheMap());
134
+ if(m_phraseCache->size() > m_max * (1 + m_tolerance)) {
135
+ typedef std::set<std::pair<clock_t, Phrase> > Cands;
136
+ Cands cands;
137
+ for(CacheMap::iterator it = m_phraseCache->begin();
138
+ it != m_phraseCache->end(); it++) {
139
+ LastUsed &lu = it->second;
140
+ cands.insert(std::make_pair(lu.m_clock, it->first));
141
+ }
142
+
143
+ for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
144
+ const Phrase& p = it->second;
145
+ m_phraseCache->erase(p);
146
+
147
+ if(m_phraseCache->size() < (m_max * (1 - m_tolerance)))
148
+ break;
149
+ }
150
+ }
151
+ }
152
+
153
+ void CleanUp() {
154
+ if(!m_phraseCache.get())
155
+ m_phraseCache.reset(new CacheMap());
156
+ m_phraseCache->clear();
157
+ }
158
+
159
+ };
160
+
161
+ }
162
+
163
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "ThrowingFwrite.h"
23
+
24
+ size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
25
+ {
26
+ assert(size);
27
+ size_t returnValue = std::fwrite(ptr, size, count, stream);
28
+ UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size);
29
+ return returnValue;
30
+ }
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_ThrowingFwrite_h
23
+ #define moses_ThrowingFwrite_h
24
+
25
+ #include <cassert>
26
+ #include <cstdio>
27
+ #include "util/exception.hh"
28
+
29
+ size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
30
+
31
+ #endif
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "LoaderFactory.h"
21
+
22
+ #include "moses/Util.h"
23
+ #include "moses/InputFileStream.h"
24
+ #include "LoaderCompact.h"
25
+ #include "LoaderHiero.h"
26
+ #include "LoaderStandard.h"
27
+
28
+ #include <sstream>
29
+ #include <iostream>
30
+
31
+ using namespace std;
32
+
33
+ namespace Moses
34
+ {
35
+
36
+ // Determines the rule table type by peeking inside the file then creates
37
+ // a suitable RuleTableLoader object.
38
+ std::auto_ptr<RuleTableLoader>
39
+ RuleTableLoaderFactory::
40
+ Create(const std::string &path)
41
+ {
42
+ InputFileStream input(path);
43
+ std::string line;
44
+
45
+ if (std::getline(input, line)) {
46
+ std::vector<std::string> tokens;
47
+ Tokenize(tokens, line);
48
+ if (tokens.size() == 1) {
49
+ if (tokens[0] == "1") {
50
+ return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderCompact());
51
+ }
52
+ std::cerr << "Unsupported compact rule table format: " << tokens[0];
53
+ return std::auto_ptr<RuleTableLoader>();
54
+ } else if (tokens[0] == "[X]" && tokens[1] == "|||") {
55
+ return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderHiero());
56
+ }
57
+
58
+ return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
59
+ } else {
60
+ // empty phrase table
61
+ return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
62
+ }
63
+ }
64
+
65
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // RuleTableLoaderHiero.cpp
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 04/11/2011.
6
+ // Copyright 2011 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #include <iostream>
10
+ #include "LoaderHiero.h"
11
+
12
+ using namespace std;
13
+
14
+ namespace Moses
15
+ {
16
+
17
+ bool RuleTableLoaderHiero::Load(AllOptions const& opts,
18
+ const std::vector<FactorType> &input,
19
+ const std::vector<FactorType> &output,
20
+ const std::string &inFile,
21
+ size_t tableLimit,
22
+ RuleTableTrie &ruleTable)
23
+ {
24
+ bool ret = RuleTableLoaderStandard::Load(opts, HieroFormat
25
+ ,input, output
26
+ ,inFile
27
+ ,tableLimit
28
+ ,ruleTable);
29
+ return ret;
30
+ }
31
+
32
+ }
33
+
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2011 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "LoaderStandard.h"
21
+
22
+ #include <fstream>
23
+ #include <string>
24
+ #include <iterator>
25
+ #include <algorithm>
26
+ #include <iostream>
27
+ #include <sys/stat.h>
28
+ #include <cstdlib>
29
+ #include <boost/algorithm/string/predicate.hpp>
30
+ #include "Trie.h"
31
+ #include "moses/FactorCollection.h"
32
+ #include "moses/Word.h"
33
+ #include "moses/Util.h"
34
+ #include "moses/InputFileStream.h"
35
+ #include "moses/StaticData.h"
36
+ #include "moses/Range.h"
37
+ #include "moses/ChartTranslationOptionList.h"
38
+ #include "moses/FactorCollection.h"
39
+ #include "util/file_piece.hh"
40
+ #include "util/string_piece.hh"
41
+ #include "util/tokenize_piece.hh"
42
+ #include "util/double-conversion/double-conversion.h"
43
+ #include "util/exception.hh"
44
+
45
+ using namespace std;
46
+ using namespace boost::algorithm;
47
+
48
+ namespace Moses
49
+ {
50
+
51
+ bool
52
+ RuleTableLoaderStandard::
53
+ Load(AllOptions const& opts
54
+ , const std::vector<FactorType> &input
55
+ , const std::vector<FactorType> &output
56
+ , const std::string &inFile
57
+ , size_t tableLimit
58
+ , RuleTableTrie &ruleTable)
59
+ {
60
+ return Load(opts, MosesFormat,input, output ,inFile ,tableLimit ,ruleTable);
61
+ }
62
+
63
+ void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
64
+ {
65
+ vector<string> toks;
66
+ Tokenize(toks, phrase, " ");
67
+
68
+ for (size_t i = 0; i < toks.size(); ++i) {
69
+ string &tok = toks[i];
70
+ if (starts_with(tok, "[") && ends_with(tok, "]")) {
71
+ // no-term
72
+ vector<string> split = Tokenize(tok, ",");
73
+ UTIL_THROW_IF2(split.size() != 2,
74
+ "Incorrectly formmatted non-terminal: " << tok);
75
+
76
+ tok = "[X]" + split[0] + "]";
77
+ size_t coIndex = Scan<size_t>(split[1]);
78
+
79
+ pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
80
+ if (sourceTarget == 0) {
81
+ alignPoint.first = i;
82
+ } else {
83
+ alignPoint.second = i;
84
+ }
85
+ }
86
+ }
87
+
88
+ phrase = Join(" ", toks) + " [X]";
89
+
90
+ }
91
+
92
+ void ReformateHieroScore(string &scoreString)
93
+ {
94
+ vector<string> toks;
95
+ Tokenize(toks, scoreString, " ");
96
+
97
+ for (size_t i = 0; i < toks.size(); ++i) {
98
+ string &tok = toks[i];
99
+ vector<string> nameValue = Tokenize(tok, "=");
100
+ UTIL_THROW_IF2(nameValue.size() != 2,
101
+ "Incorrectly formatted score: " << tok);
102
+
103
+ float score = Scan<float>(nameValue[1]);
104
+ score = exp(-score);
105
+ tok = SPrint(score);
106
+ }
107
+
108
+ scoreString = Join(" ", toks);
109
+ }
110
+
111
+ void ReformatHieroRule(const string &lineOrig, string &out)
112
+ {
113
+ vector<string> tokens;
114
+ vector<float> scoreVector;
115
+
116
+ TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
117
+
118
+ string &sourcePhraseString = tokens[1]
119
+ , &targetPhraseString = tokens[2]
120
+ , &scoreString = tokens[3];
121
+
122
+ map<size_t, pair<size_t, size_t> > ntAlign;
123
+ ReformatHieroRule(0, sourcePhraseString, ntAlign);
124
+ ReformatHieroRule(1, targetPhraseString, ntAlign);
125
+ ReformateHieroScore(scoreString);
126
+
127
+ util::StringStream align;
128
+ map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
129
+ for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
130
+ const pair<size_t, size_t> &alignPoint = iterAlign->second;
131
+ align << alignPoint.first << "-" << alignPoint.second << " ";
132
+ }
133
+
134
+ util::StringStream ret;
135
+ ret << sourcePhraseString << " ||| "
136
+ << targetPhraseString << " ||| "
137
+ << scoreString << " ||| "
138
+ << align.str();
139
+
140
+ out = ret.str();
141
+ }
142
+
143
+ bool RuleTableLoaderStandard::Load(AllOptions const& opts, FormatType format
144
+ , const std::vector<FactorType> &input
145
+ , const std::vector<FactorType> &output
146
+ , const std::string &inFile
147
+ , size_t /* tableLimit */
148
+ , RuleTableTrie &ruleTable)
149
+ {
150
+ PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format");
151
+
152
+ // const StaticData &staticData = StaticData::Instance();
153
+
154
+ string lineOrig;
155
+ size_t count = 0;
156
+
157
+ std::ostream *progress = NULL;
158
+ IFVERBOSE(1) progress = &std::cerr;
159
+ util::FilePiece in(inFile.c_str(), progress);
160
+
161
+ // reused variables
162
+ vector<float> scoreVector;
163
+ StringPiece line;
164
+ std::string hiero_before, hiero_after;
165
+
166
+ double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
167
+
168
+ while(true) {
169
+ try {
170
+ line = in.ReadLine();
171
+ } catch (const util::EndOfFileException &e) {
172
+ break;
173
+ }
174
+
175
+ if (format == HieroFormat) { // inefficiently reformat line
176
+ hiero_before.assign(line.data(), line.size());
177
+ ReformatHieroRule(hiero_before, hiero_after);
178
+ line = hiero_after;
179
+ }
180
+
181
+ util::TokenIter<util::MultiCharacter> pipes(line, "|||");
182
+ StringPiece sourcePhraseString(*pipes);
183
+ StringPiece targetPhraseString(*++pipes);
184
+ StringPiece scoreString(*++pipes);
185
+
186
+ StringPiece alignString;
187
+ if (++pipes) {
188
+ StringPiece temp(*pipes);
189
+ alignString = temp;
190
+ }
191
+
192
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
193
+ if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
194
+ TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
195
+ continue;
196
+ }
197
+
198
+ scoreVector.clear();
199
+ for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
200
+ int processed;
201
+ float score = converter.StringToFloat(s->data(), s->length(), &processed);
202
+ UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
203
+ scoreVector.push_back(FloorScore(TransformScore(score)));
204
+ }
205
+ const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
206
+ if (scoreVector.size() != numScoreComponents) {
207
+ UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
208
+ << numScoreComponents << ") of score components on line " << count);
209
+ }
210
+
211
+ // parse source & find pt node
212
+
213
+ // constituent labels
214
+ Word *sourceLHS = NULL;
215
+ Word *targetLHS;
216
+
217
+ // create target phrase obj
218
+ TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
219
+ targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
220
+ // source
221
+ Phrase sourcePhrase;
222
+ sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
223
+
224
+ // rest of target phrase
225
+ targetPhrase->SetAlignmentInfo(alignString);
226
+ targetPhrase->SetTargetLHS(targetLHS);
227
+
228
+ ++pipes; // skip over counts field
229
+
230
+ if (++pipes) {
231
+ StringPiece sparseString(*pipes);
232
+ targetPhrase->SetSparseScore(&ruleTable, sparseString);
233
+ }
234
+
235
+ if (++pipes) {
236
+ StringPiece propertiesString(*pipes);
237
+ targetPhrase->SetProperties(propertiesString);
238
+ }
239
+
240
+ targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
241
+ targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
242
+
243
+ TargetPhraseCollection::shared_ptr phraseColl
244
+ = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
245
+ *targetPhrase, sourceLHS);
246
+ phraseColl->Add(targetPhrase);
247
+
248
+ // not implemented correctly in memory pt. just delete it for now
249
+ delete sourceLHS;
250
+
251
+ count++;
252
+ }
253
+
254
+ // sort and prune each target phrase collection
255
+ SortAndPrune(ruleTable);
256
+
257
+ return true;
258
+ }
259
+
260
+ }
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // vim:tabstop=2
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <cstdio>
23
+ #include <cstdlib>
24
+ #include <cstring>
25
+ #include <climits>
26
+ #include <sys/types.h>
27
+ #include <unistd.h>
28
+ #include <dirent.h>
29
+
30
+ #include <fstream>
31
+ #include <string>
32
+ #include <iterator>
33
+ #include <algorithm>
34
+ #include "Loader.h"
35
+ #include "LoaderFactory.h"
36
+ #include "PhraseDictionaryFuzzyMatch.h"
37
+ #include "moses/FactorCollection.h"
38
+ #include "moses/Word.h"
39
+ #include "moses/Util.h"
40
+ #include "moses/InputFileStream.h"
41
+ #include "moses/StaticData.h"
42
+ #include "moses/Range.h"
43
+ #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"
44
+ #include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h"
45
+ #include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
46
+ #include "moses/TranslationTask.h"
47
+ #include "util/file.hh"
48
+ #include "util/exception.hh"
49
+ #include "util/random.hh"
50
+
51
+ using namespace std;
52
+
53
+ #if defined __MINGW32__ && !defined mkdtemp
54
+ #include <windows.h>
55
+ #include <cerrno>
56
+ char *mkdtemp(char *tempbuf)
57
+ {
58
+ int rand_value = 0;
59
+ char* tempbase = NULL;
60
+ char tempbasebuf[MAX_PATH] = "";
61
+
62
+ if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) {
63
+ errno = EINVAL;
64
+ return NULL;
65
+ }
66
+
67
+ util::rand_init();
68
+ rand_value = util::rand_excl(1e6);
69
+ tempbase = strrchr(tempbuf, '/');
70
+ tempbase = tempbase ? tempbase+1 : tempbuf;
71
+ strcpy(tempbasebuf, tempbase);
72
+ sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value);
73
+ ::GetTempPath(MAX_PATH, tempbuf);
74
+ strcat(tempbuf, tempbasebuf);
75
+ ::CreateDirectory(tempbuf, NULL);
76
+ return tempbuf;
77
+ }
78
+ #endif
79
+
80
+ namespace Moses
81
+ {
82
+
83
+ PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
84
+ :PhraseDictionary(line, true)
85
+ ,m_config(3)
86
+ ,m_FuzzyMatchWrapper(NULL)
87
+ {
88
+ ReadParameters();
89
+ }
90
+
91
+ PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch()
92
+ {
93
+ delete m_FuzzyMatchWrapper;
94
+ }
95
+
96
+ void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts)
97
+ {
98
+ m_options = opts;
99
+ SetFeaturesToApply();
100
+
101
+ m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
102
+ }
103
+
104
+ ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
105
+ const ChartParser &parser,
106
+ const ChartCellCollectionBase &cellCollection,
107
+ std::size_t /*maxChartSpan*/)
108
+ {
109
+ return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this);
110
+ }
111
+
112
+ void
113
+ PhraseDictionaryFuzzyMatch::
114
+ SetParameter(const std::string& key, const std::string& value)
115
+ {
116
+ if (key == "source") {
117
+ m_config[0] = value;
118
+ } else if (key == "target") {
119
+ m_config[1] = value;
120
+ } else if (key == "alignment") {
121
+ m_config[2] = value;
122
+ } else {
123
+ PhraseDictionary::SetParameter(key, value);
124
+ }
125
+ }
126
+
127
+ int removedirectoryrecursively(const char *dirname)
128
+ {
129
+ #if defined __MINGW32__
130
+ //TODO(jie): replace this function with boost implementation
131
+ #else
132
+ DIR *dir;
133
+ struct dirent *entry;
134
+ char path[PATH_MAX];
135
+
136
+ dir = opendir(dirname);
137
+ if (dir == NULL) {
138
+ perror("Error opendir()");
139
+ return 0;
140
+ }
141
+
142
+ while ((entry = readdir(dir)) != NULL) {
143
+ if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
144
+ snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
145
+ if (entry->d_type == DT_DIR) {
146
+ removedirectoryrecursively(path);
147
+ }
148
+
149
+ remove(path);
150
+ /*
151
+ * Here, the actual deletion must be done. Beacuse this is
152
+ * quite a dangerous thing to do, and this program is not very
153
+ * well tested, we are just printing as if we are deleting.
154
+ */
155
+ //printf("(not really) Deleting: %s\n", path);
156
+ /*
157
+ * When you are finished testing this and feel you are ready to do the real
158
+ * deleting, use this: remove*STUB*(path);
159
+ * (see "man 3 remove")
160
+ * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
161
+ */
162
+ }
163
+
164
+ }
165
+ closedir(dir);
166
+
167
+ rmdir(dirname);
168
+ /*
169
+ * Now the directory is emtpy, finally delete the directory itself. (Just
170
+ * printing here, see above)
171
+ */
172
+ //printf("(not really) Deleting: %s\n", dirname);
173
+ #endif
174
+ return 1;
175
+ }
176
+
177
+ void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask)
178
+ {
179
+ InputType const& inputSentence = *ttask->GetSource();
180
+ #if defined __MINGW32__
181
+ char dirName[] = "moses.XXXXXX";
182
+ #else
183
+ char dirName[] = "/tmp/moses.XXXXXX";
184
+ #endif // defined
185
+ char *temp = mkdtemp(dirName);
186
+ UTIL_THROW_IF2(temp == NULL,
187
+ "Couldn't create temporary directory " << dirName);
188
+
189
+ string dirNameStr(dirName);
190
+
191
+ string inFileName(dirNameStr + "/in");
192
+
193
+ ofstream inFile(inFileName.c_str());
194
+
195
+ for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
196
+ inFile << inputSentence.GetWord(i);
197
+ }
198
+ inFile << endl;
199
+ inFile.close();
200
+
201
+ long translationId = inputSentence.GetTranslationId();
202
+ string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
203
+
204
+ // populate with rules for this sentence
205
+ PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
206
+ FormatType format = MosesFormat;
207
+
208
+ // data from file
209
+ InputFileStream inStream(ptFileName);
210
+
211
+ // copied from class LoaderStandard
212
+ PrintUserTime("Start loading fuzzy-match phrase model");
213
+
214
+ const StaticData &staticData = StaticData::Instance();
215
+
216
+
217
+ string lineOrig;
218
+ size_t count = 0;
219
+
220
+ while(getline(inStream, lineOrig)) {
221
+ const string *line;
222
+ if (format == HieroFormat) { // reformat line
223
+ UTIL_THROW(util::Exception, "Cannot be Hiero format");
224
+ //line = ReformatHieroRule(lineOrig);
225
+ } else {
226
+ // do nothing to format of line
227
+ line = &lineOrig;
228
+ }
229
+
230
+ vector<string> tokens;
231
+ vector<float> scoreVector;
232
+
233
+ TokenizeMultiCharSeparator(tokens, *line , "|||" );
234
+
235
+ if (tokens.size() != 4 && tokens.size() != 5) {
236
+ UTIL_THROW2("Syntax error at " << ptFileName << ":" << count);
237
+ }
238
+
239
+ const string &sourcePhraseString = tokens[0]
240
+ , &targetPhraseString = tokens[1]
241
+ , &scoreString = tokens[2]
242
+ , &alignString = tokens[3];
243
+
244
+ bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
245
+ if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) {
246
+ TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
247
+ continue;
248
+ }
249
+
250
+ Tokenize<float>(scoreVector, scoreString);
251
+ const size_t numScoreComponents = GetNumScoreComponents();
252
+ if (scoreVector.size() != numScoreComponents) {
253
+ UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
254
+ << numScoreComponents << ") of score components on line " << count);
255
+ }
256
+
257
+ UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
258
+ "Number of scores incorrectly specified");
259
+
260
+ // parse source & find pt node
261
+
262
+ // constituent labels
263
+ Word *sourceLHS;
264
+ Word *targetLHS;
265
+
266
+ // source
267
+ Phrase sourcePhrase( 0);
268
+ sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
269
+
270
+ // create target phrase obj
271
+ TargetPhrase *targetPhrase = new TargetPhrase(this);
272
+ targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
273
+
274
+ // rest of target phrase
275
+ targetPhrase->SetAlignmentInfo(alignString);
276
+ targetPhrase->SetTargetLHS(targetLHS);
277
+ //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
278
+
279
+ // component score, for n-best output
280
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
281
+ std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
282
+
283
+ targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
284
+ targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
285
+
286
+ TargetPhraseCollection::shared_ptr phraseColl
287
+ = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase,
288
+ *targetPhrase, sourceLHS);
289
+ phraseColl->Add(targetPhrase);
290
+
291
+ count++;
292
+
293
+ if (format == HieroFormat) { // reformat line
294
+ delete line;
295
+ } else {
296
+ // do nothing
297
+ }
298
+
299
+ }
300
+
301
+ // sort and prune each target phrase collection
302
+ SortAndPrune(rootNode);
303
+
304
+ //removedirectoryrecursively(dirName);
305
+ }
306
+
307
+ TargetPhraseCollection::shared_ptr
308
+ PhraseDictionaryFuzzyMatch::
309
+ GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
310
+ , const Phrase &source
311
+ , const TargetPhrase &target
312
+ , const Word *sourceLHS)
313
+ {
314
+ PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
315
+ return currNode.GetTargetPhraseCollection();
316
+ }
317
+
318
+ PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
319
+ , const Phrase &source
320
+ , const TargetPhrase &target
321
+ , const Word *sourceLHS)
322
+ {
323
+ cerr << source << endl << target << endl;
324
+ const size_t size = source.GetSize();
325
+
326
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
327
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
328
+
329
+ PhraseDictionaryNodeMemory *currNode = &rootNode;
330
+ for (size_t pos = 0 ; pos < size ; ++pos) {
331
+ const Word& word = source.GetWord(pos);
332
+
333
+ if (word.IsNonTerminal()) {
334
+ // indexed by source label 1st
335
+ const Word &sourceNonTerm = word;
336
+
337
+ UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
338
+ "No alignment for non-term at position " << pos);
339
+ UTIL_THROW_IF2(iterAlign->first != pos,
340
+ "Alignment info incorrect at position " << pos);
341
+
342
+ size_t targetNonTermInd = iterAlign->second;
343
+ ++iterAlign;
344
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
345
+
346
+ #if defined(UNLABELLED_SOURCE)
347
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
348
+ #else
349
+ currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
350
+ #endif
351
+ } else {
352
+ currNode = currNode->GetOrCreateChild(word);
353
+ }
354
+
355
+ UTIL_THROW_IF2(currNode == NULL,
356
+ "Node not found at position " << pos);
357
+
358
+ }
359
+
360
+ // finally, the source LHS
361
+ //currNode = currNode->GetOrCreateChild(sourceLHS);
362
+
363
+ return *currNode;
364
+ }
365
+
366
+ void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
367
+ {
368
+ if (GetTableLimit()) {
369
+ rootNode.Sort(GetTableLimit());
370
+ }
371
+ }
372
+
373
+ void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
374
+ {
375
+ m_collection.erase(source.GetTranslationId());
376
+ }
377
+
378
+ const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const
379
+ {
380
+ std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId);
381
+ UTIL_THROW_IF2(iter == m_collection.end(),
382
+ "Couldn't find root node for input: " << translationId);
383
+ return iter->second;
384
+ }
385
+ PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
386
+ {
387
+ long transId = source.GetTranslationId();
388
+ std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
389
+ UTIL_THROW_IF2(iter == m_collection.end(),
390
+ "Couldn't find root node for input: " << transId);
391
+ return iter->second;
392
+ }
393
+
394
+ TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
395
+
396
+ // friend
397
+ ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
398
+ {
399
+ /*
400
+ typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
401
+ typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
402
+
403
+ const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
404
+ for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
405
+ const Word &sourceNonTerm = p->first.first;
406
+ out << sourceNonTerm;
407
+ }
408
+ for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
409
+ const Word &sourceTerm = p->first;
410
+ out << sourceTerm;
411
+ }
412
+ */
413
+
414
+ return out;
415
+ }
416
+
417
+ }
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // vim:tabstop=2
2
+ /***********************************************************************
3
+ Moses - factored phrase-based language decoder
4
+ Copyright (C) 2010 Hieu Hoang
5
+
6
+ This library is free software; you can redistribute it and/or
7
+ modify it under the terms of the GNU Lesser General Public
8
+ License as published by the Free Software Foundation; either
9
+ version 2.1 of the License, or (at your option) any later version.
10
+
11
+ This library is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ Lesser General Public License for more details.
15
+
16
+ You should have received a copy of the GNU Lesser General Public
17
+ License along with this library; if not, write to the Free Software
18
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
+ ***********************************************************************/
20
+
21
+ #include "PhraseDictionaryOnDisk.h"
22
+ #include "moses/InputFileStream.h"
23
+ #include "moses/StaticData.h"
24
+ #include "moses/TargetPhraseCollection.h"
25
+ #include "moses/InputPath.h"
26
+ #include "moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h"
27
+ #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h"
28
+ #include "moses/TranslationTask.h"
29
+
30
+ #include "OnDiskPt/OnDiskWrapper.h"
31
+ #include "OnDiskPt/Word.h"
32
+
33
+ #include "util/tokenize_piece.hh"
34
+
35
+ using namespace std;
36
+
37
+
38
+ namespace Moses
39
+ {
40
+ PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
41
+ : MyBase(line, true)
42
+ , m_maxSpanDefault(NOT_FOUND)
43
+ , m_maxSpanLabelled(NOT_FOUND)
44
+ {
45
+ ReadParameters();
46
+ }
47
+
48
+ PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
49
+ {
50
+ }
51
+
52
+ void PhraseDictionaryOnDisk::Load(AllOptions::ptr const& opts)
53
+ {
54
+ m_options = opts;
55
+ SetFeaturesToApply();
56
+ }
57
+
58
+ ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
59
+ const ChartParser &parser,
60
+ const ChartCellCollectionBase &cellCollection,
61
+ std::size_t /*maxChartSpan*/)
62
+ {
63
+ return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
64
+ GetImplementation(),
65
+ m_input,
66
+ m_output);
67
+ }
68
+
69
+ OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
70
+ {
71
+ OnDiskPt::OnDiskWrapper* dict;
72
+ dict = m_implementation.get();
73
+ UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
74
+ return *dict;
75
+ }
76
+
77
+ const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const
78
+ {
79
+ OnDiskPt::OnDiskWrapper* dict;
80
+ dict = m_implementation.get();
81
+ UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
82
+ return *dict;
83
+ }
84
+
85
+ void PhraseDictionaryOnDisk::InitializeForInput(ttasksptr const& ttask)
86
+ {
87
+ InputType const& source = *ttask->GetSource();
88
+ ReduceCache();
89
+
90
+ OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
91
+ obj->BeginLoad(m_filePath);
92
+
93
+ UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
94
+ "On-disk phrase table is version " << obj->GetMisc("Version")
95
+ << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);
96
+
97
+ UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
98
+ "On-disk phrase table has " << obj->GetMisc("NumSourceFactors") << " source factors."
99
+ << ". The ini file specified " << m_input.size() << " source factors");
100
+
101
+ UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
102
+ "On-disk phrase table has " << obj->GetMisc("NumTargetFactors") << " target factors."
103
+ << ". The ini file specified " << m_output.size() << " target factors");
104
+
105
+ UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
106
+ "On-disk phrase table has " << obj->GetMisc("NumScores") << " scores."
107
+ << ". The ini file specified " << m_numScoreComponents << " scores");
108
+
109
+ m_implementation.reset(obj);
110
+ }
111
+
112
+ void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
113
+ {
114
+ InputPathList::const_iterator iter;
115
+ for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
116
+ InputPath &inputPath = **iter;
117
+ GetTargetPhraseCollectionBatch(inputPath);
118
+ }
119
+
120
+ // delete nodes that's been saved
121
+ for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
122
+ InputPath &inputPath = **iter;
123
+ const OnDiskPt::PhraseNode *ptNode = static_cast<const OnDiskPt::PhraseNode*>(inputPath.GetPtNode(*this));
124
+ delete ptNode;
125
+ }
126
+
127
+ }
128
+
129
+ void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const
130
+ {
131
+ OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
132
+ const Phrase &phrase = inputPath.GetPhrase();
133
+ const InputPath *prevInputPath = inputPath.GetPrevPath();
134
+
135
+ const OnDiskPt::PhraseNode *prevPtNode = NULL;
136
+
137
+ if (prevInputPath) {
138
+ prevPtNode = static_cast<const OnDiskPt::PhraseNode*>(prevInputPath->GetPtNode(*this));
139
+ } else {
140
+ // Starting subphrase.
141
+ assert(phrase.GetSize() == 1);
142
+ prevPtNode = &wrapper.GetRootSourceNode();
143
+ }
144
+
145
+ // backoff
146
+ if (!SatisfyBackoff(inputPath)) {
147
+ return;
148
+ }
149
+
150
+ if (prevPtNode) {
151
+ Word lastWord = phrase.GetWord(phrase.GetSize() - 1);
152
+ lastWord.OnlyTheseFactors(m_inputFactors);
153
+ OnDiskPt::Word *lastWordOnDisk = ConvertFromMoses(wrapper, m_input, lastWord);
154
+
155
+ TargetPhraseCollection::shared_ptr tpc;
156
+ if (lastWordOnDisk == NULL) {
157
+ // OOV according to this phrase table. Not possible to extend
158
+ inputPath.SetTargetPhrases(*this, tpc, NULL);
159
+ } else {
160
+ OnDiskPt::PhraseNode const* ptNode;
161
+ ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper);
162
+ if (ptNode) tpc = GetTargetPhraseCollection(ptNode);
163
+ inputPath.SetTargetPhrases(*this, tpc, ptNode);
164
+
165
+ delete lastWordOnDisk;
166
+ }
167
+ }
168
+ }
169
+
170
+ TargetPhraseCollection::shared_ptr
171
+ PhraseDictionaryOnDisk::
172
+ GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const
173
+ {
174
+ TargetPhraseCollection::shared_ptr ret;
175
+
176
+ CacheColl &cache = GetCache();
177
+ size_t hash = (size_t) ptNode->GetFilePos();
178
+
179
+ CacheColl::iterator iter;
180
+
181
+ iter = cache.find(hash);
182
+
183
+ if (iter == cache.end()) {
184
+ // not in cache, need to look up from phrase table
185
+ ret = GetTargetPhraseCollectionNonCache(ptNode);
186
+
187
+ std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(ret, clock());
188
+ cache[hash] = value;
189
+ } else {
190
+ // in cache. just use it
191
+ iter->second.second = clock();
192
+ ret = iter->second.first;
193
+ }
194
+
195
+ return ret;
196
+ }
197
+
198
+ TargetPhraseCollection::shared_ptr
199
+ PhraseDictionaryOnDisk::
200
+ GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const
201
+ {
202
+ OnDiskPt::OnDiskWrapper& wrapper
203
+ = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
204
+
205
+ vector<float> weightT = StaticData::Instance().GetWeights(this);
206
+ OnDiskPt::Vocab &vocab = wrapper.GetVocab();
207
+
208
+ OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
209
+ = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
210
+ TargetPhraseCollection::shared_ptr targetPhrases
211
+ = ConvertToMoses(targetPhrasesOnDisk, m_input, m_output, *this,
212
+ weightT, vocab, false);
213
+
214
+ // delete targetPhrasesOnDisk;
215
+
216
+ return targetPhrases;
217
+ }
218
+
219
+ Moses::TargetPhraseCollection::shared_ptr
220
+ PhraseDictionaryOnDisk::ConvertToMoses(
221
+ const OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
222
+ , const std::vector<Moses::FactorType> &inputFactors
223
+ , const std::vector<Moses::FactorType> &outputFactors
224
+ , const Moses::PhraseDictionary &phraseDict
225
+ , const std::vector<float> &weightT
226
+ , OnDiskPt::Vocab &vocab
227
+ , bool isSyntax) const
228
+ {
229
+ Moses::TargetPhraseCollection::shared_ptr ret;
230
+ ret.reset(new Moses::TargetPhraseCollection);
231
+
232
+ for (size_t i = 0; i < targetPhrasesOnDisk->GetSize(); ++i) {
233
+ const OnDiskPt::TargetPhrase &tp = targetPhrasesOnDisk->GetTargetPhrase(i);
234
+ Moses::TargetPhrase *mosesPhrase
235
+ = ConvertToMoses(tp, inputFactors, outputFactors, vocab,
236
+ phraseDict, weightT, isSyntax);
237
+
238
+ /*
239
+ // debugging output
240
+ stringstream strme;
241
+ strme << filePath << " " << *mosesPhrase;
242
+ mosesPhrase->SetDebugOutput(strme.str());
243
+ */
244
+
245
+ ret->Add(mosesPhrase);
246
+ }
247
+
248
+ ret->Sort(true, phraseDict.GetTableLimit());
249
+
250
+ return ret;
251
+ }
252
+
253
+ Moses::TargetPhrase *PhraseDictionaryOnDisk::ConvertToMoses(const OnDiskPt::TargetPhrase &targetPhraseOnDisk
254
+ , const std::vector<Moses::FactorType> &inputFactors
255
+ , const std::vector<Moses::FactorType> &outputFactors
256
+ , const OnDiskPt::Vocab &vocab
257
+ , const Moses::PhraseDictionary &phraseDict
258
+ , const std::vector<float> &weightT
259
+ , bool isSyntax) const
260
+ {
261
+ Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict);
262
+
263
+ // words
264
+ size_t phraseSize = targetPhraseOnDisk.GetSize();
265
+ UTIL_THROW_IF2(phraseSize == 0, "Target phrase cannot be empty"); // last word is lhs
266
+ if (isSyntax) {
267
+ --phraseSize;
268
+ }
269
+
270
+ for (size_t pos = 0; pos < phraseSize; ++pos) {
271
+ const OnDiskPt::Word &wordOnDisk = targetPhraseOnDisk.GetWord(pos);
272
+ ConvertToMoses(wordOnDisk, outputFactors, vocab, ret->AddWord());
273
+ }
274
+
275
+ // alignments
276
+ // int index = 0;
277
+ Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
278
+ std::set<std::pair<size_t, size_t> > alignmentInfo;
279
+ const OnDiskPt::PhrasePtr sp = targetPhraseOnDisk.GetSourcePhrase();
280
+ for (size_t ind = 0; ind < targetPhraseOnDisk.GetAlign().size(); ++ind) {
281
+ const std::pair<size_t, size_t> &entry = targetPhraseOnDisk.GetAlign()[ind];
282
+ alignmentInfo.insert(entry);
283
+ size_t sourcePos = entry.first;
284
+ size_t targetPos = entry.second;
285
+
286
+ if (targetPhraseOnDisk.GetWord(targetPos).IsNonTerminal()) {
287
+ alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
288
+ } else {
289
+ alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
290
+ }
291
+
292
+ }
293
+ ret->SetAlignTerm(alignTerm);
294
+ ret->SetAlignNonTerm(alignNonTerm);
295
+
296
+ if (isSyntax) {
297
+ Moses::Word *lhsTarget = new Moses::Word(true);
298
+ const OnDiskPt::Word &lhsOnDisk = targetPhraseOnDisk.GetWord(targetPhraseOnDisk.GetSize() - 1);
299
+ ConvertToMoses(lhsOnDisk, outputFactors, vocab, *lhsTarget);
300
+ ret->SetTargetLHS(lhsTarget);
301
+ }
302
+
303
+ // set source phrase
304
+ Moses::Phrase mosesSP(Moses::Input);
305
+ for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
306
+ ConvertToMoses(sp->GetWord(pos), inputFactors, vocab, mosesSP.AddWord());
307
+ }
308
+
309
+ // scores
310
+ ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetScores());
311
+
312
+ // sparse features
313
+ ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetSparseFeatures());
314
+
315
+ // property
316
+ ret->SetProperties(targetPhraseOnDisk.GetProperty());
317
+
318
+ ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
319
+
320
+ return ret;
321
+ }
322
+
323
+ void PhraseDictionaryOnDisk::ConvertToMoses(
324
+ const OnDiskPt::Word &wordOnDisk,
325
+ const std::vector<Moses::FactorType> &outputFactorsVec,
326
+ const OnDiskPt::Vocab &vocab,
327
+ Moses::Word &overwrite) const
328
+ {
329
+ Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
330
+ overwrite = Moses::Word(wordOnDisk.IsNonTerminal());
331
+
332
+ if (wordOnDisk.IsNonTerminal()) {
333
+ const std::string &tok = vocab.GetString(wordOnDisk.GetVocabId());
334
+ overwrite.SetFactor(0, factorColl.AddFactor(tok, wordOnDisk.IsNonTerminal()));
335
+ } else {
336
+ // TODO: this conversion should have been done at load time.
337
+ util::TokenIter<util::SingleCharacter> tok(vocab.GetString(wordOnDisk.GetVocabId()), '|');
338
+
339
+ for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
340
+ UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
341
+ overwrite.SetFactor(*t, factorColl.AddFactor(*tok, wordOnDisk.IsNonTerminal()));
342
+ }
343
+ UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
344
+ }
345
+ }
346
+
347
+ OnDiskPt::Word *PhraseDictionaryOnDisk::ConvertFromMoses(OnDiskPt::OnDiskWrapper &wrapper, const std::vector<Moses::FactorType> &factorsVec
348
+ , const Moses::Word &origWord) const
349
+ {
350
+ bool isNonTerminal = origWord.IsNonTerminal();
351
+ OnDiskPt::Word *newWord = new OnDiskPt::Word(isNonTerminal);
352
+
353
+ util::StringStream strme;
354
+
355
+ size_t factorType = factorsVec[0];
356
+ const Moses::Factor *factor = origWord.GetFactor(factorType);
357
+ UTIL_THROW_IF2(factor == NULL, "Expecting factor " << factorType);
358
+ strme << factor->GetString();
359
+
360
+ for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
361
+ size_t factorType = factorsVec[ind];
362
+ const Moses::Factor *factor = origWord.GetFactor(factorType);
363
+ if (factor == NULL) {
364
+ // can have less factors than factorType.size()
365
+ break;
366
+ }
367
+ UTIL_THROW_IF2(factor == NULL,
368
+ "Expecting factor " << factorType << " at position " << ind);
369
+ strme << "|" << factor->GetString();
370
+ } // for (size_t factorType
371
+
372
+ bool found;
373
+ uint64_t vocabId = wrapper.GetVocab().GetVocabId(strme.str(), found);
374
+ if (!found) {
375
+ // factor not in phrase table -> phrse definately not in. exit
376
+ delete newWord;
377
+ return NULL;
378
+ } else {
379
+ newWord->SetVocabId(vocabId);
380
+ return newWord;
381
+ }
382
+
383
+ }
384
+
385
+ void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value)
386
+ {
387
+ if (key == "max-span-default") {
388
+ m_maxSpanDefault = Scan<size_t>(value);
389
+ } else if (key == "max-span-labelled") {
390
+ m_maxSpanLabelled = Scan<size_t>(value);
391
+ } else {
392
+ PhraseDictionary::SetParameter(key, value);
393
+ }
394
+ }
395
+
396
+
397
+ } // namespace
398
+
mosesdecoder/moses/TranslationModel/RuleTable/Trie.h ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "moses/TranslationModel/PhraseDictionary.h"
23
+ #include "moses/TypeDef.h"
24
+
25
+ #include <string>
26
+ #include <vector>
27
+
28
+ namespace Moses
29
+ {
30
+
31
+ class Phrase;
32
+ class TargetPhrase;
33
+ class TargetPhraseCollection;
34
+ class Word;
35
+
36
+ /*** Implementation of a SCFG rule table in a trie. Looking up a rule of
37
+ * length n symbols requires n look-ups to find the TargetPhraseCollection.
38
+ * @todo why need this and PhraseDictionaryMemory?
39
+ */
40
+ class RuleTableTrie : public PhraseDictionary
41
+ {
42
+ public:
43
+ RuleTableTrie(const std::string &line)
44
+ : PhraseDictionary(line, true) {
45
+ }
46
+
47
+ virtual ~RuleTableTrie();
48
+
49
+ void Load(AllOptions::ptr const& opts);
50
+
51
+ private:
52
+ friend class RuleTableLoader;
53
+
54
+ virtual TargetPhraseCollection::shared_ptr
55
+ GetOrCreateTargetPhraseCollection(const Phrase &source,
56
+ const TargetPhrase &target,
57
+ const Word *sourceLHS) = 0;
58
+
59
+ virtual void SortAndPrune() = 0;
60
+
61
+ };
62
+
63
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "moses/NonTerminal.h"
21
+ #include "moses/TranslationModel/Scope3Parser/Parser.h"
22
+ #include "moses/StaticData.h"
23
+ #include "moses/TargetPhrase.h"
24
+ #include "moses/TargetPhraseCollection.h"
25
+ #include "moses/Util.h"
26
+ #include "moses/Word.h"
27
+ #include "UTrie.h"
28
+ #include "Trie.h"
29
+ #include "UTrieNode.h"
30
+
31
+ #include <boost/functional/hash.hpp>
32
+ #include <boost/unordered_map.hpp>
33
+ #include <boost/version.hpp>
34
+
35
+ #include <map>
36
+ #include <vector>
37
+
38
+ namespace Moses
39
+ {
40
+
41
+ TargetPhraseCollection::shared_ptr
42
+ RuleTableUTrie::
43
+ GetOrCreateTargetPhraseCollection(const Phrase &source,
44
+ const TargetPhrase &target,
45
+ const Word *sourceLHS)
46
+ {
47
+ UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS);
48
+ return currNode.GetOrCreateTargetPhraseCollection(target);
49
+ }
50
+
51
+ UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source,
52
+ const TargetPhrase &target,
53
+ const Word */*sourceLHS*/)
54
+ {
55
+ const size_t size = source.GetSize();
56
+
57
+ const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
58
+ AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
59
+
60
+ UTrieNode *currNode = &m_root;
61
+ for (size_t pos = 0 ; pos < size ; ++pos) {
62
+ const Word &word = source.GetWord(pos);
63
+
64
+ if (word.IsNonTerminal()) {
65
+ assert(iterAlign != alignmentInfo.end());
66
+ assert(iterAlign->first == pos);
67
+ size_t targetNonTermInd = iterAlign->second;
68
+ ++iterAlign;
69
+ const Word &targetNonTerm = target.GetWord(targetNonTermInd);
70
+ currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
71
+ } else {
72
+ currNode = currNode->GetOrCreateTerminalChild(word);
73
+ }
74
+
75
+ assert(currNode != NULL);
76
+ }
77
+
78
+ return *currNode;
79
+ }
80
+
81
+ ChartRuleLookupManager *RuleTableUTrie::CreateRuleLookupManager(
82
+ const ChartParser &parser,
83
+ const ChartCellCollectionBase &cellCollection,
84
+ std::size_t maxChartSpan)
85
+ {
86
+ return new Scope3Parser(parser, cellCollection, *this, maxChartSpan);
87
+ }
88
+
89
+ void RuleTableUTrie::SortAndPrune()
90
+ {
91
+ if (GetTableLimit()) {
92
+ m_root.Sort(GetTableLimit());
93
+ }
94
+ }
95
+
96
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #pragma once
21
+
22
+ #include "Trie.h"
23
+ #include "UTrieNode.h"
24
+ #include "moses/TargetPhraseCollection.h"
25
+
26
+ namespace Moses
27
+ {
28
+
29
+ class Phrase;
30
+ class TargetPhrase;
31
+ class Word;
32
+ class ChartParser;
33
+
34
+ /** Implementation of RuleTableTrie. A RuleTableUTrie is designed to store
35
+ * string-to-tree SCFG grammars only (i.e. rules can have distinct labels on
36
+ * the target side, but only a generic non-terminal on the source side).
37
+ * A key is the source RHS (one symbol per edge) of a rule and a mapped value
38
+ * is the collection of grammar rules that share the same source RHS.
39
+ *
40
+ * (The 'U' in UTrie stands for 'unlabelled' -- the keys are unlabelled and
41
+ * the target labels are stored on the node values, as opposed to the grammar
42
+ * being a monolingual projection with target labels projected onto the source
43
+ * side.)
44
+ */
45
+ class RuleTableUTrie : public RuleTableTrie
46
+ {
47
+ public:
48
+ RuleTableUTrie(const std::string &line)
49
+ : RuleTableTrie(line) {
50
+ }
51
+
52
+ const UTrieNode &GetRootNode() const {
53
+ return m_root;
54
+ }
55
+
56
+ ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &,
57
+ const ChartCellCollectionBase &, std::size_t);
58
+
59
+ private:
60
+ TargetPhraseCollection::shared_ptr
61
+ GetOrCreateTargetPhraseCollection(const Phrase &source,
62
+ const TargetPhrase &target,
63
+ const Word *sourceLHS);
64
+
65
+ UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
66
+ const Word *sourceLHS);
67
+
68
+ void SortAndPrune();
69
+
70
+ UTrieNode m_root;
71
+ };
72
+
73
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <vector>
5
+ #include <map>
6
+
7
+ class Alignments
8
+ {
9
+ public:
10
+ std::vector< std::map<int, int> > m_alignS2T, m_alignT2S;
11
+
12
+ Alignments(const std::string &align, size_t sourceSize, size_t targetSize);
13
+
14
+
15
+ protected:
16
+
17
+ };
18
+
19
+
20
+
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // FuzzyMatchWrapper.cpp
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 26/07/2012.
6
+ // Copyright 2012 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #include <iostream>
10
+ #include "FuzzyMatchWrapper.h"
11
+ #include "SentenceAlignment.h"
12
+ #include "Match.h"
13
+ #include "create_xml.h"
14
+ #include "moses/Util.h"
15
+ #include "moses/StaticData.h"
16
+ #include "util/file.hh"
17
+
18
+ using namespace std;
19
+
20
+ namespace tmmt
21
+ {
22
+
23
+ FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
24
+ :basic_flag(false)
25
+ ,lsed_flag(true)
26
+ ,refined_flag(true)
27
+ ,length_filter_flag(true)
28
+ ,parse_flag(true)
29
+ ,min_match(70)
30
+ ,multiple_flag(true)
31
+ ,multiple_slack(0)
32
+ ,multiple_max(100)
33
+ {
34
+ cerr << "creating suffix array" << endl;
35
+ suffixArray = new tmmt::SuffixArray( sourcePath );
36
+
37
+ //cerr << "loading source data" << endl;
38
+ //load_corpus(sourcePath, source);
39
+
40
+ cerr << "loading target data" << endl;
41
+ load_target(targetPath, targetAndAlignment);
42
+
43
+ cerr << "loading alignment" << endl;
44
+ load_alignment(alignmentPath, targetAndAlignment);
45
+
46
+ // create suffix array
47
+ //load_corpus(m_config[0], input);
48
+
49
+ cerr << "loading completed" << endl;
50
+ }
51
+
52
+ string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
53
+ {
54
+ const Moses::StaticData &staticData = Moses::StaticData::Instance();
55
+
56
+ WordIndex wordIndex;
57
+
58
+ string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
59
+
60
+ // create extrac files
61
+ create_xml(fuzzyMatchFile);
62
+
63
+ // create phrase table with usual Moses scoring and consolidate programs
64
+ string cmd;
65
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
66
+ + fuzzyMatchFile + ".extract.sorted.gz";
67
+ system(cmd.c_str());
68
+ cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
69
+ + fuzzyMatchFile + ".extract.inv.sorted.gz";
70
+ system(cmd.c_str());
71
+
72
+ #ifdef IS_XCODE
73
+ cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
74
+ #elif IS_ECLIPSE
75
+ cmd = "/home/hieu/workspace/github/moses-smt/bin";
76
+ #else
77
+ cmd = staticData.GetBinDirectory();
78
+ #endif
79
+
80
+ cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
81
+ + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
82
+ + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
83
+ system(cmd.c_str());
84
+
85
+
86
+ return fuzzyMatchFile + ".pt.gz";
87
+ }
88
+
89
+ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
90
+ {
91
+ const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
92
+
93
+ string inputPath = dirNameStr + "/in";
94
+ string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
95
+ ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
96
+
97
+ vector< vector< WORD_ID > > input;
98
+ load_corpus(inputPath, input);
99
+
100
+ assert(input.size() == 1);
101
+ size_t sentenceInd = 0;
102
+
103
+ clock_t start_clock = clock();
104
+ // if (i % 10 == 0) cerr << ".";
105
+
106
+ // establish some basic statistics
107
+
108
+ // int input_length = compute_length( input[i] );
109
+ int input_length = input[sentenceInd].size();
110
+ int best_cost = input_length * (100-min_match) / 100 + 1;
111
+
112
+ int match_count = 0; // how many substring matches to be considered
113
+ //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
114
+
115
+ // find match ranges in suffix array
116
+ vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
117
+ for(int start=0; start<input[sentenceInd].size(); start++) {
118
+ SuffixArray::INDEX prior_first_match = 0;
119
+ SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
120
+ vector< string > substring;
121
+ bool stillMatched = true;
122
+ vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
123
+ //cerr << "start: " << start;
124
+ for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
125
+ substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
126
+
127
+ // only look up, if needed (i.e. no unnecessary short gram lookups)
128
+ // if (! word-start+1 <= short_match_max_length( input_length ) )
129
+ // {
130
+ SuffixArray::INDEX first_match, last_match;
131
+ stillMatched = false;
132
+ if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
133
+ stillMatched = true;
134
+ matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
135
+ //cerr << " (" << first_match << "," << last_match << ")";
136
+ //cerr << " " << ( last_match - first_match + 1 );
137
+ prior_first_match = first_match;
138
+ prior_last_match = last_match;
139
+ }
140
+ //}
141
+ }
142
+ //cerr << endl;
143
+ match_range.push_back( matchedAtThisStart );
144
+ }
145
+
146
+ clock_t clock_range = clock();
147
+
148
+ map< int, vector< Match > > sentence_match;
149
+ map< int, int > sentence_match_word_count;
150
+
151
+ // go through all matches, longest first
152
+ for(int length = input[sentenceInd].size(); length >= 1; length--) {
153
+ // do not create matches, if these are handled by the short match function
154
+ if (length <= short_match_max_length( input_length ) ) {
155
+ continue;
156
+ }
157
+
158
+ unsigned int count = 0;
159
+ for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
160
+ if (match_range[start].size() >= length) {
161
+ pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
162
+ // cerr << " (" << range.first << "," << range.second << ")";
163
+ count += range.second - range.first + 1;
164
+
165
+ for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
166
+ size_t position = suffixArray->GetPosition( i );
167
+
168
+ // sentence length mismatch
169
+ size_t sentence_id = suffixArray->GetSentence( position );
170
+ int sentence_length = suffixArray->GetSentenceLength( sentence_id );
171
+ int diff = abs( (int)sentence_length - (int)input_length );
172
+ // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
173
+ //if (length <= 2 && input_length>=5 &&
174
+ // sentence_match.find( sentence_id ) == sentence_match.end())
175
+ // continue;
176
+
177
+ if (diff > best_cost)
178
+ continue;
179
+
180
+ // compute minimal cost
181
+ int start_pos = suffixArray->GetWordInSentence( position );
182
+ int end_pos = start_pos + length-1;
183
+ // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
184
+ // << start << "-" << (start+length-1) << " (" << input_length << ")";
185
+ // different number of prior words -> cost is at least diff
186
+ int min_cost = abs( start - start_pos );
187
+
188
+ // same number of words, but not sent. start -> cost is at least 1
189
+ if (start == start_pos && start>0)
190
+ min_cost++;
191
+
192
+ // different number of remaining words -> cost is at least diff
193
+ min_cost += abs( ( sentence_length-1 - end_pos ) -
194
+ ( input_length-1 - (start+length-1) ) );
195
+
196
+ // same number of words, but not sent. end -> cost is at least 1
197
+ if ( sentence_length-1 - end_pos ==
198
+ input_length-1 - (start+length-1)
199
+ && end_pos != sentence_length-1 )
200
+ min_cost++;
201
+
202
+ // cerr << " -> min_cost " << min_cost;
203
+ if (min_cost > best_cost)
204
+ continue;
205
+
206
+ // valid match
207
+ match_count++;
208
+
209
+ // compute maximal cost
210
+ int max_cost = max( start, start_pos )
211
+ + max( sentence_length-1 - end_pos,
212
+ input_length-1 - (start+length-1) );
213
+ // cerr << ", max_cost " << max_cost;
214
+
215
+ Match m = Match( start, start+length-1,
216
+ start_pos, start_pos+length-1,
217
+ min_cost, max_cost, 0);
218
+ sentence_match[ sentence_id ].push_back( m );
219
+ sentence_match_word_count[ sentence_id ] += length;
220
+
221
+ if (max_cost < best_cost) {
222
+ best_cost = max_cost;
223
+ if (best_cost == 0) break;
224
+ }
225
+ //if (match_count >= MAX_MATCH_COUNT) break;
226
+ }
227
+ }
228
+ // cerr << endl;
229
+ if (best_cost == 0) break;
230
+ //if (match_count >= MAX_MATCH_COUNT) break;
231
+ }
232
+ // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
233
+
234
+ if (best_cost == 0) break;
235
+ //if (match_count >= MAX_MATCH_COUNT) break;
236
+ }
237
+ cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
238
+
239
+ clock_t clock_matches = clock();
240
+
241
+ // consider each sentence for which we have matches
242
+ int old_best_cost = best_cost;
243
+ int tm_count_word_match = 0;
244
+ int tm_count_word_match2 = 0;
245
+ int pruned_match_count = 0;
246
+ if (short_match_max_length( input_length )) {
247
+ init_short_matches(wordIndex, translationId, input[sentenceInd] );
248
+ }
249
+ vector< int > best_tm;
250
+ typedef map< int, vector< Match > >::iterator I;
251
+
252
+ clock_t clock_validation_sum = 0;
253
+
254
+ for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
255
+ int tmID = tm->first;
256
+ int tm_length = suffixArray->GetSentenceLength(tmID);
257
+ vector< Match > &match = tm->second;
258
+ add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
259
+
260
+ //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
261
+
262
+ // quick look: how many words are matched
263
+ int words_matched = 0;
264
+ for(size_t m=0; m<match.size(); m++) {
265
+
266
+ if (match[m].min_cost <= best_cost) // makes no difference
267
+ words_matched += match[m].input_end - match[m].input_start + 1;
268
+ }
269
+ if (max(input_length,tm_length) - words_matched > best_cost) {
270
+ if (length_filter_flag) continue;
271
+ }
272
+ tm_count_word_match++;
273
+
274
+ // prune, check again how many words are matched
275
+ vector< Match > pruned = prune_matches( match, best_cost );
276
+ words_matched = 0;
277
+ for(size_t p=0; p<pruned.size(); p++) {
278
+ words_matched += pruned[p].input_end - pruned[p].input_start + 1;
279
+ }
280
+ if (max(input_length,tm_length) - words_matched > best_cost) {
281
+ if (length_filter_flag) continue;
282
+ }
283
+ tm_count_word_match2++;
284
+
285
+ pruned_match_count += pruned.size();
286
+ int prior_best_cost = best_cost;
287
+ int cost;
288
+
289
+ clock_t clock_validation_start = clock();
290
+ if (! parse_flag ||
291
+ pruned.size()>=10) { // to prevent worst cases
292
+ string path;
293
+ cost = sed( input[sentenceInd], source[tmID], path, false );
294
+ if (cost < best_cost) {
295
+ best_cost = cost;
296
+ }
297
+ }
298
+
299
+ else {
300
+ cost = parse_matches( pruned, input_length, tm_length, best_cost );
301
+ if (prior_best_cost != best_cost) {
302
+ best_tm.clear();
303
+ }
304
+ }
305
+ clock_validation_sum += clock() - clock_validation_start;
306
+ if (cost == best_cost) {
307
+ best_tm.push_back( tmID );
308
+ }
309
+ }
310
+ cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
311
+ cerr << "tm considered: " << sentence_match.size()
312
+ << " word-matched: " << tm_count_word_match
313
+ << " word-matched2: " << tm_count_word_match2
314
+ << " best: " << best_tm.size() << endl;
315
+
316
+ cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
317
+
318
+ // create xml and extract files
319
+ string inputStr, sourceStr;
320
+ for (size_t pos = 0; pos < input_length; ++pos) {
321
+ inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
322
+ }
323
+
324
+ // do not try to find the best ... report multiple matches
325
+ if (multiple_flag) {
326
+ for(size_t si=0; si<best_tm.size(); si++) {
327
+ int s = best_tm[si];
328
+ string path;
329
+ sed( input[sentenceInd], source[s], path, true );
330
+ const vector<WORD_ID> &sourceSentence = source[s];
331
+ vector<SentenceAlignment> &targets = targetAndAlignment[s];
332
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
333
+
334
+ }
335
+ } // if (multiple_flag)
336
+ else {
337
+
338
+ // find the best matches according to letter sed
339
+ string best_path = "";
340
+ int best_match = -1;
341
+ unsigned int best_letter_cost;
342
+ if (lsed_flag) {
343
+ best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
344
+ for(size_t si=0; si<best_tm.size(); si++) {
345
+ int s = best_tm[si];
346
+ string path;
347
+ unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
348
+ if (letter_cost < best_letter_cost) {
349
+ best_letter_cost = letter_cost;
350
+ best_path = path;
351
+ best_match = s;
352
+ }
353
+ }
354
+ }
355
+ // if letter sed turned off, just compute path for first match
356
+ else {
357
+ if (best_tm.size() > 0) {
358
+ string path;
359
+ sed( input[sentenceInd], source[best_tm[0]], path, false );
360
+ best_path = path;
361
+ best_match = best_tm[0];
362
+ }
363
+ }
364
+ cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
365
+ << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
366
+ << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
367
+ << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
368
+ << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
369
+ << " )" << endl;
370
+ if (lsed_flag) {
371
+ //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
372
+ }
373
+ //cout << best_cost <<"/" << input_length;
374
+ if (lsed_flag) {
375
+ //cout << ")";
376
+ }
377
+ //cout << " ||| " << best_match << " ||| " << best_path << endl;
378
+
379
+ if (best_match == -1) {
380
+ UTIL_THROW_IF2(source.size() == 0, "Empty source phrase");
381
+ best_match = 0;
382
+ }
383
+
384
+ // creat xml & extracts
385
+ const vector<WORD_ID> &sourceSentence = source[best_match];
386
+ vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
387
+ create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
388
+
389
+ } // else if (multiple_flag)
390
+
391
+ fuzzyMatchStream.close();
392
+
393
+ return fuzzyMatchFile;
394
+ }
395
+
396
+ void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
397
+ {
398
+ // source
399
+ ifstream fileStream;
400
+ fileStream.open(fileName.c_str());
401
+ if (!fileStream) {
402
+ cerr << "file not found: " << fileName << endl;
403
+ exit(1);
404
+ }
405
+ cerr << "loading " << fileName << endl;
406
+
407
+ istream *fileStreamP = &fileStream;
408
+
409
+ string line;
410
+ while(getline(*fileStreamP, line)) {
411
+ corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
412
+ }
413
+ }
414
+
415
+ void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
416
+ {
417
+ ifstream fileStream;
418
+ fileStream.open(fileName.c_str());
419
+ if (!fileStream) {
420
+ cerr << "file not found: " << fileName << endl;
421
+ exit(1);
422
+ }
423
+ cerr << "loading " << fileName << endl;
424
+
425
+ istream *fileStreamP = &fileStream;
426
+
427
+ WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
428
+
429
+ int lineNum = 0;
430
+ string line;
431
+ while(getline(*fileStreamP, line)) {
432
+ vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
433
+
434
+ corpus.push_back(vector< SentenceAlignment >());
435
+ vector< SentenceAlignment > &vec = corpus.back();
436
+
437
+ vec.push_back(SentenceAlignment());
438
+ SentenceAlignment *sentence = &vec.back();
439
+
440
+ const WORD &countStr = GetVocabulary().GetWord(toks[0]);
441
+ sentence->count = atoi(countStr.c_str());
442
+
443
+ for (size_t i = 1; i < toks.size(); ++i) {
444
+ WORD_ID wordId = toks[i];
445
+
446
+ if (wordId == delimiter) {
447
+ // target and alignments can have multiple sentences.
448
+ vec.push_back(SentenceAlignment());
449
+ sentence = &vec.back();
450
+
451
+ // count
452
+ ++i;
453
+
454
+ const WORD &countStr = GetVocabulary().GetWord(toks[i]);
455
+ sentence->count = atoi(countStr.c_str());
456
+ } else {
457
+ // just a normal word, add
458
+ sentence->target.push_back(wordId);
459
+ }
460
+ }
461
+
462
+ ++lineNum;
463
+
464
+ }
465
+
466
+ }
467
+
468
+
469
+ void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
470
+ {
471
+ ifstream fileStream;
472
+ fileStream.open(fileName.c_str());
473
+ if (!fileStream) {
474
+ cerr << "file not found: " << fileName << endl;
475
+ exit(1);
476
+ }
477
+ cerr << "loading " << fileName << endl;
478
+
479
+ istream *fileStreamP = &fileStream;
480
+
481
+ string delimiter = "|||";
482
+
483
+ int lineNum = 0;
484
+ string line;
485
+ while(getline(*fileStreamP, line)) {
486
+ vector< SentenceAlignment > &vec = corpus[lineNum];
487
+ size_t targetInd = 0;
488
+ SentenceAlignment *sentence = &vec[targetInd];
489
+
490
+ vector<string> toks = Moses::Tokenize(line);
491
+
492
+ for (size_t i = 0; i < toks.size(); ++i) {
493
+ string &tok = toks[i];
494
+
495
+ if (tok == delimiter) {
496
+ // target and alignments can have multiple sentences.
497
+ ++targetInd;
498
+ sentence = &vec[targetInd];
499
+
500
+ ++i;
501
+ } else {
502
+ // just a normal alignment, add
503
+ vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
504
+ assert(alignPoint.size() == 2);
505
+ sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
506
+ }
507
+ }
508
+
509
+ ++lineNum;
510
+
511
+ }
512
+ }
513
+
514
+ bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
515
+ {
516
+ #ifdef WITH_THREADS
517
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
518
+ #endif
519
+ map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
520
+ if (lookup != m_lsed.end()) {
521
+ value = lookup->second;
522
+ return true;
523
+ }
524
+
525
+ return false;
526
+ }
527
+
528
+ void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
529
+ {
530
+ #ifdef WITH_THREADS
531
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
532
+ #endif
533
+ m_lsed[ key ] = value;
534
+ }
535
+
536
+ /* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
537
+
538
+ unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
539
+ {
540
+ // check if already computed -> lookup in cache
541
+ pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
542
+ unsigned int value;
543
+ bool ret = GetLSEDCache(pIdx, value);
544
+ if (ret) {
545
+ return value;
546
+ }
547
+
548
+ // get surface strings for word indices
549
+ const string &a = GetVocabulary().GetWord( aIdx );
550
+ const string &b = GetVocabulary().GetWord( bIdx );
551
+
552
+ // initialize cost matrix
553
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
554
+ for( unsigned int i=0; i<=a.size(); i++ ) {
555
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
556
+ cost[i][0] = i;
557
+ }
558
+ for( unsigned int j=0; j<=b.size(); j++ ) {
559
+ cost[0][j] = j;
560
+ }
561
+
562
+ // core string edit distance loop
563
+ for( unsigned int i=1; i<=a.size(); i++ ) {
564
+ for( unsigned int j=1; j<=b.size(); j++ ) {
565
+
566
+ unsigned int ins = cost[i-1][j] + 1;
567
+ unsigned int del = cost[i][j-1] + 1;
568
+ bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
569
+ unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
570
+
571
+ unsigned int min = (ins < del) ? ins : del;
572
+ min = (diag < min) ? diag : min;
573
+
574
+ cost[i][j] = min;
575
+ }
576
+ }
577
+
578
+ // clear out memory
579
+ unsigned int final = cost[a.size()][b.size()];
580
+ for( unsigned int i=0; i<=a.size(); i++ ) {
581
+ free( cost[i] );
582
+ }
583
+ free( cost );
584
+
585
+ // cache and return result
586
+ SetLSEDCache(pIdx, final);
587
+ return final;
588
+ }
589
+
590
+ /* string edit distance implementation */
591
+
592
+ unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
593
+ {
594
+
595
+ // initialize cost and path matrices
596
+ unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
597
+ char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
598
+
599
+ for( unsigned int i=0; i<=a.size(); i++ ) {
600
+ cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
601
+ path[i] = (char*) calloc( sizeof(char), b.size()+1 );
602
+ if (i>0) {
603
+ cost[i][0] = cost[i-1][0];
604
+ if (use_letter_sed) {
605
+ cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
606
+ } else {
607
+ cost[i][0]++;
608
+ }
609
+ } else {
610
+ cost[i][0] = 0;
611
+ }
612
+ path[i][0] = 'I';
613
+ }
614
+
615
+ for( unsigned int j=0; j<=b.size(); j++ ) {
616
+ if (j>0) {
617
+ cost[0][j] = cost[0][j-1];
618
+ if (use_letter_sed) {
619
+ cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size();
620
+ } else {
621
+ cost[0][j]++;
622
+ }
623
+ } else {
624
+ cost[0][j] = 0;
625
+ }
626
+ path[0][j] = 'D';
627
+ }
628
+
629
+ // core string edit distance algorithm
630
+ for( unsigned int i=1; i<=a.size(); i++ ) {
631
+ for( unsigned int j=1; j<=b.size(); j++ ) {
632
+ unsigned int ins = cost[i-1][j];
633
+ unsigned int del = cost[i][j-1];
634
+ unsigned int match;
635
+ if (use_letter_sed) {
636
+ ins += GetVocabulary().GetWord( a[i-1] ).size();
637
+ del += GetVocabulary().GetWord( b[j-1] ).size();
638
+ match = letter_sed( a[i-1], b[j-1] );
639
+ } else {
640
+ ins++;
641
+ del++;
642
+ match = ( a[i-1] == b[j-1] ) ? 0 : 1;
643
+ }
644
+ unsigned int diag = cost[i-1][j-1] + match;
645
+
646
+ char action = (ins < del) ? 'I' : 'D';
647
+ unsigned int min = (ins < del) ? ins : del;
648
+ if (diag < min) {
649
+ action = (match>0) ? 'S' : 'M';
650
+ min = diag;
651
+ }
652
+
653
+ cost[i][j] = min;
654
+ path[i][j] = action;
655
+ }
656
+ }
657
+
658
+ // construct string for best path
659
+ unsigned int i = a.size();
660
+ unsigned int j = b.size();
661
+ best_path = "";
662
+ while( i>0 || j>0 ) {
663
+ best_path = path[i][j] + best_path;
664
+ if (path[i][j] == 'I') {
665
+ i--;
666
+ } else if (path[i][j] == 'D') {
667
+ j--;
668
+ } else {
669
+ i--;
670
+ j--;
671
+ }
672
+ }
673
+
674
+
675
+ // clear out memory
676
+ unsigned int final = cost[a.size()][b.size()];
677
+
678
+ for( unsigned int i=0; i<=a.size(); i++ ) {
679
+ free( cost[i] );
680
+ free( path[i] );
681
+ }
682
+ free( cost );
683
+ free( path );
684
+
685
+ // return result
686
+ return final;
687
+ }
688
+
689
+ /* utlility function: compute length of sentence in characters
690
+ (spaces do not count) */
691
+
692
+ unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
693
+ {
694
+ unsigned int length = 0;
695
+ for( unsigned int i=0; i<sentence.size(); i++ ) {
696
+ length += GetVocabulary().GetWord( sentence[i] ).size();
697
+ }
698
+ return length;
699
+ }
700
+
701
+ /* brute force method: compare input to all corpus sentences */
702
+
703
+ void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
704
+ vector< vector< WORD_ID > > input )
705
+ {
706
+ // go through input set...
707
+ for(unsigned int i=0; i<input.size(); i++) {
708
+ bool use_letter_sed = false;
709
+
710
+ // compute sentence length and worst allowed cost
711
+ unsigned int input_length;
712
+ if (use_letter_sed) {
713
+ input_length = compute_length( input[i] );
714
+ } else {
715
+ input_length = input[i].size();
716
+ }
717
+ unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
718
+ string best_path = "";
719
+ //int best_match = -1;
720
+
721
+ // go through all corpus sentences
722
+ for(unsigned int s=0; s<source.size(); s++) {
723
+ int source_length;
724
+ if (use_letter_sed) {
725
+ source_length = compute_length( source[s] );
726
+ } else {
727
+ source_length = source[s].size();
728
+ }
729
+ int diff = abs((int)source_length - (int)input_length);
730
+ if (length_filter_flag && (diff >= best_cost)) {
731
+ continue;
732
+ }
733
+
734
+ // compute string edit distance
735
+ string path;
736
+ unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
737
+
738
+ // update if new best
739
+ if (cost < best_cost) {
740
+ best_cost = cost;
741
+ best_path = path;
742
+ //best_match = s;
743
+ }
744
+ }
745
+ //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
746
+ }
747
+ }
748
+
749
+ /* definition of short matches
750
+ very short n-gram matches (1-grams) will not be looked up in
751
+ the suffix array, since there are too many matches
752
+ and for longer sentences, at least one 2-gram match must occur */
753
+
754
+ int FuzzyMatchWrapper::short_match_max_length( int input_length )
755
+ {
756
+ if ( ! refined_flag )
757
+ return 0;
758
+ if ( input_length >= 5 )
759
+ return 1;
760
+ return 0;
761
+ }
762
+
763
+
764
+ /* if we have non-short matches in a sentence, we need to
765
+ take a closer look at it.
766
+ this function creates a hash map for all input words and their positions
767
+ (to be used by the next function)
768
+ (done here, because this has be done only once for an input sentence) */
769
+
770
+ void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
771
+ {
772
+ int max_length = short_match_max_length( input.size() );
773
+ if (max_length == 0)
774
+ return;
775
+
776
+ wordIndex.clear();
777
+
778
+ // store input words and their positions in hash map
779
+ for(size_t i=0; i<input.size(); i++) {
780
+ if (wordIndex.find( input[i] ) == wordIndex.end()) {
781
+ vector< int > position_vector;
782
+ wordIndex[ input[i] ] = position_vector;
783
+ }
784
+ wordIndex[ input[i] ].push_back( i );
785
+ }
786
+ }
787
+
788
+ /* add all short matches to list of matches for a sentence */
789
+
790
+ void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
791
+ {
792
+ int max_length = short_match_max_length( input_length );
793
+ if (max_length == 0)
794
+ return;
795
+
796
+ int tm_length = tm.size();
797
+ map< WORD_ID,vector< int > >::iterator input_word_hit;
798
+ for(int t_pos=0; t_pos<tm.size(); t_pos++) {
799
+ input_word_hit = wordIndex.find( tm[t_pos] );
800
+ if (input_word_hit != wordIndex.end()) {
801
+ vector< int > &position_vector = input_word_hit->second;
802
+ for(size_t j=0; j<position_vector.size(); j++) {
803
+ int &i_pos = position_vector[j];
804
+
805
+ // before match
806
+ int max_cost = max( i_pos , t_pos );
807
+ int min_cost = abs( i_pos - t_pos );
808
+ if ( i_pos>0 && i_pos == t_pos )
809
+ min_cost++;
810
+
811
+ // after match
812
+ max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
813
+ min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
814
+ if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
815
+ min_cost++;
816
+
817
+ if (min_cost <= best_cost) {
818
+ Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
819
+ match.push_back( new_match );
820
+ }
821
+ }
822
+ }
823
+ }
824
+ }
825
+
826
+ /* remove matches that are subsumed by a larger match */
827
+
828
+ vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
829
+ {
830
+ //cerr << "\tpruning";
831
+ vector< Match > pruned;
832
+ for(int i=match.size()-1; i>=0; i--) {
833
+ //cerr << " (" << match[i].input_start << "," << match[i].input_end
834
+ // << " ; " << match[i].tm_start << "," << match[i].tm_end
835
+ // << " * " << match[i].min_cost << ")";
836
+
837
+ //if (match[i].min_cost > best_cost)
838
+ // continue;
839
+
840
+ bool subsumed = false;
841
+ for(int j=match.size()-1; j>=0; j--) {
842
+ if (i!=j // do not compare match with itself
843
+ && ( match[i].input_end - match[i].input_start <=
844
+ match[j].input_end - match[j].input_start ) // i shorter than j
845
+ && ((match[i].input_start == match[j].input_start &&
846
+ match[i].tm_start == match[j].tm_start ) ||
847
+ (match[i].input_end == match[j].input_end &&
848
+ match[i].tm_end == match[j].tm_end) ) ) {
849
+ subsumed = true;
850
+ }
851
+ }
852
+ if (! subsumed && match[i].min_cost <= best_cost) {
853
+ //cerr << "*";
854
+ pruned.push_back( match[i] );
855
+ }
856
+ }
857
+ //cerr << endl;
858
+ return pruned;
859
+ }
860
+
861
+ /* A* parsing method to compute string edit distance */
862
+
863
+ int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
864
+ {
865
+ // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
866
+
867
+ if (match.size() == 1)
868
+ return match[0].max_cost;
869
+ if (match.size() == 0)
870
+ return input_length+tm_length;
871
+
872
+ int this_best_cost = input_length + tm_length;
873
+ for(size_t i=0; i<match.size(); i++) {
874
+ this_best_cost = min( this_best_cost, match[i].max_cost );
875
+ }
876
+ // cerr << "\tthis best cost: " << this_best_cost << endl;
877
+
878
+ // bottom up combination of spans
879
+ vector< vector< Match > > multi_match;
880
+ multi_match.push_back( match );
881
+
882
+ int match_level = 1;
883
+ while(multi_match[ match_level-1 ].size()>0) {
884
+ // init vector
885
+ vector< Match > empty;
886
+ multi_match.push_back( empty );
887
+
888
+ for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
889
+ int second_level = match_level - first_level -1;
890
+ //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
891
+
892
+ vector< Match > &first_match = multi_match[ first_level ];
893
+ vector< Match > &second_match = multi_match[ second_level ];
894
+
895
+ for(size_t i1 = 0; i1 < first_match.size(); i1++) {
896
+ for(size_t i2 = 0; i2 < second_match.size(); i2++) {
897
+
898
+ // do not combine the same pair twice
899
+ if (first_level == second_level && i2 <= i1) {
900
+ continue;
901
+ }
902
+
903
+ // get sorted matches (first is before second)
904
+ Match *first, *second;
905
+ if (first_match[i1].input_start < second_match[i2].input_start ) {
906
+ first = &first_match[i1];
907
+ second = &second_match[i2];
908
+ } else {
909
+ second = &first_match[i1];
910
+ first = &second_match[i2];
911
+ }
912
+
913
+ //cerr << "\tcombining "
914
+ // << "(" << first->input_start << "," << first->input_end << "), "
915
+ // << first->tm_start << " [" << first->internal_cost << "]"
916
+ // << " with "
917
+ // << "(" << second->input_start << "," << second->input_end << "), "
918
+ // << second->tm_start<< " [" << second->internal_cost << "]"
919
+ // << endl;
920
+
921
+ // do not process overlapping matches
922
+ if (first->input_end >= second->input_start) {
923
+ continue;
924
+ }
925
+
926
+ // no overlap / mismatch in tm
927
+ if (first->tm_end >= second->tm_start) {
928
+ continue;
929
+ }
930
+
931
+ // compute cost
932
+ int min_cost = 0;
933
+ int max_cost = 0;
934
+
935
+ // initial
936
+ min_cost += abs( first->input_start - first->tm_start );
937
+ max_cost += max( first->input_start, first->tm_start );
938
+
939
+ // same number of words, but not sent. start -> cost is at least 1
940
+ if (first->input_start == first->tm_start && first->input_start > 0) {
941
+ min_cost++;
942
+ }
943
+
944
+ // in-between
945
+ int skipped_words = second->input_start - first->input_end -1;
946
+ int skipped_words_tm = second->tm_start - first->tm_end -1;
947
+ int internal_cost = max( skipped_words, skipped_words_tm );
948
+ internal_cost += first->internal_cost + second->internal_cost;
949
+ min_cost += internal_cost;
950
+ max_cost += internal_cost;
951
+
952
+ // final
953
+ min_cost += abs( (tm_length-1 - second->tm_end) -
954
+ (input_length-1 - second->input_end) );
955
+ max_cost += max( (tm_length-1 - second->tm_end),
956
+ (input_length-1 - second->input_end) );
957
+
958
+ // same number of words, but not sent. end -> cost is at least 1
959
+ if ( ( input_length-1 - second->input_end
960
+ == tm_length-1 - second->tm_end )
961
+ && input_length-1 != second->input_end ) {
962
+ min_cost++;
963
+ }
964
+
965
+ // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
966
+
967
+ // if worst than best cost, forget it
968
+ if (min_cost > best_cost) {
969
+ continue;
970
+ }
971
+
972
+ // add match
973
+ Match new_match( first->input_start,
974
+ second->input_end,
975
+ first->tm_start,
976
+ second->tm_end,
977
+ min_cost,
978
+ max_cost,
979
+ internal_cost);
980
+ multi_match[ match_level ].push_back( new_match );
981
+ // cerr << "\tstored\n";
982
+
983
+ // possibly updating this_best_cost
984
+ if (max_cost < this_best_cost) {
985
+ // cerr << "\tupdating this best cost to " << max_cost << "\n";
986
+ this_best_cost = max_cost;
987
+
988
+ // possibly updating best_cost
989
+ if (max_cost < best_cost) {
990
+ // cerr << "\tupdating best cost to " << max_cost << "\n";
991
+ best_cost = max_cost;
992
+ }
993
+ }
994
+ }
995
+ }
996
+ }
997
+ match_level++;
998
+ }
999
+ return this_best_cost;
1000
+ }
1001
+
1002
+
1003
+ void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string &path, ofstream &outputFile)
1004
+ {
1005
+ string sourceStr;
1006
+ for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
1007
+ WORD_ID wordId = sourceSentence[pos];
1008
+ sourceStr += GetVocabulary().GetWord(wordId) + " ";
1009
+ }
1010
+
1011
+ for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
1012
+ const SentenceAlignment &sentenceAlignment = targets[targetInd];
1013
+ string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
1014
+ string alignStr = sentenceAlignment.getAlignmentString();
1015
+
1016
+ outputFile
1017
+ << sentenceInd << endl
1018
+ << cost << endl
1019
+ << sourceStr << endl
1020
+ << inputStr << endl
1021
+ << targetStr << endl
1022
+ << alignStr << endl
1023
+ << path << endl
1024
+ << sentenceAlignment.count << endl;
1025
+
1026
+ }
1027
+ }
1028
+
1029
+ } // namespace
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // FuzzyMatchWrapper.h
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 26/07/2012.
6
+ // Copyright 2012 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #ifndef moses_FuzzyMatchWrapper_h
10
+ #define moses_FuzzyMatchWrapper_h
11
+
12
+ #ifdef WITH_THREADS
13
+ #include <boost/thread/shared_mutex.hpp>
14
+ #endif
15
+
16
+ #include <fstream>
17
+ #include <string>
18
+ #include "SuffixArray.h"
19
+ #include "Vocabulary.h"
20
+ #include "Match.h"
21
+ #include "moses/InputType.h"
22
+
23
+ namespace tmmt
24
+ {
25
+ class Match;
26
+ struct SentenceAlignment;
27
+
28
+ class FuzzyMatchWrapper
29
+ {
30
+ public:
31
+ FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
32
+
33
+ std::string Extract(long translationId, const std::string &dirNameStr);
34
+
35
+ protected:
36
+ // tm-mt
37
+ std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
38
+ tmmt::SuffixArray *suffixArray;
39
+ int basic_flag;
40
+ int lsed_flag;
41
+ int refined_flag;
42
+ int length_filter_flag;
43
+ int parse_flag;
44
+ int min_match;
45
+ int multiple_flag;
46
+ int multiple_slack;
47
+ int multiple_max;
48
+
49
+ typedef std::map< WORD_ID,std::vector< int > > WordIndex;
50
+
51
+ // global cache for word pairs
52
+ std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
53
+ #ifdef WITH_THREADS
54
+ //reader-writer lock
55
+ mutable boost::shared_mutex m_accessLock;
56
+ #endif
57
+
58
+ void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
59
+ void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
60
+ void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
61
+
62
+ /** brute force method: compare input to all corpus sentences */
63
+ void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
64
+ std::vector< std::vector< tmmt::WORD_ID > > input ) ;
65
+
66
+ /** utlility function: compute length of sentence in characters
67
+ (spaces do not count) */
68
+ unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
69
+ unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
70
+ unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
71
+ void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
72
+ int short_match_max_length( int input_length );
73
+ void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
74
+ std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
75
+ int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );
76
+
77
+ void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile);
78
+
79
+ std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
80
+ Vocabulary &GetVocabulary() {
81
+ return suffixArray->GetVocabulary();
82
+ }
83
+
84
+ bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
85
+ void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
86
+
87
+ };
88
+
89
+ }
90
+
91
+ #endif
mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // Match.h
3
+ // fuzzy-match
4
+ //
5
+ // Created by Hieu Hoang on 25/07/2012.
6
+ // Copyright 2012 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #ifndef fuzzy_match_Match_h
10
+ #define fuzzy_match_Match_h
11
+
12
+ namespace tmmt
13
+ {
14
+
15
+ /* data structure for n-gram match between input and corpus */
16
+
17
+ class Match
18
+ {
19
+ public:
20
+ int input_start;
21
+ int input_end;
22
+ int tm_start;
23
+ int tm_end;
24
+ int min_cost;
25
+ int max_cost;
26
+ int internal_cost;
27
+ Match( int is, int ie, int ts, int te, int min, int max, int i )
28
+ :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) {
29
+ }
30
+ };
31
+
32
+ }
33
+
34
+ #endif
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // SentenceAlignment.cpp
3
+ // moses
4
+ //
5
+ // Created by Hieu Hoang on 26/07/2012.
6
+ // Copyright 2012 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #include <iostream>
10
+ #include "util/string_stream.hh"
11
+ #include "SentenceAlignment.h"
12
+
13
+ namespace tmmt
14
+ {
15
+ std::string SentenceAlignment::getTargetString(const Vocabulary &vocab) const
16
+ {
17
+ util::StringStream strme;
18
+ for (size_t i = 0; i < target.size(); ++i) {
19
+ const WORD &word = vocab.GetWord(target[i]);
20
+ strme << word << " ";
21
+ }
22
+ return strme.str();
23
+ }
24
+
25
+ }
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // SentenceAlignment.h
3
+ // fuzzy-match
4
+ //
5
+ // Created by Hieu Hoang on 25/07/2012.
6
+ // Copyright 2012 __MyCompanyName__. All rights reserved.
7
+ //
8
+
9
+ #ifndef fuzzy_match_SentenceAlignment_h
10
+ #define fuzzy_match_SentenceAlignment_h
11
+
12
+ #include <sstream>
13
+ #include <vector>
14
+ #include "Vocabulary.h"
15
+ #include "util/string_stream.hh"
16
+
17
+ namespace tmmt
18
+ {
19
+
20
+ struct SentenceAlignment {
21
+ int count;
22
+ std::vector< WORD_ID > target;
23
+ std::vector< std::pair<int,int> > alignment;
24
+
25
+ SentenceAlignment() {
26
+ }
27
+
28
+ std::string getTargetString(const Vocabulary &vocab) const;
29
+
30
+ std::string getAlignmentString() const {
31
+ util::StringStream strme;
32
+ for (size_t i = 0; i < alignment.size(); ++i) {
33
+ const std::pair<int,int> &alignPair = alignment[i];
34
+ strme << alignPair.first << "-" << alignPair.second << " ";
35
+ }
36
+ return strme.str();
37
+ }
38
+
39
+ };
40
+
41
+ }
42
+
43
+ #endif
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "SuffixArray.h"
2
+ #include <string>
3
+ #include <stdlib.h>
4
+ #include <cstring>
5
+
6
+ using namespace std;
7
+
8
+ namespace tmmt
9
+ {
10
+
11
+ SuffixArray::SuffixArray( string fileName )
12
+ {
13
+ m_vcb.StoreIfNew( "<uNk>" );
14
+ m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
15
+
16
+ ifstream extractFile;
17
+
18
+ // count the number of words first;
19
+ extractFile.open(fileName.c_str());
20
+ istream *fileP = &extractFile;
21
+ m_size = 0;
22
+ size_t sentenceCount = 0;
23
+ string line;
24
+ while(getline(*fileP, line)) {
25
+
26
+ vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
27
+ m_size += words.size() + 1;
28
+ sentenceCount++;
29
+ }
30
+ extractFile.close();
31
+ cerr << m_size << " words (incl. sentence boundaries)" << endl;
32
+
33
+ // allocate memory
34
+ m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
35
+ m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
36
+ m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
37
+ m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
38
+ m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
39
+
40
+ // fill the array
41
+ int wordIndex = 0;
42
+ int sentenceId = 0;
43
+ extractFile.open(fileName.c_str());
44
+ fileP = &extractFile;
45
+ while(getline(*fileP, line)) {
46
+ vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
47
+
48
+ // add to corpus vector
49
+ corpus.push_back(words);
50
+
51
+ // create SA
52
+
53
+ vector< WORD_ID >::const_iterator i;
54
+ for( i=words.begin(); i!=words.end(); i++) {
55
+ m_index[ wordIndex ] = wordIndex;
56
+ m_sentence[ wordIndex ] = sentenceId;
57
+ m_wordInSentence[ wordIndex ] = i-words.begin();
58
+ m_array[ wordIndex++ ] = *i;
59
+ }
60
+ m_index[ wordIndex ] = wordIndex;
61
+ m_array[ wordIndex++ ] = m_endOfSentence;
62
+ m_sentenceLength[ sentenceId++ ] = words.size();
63
+ }
64
+ extractFile.close();
65
+ cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
66
+ // List(0,9);
67
+
68
+ // sort
69
+ m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
70
+ Sort( 0, m_size-1 );
71
+ free( m_buffer );
72
+ cerr << "done sorting" << endl;
73
+ }
74
+
75
+ // good ol' quick sort
76
+ void SuffixArray::Sort(INDEX start, INDEX end)
77
+ {
78
+ if (start == end) return;
79
+ INDEX mid = (start+end+1)/2;
80
+ Sort( start, mid-1 );
81
+ Sort( mid, end );
82
+
83
+ // merge
84
+ size_t i = start;
85
+ size_t j = mid;
86
+ size_t k = 0;
87
+ size_t length = end-start+1;
88
+ while( k<length ) {
89
+ if (i == mid ) {
90
+ m_buffer[ k++ ] = m_index[ j++ ];
91
+ } else if (j > end ) {
92
+ m_buffer[ k++ ] = m_index[ i++ ];
93
+ } else {
94
+ if (CompareIndex( m_index[i], m_index[j] ) < 0) {
95
+ m_buffer[ k++ ] = m_index[ i++ ];
96
+ } else {
97
+ m_buffer[ k++ ] = m_index[ j++ ];
98
+ }
99
+ }
100
+ }
101
+
102
+ memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
103
+ ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
104
+ }
105
+
106
+ SuffixArray::~SuffixArray()
107
+ {
108
+ free(m_index);
109
+ free(m_array);
110
+ }
111
+
112
+ int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
113
+ {
114
+ // skip over identical words
115
+ INDEX offset = 0;
116
+ while( a+offset < m_size &&
117
+ b+offset < m_size &&
118
+ m_array[ a+offset ] == m_array[ b+offset ] ) {
119
+ offset++;
120
+ }
121
+
122
+ if( a+offset == m_size ) return -1;
123
+ if( b+offset == m_size ) return 1;
124
+ return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
125
+ }
126
+
127
+ inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
128
+ {
129
+ // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
130
+ return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
131
+ }
132
+
133
+ int SuffixArray::Count( const vector< WORD > &phrase )
134
+ {
135
+ INDEX dummy;
136
+ return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
137
+ }
138
+
139
+ bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
140
+ {
141
+ INDEX dummy;
142
+ return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
143
+ }
144
+
145
+ bool SuffixArray::Exists( const vector< WORD > &phrase )
146
+ {
147
+ INDEX dummy;
148
+ return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
149
+ }
150
+
151
+ int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
152
+ {
153
+ return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
154
+ }
155
+
156
+ int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
157
+ {
158
+ // cerr << "FindFirst\n";
159
+ INDEX start = search_start;
160
+ INDEX end = (search_end == -1) ? (m_size-1) : search_end;
161
+ INDEX mid = FindFirst( phrase, start, end );
162
+ // cerr << "done\n";
163
+ if (mid == m_size) return 0; // no matches
164
+ if (min == 1) return 1; // only existance check
165
+
166
+ int matchCount = 1;
167
+
168
+ //cerr << "before...\n";
169
+ firstMatch = FindLast( phrase, mid, start, -1 );
170
+ matchCount += mid - firstMatch;
171
+
172
+ //cerr << "after...\n";
173
+ lastMatch = FindLast( phrase, mid, end, 1 );
174
+ matchCount += lastMatch - mid;
175
+
176
+ return matchCount;
177
+ }
178
+
179
+ SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
180
+ {
181
+ end += direction;
182
+ while(true) {
183
+ INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
184
+
185
+ int match = Match( phrase, mid );
186
+ int matchNext = Match( phrase, mid+direction );
187
+ //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
188
+
189
+ if (match == 0 && matchNext != 0) return mid;
190
+
191
+ if (match == 0) // mid point is a match
192
+ start = mid;
193
+ else
194
+ end = mid;
195
+ }
196
+ }
197
+
198
+ SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
199
+ {
200
+ while(true) {
201
+ INDEX mid = ( start + end + 1 )/2;
202
+ //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
203
+ int match = Match( phrase, mid );
204
+
205
+ if (match == 0) return mid;
206
+ if (start >= end && match != 0 ) return m_size;
207
+
208
+ if (match > 0)
209
+ start = mid+1;
210
+ else
211
+ end = mid-1;
212
+ }
213
+ }
214
+
215
+ int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
216
+ {
217
+ INDEX pos = m_index[ index ];
218
+ for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
219
+ int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
220
+ // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
221
+ if (match != 0)
222
+ return match;
223
+ }
224
+ return 0;
225
+ }
226
+
227
+ void SuffixArray::List(INDEX start, INDEX end)
228
+ {
229
+ for(INDEX i=start; i<=end; i++) {
230
+ INDEX pos = m_index[ i ];
231
+ // cerr << i << ":" << pos << "\t";
232
+ for(int j=0; j<5 && j+pos<m_size; j++) {
233
+ //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
234
+ }
235
+ // cerr << "\n";
236
+ }
237
+ }
238
+
239
+ }
240
+
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "Vocabulary.h"
2
+
3
+ #pragma once
4
+
5
+ #define LINE_MAX_LENGTH 10000
6
+
7
+ namespace tmmt
8
+ {
9
+
10
+ class SuffixArray
11
+ {
12
+ public:
13
+ typedef unsigned int INDEX;
14
+
15
+ private:
16
+ std::vector< std::vector< WORD_ID > > corpus;
17
+
18
+ WORD_ID *m_array;
19
+ INDEX *m_index;
20
+ INDEX *m_buffer;
21
+ char *m_wordInSentence;
22
+ size_t *m_sentence;
23
+ char *m_sentenceLength;
24
+ WORD_ID m_endOfSentence;
25
+ Vocabulary m_vcb;
26
+ INDEX m_size;
27
+
28
+ public:
29
+ SuffixArray( std::string fileName );
30
+ ~SuffixArray();
31
+
32
+ void Sort(INDEX start, INDEX end);
33
+ int CompareIndex( INDEX a, INDEX b ) const;
34
+ inline int CompareWord( WORD_ID a, WORD_ID b ) const;
35
+ int Count( const std::vector< WORD > &phrase );
36
+ bool MinCount( const std::vector< WORD > &phrase, INDEX min );
37
+ bool Exists( const std::vector< WORD > &phrase );
38
+ int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
39
+ int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
40
+ INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
41
+ INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
42
+ int Match( const std::vector< WORD > &phrase, INDEX index );
43
+ void List( INDEX start, INDEX end );
44
+ inline INDEX GetPosition( INDEX index ) {
45
+ return m_index[ index ];
46
+ }
47
+ inline size_t GetSentence( INDEX position ) {
48
+ return m_sentence[position];
49
+ }
50
+ inline char GetWordInSentence( INDEX position ) {
51
+ return m_wordInSentence[position];
52
+ }
53
+ inline char GetSentenceLength( size_t sentenceId ) {
54
+ return m_sentenceLength[sentenceId];
55
+ }
56
+ inline INDEX GetSize() {
57
+ return m_size;
58
+ }
59
+
60
+ Vocabulary &GetVocabulary() {
61
+ return m_vcb;
62
+ }
63
+ const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
64
+ return corpus;
65
+ }
66
+ };
67
+
68
+ }
69
+
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
2
+ #include "Vocabulary.h"
3
+ #ifdef WITH_THREADS
4
+ #include <boost/thread/locks.hpp>
5
+ #endif
6
+
7
+ using namespace std;
8
+
9
+ namespace tmmt
10
+ {
11
+
12
+ // as in beamdecoder/tables.cpp
13
+ vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
14
+ {
15
+ vector< WORD_ID > token;
16
+ bool betweenWords = true;
17
+ int start=0;
18
+ int i=0;
19
+ for(; input[i] != '\0'; i++) {
20
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
21
+
22
+ if (!isSpace && betweenWords) {
23
+ start = i;
24
+ betweenWords = false;
25
+ } else if (isSpace && !betweenWords) {
26
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
27
+ betweenWords = true;
28
+ }
29
+ }
30
+ if (!betweenWords)
31
+ token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
32
+ return token;
33
+ }
34
+
35
+ WORD_ID Vocabulary::StoreIfNew( const WORD& word )
36
+ {
37
+
38
+ {
39
+ // read=lock scope
40
+ #ifdef WITH_THREADS
41
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
42
+ #endif
43
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
44
+
45
+ if( i != lookup.end() )
46
+ return i->second;
47
+ }
48
+
49
+ #ifdef WITH_THREADS
50
+ boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
51
+ #endif
52
+ WORD_ID id = vocab.size();
53
+ vocab.push_back( word );
54
+ lookup[ word ] = id;
55
+ return id;
56
+ }
57
+
58
+ WORD_ID Vocabulary::GetWordID( const WORD &word )
59
+ {
60
+ #ifdef WITH_THREADS
61
+ boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
62
+ #endif
63
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
64
+ if( i == lookup.end() )
65
+ return 0;
66
+ WORD_ID w= (WORD_ID) i->second;
67
+ return w;
68
+ }
69
+
70
+ }
71
+
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
2
+
3
+ #pragma once
4
+
5
+ #include <iostream>
6
+ #include <fstream>
7
+ #include <cassert>
8
+ #include <cstdlib>
9
+ #include <string>
10
+ #include <queue>
11
+ #include <map>
12
+ #include <cmath>
13
+
14
+ #ifdef WITH_THREADS
15
+ #include <boost/thread/shared_mutex.hpp>
16
+ #endif
17
+
18
+ namespace tmmt
19
+ {
20
+ typedef std::string WORD;
21
+ typedef unsigned int WORD_ID;
22
+
23
+ class Vocabulary
24
+ {
25
+ public:
26
+ std::map<WORD, WORD_ID> lookup;
27
+ std::vector< WORD > vocab;
28
+ WORD_ID StoreIfNew( const WORD& );
29
+ WORD_ID GetWordID( const WORD& );
30
+ std::vector<WORD_ID> Tokenize( const char[] );
31
+ inline WORD &GetWord( WORD_ID id ) const {
32
+ WORD &i = (WORD&) vocab[ id ];
33
+ return i;
34
+ }
35
+
36
+ protected:
37
+ #ifdef WITH_THREADS
38
+ //reader-writer lock
39
+ mutable boost::shared_mutex m_accessLock;
40
+ #endif
41
+
42
+
43
+ };
44
+
45
+ }
46
+
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #include <iostream>
3
+ #include <fstream>
4
+ #include <cassert>
5
+ #include <vector>
6
+ #include <string>
7
+ #include "moses/Util.h"
8
+ #include "Alignments.h"
9
+
10
+ using namespace std;
11
+ using namespace Moses;
12
+
13
+ inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r")
14
+ {
15
+ std::string res = str;
16
+ res.erase(str.find_last_not_of(dropChars)+1);
17
+ return res.erase(0, res.find_first_not_of(dropChars));
18
+ }
19
+
20
+ class CreateXMLRetValues
21
+ {
22
+ public:
23
+ string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv;
24
+ };
25
+
26
+ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path );
27
+
28
+ void create_xml(const string &inPath)
29
+ {
30
+ ifstream inStrme(inPath.c_str());
31
+ ofstream rule((inPath + ".extract").c_str());
32
+ ofstream ruleInv((inPath + ".extract.inv").c_str());
33
+
34
+ // int setenceId;
35
+ // float score;
36
+ string source, target, align, path;
37
+ string *input = NULL;
38
+ int count;
39
+
40
+ int lineCount = 1;
41
+ int ruleCount = 1;
42
+ string inLine;
43
+
44
+ int step = 0;
45
+ while (!inStrme.eof()) {
46
+ getline(inStrme, inLine);
47
+ //cout << inLine << endl;
48
+ switch (step) {
49
+ case 0:
50
+ /*setenceId = */
51
+ Scan<int>(inLine);
52
+ ++step;
53
+ break;
54
+ case 1:
55
+ /*score = */
56
+ Scan<float>(inLine);
57
+ ++step;
58
+ break;
59
+ case 2:
60
+ source = inLine;
61
+ ++step;
62
+ break;
63
+ case 3:
64
+ if (input == NULL) {
65
+ input = new string(inLine);
66
+ } else {
67
+ assert(inLine == *input);
68
+ }
69
+ ++step;
70
+ break;
71
+ case 4:
72
+ target = inLine;
73
+ ++step;
74
+ break;
75
+ case 5:
76
+ align = inLine;
77
+ ++step;
78
+ break;
79
+ case 6:
80
+ path = inLine + "X";
81
+ ++step;
82
+ break;
83
+ case 7:
84
+ count = Scan<int>(inLine);
85
+ CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path);
86
+
87
+ //print STDOUT $frame."\n";
88
+ rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
89
+ << " ||| " << count << endl;
90
+ ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
91
+ << " ||| " << count << endl;
92
+
93
+ //print STDOUT "$sentenceInd ||| $score ||| $count\n";
94
+ ++ruleCount;
95
+ step = 0;
96
+ break;
97
+ }
98
+
99
+ ++lineCount;
100
+ }
101
+
102
+ delete input;
103
+ ruleInv.close();
104
+ rule.close();
105
+ inStrme.close();
106
+
107
+ }
108
+
109
+
110
+ CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path)
111
+ {
112
+ CreateXMLRetValues ret;
113
+ vector<string> sourceToks = Tokenize(source, " ")
114
+ ,inputToks = Tokenize(input, " ")
115
+ ,targetsToks = Tokenize(target, " ");
116
+ Alignments alignments(align, sourceToks.size(), targetsToks.size());
117
+ map<int, string> frameInput;
118
+ map<int, int> alignI2S;
119
+ vector< map<string, int> > nonTerms;
120
+ vector<bool> targetBitmap(targetsToks.size(), true);
121
+ vector<bool> inputBitmap;
122
+
123
+ // STEP 1: FIND MISMATCHES
124
+ int s = 0, i = 0;
125
+ bool currently_matching = false;
126
+ int start_s = 0, start_i = 0;
127
+
128
+ //cerr << input << endl << source << endl << target << endl << path << endl;
129
+ for ( int p = 0 ; p < int(path.length()) ; p++ ) {
130
+ string action = path.substr(p, 1);
131
+
132
+ // beginning of a mismatch
133
+ if ( currently_matching && action != "M" && action != "X" ) {
134
+ start_i = i;
135
+ start_s = s;
136
+ currently_matching = 0;
137
+ } // if ( currently_matching
138
+ // end of a mismatch
139
+ else if ( !currently_matching && ( action == "M" || action == "X" ) ) {
140
+
141
+ // remove use of affected target words
142
+ for ( int ss = start_s ; ss < s ; ss++ ) {
143
+ const std::map<int, int> &targets = alignments.m_alignS2T[ss];
144
+
145
+ std::map<int, int>::const_iterator iter;
146
+ for (iter = targets.begin(); iter != targets.end(); ++iter) {
147
+ int tt = iter->first;
148
+ targetBitmap[tt] = 0;
149
+ }
150
+
151
+ // also remove enclosed unaligned words?
152
+ } //for ( int ss = start_s ; ss < s ; ss++ ) {
153
+
154
+ // are there input words that need to be inserted ?
155
+ //cerr << start_i << "<" << i << "?" << endl;
156
+ if (start_i < i ) {
157
+
158
+ // take note of input words to be inserted
159
+ string insertion = "";
160
+ for (int ii = start_i ; ii < i ; ii++ ) {
161
+ insertion += inputToks[ii] + " ";
162
+ }
163
+
164
+ // find position for inserted input words
165
+
166
+ // find first removed target word
167
+ int start_t = 1000;
168
+ for ( int ss = start_s ; ss < s ; ss++ ) {
169
+ const std::map<int, int> &targets = alignments.m_alignS2T[ss];
170
+
171
+ std::map<int, int>::const_iterator iter;
172
+ for (iter = targets.begin(); iter != targets.end(); ++iter) {
173
+ int tt = iter->first;
174
+ if (tt < start_t) {
175
+ start_t = tt;
176
+ }
177
+ }
178
+ }
179
+
180
+ // end of sentence? add to end
181
+ if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) {
182
+ start_t = targetsToks.size() - 1;
183
+ }
184
+
185
+ // backtrack to previous words if unaligned
186
+ if ( start_t == 1000 ) {
187
+ start_t = -1;
188
+ for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
189
+ const std::map<int, int> &targets = alignments.m_alignS2T[ss];
190
+
191
+ std::map<int, int>::const_iterator iter;
192
+ for (iter = targets.begin(); iter != targets.end(); ++iter) {
193
+ int tt = iter->first;
194
+ if (tt > start_t) {
195
+ start_t = tt;
196
+ }
197
+ }
198
+ }
199
+ } // if ( start_t == 1000 ) {
200
+
201
+ frameInput[start_t] += insertion;
202
+ map<string, int> nt;
203
+ nt["start_t"] = start_t;
204
+ nt["start_i"] = start_i;
205
+ nonTerms.push_back(nt);
206
+
207
+ } // if (start_i < i ) {
208
+
209
+ currently_matching = 1;
210
+ } // else if ( !currently_matching
211
+
212
+ /*
213
+ cerr << action << " " << s << " " << i
214
+ << "(" << start_s << " " << start_i << ")"
215
+ << currently_matching;
216
+ */
217
+
218
+ if ( action != "I" ) {
219
+ //cerr << " ->";
220
+
221
+ if (s < int(alignments.m_alignS2T.size())) {
222
+ const std::map<int, int> &targets = alignments.m_alignS2T[s];
223
+ //cerr << "s=" << s << endl;
224
+
225
+ std::map<int, int>::const_iterator iter;
226
+ for (iter = targets.begin(); iter != targets.end(); ++iter) {
227
+ // int tt = iter->first;
228
+ //cerr << " " << tt;
229
+ }
230
+ }
231
+ }
232
+ //cerr << endl;
233
+
234
+ if (action != "I")
235
+ s++;
236
+ if (action != "D") {
237
+ i++;
238
+ alignI2S[i] = s;
239
+ }
240
+
241
+ if (action == "M") {
242
+ inputBitmap.push_back(1);
243
+ } else if (action == "I" || action == "S") {
244
+ inputBitmap.push_back(0);
245
+ }
246
+
247
+ } // for ( int p = 0
248
+
249
+ //cerr << target << endl;
250
+ for (size_t i = 0; i < targetBitmap.size(); ++i) {
251
+ //cerr << targetBitmap[i];
252
+ }
253
+ //cerr << endl;
254
+
255
+ for (map<int, string>::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
256
+ //cerr << iter->first << ":" <<iter->second << endl;
257
+ }
258
+
259
+ // STEP 2: BUILD RULE AND FRAME
260
+
261
+ // hierarchical rule
262
+ int rule_pos_s = 0;
263
+ map<int, int> ruleAlignS;
264
+
265
+ for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) {
266
+ if ( inputBitmap[i] ) {
267
+ ret.ruleS += inputToks[i] + " ";
268
+ ruleAlignS[ alignI2S[i] ] = rule_pos_s++;
269
+ }
270
+
271
+ for (size_t j = 0; j < nonTerms.size(); ++j) {
272
+ map<string, int> &nt = nonTerms[j];
273
+ if (i == nt["start_i"]) {
274
+ ret.ruleS += "[X][X] ";
275
+ nt["rule_pos_s"] = rule_pos_s++;
276
+ }
277
+ }
278
+ }
279
+
280
+ int rule_pos_t = 0;
281
+ map<int, int> ruleAlignT;
282
+
283
+ for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) {
284
+ if (t >= 0 && targetBitmap[t]) {
285
+ ret.ruleT += targetsToks[t] + " ";
286
+ ruleAlignT[t] = rule_pos_t++;
287
+ }
288
+
289
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
290
+ map<string, int> &nt = nonTerms[i];
291
+
292
+ if (t == nt["start_t"]) {
293
+ ret.ruleT += "[X][X] ";
294
+ nt["rule_pos_t"] = rule_pos_t++;
295
+ }
296
+ }
297
+ }
298
+
299
+ int numAlign = 0;
300
+ ret.ruleAlignment = "";
301
+
302
+ for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) {
303
+ int s = iter->first;
304
+
305
+ if (s < int(alignments.m_alignS2T.size())) {
306
+ const std::map<int, int> &targets = alignments.m_alignS2T[s];
307
+
308
+ std::map<int, int>::const_iterator iter;
309
+ for (iter = targets.begin(); iter != targets.end(); ++iter) {
310
+ int t =iter->first;
311
+ if (ruleAlignT.find(t) == ruleAlignT.end())
312
+ continue;
313
+ ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " ";
314
+ ++numAlign;
315
+ }
316
+ }
317
+ }
318
+
319
+ //cerr << "numAlign=" << numAlign << endl;
320
+
321
+ for (size_t i = 0; i < nonTerms.size(); ++i) {
322
+ map<string, int> &nt = nonTerms[i];
323
+ ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " ";
324
+ ++numAlign;
325
+ }
326
+
327
+ //cerr << "numAlign=" << numAlign << endl;
328
+
329
+ ret.ruleS = TrimInternal(ret.ruleS);
330
+ ret.ruleT = TrimInternal(ret.ruleT);
331
+ ret.ruleAlignment = TrimInternal(ret.ruleAlignment);
332
+
333
+ vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment);
334
+ for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) {
335
+ const string &alignPoint = ruleAlignmentToks[i];
336
+ vector<string> toks = Tokenize(alignPoint, "-");
337
+ assert(toks.size() == 2);
338
+ ret.ruleAlignmentInv += toks[1] + "-" +toks[0];
339
+ }
340
+ ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv);
341
+
342
+ // frame
343
+ // ret.frame;
344
+ if (frameInput.find(-1) == frameInput.end())
345
+ ret.frame = frameInput[-1];
346
+
347
+ int currently_included = 0;
348
+ int start_t = -1;
349
+ targetBitmap.push_back(0);
350
+
351
+ for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) {
352
+ // beginning of tm target inclusion
353
+ if ( !currently_included && targetBitmap[t] ) {
354
+ start_t = t;
355
+ currently_included = 1;
356
+ }
357
+ // end of tm target inclusion (not included word or inserted input)
358
+ else if (currently_included
359
+ && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
360
+ ) {
361
+ // add xml (unless change is at the beginning of the sentence
362
+ if ( start_t >= 0 ) {
363
+ string target = "";
364
+ //cerr << "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
365
+ for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
366
+ target += targetsToks[tt] + " ";
367
+ }
368
+ // target = Trim(target); TODO
369
+ ret.frame += "<xml translation=\"" + target + "\"> x </xml> ";
370
+ }
371
+ currently_included = 0;
372
+ }
373
+
374
+ if (frameInput.find(t) != frameInput.end())
375
+ ret.frame += frameInput[t];
376
+ //cerr << targetBitmap[t] << " " << t << " " << "(" << start_t << ")"
377
+ // << currently_included << endl;
378
+
379
+ } //for (int t = 0
380
+
381
+ cerr << ret.frame << "\n-------------------------------------\n";
382
+ return ret;
383
+
384
+ }
385
+
386
+
387
+
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+
5
+ void create_xml(const std::string &inPath);
mosesdecoder/moses/server/Hypothesis_4server.cpp ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #include "moses/Hypothesis.h"
3
+ #include "moses/Manager.h"
4
+ #include <boost/foreach.hpp>
5
+ namespace Moses {
6
+ void
7
+ Hypothesis::
8
+ OutputLocalWordAlignment(std::vector<xmlrpc_c::value>& dest) const
9
+ {
10
+ using namespace std;
11
+ Range const& src = this->GetCurrSourceWordsRange();
12
+ Range const& trg = this->GetCurrTargetWordsRange();
13
+
14
+ WordAlignmentSort waso = m_manager.options()->output.WA_SortOrder;
15
+ vector<pair<size_t,size_t> const* > a
16
+ = this->GetCurrTargetPhrase().GetAlignTerm().GetSortedAlignments(waso);
17
+ typedef pair<size_t,size_t> item;
18
+ BOOST_FOREACH(item const* p, a) {
19
+ map<string, xmlrpc_c::value> M;
20
+ M["source-word"] = xmlrpc_c::value_int(src.GetStartPos() + p->first);
21
+ M["target-word"] = xmlrpc_c::value_int(trg.GetStartPos() + p->second);
22
+ dest.push_back(xmlrpc_c::value_struct(M));
23
+ }
24
+ }
25
+
26
+ void
27
+ Hypothesis::
28
+ OutputWordAlignment(std::vector<xmlrpc_c::value>& out) const
29
+ {
30
+ std::vector<Hypothesis const*> tmp;
31
+ for (Hypothesis const* h = this; h; h = h->GetPrevHypo())
32
+ tmp.push_back(h);
33
+ for (size_t i = tmp.size(); i-- > 0;)
34
+ tmp[i]->OutputLocalWordAlignment(out);
35
+ }
36
+
37
+ }
mosesdecoder/moses/server/Optimizer.h ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+
3
+ #include <xmlrpc-c/base.hpp>
4
+ #include <xmlrpc-c/registry.hpp>
5
+ #include <xmlrpc-c/server_abyss.hpp>
6
+
7
+ namespace MosesServer
8
+ {
9
+ class
10
+ Optimizer : public xmlrpc_c::method
11
+ {
12
+ public:
13
+ Optimizer();
14
+ void execute(xmlrpc_c::paramList const& paramList,
15
+ xmlrpc_c::value * const retvalP);
16
+ };
17
+ }
mosesdecoder/moses/server/PackScores.cpp ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #include "PackScores.h"
3
+ #include "moses/FF/StatefulFeatureFunction.h"
4
+ #include "moses/FF/StatelessFeatureFunction.h"
5
+ #include <boost/foreach.hpp>
6
+ namespace Moses {
7
+
8
+ void
9
+ PackScores(FeatureFunction const& ff, FVector const& S,
10
+ std::map<std::string, xmlrpc_c::value>& M)
11
+ {
12
+ std::vector<xmlrpc_c::value> v;
13
+ size_t N = ff.GetNumScoreComponents();
14
+
15
+ std::vector<xmlrpc_c::value> dense;
16
+ dense.reserve(N);
17
+ size_t o = ff.GetIndex();
18
+ for (size_t i = 0; i < N; ++i)
19
+ if (ff.IsTuneableComponent(i))
20
+ dense.push_back(xmlrpc_c::value_double(S[o+i]));
21
+ v.push_back(xmlrpc_c::value_array(dense));
22
+
23
+ std::map<std::string,xmlrpc_c::value> sparse;
24
+ typedef FVector::FNVmap::const_iterator iter;
25
+ for(iter m = S.cbegin(); m != S.cend(); ++m)
26
+ sparse[m->first.name()] = xmlrpc_c::value_double(m->second);
27
+ v.push_back(xmlrpc_c::value_struct(sparse));
28
+ M[ff.GetScoreProducerDescription()] = xmlrpc_c::value_array(v);
29
+ }
30
+
31
+ xmlrpc_c::value
32
+ PackScores(ScoreComponentCollection const& S)
33
+ {
34
+ std::map<std::string, xmlrpc_c::value> M;
35
+ typedef StatefulFeatureFunction SFFF;
36
+ typedef StatelessFeatureFunction SLFF;
37
+ BOOST_FOREACH(SFFF const* ff, SFFF::GetStatefulFeatureFunctions())
38
+ if (ff->IsTuneable())
39
+ PackScores(*ff, S.GetScoresVector(), M);
40
+ BOOST_FOREACH(SLFF const* ff, SLFF::GetStatelessFeatureFunctions())
41
+ if (ff->IsTuneable())
42
+ PackScores(*ff, S.GetScoresVector(), M);
43
+ return xmlrpc_c::value_struct(M);
44
+ }
45
+ }
mosesdecoder/moses/server/PackScores.h ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #pragma once
3
+ #include <xmlrpc-c/base.hpp>
4
+ #include "moses/FF/FeatureFunction.h"
5
+ #include "moses/ScoreComponentCollection.h"
6
+
7
+ namespace Moses {
8
+ xmlrpc_c::value
9
+ PackScores(ScoreComponentCollection const& S);
10
+ }
mosesdecoder/moses/server/Server.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ #pragma once
3
+ #include "moses/TypeDef.h"
4
+
5
+ #ifdef WITH_THREADS
6
+ #include <boost/thread.hpp>
7
+ #include "moses/ThreadPool.h"
8
+ #endif
9
+
10
+ #include <xmlrpc-c/base.hpp>
11
+ #include <xmlrpc-c/registry.hpp>
12
+ #include <xmlrpc-c/server_abyss.hpp>
13
+ #include "Translator.h"
14
+ #include "Optimizer.h"
15
+ #include "Updater.h"
16
+ #include "CloseSession.h"
17
+ #include "Session.h"
18
+ #include "moses/parameters/ServerOptions.h"
19
+ #include <string>
20
+
21
+ namespace MosesServer
22
+ {
23
+ class Server
24
+ {
25
+ Moses::ServerOptions m_server_options;
26
+ SessionCache m_session_cache;
27
+ xmlrpc_c::registry m_registry;
28
+ xmlrpc_c::methodPtr const m_updater;
29
+ xmlrpc_c::methodPtr const m_optimizer;
30
+ xmlrpc_c::methodPtr const m_translator;
31
+ xmlrpc_c::methodPtr const m_close_session;
32
+ std::string m_pidfile;
33
+ public:
34
+ Server(Moses::Parameter& params);
35
+ ~Server();
36
+ int run();
37
+ void delete_session(uint64_t const session_id);
38
+
39
+ Moses::ServerOptions const&
40
+ options() const;
41
+
42
+ Session const&
43
+ get_session(uint64_t session_id);
44
+
45
+ };
46
+ }
mosesdecoder/moses/server/Session.h ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+ #pragma once
3
+ #include "moses/Util.h"
4
+ #include "moses/ContextScope.h"
5
+ #include "moses/parameters/AllOptions.h"
6
+ #include <sys/time.h>
7
+ #include <boost/unordered_map.hpp>
8
+
9
+ #ifdef WITH_THREADS
10
+ #include <boost/thread/shared_mutex.hpp>
11
+ #include <boost/thread/locks.hpp>
12
+ #endif
13
+ namespace MosesServer{
14
+
15
+ struct Session
16
+ {
17
+ uint64_t const id;
18
+ time_t start_time;
19
+ time_t last_access;
20
+ boost::shared_ptr<Moses::ContextScope> const scope; // stores local info
21
+ SPTR<std::map<std::string,float> > m_context_weights;
22
+
23
+
24
+ Session(uint64_t const session_id)
25
+ : id(session_id)
26
+ , scope(new Moses::ContextScope)
27
+ {
28
+ last_access = start_time = time(NULL);
29
+ }
30
+
31
+ bool is_new() const { return last_access == start_time; }
32
+
33
+ void setup(std::map<std::string, xmlrpc_c::value> const& params);
34
+ };
35
+
36
+ class SessionCache
37
+ {
38
+ mutable boost::shared_mutex m_lock;
39
+ uint64_t m_session_counter;
40
+ boost::unordered_map<uint64_t,Session> m_cache;
41
+ public:
42
+
43
+ SessionCache() : m_session_counter(1) {}
44
+
45
+ Session const&
46
+ operator[](uint32_t id)
47
+ {
48
+ boost::upgrade_lock<boost::shared_mutex> lock(m_lock);
49
+ if (id > 1)
50
+ {
51
+ boost::unordered_map<uint64_t, Session>::iterator m = m_cache.find(id);
52
+ if (m != m_cache.end())
53
+ {
54
+ m->second.last_access = time(NULL);
55
+ return m->second;
56
+ }
57
+ }
58
+ boost::upgrade_to_unique_lock<boost::shared_mutex> xlock(lock);
59
+ id = ++m_session_counter;
60
+ std::pair<uint64_t, Session> foo(id, Session(id));
61
+ return m_cache.insert(foo).first->second;
62
+ }
63
+
64
+ void
65
+ erase(uint32_t const id)
66
+ {
67
+ boost::unique_lock<boost::shared_mutex> lock(m_lock);
68
+ m_cache.erase(id);
69
+ }
70
+
71
+
72
+ };
73
+
74
+
75
+ }
mosesdecoder/moses/server/TranslationRequest.cpp ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "TranslationRequest.h"
2
+ #include "PackScores.h"
3
+ #include "moses/ContextScope.h"
4
+ #include <boost/foreach.hpp>
5
+ #include "moses/Util.h"
6
+ #include "moses/Hypothesis.h"
7
+
8
+ namespace MosesServer
9
+ {
10
+ using namespace std;
11
+ using Moses::Hypothesis;
12
+ using Moses::StaticData;
13
+ using Moses::Range;
14
+ using Moses::ChartHypothesis;
15
+ using Moses::Phrase;
16
+ using Moses::Manager;
17
+ using Moses::SearchGraphNode;
18
+ using Moses::TrellisPathList;
19
+ using Moses::TranslationOptionCollection;
20
+ using Moses::TranslationOptionList;
21
+ using Moses::TranslationOption;
22
+ using Moses::TargetPhrase;
23
+ using Moses::FValue;
24
+ using Moses::PhraseDictionaryMultiModel;
25
+ using Moses::FindPhraseDictionary;
26
+ using Moses::Sentence;
27
+ using Moses::TokenizeMultiCharSeparator;
28
+ using Moses::FeatureFunction;
29
+ using Moses::Scan;
30
+
31
+ boost::shared_ptr<TranslationRequest>
32
+ TranslationRequest::
33
+ create(Translator* translator, xmlrpc_c::paramList const& paramList,
34
+ boost::condition_variable& cond, boost::mutex& mut)
35
+ {
36
+ boost::shared_ptr<TranslationRequest> ret;
37
+ ret.reset(new TranslationRequest(paramList, cond, mut));
38
+ ret->m_self = ret;
39
+ ret->m_translator = translator;
40
+ return ret;
41
+ }
42
+
43
+ void
44
+ SetContextWeights(Moses::ContextScope& s, xmlrpc_c::value const& w)
45
+ {
46
+ SPTR<std::map<std::string,float> > M(new std::map<std::string, float>);
47
+ typedef std::map<std::string,xmlrpc_c::value> tmap;
48
+ tmap const tmp = static_cast<tmap>(xmlrpc_c::value_struct(w));
49
+ for(tmap::const_iterator m = tmp.begin(); m != tmp.end(); ++m)
50
+ (*M)[m->first] = xmlrpc_c::value_double(m->second);
51
+ s.SetContextWeights(M);
52
+ }
53
+
54
+ void
55
+ TranslationRequest::
56
+ Run()
57
+ {
58
+ typedef std::map<std::string,xmlrpc_c::value> param_t;
59
+ param_t const& params = m_paramList.getStruct(0);
60
+ parse_request(params);
61
+ // cerr << "SESSION ID" << ret->m_session_id << endl;
62
+
63
+
64
+ // settings within the session scope
65
+ param_t::const_iterator si = params.find("context-weights");
66
+ if (si != params.end()) SetContextWeights(*m_scope, si->second);
67
+
68
+ Moses::StaticData const& SD = Moses::StaticData::Instance();
69
+
70
+ if (is_syntax(m_options->search.algo))
71
+ run_chart_decoder();
72
+ else
73
+ run_phrase_decoder();
74
+
75
+ {
76
+ boost::lock_guard<boost::mutex> lock(m_mutex);
77
+ m_done = true;
78
+ }
79
+ m_cond.notify_one();
80
+
81
+ }
82
+
83
+ /// add phrase alignment information from a Hypothesis
84
+ void
85
+ TranslationRequest::
86
+ add_phrase_aln_info(Hypothesis const& h, vector<xmlrpc_c::value>& aInfo) const
87
+ {
88
+ if (!m_withAlignInfo) return;
89
+ // if (!options()->output.ReportSegmentation) return;
90
+ Range const& trg = h.GetCurrTargetWordsRange();
91
+ Range const& src = h.GetCurrSourceWordsRange();
92
+
93
+ std::map<std::string, xmlrpc_c::value> pAlnInfo;
94
+ pAlnInfo["tgt-start"] = xmlrpc_c::value_int(trg.GetStartPos());
95
+ pAlnInfo["tgt-end"] = xmlrpc_c::value_int(trg.GetEndPos());
96
+ pAlnInfo["src-start"] = xmlrpc_c::value_int(src.GetStartPos());
97
+ pAlnInfo["src-end"] = xmlrpc_c::value_int(src.GetEndPos());
98
+ aInfo.push_back(xmlrpc_c::value_struct(pAlnInfo));
99
+ }
100
+
101
+ void
102
+ TranslationRequest::
103
+ outputChartHypo(ostream& out, const ChartHypothesis* hypo)
104
+ {
105
+ Phrase outPhrase(20);
106
+ hypo->GetOutputPhrase(outPhrase);
107
+
108
+ // delete 1st & last
109
+ assert(outPhrase.GetSize() >= 2);
110
+ outPhrase.RemoveWord(0);
111
+ outPhrase.RemoveWord(outPhrase.GetSize() - 1);
112
+ for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++)
113
+ out << *outPhrase.GetFactor(pos, 0) << " ";
114
+ }
115
+
116
+ bool
117
+ TranslationRequest::
118
+ compareSearchGraphNode(const Moses::SearchGraphNode& a,
119
+ const Moses::SearchGraphNode& b)
120
+ {
121
+ return a.hypo->GetId() < b.hypo->GetId();
122
+ }
123
+
124
+ void
125
+ TranslationRequest::
126
+ insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData)
127
+ {
128
+ using xmlrpc_c::value_int;
129
+ using xmlrpc_c::value_double;
130
+ using xmlrpc_c::value_struct;
131
+ using xmlrpc_c::value_string;
132
+ vector<xmlrpc_c::value> searchGraphXml;
133
+ vector<SearchGraphNode> searchGraph;
134
+ manager.GetSearchGraph(searchGraph);
135
+ std::sort(searchGraph.begin(), searchGraph.end());
136
+ BOOST_FOREACH(Moses::SearchGraphNode const& n, searchGraph) {
137
+ map<string, xmlrpc_c::value> x; // search graph xml node
138
+ x["forward"] = value_double(n.forward);
139
+ x["fscore"] = value_double(n.fscore);
140
+ const Hypothesis* hypo = n.hypo;
141
+ x["hyp"] = value_int(hypo->GetId());
142
+ x["stack"] = value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
143
+ if (hypo->GetId() != 0) {
144
+ const Hypothesis *prevHypo = hypo->GetPrevHypo();
145
+ x["back"] = value_int(prevHypo->GetId());
146
+ x["score"] = value_double(hypo->GetScore());
147
+ x["transition"] = value_double(hypo->GetScore() - prevHypo->GetScore());
148
+ if (n.recombinationHypo)
149
+ x["recombined"] = value_int(n.recombinationHypo->GetId());
150
+ x["cover-start"] = value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
151
+ x["cover-end"] = value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
152
+ x["out"] = value_string(hypo->GetCurrTargetPhrase().GetStringRep(options()->output.factor_order));
153
+ }
154
+ searchGraphXml.push_back(value_struct(x));
155
+ }
156
+ retData["sg"] = xmlrpc_c::value_array(searchGraphXml);
157
+ }
158
+
159
+ void
160
+ TranslationRequest::
161
+ outputNBest(const Manager& manager, map<string, xmlrpc_c::value>& retData)
162
+ {
163
+ TrellisPathList nBestList;
164
+ vector<xmlrpc_c::value> nBestXml;
165
+
166
+ Moses::NBestOptions const& nbo = m_options->nbest;
167
+ manager.CalcNBest(nbo.nbest_size, nBestList, nbo.only_distinct);
168
+ manager.OutputNBest(cout, nBestList);
169
+
170
+ BOOST_FOREACH(Moses::TrellisPath const* path, nBestList) {
171
+ vector<const Hypothesis *> const& E = path->GetEdges();
172
+ if (!E.size()) continue;
173
+ std::map<std::string, xmlrpc_c::value> nBestXmlItem;
174
+ pack_hypothesis(manager, E, "hyp", nBestXmlItem);
175
+ if (m_withScoreBreakdown) {
176
+ // should the score breakdown be reported in a more structured manner?
177
+ ostringstream buf;
178
+ bool with_labels = nbo.include_feature_labels;
179
+ path->GetScoreBreakdown()->OutputAllFeatureScores(buf, with_labels);
180
+ nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
181
+ nBestXmlItem["scores"] = PackScores(*path->GetScoreBreakdown());
182
+ }
183
+
184
+ // weighted score
185
+ nBestXmlItem["totalScore"] = xmlrpc_c::value_double(path->GetFutureScore());
186
+ nBestXml.push_back(xmlrpc_c::value_struct(nBestXmlItem));
187
+ }
188
+ retData["nbest"] = xmlrpc_c::value_array(nBestXml);
189
+ }
190
+
191
+ void
192
+ TranslationRequest::
193
+ insertTranslationOptions(Moses::Manager& manager,
194
+ std::map<std::string, xmlrpc_c::value>& retData)
195
+ {
196
+ std::vector<Moses::FactorType> const& ofactor_order = options()->output.factor_order;
197
+
198
+ const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
199
+ vector<xmlrpc_c::value> toptsXml;
200
+ size_t const stop = toptsColl->GetSource().GetSize();
201
+ TranslationOptionList const* tol;
202
+ for (size_t s = 0 ; s < stop ; ++s) {
203
+ for (size_t e=s;(tol=toptsColl->GetTranslationOptionList(s,e))!=NULL;++e) {
204
+ BOOST_FOREACH(TranslationOption const* topt, *tol) {
205
+ std::map<std::string, xmlrpc_c::value> toptXml;
206
+ TargetPhrase const& tp = topt->GetTargetPhrase();
207
+ std::string tphrase = tp.GetStringRep(ofactor_order);
208
+ toptXml["phrase"] = xmlrpc_c::value_string(tphrase);
209
+ toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
210
+ toptXml["start"] = xmlrpc_c::value_int(s);
211
+ toptXml["end"] = xmlrpc_c::value_int(e);
212
+ vector<xmlrpc_c::value> scoresXml;
213
+ const std::valarray<FValue> &scores
214
+ = topt->GetScoreBreakdown().getCoreFeatures();
215
+ for (size_t j = 0; j < scores.size(); ++j)
216
+ scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
217
+ toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
218
+ ostringstream buf;
219
+ topt->GetScoreBreakdown().OutputAllFeatureScores(buf, true);
220
+ toptXml["labelledScores"] = PackScores(topt->GetScoreBreakdown());
221
+ toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
222
+ }
223
+ }
224
+ }
225
+ retData["topt"] = xmlrpc_c::value_array(toptsXml);
226
+ }
227
+
228
+ TranslationRequest::
229
+ TranslationRequest(xmlrpc_c::paramList const& paramList,
230
+ boost::condition_variable& cond, boost::mutex& mut)
231
+ : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList)
232
+ , m_session_id(0)
233
+ {
234
+
235
+ }
236
+
237
+ bool
238
+ check(std::map<std::string, xmlrpc_c::value> const& param,
239
+ std::string const key)
240
+ {
241
+ std::map<std::string, xmlrpc_c::value>::const_iterator m = param.find(key);
242
+ if(m == param.end()) return false;
243
+
244
+ if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN)
245
+ return xmlrpc_c::value_boolean(m->second);
246
+
247
+ std::string val = string(xmlrpc_c::value_string(m->second));
248
+ if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true;
249
+ return false;
250
+ }
251
+
252
+ void
253
+ TranslationRequest::
254
+ parse_request(std::map<std::string, xmlrpc_c::value> const& params)
255
+ {
256
+ // parse XMLRPC request
257
+ m_paramList.verifyEnd(1); // ??? UG
258
+
259
+ typedef std::map<std::string, xmlrpc_c::value> params_t;
260
+ params_t::const_iterator si;
261
+
262
+ si = params.find("session-id");
263
+ if (si != params.end())
264
+ {
265
+ m_session_id = xmlrpc_c::value_int(si->second);
266
+ Session const& S = m_translator->get_session(m_session_id);
267
+ m_scope = S.scope;
268
+ m_session_id = S.id;
269
+ }
270
+ else
271
+ {
272
+ m_session_id = 0;
273
+ m_scope.reset(new Moses::ContextScope);
274
+ }
275
+
276
+ boost::shared_ptr<Moses::AllOptions> opts(new Moses::AllOptions(*StaticData::Instance().options()));
277
+ opts->update(params);
278
+
279
+ m_withGraphInfo = check(params, "sg");
280
+ if (m_withGraphInfo || opts->nbest.nbest_size > 0) {
281
+ opts->output.SearchGraph = "true";
282
+ opts->nbest.enabled = true;
283
+ }
284
+
285
+ m_options = opts;
286
+
287
+ // source text must be given, or we don't know what to translate
288
+ si = params.find("text");
289
+ if (si == params.end())
290
+ throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE);
291
+ m_source_string = xmlrpc_c::value_string(si->second);
292
+ XVERBOSE(1,"Input: " << m_source_string << endl);
293
+
294
+ m_withTopts = check(params, "topt");
295
+ m_withScoreBreakdown = check(params, "add-score-breakdown");
296
+ si = params.find("lambda");
297
+ if (si != params.end())
298
+ {
299
+ // muMo = multiModel
300
+ xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second);
301
+ vector<xmlrpc_c::value> muMoValVec(muMoArray.vectorValueValue());
302
+ vector<float> w(muMoValVec.size());
303
+ for (size_t i = 0; i < muMoValVec.size(); ++i)
304
+ w[i] = xmlrpc_c::value_double(muMoValVec[i]);
305
+ if (w.size() && (si = params.find("model_name")) != params.end())
306
+ {
307
+ string const model_name = xmlrpc_c::value_string(si->second);
308
+ PhraseDictionaryMultiModel* pdmm
309
+ = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
310
+ pdmm->SetTemporaryMultiModelWeightsVector(w);
311
+ }
312
+ }
313
+
314
+ si = params.find("context");
315
+ if (si != params.end())
316
+ {
317
+ string context = xmlrpc_c::value_string(si->second);
318
+ VERBOSE(1,"CONTEXT " << context);
319
+ m_context.reset(new std::vector<std::string>(1,context));
320
+ }
321
+
322
+ si = params.find("context-scope");
323
+ if (si != params.end())
324
+ {
325
+
326
+ string context = xmlrpc_c::value_string(si->second);
327
+
328
+ string groupSeparator("Moses::ContextScope::GroupSeparator");
329
+ string recordSeparator("Moses::ContextScope::RecordSeparator");
330
+
331
+ // Here, we assume that any XML-RPC value
332
+ // associated with the key "context-scope"
333
+ // has the following format:
334
+ //
335
+ // FeatureFunctionName followed by recordSeparator
336
+ // followed by the value of interest
337
+ // followed by groupSeparator
338
+ //
339
+ // In the following code, the value of interest will be stored
340
+ // in contextScope under the key FeatureFunctionName,
341
+ // where FeatureFunctionName is the actual name of the feature function
342
+
343
+ boost::shared_ptr<Moses::ContextScope> contextScope = GetScope();
344
+
345
+ BOOST_FOREACH(string group, TokenizeMultiCharSeparator(context, groupSeparator)) {
346
+
347
+ vector<string> record = TokenizeMultiCharSeparator(group, recordSeparator);
348
+
349
+ // Use the feature function whose name is record[0] as a key
350
+ FeatureFunction& ff = Moses::FeatureFunction::FindFeatureFunction(record[0]);
351
+ void const* key = static_cast<void const*>(&ff);
352
+
353
+ // Store (in the context scope) record[1] as the value associated with that key
354
+ boost::shared_ptr<string> value = contextScope->get<string>(key,true);
355
+ value->replace(value->begin(), value->end(), record[1]);
356
+
357
+ }
358
+ }
359
+
360
+ // Report alignment info if Moses config says to or if XML request says to
361
+ m_withAlignInfo = options()->output.ReportSegmentation || check(params, "align");
362
+
363
+ // Report word alignment info if Moses config says to or if XML request says to
364
+ m_withWordAlignInfo = options()->output.PrintAlignmentInfo || check(params, "word-align");
365
+
366
+ si = params.find("weights");
367
+ if (si != params.end())
368
+ {
369
+
370
+ boost::unordered_map<string, FeatureFunction*> map;
371
+ {
372
+ const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
373
+ BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
374
+ map[ff->GetScoreProducerDescription()] = ff;
375
+ }
376
+ }
377
+
378
+ string allValues = xmlrpc_c::value_string(si->second);
379
+
380
+ BOOST_FOREACH(string values, TokenizeMultiCharSeparator(allValues, "\t")) {
381
+
382
+ vector<string> record = TokenizeMultiCharSeparator(values, "=");
383
+
384
+ if (record.size() == 2) {
385
+ string featureName = record[0];
386
+ string featureWeights = record[1];
387
+
388
+ boost::unordered_map<string, FeatureFunction*>::iterator ffi = map.find(featureName);
389
+
390
+ if (ffi != map.end()) {
391
+ FeatureFunction* ff = ffi->second;
392
+
393
+ size_t prevNumWeights = ff->GetNumScoreComponents();
394
+
395
+ vector<float> ffWeights;
396
+ BOOST_FOREACH(string weight, TokenizeMultiCharSeparator(featureWeights, " ")) {
397
+ ffWeights.push_back(Scan<float>(weight));
398
+ }
399
+
400
+ if (ffWeights.size() == ff->GetNumScoreComponents()) {
401
+
402
+ // XXX: This is NOT thread-safe
403
+ Moses::StaticData::InstanceNonConst().SetWeights(ff, ffWeights);
404
+ VERBOSE(1, "WARNING: THIS IS NOT THREAD-SAFE!\tUpdating weights for " << featureName << " to " << featureWeights << "\n");
405
+
406
+ } else {
407
+ TRACE_ERR("ERROR: Unable to update weights for " << featureName << " because " << ff->GetNumScoreComponents() << " weights are required but only " << ffWeights.size() << " were provided\n");
408
+ }
409
+
410
+ } else {
411
+ TRACE_ERR("ERROR: No FeatureFunction with name " << featureName << ", no weight update\n");
412
+ }
413
+
414
+ } else {
415
+ TRACE_ERR("WARNING: XML-RPC weights update was improperly formatted:\t" << values << "\n");
416
+ }
417
+
418
+ }
419
+
420
+ }
421
+
422
+
423
+ // // biased sampling for suffix-array-based sampling phrase table?
424
+ // if ((si = params.find("bias")) != params.end())
425
+ // {
426
+ // std::vector<xmlrpc_c::value> tmp
427
+ // = xmlrpc_c::value_array(si->second).cvalue();
428
+ // for (size_t i = 1; i < tmp.size(); i += 2)
429
+ // m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
430
+ // }
431
+ if (is_syntax(m_options->search.algo)) {
432
+ m_source.reset(new Sentence(m_options,0,m_source_string));
433
+ } else {
434
+ m_source.reset(new Sentence(m_options,0,m_source_string));
435
+ }
436
+ interpret_dlt();
437
+ } // end of Translationtask::parse_request()
438
+
439
+
440
+ void
441
+ TranslationRequest::
442
+ run_chart_decoder()
443
+ {
444
+ Moses::ChartManager manager(this->self());
445
+ manager.Decode();
446
+
447
+ const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
448
+ ostringstream out;
449
+ if (hypo) outputChartHypo(out,hypo);
450
+
451
+ m_target_string = out.str();
452
+ m_retData["text"] = xmlrpc_c::value_string(m_target_string);
453
+
454
+ if (m_withGraphInfo) {
455
+ std::ostringstream sgstream;
456
+ manager.OutputSearchGraphMoses(sgstream);
457
+ m_retData["sg"] = xmlrpc_c::value_string(sgstream.str());
458
+ }
459
+ } // end of TranslationRequest::run_chart_decoder()
460
+
461
+ void
462
+ TranslationRequest::
463
+ pack_hypothesis(const Moses::Manager& manager,
464
+ vector<Hypothesis const* > const& edges, string const& key,
465
+ map<string, xmlrpc_c::value> & dest) const
466
+ {
467
+ // target string
468
+ ostringstream target;
469
+ BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) {
470
+ manager.OutputSurface(target, *e);
471
+ }
472
+ XVERBOSE(1, "BEST TRANSLATION: " << *(manager.GetBestHypothesis())
473
+ << std::endl);
474
+ dest[key] = xmlrpc_c::value_string(target.str());
475
+
476
+ if (m_withAlignInfo) {
477
+ // if (options()->output.ReportSegmentation) {
478
+ // phrase alignment, if requested
479
+
480
+ vector<xmlrpc_c::value> p_aln;
481
+ BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
482
+ add_phrase_aln_info(*e, p_aln);
483
+ dest["align"] = xmlrpc_c::value_array(p_aln);
484
+ }
485
+
486
+ if (m_withWordAlignInfo) {
487
+ //if (options()->output.PrintAlignmentInfo) {
488
+ // word alignment, if requested
489
+ vector<xmlrpc_c::value> w_aln;
490
+ BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
491
+ e->OutputLocalWordAlignment(w_aln);
492
+ dest["word-align"] = xmlrpc_c::value_array(w_aln);
493
+ }
494
+ }
495
+
496
+ void
497
+ TranslationRequest::
498
+ pack_hypothesis(const Moses::Manager& manager, Hypothesis const* h, string const& key,
499
+ map<string, xmlrpc_c::value>& dest) const
500
+ {
501
+ using namespace std;
502
+ vector<Hypothesis const*> edges;
503
+ for (; h; h = h->GetPrevHypo())
504
+ edges.push_back(h);
505
+ pack_hypothesis(manager, edges, key, dest);
506
+ }
507
+
508
+
509
+ void
510
+ TranslationRequest::
511
+ run_phrase_decoder()
512
+ {
513
+ Manager manager(this->self());
514
+ manager.Decode();
515
+ pack_hypothesis(manager, manager.GetBestHypothesis(), "text", m_retData);
516
+ if (m_session_id)
517
+ m_retData["session-id"] = xmlrpc_c::value_int(m_session_id);
518
+
519
+ if (m_withGraphInfo) insertGraphInfo(manager,m_retData);
520
+ if (m_withTopts) insertTranslationOptions(manager,m_retData);
521
+ if (m_options->nbest.nbest_size) outputNBest(manager, m_retData);
522
+
523
+ }
524
+ }
mosesdecoder/moses/server/Updater.cpp ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #include "Updater.h"
3
+
4
+ namespace MosesServer
5
+ {
6
+ using namespace Moses;
7
+ using namespace std;
8
+
9
+ Updater::
10
+ Updater()
11
+ {
12
+ // signature and help strings are documentation -- the client
13
+ // can query this information with a system.methodSignature and
14
+ // system.methodHelp RPC.
15
+ this->_signature = "S:S";
16
+ this->_help = "Updates stuff";
17
+ }
18
+
19
+ void
20
+ Updater::
21
+ execute(xmlrpc_c::paramList const& paramList,
22
+ xmlrpc_c::value * const retvalP)
23
+ {
24
+ #if PT_UG
25
+ const params_t params = paramList.getStruct(0);
26
+ breakOutParams(params);
27
+ Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
28
+ pdsa->add(m_src, m_trg, m_aln);
29
+ XVERBOSE(1,"Done inserting\n");
30
+ *retvalP = xmlrpc_c::value_string("Phrase table updated");
31
+ #endif
32
+ };
33
+
34
+ void
35
+ Updater::
36
+ breakOutParams(const params_t& params)
37
+ {
38
+ params_t::const_iterator si = params.find("source");
39
+ if(si == params.end())
40
+ throw xmlrpc_c::fault("Missing source sentence",
41
+ xmlrpc_c::fault::CODE_PARSE);
42
+ m_src = xmlrpc_c::value_string(si->second);
43
+ XVERBOSE(1,"source = " << m_src << endl);
44
+ si = params.find("target");
45
+ if(si == params.end())
46
+ throw xmlrpc_c::fault("Missing target sentence",
47
+ xmlrpc_c::fault::CODE_PARSE);
48
+ m_trg = xmlrpc_c::value_string(si->second);
49
+ XVERBOSE(1,"target = " << m_trg << endl);
50
+ if((si = params.find("alignment")) == params.end())
51
+ throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
52
+ m_aln = xmlrpc_c::value_string(si->second);
53
+ XVERBOSE(1,"alignment = " << m_aln << endl);
54
+ m_bounded = ((si = params.find("bounded")) != params.end());
55
+ m_add2ORLM = ((si = params.find("updateORLM")) != params.end());
56
+ };
57
+
58
+ }
mosesdecoder/moses/server/Updater.h ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ #pragma once
3
+
4
+ #include "moses/Util.h"
5
+ #include "moses/ChartManager.h"
6
+ #include "moses/Hypothesis.h"
7
+ #include "moses/Manager.h"
8
+ #include "moses/StaticData.h"
9
+ #include "moses/ThreadPool.h"
10
+
11
+ #if PT_UG
12
+ #include "moses/TranslationModel/UG/mmsapt.h"
13
+ #endif
14
+
15
+ #include <xmlrpc-c/base.hpp>
16
+ #include <xmlrpc-c/registry.hpp>
17
+ #include <xmlrpc-c/server_abyss.hpp>
18
+
19
+
20
+ namespace MosesServer
21
+ {
22
+ class
23
+ Updater: public xmlrpc_c::method
24
+ {
25
+
26
+ typedef std::map<std::string, xmlrpc_c::value> params_t;
27
+
28
+
29
+ std::string m_src, m_trg, m_aln;
30
+ bool m_bounded, m_add2ORLM;
31
+
32
+ public:
33
+ Updater();
34
+
35
+ void
36
+ execute(xmlrpc_c::paramList const& paramList,
37
+ xmlrpc_c::value * const retvalP);
38
+
39
+ void
40
+ breakOutParams(const params_t& params);
41
+
42
+ };
43
+
44
+ }
mosesdecoder/util/bit_packing_test.cc ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "util/bit_packing.hh"
2
+
3
+ #define BOOST_TEST_MODULE BitPackingTest
4
+ #include <boost/test/unit_test.hpp>
5
+
6
+ #include <cstring>
7
+
8
+ namespace util {
9
+ namespace {
10
+
11
+ const uint64_t test57 = 0x123456789abcdefULL;
12
+ const uint32_t test25 = 0x1234567;
13
+
14
+ BOOST_AUTO_TEST_CASE(ZeroBit57) {
15
+ char mem[16];
16
+ memset(mem, 0, sizeof(mem));
17
+ WriteInt57(mem, 0, 57, test57);
18
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
19
+ }
20
+
21
+ BOOST_AUTO_TEST_CASE(EachBit57) {
22
+ char mem[16];
23
+ for (uint8_t b = 0; b < 8; ++b) {
24
+ memset(mem, 0, sizeof(mem));
25
+ WriteInt57(mem, b, 57, test57);
26
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
27
+ }
28
+ }
29
+
30
+ BOOST_AUTO_TEST_CASE(Consecutive57) {
31
+ char mem[57+8];
32
+ memset(mem, 0, sizeof(mem));
33
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
34
+ WriteInt57(mem, b, 57, test57);
35
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
36
+ }
37
+ for (uint64_t b = 0; b < 57 * 8; b += 57) {
38
+ BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
39
+ }
40
+ }
41
+
42
+ BOOST_AUTO_TEST_CASE(Consecutive25) {
43
+ char mem[25+8];
44
+ memset(mem, 0, sizeof(mem));
45
+ for (uint64_t b = 0; b < 25 * 8; b += 25) {
46
+ WriteInt25(mem, b, 25, test25);
47
+ BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
48
+ }
49
+ for (uint64_t b = 0; b < 25 * 8; b += 25) {
50
+ BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
51
+ }
52
+ }
53
+
54
+ BOOST_AUTO_TEST_CASE(Sanity) {
55
+ BitPackingSanity();
56
+ }
57
+
58
+ } // namespace
59
+ } // namespace util
mosesdecoder/util/ersatz_progress.hh ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef UTIL_ERSATZ_PROGRESS_H
2
+ #define UTIL_ERSATZ_PROGRESS_H
3
+
4
+ #include <iostream>
5
+ #include <string>
6
+ #include <stdint.h>
7
+
8
+ // Ersatz version of boost::progress so core language model doesn't depend on
9
+ // boost. Also adds option to print nothing.
10
+
11
+ namespace util {
12
+
13
+ extern const char kProgressBanner[];
14
+
15
+ class ErsatzProgress {
16
+ public:
17
+ // No output.
18
+ ErsatzProgress();
19
+
20
+ // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
21
+ explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
22
+
23
+ ~ErsatzProgress();
24
+
25
+ ErsatzProgress &operator++() {
26
+ if (++current_ >= next_) Milestone();
27
+ return *this;
28
+ }
29
+
30
+ ErsatzProgress &operator+=(uint64_t amount) {
31
+ if ((current_ += amount) >= next_) Milestone();
32
+ return *this;
33
+ }
34
+
35
+ void Set(uint64_t to) {
36
+ if ((current_ = to) >= next_) Milestone();
37
+ }
38
+
39
+ void Finished() {
40
+ Set(complete_);
41
+ }
42
+
43
+ private:
44
+ void Milestone();
45
+
46
+ uint64_t current_, next_, complete_;
47
+ unsigned char stones_written_;
48
+ std::ostream *out_;
49
+
50
+ // noncopyable
51
+ ErsatzProgress(const ErsatzProgress &other);
52
+ ErsatzProgress &operator=(const ErsatzProgress &other);
53
+ };
54
+
55
+ } // namespace util
56
+
57
+ #endif // UTIL_ERSATZ_PROGRESS_H
mosesdecoder/util/exception.hh ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef UTIL_EXCEPTION_H
2
+ #define UTIL_EXCEPTION_H
3
+
4
+ #include "util/string_stream.hh"
5
+
6
+ #include <exception>
7
+ #include <limits>
8
+ #include <string>
9
+ #include <stdint.h>
10
+
11
+ // TODO(hieu): delete this
12
+ #include <sstream>
13
+
14
+ namespace util {
15
+
16
+ template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
17
+
18
+ class Exception : public std::exception {
19
+ public:
20
+ Exception() throw();
21
+ virtual ~Exception() throw();
22
+
23
+ const char *what() const throw() { return what_.str().c_str(); }
24
+
25
+ // For use by the UTIL_THROW macros.
26
+ void SetLocation(
27
+ const char *file,
28
+ unsigned int line,
29
+ const char *func,
30
+ const char *child_name,
31
+ const char *condition);
32
+
33
+ private:
34
+ template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
35
+
36
+ // This helps restrict operator<< defined below.
37
+ template <class T> struct ExceptionTag {
38
+ typedef T Identity;
39
+ };
40
+
41
+ StringStream what_;
42
+ };
43
+
44
+ /* This implements the normal operator<< for Exception and all its children.
45
+ * SFINAE means it only applies to Exception. Think of this as an ersatz
46
+ * boost::enable_if.
47
+ */
48
+ template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
49
+ // TODO(hieu): delete this.
50
+ std::stringstream moses_hack;
51
+ moses_hack << data;
52
+ e.what_ << moses_hack.str();
53
+ return e;
54
+ }
55
+
56
+ #ifdef __GNUC__
57
+ #define UTIL_FUNC_NAME __PRETTY_FUNCTION__
58
+ #else
59
+ #ifdef _WIN32
60
+ #define UTIL_FUNC_NAME __FUNCTION__
61
+ #else
62
+ #define UTIL_FUNC_NAME NULL
63
+ #endif
64
+ #endif
65
+
66
+ /* Create an instance of Exception, add the message Modify, and throw it.
67
+ * Modify is appended to the what() message and can contain << for ostream
68
+ * operations.
69
+ *
70
+ * do .. while kludge to swallow trailing ; character
71
+ * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
72
+ * Arg can be a constructor argument to the exception.
73
+ */
74
+ #define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
75
+ Exception UTIL_e Arg; \
76
+ UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
77
+ UTIL_e << Modify; \
78
+ throw UTIL_e; \
79
+ } while (0)
80
+
81
+ #define UTIL_THROW_ARG(Exception, Arg, Modify) \
82
+ UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
83
+
84
+ #define UTIL_THROW(Exception, Modify) \
85
+ UTIL_THROW_BACKEND(NULL, Exception, , Modify);
86
+
87
+ #define UTIL_THROW2(Modify) \
88
+ UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
89
+
90
+ #if __GNUC__ >= 3
91
+ #define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
92
+ #else
93
+ #define UTIL_UNLIKELY(x) (x)
94
+ #endif
95
+
96
+ #if __GNUC__ >= 3
97
+ #define UTIL_LIKELY(x) __builtin_expect (!!(x), 1)
98
+ #else
99
+ #define UTIL_LIKELY(x) (x)
100
+ #endif
101
+
102
+ #define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
103
+ if (UTIL_UNLIKELY(Condition)) { \
104
+ UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
105
+ } \
106
+ } while (0)
107
+
108
+ #define UTIL_THROW_IF(Condition, Exception, Modify) \
109
+ UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
110
+
111
+ #define UTIL_THROW_IF2(Condition, Modify) \
112
+ UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify)
113
+
114
+ // Exception that records errno and adds it to the message.
115
+ class ErrnoException : public Exception {
116
+ public:
117
+ ErrnoException() throw();
118
+
119
+ virtual ~ErrnoException() throw();
120
+
121
+ int Error() const throw() { return errno_; }
122
+
123
+ private:
124
+ int errno_;
125
+ };
126
+
127
+ // file wasn't there, or couldn't be open for some reason
128
+ class FileOpenException : public Exception {
129
+ public:
130
+ FileOpenException() throw() {}
131
+ ~FileOpenException() throw() {}
132
+ };
133
+
134
+ // Utilities for overflow checking.
135
+ class OverflowException : public Exception {
136
+ public:
137
+ OverflowException() throw();
138
+ ~OverflowException() throw();
139
+ };
140
+
141
+ template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
142
+ UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code.");
143
+ return value;
144
+ }
145
+
146
+ template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
147
+ return value;
148
+ }
149
+
150
+ inline std::size_t CheckOverflow(uint64_t value) {
151
+ return CheckOverflowInternal<sizeof(std::size_t)>(value);
152
+ }
153
+
154
+ #if defined(_WIN32) || defined(_WIN64)
155
+ /* Thrown for Windows specific operations. */
156
+ class WindowsException : public Exception {
157
+ public:
158
+ WindowsException() throw();
159
+ ~WindowsException() throw();
160
+ };
161
+ #endif
162
+
163
+ } // namespace util
164
+
165
+ #endif // UTIL_EXCEPTION_H
mosesdecoder/util/fake_ostream.hh ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef UTIL_FAKE_OSTREAM_H
2
+ #define UTIL_FAKE_OSTREAM_H
3
+
4
+ #include "util/float_to_string.hh"
5
+ #include "util/integer_to_string.hh"
6
+ #include "util/string_piece.hh"
7
+
8
+ #include <cassert>
9
+ #include <limits>
10
+
11
+ #include <stdint.h>
12
+
13
+ namespace util {
14
+
15
+ /* Like std::ostream but without being incredibly slow.
16
+ * Supports most of the built-in types except for long double.
17
+ *
18
+ * The FakeOStream class is intended to be inherited from. The inherting class
19
+ * should provide:
20
+ * public:
21
+ * Derived &flush();
22
+ * Derived &write(const void *data, std::size_t length);
23
+ *
24
+ * private: or protected:
25
+ * friend class FakeOStream;
26
+ * char *Ensure(std::size_t amount);
27
+ * void AdvanceTo(char *to);
28
+ *
29
+ * The Ensure function makes enough space for an in-place write and returns
30
+ * where to write. The AdvanceTo function happens after the write, saying how
31
+ * much was actually written.
32
+ *
33
+ * Precondition:
34
+ * amount <= kToStringMaxBytes for in-place writes.
35
+ */
36
+ template <class Derived> class FakeOStream {
37
+ public:
38
+ FakeOStream() {}
39
+
40
+ // This also covers std::string and char*
41
+ Derived &operator<<(StringPiece str) {
42
+ return C().write(str.data(), str.size());
43
+ }
44
+
45
+ // Handle integers by size and signedness.
46
+ private:
47
+ template <class Arg> struct EnableIfKludge {
48
+ typedef Derived type;
49
+ };
50
+ template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
51
+
52
+ template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
53
+ template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
54
+ template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
55
+
56
+ template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
57
+ template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
58
+ template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
59
+ public:
60
+ template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
61
+ return CallToString(static_cast<typename Coerce<From>::To>(value));
62
+ }
63
+
64
+ // Character types that get copied as bytes instead of displayed as integers.
65
+ Derived &operator<<(char val) { return put(val); }
66
+ Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
67
+ Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
68
+
69
+ Derived &operator<<(bool val) { return put(val + '0'); }
70
+ // enums will fall back to int but are not caught by the template.
71
+ Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
72
+
73
+ Derived &operator<<(float val) { return CallToString(val); }
74
+ Derived &operator<<(double val) { return CallToString(val); }
75
+
76
+ // This is here to catch all the other pointer types.
77
+ Derived &operator<<(const void *value) { return CallToString(value); }
78
+ // This is here because the above line also catches const char*.
79
+ Derived &operator<<(const char *value) { return *this << StringPiece(value); }
80
+ Derived &operator<<(char *value) { return *this << StringPiece(value); }
81
+
82
+ Derived &put(char val) {
83
+ char *c = C().Ensure(1);
84
+ *c = val;
85
+ C().AdvanceTo(++c);
86
+ return C();
87
+ }
88
+
89
+ char widen(char val) const { return val; }
90
+
91
+ private:
92
+ // References to derived class for convenience.
93
+ Derived &C() {
94
+ return *static_cast<Derived*>(this);
95
+ }
96
+
97
+ const Derived &C() const {
98
+ return *static_cast<const Derived*>(this);
99
+ }
100
+
101
+ // This is separate to prevent an infinite loop if the compiler considers
102
+ // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
103
+ template <class T> Derived &CallToString(const T value) {
104
+ C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
105
+ return C();
106
+ }
107
+ };
108
+
109
+ } // namespace
110
+
111
+ #endif // UTIL_FAKE_OSTREAM_H
mosesdecoder/util/file_piece.hh ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef UTIL_FILE_PIECE_H
2
+ #define UTIL_FILE_PIECE_H
3
+
4
+ #include "util/ersatz_progress.hh"
5
+ #include "util/exception.hh"
6
+ #include "util/file.hh"
7
+ #include "util/mmap.hh"
8
+ #include "util/read_compressed.hh"
9
+ #include "util/string_piece.hh"
10
+
11
+ #include <cstddef>
12
+ #include <iosfwd>
13
+ #include <string>
14
+ #include <cassert>
15
+ #include <stdint.h>
16
+
17
+ namespace util {
18
+
19
+ class ParseNumberException : public Exception {
20
+ public:
21
+ explicit ParseNumberException(StringPiece value) throw();
22
+ ~ParseNumberException() throw() {}
23
+ };
24
+
25
+ extern const bool kSpaces[256];
26
+
27
+ // Memory backing the returned StringPiece may vanish on the next call.
28
+ class FilePiece {
29
+ public:
30
+ // 1 MB default.
31
+ explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
32
+ // Takes ownership of fd. name is used for messages.
33
+ explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
34
+
35
+ /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is
36
+ * much faster. But sometimes you just have an istream like Boost's HTTP
37
+ * server and want to parse it the same way.
38
+ * name is just used for messages and FileName().
39
+ */
40
+ explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
41
+
42
+ ~FilePiece();
43
+
44
+ char get() {
45
+ if (position_ == position_end_) {
46
+ Shift();
47
+ if (at_end_) throw EndOfFileException();
48
+ }
49
+ return *(position_++);
50
+ }
51
+
52
+ // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace().
53
+ StringPiece ReadDelimited(const bool *delim = kSpaces) {
54
+ SkipSpaces(delim);
55
+ return Consume(FindDelimiterOrEOF(delim));
56
+ }
57
+
58
+ /// Read word until the line or file ends.
59
+ bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) {
60
+ assert(delim[static_cast<unsigned char>('\n')]);
61
+ // Skip non-enter spaces.
62
+ for (; ; ++position_) {
63
+ if (position_ == position_end_) {
64
+ try {
65
+ Shift();
66
+ } catch (const util::EndOfFileException &e) { return false; }
67
+ // And break out at end of file.
68
+ if (position_ == position_end_) return false;
69
+ }
70
+ if (!delim[static_cast<unsigned char>(*position_)]) break;
71
+ if (*position_ == '\n') return false;
72
+ }
73
+ // We can't be at the end of file because there's at least one character open.
74
+ to = Consume(FindDelimiterOrEOF(delim));
75
+ return true;
76
+ }
77
+
78
+ /** Read a line of text from the file.
79
+ *
80
+ * Unlike ReadDelimited, this includes leading spaces and consumes the
81
+ * delimiter. It is similar to getline in that way.
82
+ *
83
+ * If strip_cr is true, any trailing carriate return (as would be found on
84
+ * a file written on Windows) will be left out of the returned line.
85
+ *
86
+ * Throws EndOfFileException if the end of the file is encountered. If the
87
+ * file does not end in a newline, this could mean that the last line is
88
+ * never read.
89
+ */
90
+ StringPiece ReadLine(char delim = '\n', bool strip_cr = true);
91
+
92
+ /** Read a line of text from the file, or return false on EOF.
93
+ *
94
+ * This is like ReadLine, except it returns false where ReadLine throws
95
+ * EndOfFileException. Like ReadLine it may not read the last line in the
96
+ * file if the file does not end in a newline.
97
+ *
98
+ * If strip_cr is true, any trailing carriate return (as would be found on
99
+ * a file written on Windows) will be left out of the returned line.
100
+ */
101
+ bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true);
102
+
103
+ float ReadFloat();
104
+ double ReadDouble();
105
+ long int ReadLong();
106
+ unsigned long int ReadULong();
107
+
108
+ // Skip spaces defined by isspace.
109
+ void SkipSpaces(const bool *delim = kSpaces) {
110
+ assert(position_ <= position_end_);
111
+ for (; ; ++position_) {
112
+ if (position_ == position_end_) {
113
+ Shift();
114
+ // And break out at end of file.
115
+ if (position_ == position_end_) return;
116
+ }
117
+ assert(position_ < position_end_);
118
+ if (!delim[static_cast<unsigned char>(*position_)]) return;
119
+ }
120
+ }
121
+
122
+ uint64_t Offset() const {
123
+ return position_ - data_.begin() + mapped_offset_;
124
+ }
125
+
126
+ const std::string &FileName() const { return file_name_; }
127
+
128
+ private:
129
+ void InitializeNoRead(const char *name, std::size_t min_buffer);
130
+ // Calls InitializeNoRead, so don't call both.
131
+ void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
132
+
133
+ template <class T> T ReadNumber();
134
+
135
+ StringPiece Consume(const char *to) {
136
+ assert(to >= position_);
137
+ StringPiece ret(position_, to - position_);
138
+ position_ = to;
139
+ return ret;
140
+ }
141
+
142
+ const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
143
+
144
+ void Shift();
145
+ // Backends to Shift().
146
+ void MMapShift(uint64_t desired_begin);
147
+
148
+ void TransitionToRead();
149
+ void ReadShift();
150
+
151
+ const char *position_, *last_space_, *position_end_;
152
+
153
+ scoped_fd file_;
154
+ const uint64_t total_size_;
155
+ const uint64_t page_;
156
+
157
+ std::size_t default_map_size_;
158
+ uint64_t mapped_offset_;
159
+
160
+ // Order matters: file_ should always be destroyed after this.
161
+ scoped_memory data_;
162
+
163
+ bool at_end_;
164
+ bool fallback_to_read_;
165
+
166
+ ErsatzProgress progress_;
167
+
168
+ std::string file_name_;
169
+
170
+ ReadCompressed fell_back_;
171
+ };
172
+
173
+ } // namespace util
174
+
175
+ #endif // UTIL_FILE_PIECE_H
mosesdecoder/util/file_piece_test.cc ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Tests might fail if you have creative characters in your path. Sue me.
2
+ #include "util/file_piece.hh"
3
+
4
+ #include "util/file_stream.hh"
5
+ #include "util/file.hh"
6
+ #include "util/scoped.hh"
7
+
8
+ #define BOOST_TEST_MODULE FilePieceTest
9
+ #include <boost/test/unit_test.hpp>
10
+ #include <fstream>
11
+ #include <iostream>
12
+ #include <cstdio>
13
+ #include <sys/types.h>
14
+ #include <sys/stat.h>
15
+
16
+ namespace util {
17
+ namespace {
18
+
19
+ std::string FileLocation() {
20
+ if (boost::unit_test::framework::master_test_suite().argc < 2) {
21
+ return "file_piece.cc";
22
+ }
23
+ std::string ret(boost::unit_test::framework::master_test_suite().argv[1]);
24
+ return ret;
25
+ }
26
+
27
+ /* istream */
28
+ BOOST_AUTO_TEST_CASE(IStream) {
29
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
30
+ std::fstream backing(FileLocation().c_str(), std::ios::in);
31
+ FilePiece test(backing);
32
+ std::string ref_line;
33
+ while (getline(ref, ref_line)) {
34
+ StringPiece test_line(test.ReadLine());
35
+ BOOST_CHECK_EQUAL(ref_line, test_line);
36
+ }
37
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
38
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
39
+ }
40
+
41
+ /* mmap implementation */
42
+ BOOST_AUTO_TEST_CASE(MMapReadLine) {
43
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
44
+ FilePiece test(FileLocation().c_str(), NULL, 1);
45
+ std::string ref_line;
46
+ while (getline(ref, ref_line)) {
47
+ StringPiece test_line(test.ReadLine());
48
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
49
+ if (!test_line.empty() || !ref_line.empty()) {
50
+ BOOST_CHECK_EQUAL(ref_line, test_line);
51
+ }
52
+ }
53
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
54
+ }
55
+
56
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
57
+ /* Apple isn't happy with the popen, fileno, dup. And I don't want to
58
+ * reimplement popen. This is an issue with the test.
59
+ */
60
+ /* read() implementation */
61
+ BOOST_AUTO_TEST_CASE(StreamReadLine) {
62
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
63
+
64
+ std::string popen_args = "cat \"";
65
+ popen_args += FileLocation();
66
+ popen_args += '"';
67
+
68
+ FILE *catter = popen(popen_args.c_str(), "r");
69
+ BOOST_REQUIRE(catter);
70
+
71
+ FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
72
+ std::string ref_line;
73
+ while (getline(ref, ref_line)) {
74
+ StringPiece test_line(test.ReadLine());
75
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
76
+ if (!test_line.empty() || !ref_line.empty()) {
77
+ BOOST_CHECK_EQUAL(ref_line, test_line);
78
+ }
79
+ }
80
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
81
+ BOOST_REQUIRE(!pclose(catter));
82
+ }
83
+ #endif
84
+
85
+ #ifdef HAVE_ZLIB
86
+
87
+ // gzip file
88
+ BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
89
+ std::string location(FileLocation());
90
+ std::fstream ref(location.c_str(), std::ios::in);
91
+
92
+ std::string command("gzip <\"");
93
+ command += location + "\" >\"" + location + "\".gz";
94
+
95
+ BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
96
+ FilePiece test((location + ".gz").c_str(), NULL, 1);
97
+ unlink((location + ".gz").c_str());
98
+ std::string ref_line;
99
+ while (getline(ref, ref_line)) {
100
+ StringPiece test_line(test.ReadLine());
101
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
102
+ if (!test_line.empty() || !ref_line.empty()) {
103
+ BOOST_CHECK_EQUAL(ref_line, test_line);
104
+ }
105
+ }
106
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
107
+ }
108
+
109
+ // gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with
110
+ // the test.
111
+ #if !defined __APPLE__ && !defined __MINGW32__
112
+ BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
113
+ std::fstream ref(FileLocation().c_str(), std::ios::in);
114
+
115
+ std::string command("gzip <\"");
116
+ command += FileLocation() + "\"";
117
+
118
+ FILE * catter = popen(command.c_str(), "r");
119
+ BOOST_REQUIRE(catter);
120
+
121
+ FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1);
122
+ std::string ref_line;
123
+ while (getline(ref, ref_line)) {
124
+ StringPiece test_line(test.ReadLine());
125
+ // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
126
+ if (!test_line.empty() || !ref_line.empty()) {
127
+ BOOST_CHECK_EQUAL(ref_line, test_line);
128
+ }
129
+ }
130
+ BOOST_CHECK_THROW(test.get(), EndOfFileException);
131
+ BOOST_REQUIRE(!pclose(catter));
132
+ }
133
+ #endif // __APPLE__
134
+
135
+ #endif // HAVE_ZLIB
136
+
137
+ BOOST_AUTO_TEST_CASE(Numbers) {
138
+ scoped_fd file(MakeTemp(FileLocation()));
139
+ const float floating = 3.2;
140
+ {
141
+ util::FileStream writing(file.get());
142
+ writing << "94389483984398493890287 " << floating << " 5";
143
+ }
144
+ SeekOrThrow(file.get(), 0);
145
+ util::FilePiece f(file.release());
146
+ BOOST_CHECK_THROW(f.ReadULong(), ParseNumberException);
147
+ BOOST_CHECK_EQUAL("94389483984398493890287", f.ReadDelimited());
148
+ // Yes, exactly equal. Isn't double-conversion wonderful?
149
+ BOOST_CHECK_EQUAL(floating, f.ReadFloat());
150
+ BOOST_CHECK_EQUAL(5, f.ReadULong());
151
+ }
152
+
153
+ } // namespace
154
+ } // namespace util
mosesdecoder/util/generator.hh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // generator/continuation for C++
4
+ // author: Andrew Fedoniouk @ terrainformatica.com
5
+ // idea borrowed from: "coroutines in C" Simon Tatham,
6
+ // http://www.chiark.greenend.org.uk/~sgtatham/coroutines.html
7
+ // BSD license
8
+
9
+ template<typename T>
10
+ struct _generator
11
+ {
12
+ T* _stack;
13
+ int _line;
14
+ _generator():_stack(0), _line(-1) {}
15
+ void _push() { T* n = new T; *n = *static_cast<T*>(this); _stack = n; }
16
+ bool _pop() { if(!_stack) return false; T* t = _stack; *static_cast<T*>(this) = *_stack; t->_stack = 0; delete t; return true; }
17
+ ~_generator() { while(_pop()); }
18
+ };
19
+
20
+ #define $generator(NAME) struct NAME : public _generator<NAME>
21
+
22
+ #define $emit(T) bool operator()(T& _rv) { \
23
+ if(_line < 0) _line=0; \
24
+ $START: switch(_line) { case 0:;
25
+
26
+ #define $stop } _line = 0; if(_pop()) goto $START; return false; }
27
+
28
+ #define $restart(WITH) { _push(); _stack->_line = __LINE__; _line=0; WITH; goto $START; case __LINE__:; }
29
+
30
+ #define $yield(V) \
31
+ do {\
32
+ _line=__LINE__;\
33
+ _rv = (V); return true; case __LINE__:;\
34
+ } while (0)
mosesdecoder/util/getopt.c ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ POSIX getopt for Windows
3
+
4
+ AT&T Public License
5
+
6
+ Code given out at the 1985 UNIFORUM conference in Dallas.
7
+ */
8
+
9
+ #ifndef __GNUC__
10
+
11
+ #include "getopt.hh"
12
+ #include <stdio.h>
13
+ #include <string.h>
14
+
15
+ #define NULL 0
16
+ #define EOF (-1)
17
+ #define ERR(s, c) if(opterr){\
18
+ char errbuf[2];\
19
+ errbuf[0] = c; errbuf[1] = '\n';\
20
+ fputs(argv[0], stderr);\
21
+ fputs(s, stderr);\
22
+ fputc(c, stderr);}
23
+ //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
24
+ //(void) write(2, s, (unsigned)strlen(s));\
25
+ //(void) write(2, errbuf, 2);}
26
+
27
+ int opterr = 1;
28
+ int optind = 1;
29
+ int optopt;
30
+ char *optarg;
31
+
32
+ int
33
+ getopt(argc, argv, opts)
34
+ int argc;
35
+ char **argv, *opts;
36
+ {
37
+ static int sp = 1;
38
+ register int c;
39
+ register char *cp;
40
+
41
+ if(sp == 1)
42
+ if(optind >= argc ||
43
+ argv[optind][0] != '-' || argv[optind][1] == '\0')
44
+ return(EOF);
45
+ else if(strcmp(argv[optind], "--") == NULL) {
46
+ optind++;
47
+ return(EOF);
48
+ }
49
+ optopt = c = argv[optind][sp];
50
+ if(c == ':' || (cp=strchr(opts, c)) == NULL) {
51
+ ERR(": illegal option -- ", c);
52
+ if(argv[optind][++sp] == '\0') {
53
+ optind++;
54
+ sp = 1;
55
+ }
56
+ return('?');
57
+ }
58
+ if(*++cp == ':') {
59
+ if(argv[optind][sp+1] != '\0')
60
+ optarg = &argv[optind++][sp+1];
61
+ else if(++optind >= argc) {
62
+ ERR(": option requires an argument -- ", c);
63
+ sp = 1;
64
+ return('?');
65
+ } else
66
+ optarg = argv[optind++];
67
+ sp = 1;
68
+ } else {
69
+ if(argv[optind][++sp] == '\0') {
70
+ sp = 1;
71
+ optind++;
72
+ }
73
+ optarg = NULL;
74
+ }
75
+ return(c);
76
+ }
77
+
78
+ #endif /* __GNUC__ */
mosesdecoder/util/integer_to_string_test.cc ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
2
+ #include "util/integer_to_string.hh"
3
+ #include "util/string_piece.hh"
4
+
5
+ #define BOOST_TEST_MODULE IntegerToStringTest
6
+ #include <boost/test/unit_test.hpp>
7
+ #include <boost/lexical_cast.hpp>
8
+
9
+ #include <limits>
10
+
11
+ namespace util {
12
+ namespace {
13
+
14
+ template <class T> void TestValue(const T value) {
15
+ char buf[ToStringBuf<T>::kBytes];
16
+ StringPiece result(buf, ToString(value, buf) - buf);
17
+ BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
18
+ if (value) {
19
+ BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
20
+ } else {
21
+ // Platforms can do void * as 0x0 or 0.
22
+ BOOST_CHECK(result == "0x0" || result == "0");
23
+ }
24
+ }
25
+
26
+ template <class T> void TestCorners() {
27
+ TestValue(std::numeric_limits<T>::min());
28
+ TestValue(std::numeric_limits<T>::max());
29
+ TestValue((T)0);
30
+ TestValue((T)-1);
31
+ TestValue((T)1);
32
+ }
33
+
34
+ BOOST_AUTO_TEST_CASE(Corners) {
35
+ TestCorners<uint16_t>();
36
+ TestCorners<uint32_t>();
37
+ TestCorners<uint64_t>();
38
+ TestCorners<int16_t>();
39
+ TestCorners<int32_t>();
40
+ TestCorners<int64_t>();
41
+ TestCorners<const void*>();
42
+ }
43
+
44
+ template <class T> void TestAll() {
45
+ for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
46
+ TestValue(i);
47
+ }
48
+ TestValue(std::numeric_limits<T>::max());
49
+ }
50
+
51
+ BOOST_AUTO_TEST_CASE(Short) {
52
+ TestAll<uint16_t>();
53
+ TestAll<int16_t>();
54
+ }
55
+
56
+ template <class T> void Test10s() {
57
+ for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
58
+ TestValue(i);
59
+ TestValue(i - 1);
60
+ TestValue(i + 1);
61
+ }
62
+ }
63
+
64
+ BOOST_AUTO_TEST_CASE(Tens) {
65
+ Test10s<uint64_t>();
66
+ Test10s<int64_t>();
67
+ Test10s<uint32_t>();
68
+ Test10s<int32_t>();
69
+ }
70
+
71
+ BOOST_AUTO_TEST_CASE(Pointers) {
72
+ for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
73
+ TestValue((const void*)i);
74
+ }
75
+ for (uintptr_t i = 0; i < 256; ++i) {
76
+ TestValue((const void*)i);
77
+ TestValue((const void*)(i + 0xf00));
78
+ }
79
+ }
80
+
81
+ }} // namespaces