sleepyhead111 commited on
Commit
1747e32
·
verified ·
1 Parent(s): dc27c50

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. mosesdecoder/moses/BitmapContainer.cpp +498 -0
  2. mosesdecoder/moses/Bitmaps.h +32 -0
  3. mosesdecoder/moses/ChartManager.h +162 -0
  4. mosesdecoder/moses/ChartTranslationOptions.cpp +168 -0
  5. mosesdecoder/moses/DecodeStepGeneration.cpp +169 -0
  6. mosesdecoder/moses/FloydWarshall.cpp +36 -0
  7. mosesdecoder/moses/HypothesisStack.h +64 -0
  8. mosesdecoder/moses/Sentence.cpp +372 -0
  9. mosesdecoder/moses/Syntax/Cube.h +62 -0
  10. mosesdecoder/moses/Syntax/CubeQueue.cpp +37 -0
  11. mosesdecoder/moses/Syntax/CubeQueue.h +52 -0
  12. mosesdecoder/moses/Syntax/InputWeightFF.cpp +48 -0
  13. mosesdecoder/moses/Syntax/Manager.cpp +229 -0
  14. mosesdecoder/moses/Syntax/NonTerminalMap.h +85 -0
  15. mosesdecoder/moses/Syntax/PHyperedge.h +21 -0
  16. mosesdecoder/moses/Syntax/RuleTableFF.h +60 -0
  17. mosesdecoder/moses/Syntax/SHyperedgeBundle.h +31 -0
  18. mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h +26 -0
  19. mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +424 -0
  20. mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h +112 -0
  21. mosesdecoder/moses/TranslationModel/CompactPT/Jamfile +17 -0
  22. mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +450 -0
  23. mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h +202 -0
  24. mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h +230 -0
  25. mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h +144 -0
  26. mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h +412 -0
  27. mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp +198 -0
  28. mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc +434 -0
  29. mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h +83 -0
  30. mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc +31 -0
  31. mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h +30 -0
  32. mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc +51 -0
  33. mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h +21 -0
  34. mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x +105 -0
  35. mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc +57 -0
  36. mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc +348 -0
  37. mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc +498 -0
  38. mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc +166 -0
  39. mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc +77 -0
  40. mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc +74 -0
  41. mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc +27 -0
  42. mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc +13 -0
  43. mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc +594 -0
  44. mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h +176 -0
  45. mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc +420 -0
  46. mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc +171 -0
  47. mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h +782 -0
  48. mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +188 -0
  49. mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +58 -0
  50. mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h +87 -0
mosesdecoder/moses/BitmapContainer.cpp ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <algorithm>
23
+ #include <limits>
24
+ #include <utility>
25
+
26
+ #include "BitmapContainer.h"
27
+ #include "HypothesisStackCubePruning.h"
28
+ #include "moses/FF/DistortionScoreProducer.h"
29
+ #include "TranslationOptionList.h"
30
+ #include "Manager.h"
31
+
32
+ namespace Moses
33
+ {
34
+
35
+ class HypothesisScoreOrdererNoDistortion
36
+ {
37
+ public:
38
+ bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
39
+ const float scoreA = hypoA->GetScore();
40
+ const float scoreB = hypoB->GetScore();
41
+
42
+ if (scoreA > scoreB) {
43
+ return true;
44
+ } else if (scoreA < scoreB) {
45
+ return false;
46
+ } else {
47
+ return hypoA < hypoB;
48
+ }
49
+ }
50
+ };
51
+
52
+ class HypothesisScoreOrdererWithDistortion
53
+ {
54
+ private:
55
+ bool m_deterministic;
56
+
57
+ public:
58
+ HypothesisScoreOrdererWithDistortion(const Range* transOptRange,
59
+ const bool deterministic = false)
60
+ : m_deterministic(deterministic)
61
+ , m_transOptRange(transOptRange) {
62
+ m_totalWeightDistortion = 0;
63
+ const StaticData &staticData = StaticData::Instance();
64
+
65
+ const std::vector<const DistortionScoreProducer*> &ffs = DistortionScoreProducer::GetDistortionFeatureFunctions();
66
+ std::vector<const DistortionScoreProducer*>::const_iterator iter;
67
+ for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
68
+ const DistortionScoreProducer *ff = *iter;
69
+
70
+ float weight =staticData.GetAllWeights().GetScoreForProducer(ff);
71
+ m_totalWeightDistortion += weight;
72
+ }
73
+ }
74
+
75
+ const Range* m_transOptRange;
76
+ float m_totalWeightDistortion;
77
+
78
+ bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
79
+ UTIL_THROW_IF2(m_transOptRange == NULL, "Words range not set");
80
+
81
+
82
+ const float distortionScoreA = DistortionScoreProducer::CalculateDistortionScore(
83
+ *hypoA,
84
+ hypoA->GetCurrSourceWordsRange(),
85
+ *m_transOptRange,
86
+ hypoA->GetWordsBitmap().GetFirstGapPos()
87
+ );
88
+ const float distortionScoreB = DistortionScoreProducer::CalculateDistortionScore(
89
+ *hypoB,
90
+ hypoB->GetCurrSourceWordsRange(),
91
+ *m_transOptRange,
92
+ hypoB->GetWordsBitmap().GetFirstGapPos()
93
+ );
94
+
95
+
96
+ const float scoreA = hypoA->GetScore() + distortionScoreA * m_totalWeightDistortion;
97
+ const float scoreB = hypoB->GetScore() + distortionScoreB * m_totalWeightDistortion;
98
+
99
+
100
+ if (scoreA > scoreB) {
101
+ return true;
102
+ } else if (scoreA < scoreB) {
103
+ return false;
104
+ } else {
105
+ if (m_deterministic) {
106
+ // Equal scores: break ties by comparing target phrases
107
+ return (hypoA->GetCurrTargetPhrase().Compare(hypoB->GetCurrTargetPhrase()) < 0);
108
+ }
109
+ // Fallback: non-deterministic sort
110
+ return hypoA < hypoB;
111
+ }
112
+ }
113
+
114
+ };
115
+
116
+ ////////////////////////////////////////////////////////////////////////////////
117
+ // BackwardsEdge Code
118
+ ////////////////////////////////////////////////////////////////////////////////
119
+
120
+ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
121
+ , BitmapContainer &parent
122
+ , const TranslationOptionList &translations
123
+ , const SquareMatrix &estimatedScores,
124
+ const InputType& itype,
125
+ const bool deterministic)
126
+ : m_initialized(false)
127
+ , m_prevBitmapContainer(prevBitmapContainer)
128
+ , m_parent(parent)
129
+ , m_translations(translations)
130
+ , m_estimatedScores(estimatedScores)
131
+ , m_deterministic(deterministic)
132
+ , m_seenPosition()
133
+ {
134
+
135
+ // If either dimension is empty, we haven't got anything to do.
136
+ if(m_prevBitmapContainer.GetHypotheses().size() == 0 || m_translations.size() == 0) {
137
+ VERBOSE(3, "Empty cube on BackwardsEdge" << std::endl);
138
+ return;
139
+ }
140
+
141
+ // Fetch the things we need for distortion cost computation.
142
+ // int maxDistortion = StaticData::Instance().GetMaxDistortion();
143
+ int maxDistortion = itype.options()->reordering.max_distortion;
144
+
145
+ if (maxDistortion == -1) {
146
+ for (HypothesisSet::const_iterator iter = m_prevBitmapContainer.GetHypotheses().begin(); iter != m_prevBitmapContainer.GetHypotheses().end(); ++iter) {
147
+ m_hypotheses.push_back(*iter);
148
+ }
149
+ return;
150
+ }
151
+
152
+ const Range &transOptRange = translations.Get(0)->GetSourceWordsRange();
153
+
154
+ HypothesisSet::const_iterator iterHypo = m_prevBitmapContainer.GetHypotheses().begin();
155
+ HypothesisSet::const_iterator iterEnd = m_prevBitmapContainer.GetHypotheses().end();
156
+
157
+ while (iterHypo != iterEnd) {
158
+ const Hypothesis &hypo = **iterHypo;
159
+ // Special case: If this is the first hypothesis used to seed the search,
160
+ // it doesn't have a valid range, and we create the hypothesis, if the
161
+ // initial position is not further into the sentence than the distortion limit.
162
+ if (hypo.GetWordsBitmap().GetNumWordsCovered() == 0) {
163
+ if ((int)transOptRange.GetStartPos() <= maxDistortion)
164
+ m_hypotheses.push_back(&hypo);
165
+ } else {
166
+ int distortionDistance = itype.ComputeDistortionDistance(hypo.GetCurrSourceWordsRange()
167
+ , transOptRange);
168
+
169
+ if (distortionDistance <= maxDistortion)
170
+ m_hypotheses.push_back(&hypo);
171
+ }
172
+
173
+ ++iterHypo;
174
+ }
175
+
176
+ if (m_translations.size() > 1) {
177
+ UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
178
+ "Non-monotonic future score: "
179
+ << m_translations.Get(0)->GetFutureScore() << " vs. "
180
+ << m_translations.Get(1)->GetFutureScore());
181
+ }
182
+
183
+ if (m_hypotheses.size() > 1) {
184
+ UTIL_THROW_IF2(m_hypotheses[0]->GetFutureScore() < m_hypotheses[1]->GetFutureScore(),
185
+ "Non-monotonic total score"
186
+ << m_hypotheses[0]->GetFutureScore() << " vs. "
187
+ << m_hypotheses[1]->GetFutureScore());
188
+ }
189
+
190
+ HypothesisScoreOrdererWithDistortion orderer (&transOptRange, m_deterministic);
191
+ std::sort(m_hypotheses.begin(), m_hypotheses.end(), orderer);
192
+
193
+ // std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrdererNoDistortion());
194
+ }
195
+
196
+ BackwardsEdge::~BackwardsEdge()
197
+ {
198
+ m_seenPosition.clear();
199
+ m_hypotheses.clear();
200
+ }
201
+
202
+
203
+ void
204
+ BackwardsEdge::Initialize()
205
+ {
206
+ if(m_hypotheses.size() == 0 || m_translations.size() == 0) {
207
+ m_initialized = true;
208
+ return;
209
+ }
210
+
211
+ const Bitmap &bm = m_hypotheses[0]->GetWordsBitmap();
212
+ const Range &newRange = m_translations.Get(0)->GetSourceWordsRange();
213
+ m_estimatedScore = m_estimatedScores.CalcEstimatedScore(bm, newRange.GetStartPos(), newRange.GetEndPos());
214
+
215
+ Hypothesis *expanded = CreateHypothesis(*m_hypotheses[0], *m_translations.Get(0));
216
+ m_parent.Enqueue(0, 0, expanded, this);
217
+ SetSeenPosition(0, 0);
218
+ m_initialized = true;
219
+ }
220
+
221
+ Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt)
222
+ {
223
+ // create hypothesis and calculate all its scores
224
+ IFVERBOSE(2) {
225
+ hypothesis.GetManager().GetSentenceStats().StartTimeBuildHyp();
226
+ }
227
+ const Bitmap &bitmap = m_parent.GetWordsBitmap();
228
+ Hypothesis *newHypo = new Hypothesis(hypothesis, transOpt, bitmap, hypothesis.GetManager().GetNextHypoId());
229
+ IFVERBOSE(2) {
230
+ hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
231
+ }
232
+ newHypo->EvaluateWhenApplied(m_estimatedScore);
233
+
234
+ return newHypo;
235
+ }
236
+
237
+ bool
238
+ BackwardsEdge::SeenPosition(const size_t x, const size_t y)
239
+ {
240
+ boost::unordered_set< int >::iterator iter = m_seenPosition.find((x<<16) + y);
241
+ return (iter != m_seenPosition.end());
242
+ }
243
+
244
+ void
245
+ BackwardsEdge::SetSeenPosition(const size_t x, const size_t y)
246
+ {
247
+ UTIL_THROW_IF2(x >= (1<<17), "Error");
248
+ UTIL_THROW_IF2(y >= (1<<17), "Error");
249
+
250
+ m_seenPosition.insert((x<<16) + y);
251
+ }
252
+
253
+
254
+ bool
255
+ BackwardsEdge::GetInitialized()
256
+ {
257
+ return m_initialized;
258
+ }
259
+
260
+ const BitmapContainer&
261
+ BackwardsEdge::GetBitmapContainer() const
262
+ {
263
+ return m_prevBitmapContainer;
264
+ }
265
+
266
+ void
267
+ BackwardsEdge::PushSuccessors(const size_t x, const size_t y)
268
+ {
269
+ Hypothesis *newHypo;
270
+
271
+ if(y + 1 < m_translations.size() && !SeenPosition(x, y + 1)) {
272
+ SetSeenPosition(x, y + 1);
273
+ newHypo = CreateHypothesis(*m_hypotheses[x], *m_translations.Get(y + 1));
274
+ if(newHypo != NULL) {
275
+ m_parent.Enqueue(x, y + 1, newHypo, (BackwardsEdge*)this);
276
+ }
277
+ }
278
+
279
+ if(x + 1 < m_hypotheses.size() && !SeenPosition(x + 1, y)) {
280
+ SetSeenPosition(x + 1, y);
281
+ newHypo = CreateHypothesis(*m_hypotheses[x + 1], *m_translations.Get(y));
282
+ if(newHypo != NULL) {
283
+ m_parent.Enqueue(x + 1, y, newHypo, (BackwardsEdge*)this);
284
+ }
285
+ }
286
+ }
287
+
288
+
289
+ ////////////////////////////////////////////////////////////////////////////////
290
+ // BitmapContainer Code
291
+ ////////////////////////////////////////////////////////////////////////////////
292
+
293
+ BitmapContainer::BitmapContainer(const Bitmap &bitmap
294
+ , HypothesisStackCubePruning &stack
295
+ , bool deterministic)
296
+ : m_bitmap(bitmap)
297
+ , m_stack(stack)
298
+ , m_numStackInsertions(0)
299
+ , m_deterministic(deterministic)
300
+ {
301
+ m_hypotheses = HypothesisSet();
302
+ m_edges = BackwardsEdgeSet();
303
+ m_queue = HypothesisQueue();
304
+ }
305
+
306
+ BitmapContainer::~BitmapContainer()
307
+ {
308
+ // As we have created the square position objects we clean up now.
309
+
310
+ while (!m_queue.empty()) {
311
+ HypothesisQueueItem *item = m_queue.top();
312
+ m_queue.pop();
313
+
314
+ delete item->GetHypothesis();
315
+ delete item;
316
+ }
317
+
318
+ // Delete all edges.
319
+ RemoveAllInColl(m_edges);
320
+
321
+ m_hypotheses.clear();
322
+ m_edges.clear();
323
+ }
324
+
325
+
326
+ void
327
+ BitmapContainer::Enqueue(int hypothesis_pos
328
+ , int translation_pos
329
+ , Hypothesis *hypothesis
330
+ , BackwardsEdge *edge)
331
+ {
332
+ // Only supply target phrase if running deterministic search mode
333
+ const TargetPhrase *target_phrase = m_deterministic ? &(hypothesis->GetCurrTargetPhrase()) : NULL;
334
+ HypothesisQueueItem *item = new HypothesisQueueItem(hypothesis_pos
335
+ , translation_pos
336
+ , hypothesis
337
+ , edge
338
+ , target_phrase);
339
+ IFVERBOSE(2) {
340
+ item->GetHypothesis()->GetManager().GetSentenceStats().StartTimeManageCubes();
341
+ }
342
+ m_queue.push(item);
343
+ IFVERBOSE(2) {
344
+ item->GetHypothesis()->GetManager().GetSentenceStats().StopTimeManageCubes();
345
+ }
346
+ }
347
+
348
+ HypothesisQueueItem*
349
+ BitmapContainer::Dequeue(bool keepValue)
350
+ {
351
+ if (!m_queue.empty()) {
352
+ HypothesisQueueItem *item = m_queue.top();
353
+
354
+ if (!keepValue) {
355
+ m_queue.pop();
356
+ }
357
+
358
+ return item;
359
+ }
360
+
361
+ return NULL;
362
+ }
363
+
364
+ HypothesisQueueItem*
365
+ BitmapContainer::Top() const
366
+ {
367
+ return m_queue.top();
368
+ }
369
+
370
+ size_t
371
+ BitmapContainer::Size()
372
+ {
373
+ return m_queue.size();
374
+ }
375
+
376
+ bool
377
+ BitmapContainer::Empty() const
378
+ {
379
+ return m_queue.empty();
380
+ }
381
+
382
+ const HypothesisSet&
383
+ BitmapContainer::GetHypotheses() const
384
+ {
385
+ return m_hypotheses;
386
+ }
387
+
388
+ size_t
389
+ BitmapContainer::GetHypothesesSize() const
390
+ {
391
+ return m_hypotheses.size();
392
+ }
393
+
394
+ const BackwardsEdgeSet&
395
+ BitmapContainer::GetBackwardsEdges()
396
+ {
397
+ return m_edges;
398
+ }
399
+
400
+ void
401
+ BitmapContainer::AddHypothesis(Hypothesis *hypothesis)
402
+ {
403
+ bool itemExists = false;
404
+ HypothesisSet::const_iterator iter = m_hypotheses.begin();
405
+ HypothesisSet::const_iterator iterEnd = m_hypotheses.end();
406
+
407
+ // cfedermann: do we actually need this check?
408
+ while (iter != iterEnd) {
409
+ if (*iter == hypothesis) {
410
+ itemExists = true;
411
+ break;
412
+ }
413
+
414
+ ++iter;
415
+ }
416
+ UTIL_THROW_IF2(itemExists, "Duplicate hypotheses");
417
+ m_hypotheses.push_back(hypothesis);
418
+ }
419
+
420
+ void
421
+ BitmapContainer::AddBackwardsEdge(BackwardsEdge *edge)
422
+ {
423
+ m_edges.insert(edge);
424
+ }
425
+
426
+ void
427
+ BitmapContainer::InitializeEdges()
428
+ {
429
+ BackwardsEdgeSet::iterator iter = m_edges.begin();
430
+ BackwardsEdgeSet::iterator iterEnd = m_edges.end();
431
+
432
+ while (iter != iterEnd) {
433
+ BackwardsEdge *edge = *iter;
434
+ edge->Initialize();
435
+
436
+ ++iter;
437
+ }
438
+ }
439
+
440
+ void
441
+ BitmapContainer::EnsureMinStackHyps(const size_t minNumHyps)
442
+ {
443
+ while ((!Empty()) && m_numStackInsertions < minNumHyps) {
444
+ ProcessBestHypothesis();
445
+ }
446
+ }
447
+
448
+ void
449
+ BitmapContainer::ProcessBestHypothesis()
450
+ {
451
+ if (m_queue.empty()) {
452
+ return;
453
+ }
454
+
455
+ // Get the currently best hypothesis from the queue.
456
+ HypothesisQueueItem *item = Dequeue();
457
+
458
+ // If the priority queue is exhausted, we are done and should have exited
459
+ UTIL_THROW_IF2(item == NULL, "Null object");
460
+
461
+ // check we are pulling things off of priority queue in right order
462
+ if (!Empty()) {
463
+ HypothesisQueueItem *check = Dequeue(true);
464
+ UTIL_THROW_IF2(item->GetHypothesis()->GetFutureScore() < check->GetHypothesis()->GetFutureScore(),
465
+ "Non-monotonic total score: "
466
+ << item->GetHypothesis()->GetFutureScore() << " vs. "
467
+ << check->GetHypothesis()->GetFutureScore());
468
+ }
469
+
470
+ // Logging for the criminally insane
471
+ IFVERBOSE(3) {
472
+ item->GetHypothesis()->PrintHypothesis();
473
+ }
474
+
475
+ // Add best hypothesis to hypothesis stack.
476
+ const bool newstackentry = m_stack.AddPrune(item->GetHypothesis());
477
+ if (newstackentry)
478
+ m_numStackInsertions++;
479
+
480
+ IFVERBOSE(3) {
481
+ TRACE_ERR("new stack entry flag is " << newstackentry << std::endl);
482
+ }
483
+
484
+ // Create new hypotheses for the two successors of the hypothesis just added.
485
+ item->GetBackwardsEdge()->PushSuccessors(item->GetHypothesisPos(), item->GetTranslationPos());
486
+
487
+ // We are done with the queue item, we delete it.
488
+ delete item;
489
+ }
490
+
491
+ void
492
+ BitmapContainer::SortHypotheses()
493
+ {
494
+ std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrderer(m_deterministic));
495
+ }
496
+
497
+ }
498
+
mosesdecoder/moses/Bitmaps.h ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <boost/unordered_set.hpp>
4
+ #include <boost/unordered_map.hpp>
5
+ #include <set>
6
+ #include "Bitmap.h"
7
+ #include "Util.h"
8
+
9
+ namespace Moses
10
+ {
11
+
12
+ class Bitmaps
13
+ {
14
+ typedef boost::unordered_map<Range, const Bitmap*> NextBitmaps;
15
+ typedef boost::unordered_map<const Bitmap*, NextBitmaps, UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
16
+ //typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
17
+ Coll m_coll;
18
+ const Bitmap *m_initBitmap;
19
+
20
+ const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
21
+ public:
22
+ Bitmaps(size_t inputSize, const std::vector<bool> &initSourceCompleted);
23
+ virtual ~Bitmaps();
24
+
25
+ const Bitmap &GetInitialBitmap() const {
26
+ return *m_initBitmap;
27
+ }
28
+ const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
29
+
30
+ };
31
+
32
+ }
mosesdecoder/moses/ChartManager.h ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2010 Hieu Hoang
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #pragma once
23
+
24
+ #include <vector>
25
+ #include <boost/unordered_map.hpp>
26
+ #include "ChartCell.h"
27
+ #include "ChartCellCollection.h"
28
+ #include "Range.h"
29
+ #include "SentenceStats.h"
30
+ #include "ChartTranslationOptionList.h"
31
+ #include "ChartParser.h"
32
+ #include "ChartKBestExtractor.h"
33
+ #include "BaseManager.h"
34
+ #include "moses/Syntax/KBestExtractor.h"
35
+
36
+ namespace Moses
37
+ {
38
+
39
+ class ChartHypothesis;
40
+ class ChartSearchGraphWriter;
41
+
42
+ /** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
43
+ */
44
+ class ChartManager : public BaseManager
45
+ {
46
+ private:
47
+ ChartCellCollection m_hypoStackColl;
48
+ std::auto_ptr<SentenceStats> m_sentenceStats;
49
+ clock_t m_start; /**< starting time, used for logging */
50
+ unsigned m_hypothesisId; /* For handing out hypothesis ids to ChartHypothesis */
51
+
52
+ ChartParser m_parser;
53
+
54
+ ChartTranslationOptionList m_translationOptionList; /**< pre-computed list of translation options for the phrases in this sentence */
55
+
56
+ /* auxilliary functions for SearchGraphs */
57
+ void FindReachableHypotheses(
58
+ const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable , size_t* winners, size_t* losers) const;
59
+ void WriteSearchGraph(const ChartSearchGraphWriter& writer) const;
60
+
61
+ // output
62
+ void OutputNBestList(OutputCollector *collector,
63
+ const ChartKBestExtractor::KBestVec &nBestList,
64
+ long translationId) const;
65
+ size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) const;
66
+ size_t OutputAlignmentNBest(Alignments &retAlign,
67
+ const Moses::ChartKBestExtractor::Derivation &derivation,
68
+ size_t startTarget) const;
69
+ size_t OutputAlignment(Alignments &retAlign,
70
+ const Moses::ChartHypothesis *hypo,
71
+ size_t startTarget) const;
72
+ void OutputDetailedTranslationReport(
73
+ OutputCollector *collector,
74
+ const ChartHypothesis *hypo,
75
+ const Sentence &sentence,
76
+ long translationId) const;
77
+ void OutputTranslationOptions(std::ostream &out,
78
+ ApplicationContext &applicationContext,
79
+ const ChartHypothesis *hypo,
80
+ const Sentence &sentence,
81
+ long translationId) const;
82
+ void OutputTranslationOption(std::ostream &out,
83
+ ApplicationContext &applicationContext,
84
+ const ChartHypothesis *hypo,
85
+ const Sentence &sentence,
86
+ long translationId) const;
87
+ void ReconstructApplicationContext(const ChartHypothesis &hypo,
88
+ const Sentence &sentence,
89
+ ApplicationContext &context) const;
90
+ void OutputTreeFragmentsTranslationOptions(std::ostream &out,
91
+ ApplicationContext &applicationContext,
92
+ const ChartHypothesis *hypo,
93
+ const Sentence &sentence,
94
+ long translationId) const;
95
+ void OutputDetailedAllTranslationReport(
96
+ OutputCollector *collector,
97
+ const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
98
+ const Sentence &sentence,
99
+ long translationId) const;
100
+ void OutputBestHypo(OutputCollector *collector, const ChartHypothesis *hypo, long translationId) const;
101
+ void Backtrack(const ChartHypothesis *hypo) const;
102
+
103
+ public:
104
+ ChartManager(ttasksptr const& ttask);
105
+ ~ChartManager();
106
+ void Decode();
107
+ void AddXmlChartOptions();
108
+ const ChartHypothesis *GetBestHypothesis() const;
109
+ void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
110
+
111
+ /** "Moses" (osg) type format */
112
+ void OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const;
113
+
114
+ /** Output in (modified) Kenneth hypergraph format */
115
+ void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
116
+
117
+ //! debug data collected when decoding sentence
118
+ SentenceStats& GetSentenceStats() const {
119
+ return *m_sentenceStats;
120
+ }
121
+
122
+ //DIMw
123
+ const ChartCellCollection& GetChartCellCollection() const {
124
+ return m_hypoStackColl;
125
+ }
126
+
127
+ void CalcDecoderStatistics() const {
128
+ }
129
+
130
+ void ResetSentenceStats(const InputType& source) {
131
+ m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
132
+ }
133
+
134
+ //! contigious hypo id for each input sentence. For debugging purposes
135
+ unsigned GetNextHypoId() {
136
+ return m_hypothesisId++;
137
+ }
138
+
139
+ const ChartParser &GetParser() const {
140
+ return m_parser;
141
+ }
142
+
143
+ // outputs
144
+ void OutputBest(OutputCollector *collector) const;
145
+ void OutputNBest(OutputCollector *collector) const;
146
+ void OutputLatticeSamples(OutputCollector *collector) const {
147
+ }
148
+ void OutputAlignment(OutputCollector *collector) const;
149
+ void OutputDetailedTranslationReport(OutputCollector *collector) const;
150
+ void OutputUnknowns(OutputCollector *collector) const;
151
+ void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
152
+ void OutputWordGraph(OutputCollector *collector) const {
153
+ }
154
+ void OutputSearchGraph(OutputCollector *collector) const;
155
+ void OutputSearchGraphSLF() const {
156
+ }
157
+ // void OutputSearchGraphHypergraph() const;
158
+
159
+ };
160
+
161
+ }
162
+
mosesdecoder/moses/ChartTranslationOptions.cpp ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - factored phrase-based language decoder
3
+ Copyright (C) 2010 Hieu Hoang
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "ChartTranslationOptions.h"
21
+ #include "ChartHypothesis.h"
22
+ #include "ChartCellLabel.h"
23
+ #include "ChartTranslationOption.h"
24
+ #include "InputPath.h"
25
+ #include "StaticData.h"
26
+ #include "TranslationTask.h"
27
+
28
+ using namespace std;
29
+
30
+ namespace Moses
31
+ {
32
+
33
+ ChartTranslationOptions::ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
34
+ const StackVec &stackVec,
35
+ const Range &range,
36
+ float score)
37
+ : m_stackVec(stackVec)
38
+ , m_wordsRange(&range)
39
+ , m_estimateOfBestScore(score)
40
+ {
41
+ TargetPhraseCollection::const_iterator iter;
42
+ for (iter = targetPhraseColl.begin(); iter != targetPhraseColl.end(); ++iter) {
43
+ const TargetPhrase *origTP = *iter;
44
+
45
+ boost::shared_ptr<ChartTranslationOption> ptr(new ChartTranslationOption(*origTP));
46
+ m_collection.push_back(ptr);
47
+ }
48
+ }
49
+
50
+ ChartTranslationOptions::~ChartTranslationOptions()
51
+ {
52
+
53
+ }
54
+
55
+ //! functor to compare (chart) hypotheses by (descending) score
56
+ class ChartTranslationOptionScoreOrderer
57
+ {
58
+ public:
59
+ bool operator()(const boost::shared_ptr<ChartTranslationOption> &transOptA
60
+ , const boost::shared_ptr<ChartTranslationOption> &transOptB) const {
61
+ const ScoreComponentCollection &scoresA = transOptA->GetScores();
62
+ const ScoreComponentCollection &scoresB = transOptB->GetScores();
63
+ return scoresA.GetWeightedScore() > scoresB.GetWeightedScore();
64
+ }
65
+ };
66
+
67
+ void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
68
+ {
69
+ SetInputPath(&inputPath);
70
+ // if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
71
+ if (inputPath.ttask->options()->input.placeholder_factor != NOT_FOUND) {
72
+ CreateSourceRuleFromInputPath();
73
+ }
74
+
75
+ CollType::iterator iter;
76
+ for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
77
+ ChartTranslationOption &transOpt = **iter;
78
+ transOpt.SetInputPath(&inputPath);
79
+ transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
80
+ }
81
+
82
+ // get rid of -inf trans opts
83
+ size_t numDiscard = 0;
84
+ for (size_t i = 0; i < m_collection.size(); ++i) {
85
+ ChartTranslationOption *transOpt = m_collection[i].get();
86
+
87
+ if (transOpt->GetScores().GetWeightedScore() == - std::numeric_limits<float>::infinity()) {
88
+ ++numDiscard;
89
+ } else if (numDiscard) {
90
+ m_collection[i - numDiscard] = m_collection[i];
91
+ }
92
+ }
93
+
94
+ size_t newSize = m_collection.size() - numDiscard;
95
+ m_collection.resize(newSize);
96
+
97
+ // sort if necessary
98
+ const StaticData &staticData = StaticData::Instance();
99
+ if (staticData.RequireSortingAfterSourceContext()) {
100
+ std::sort(m_collection.begin()
101
+ , m_collection.begin() + newSize
102
+ , ChartTranslationOptionScoreOrderer());
103
+ }
104
+
105
+ }
106
+
107
+ void ChartTranslationOptions::SetInputPath(const InputPath *inputPath)
108
+ {
109
+ CollType::iterator iter;
110
+ for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
111
+ ChartTranslationOption &transOpt = **iter;
112
+ transOpt.SetInputPath(inputPath);
113
+ }
114
+ }
115
+
116
+ void ChartTranslationOptions::CreateSourceRuleFromInputPath()
117
+ {
118
+ if (m_collection.size() == 0) {
119
+ return;
120
+ }
121
+
122
+ const InputPath *inputPath = m_collection.front()->GetInputPath();
123
+ assert(inputPath);
124
+ std::vector<const Word*> &ruleSourceFromInputPath = inputPath->AddRuleSourceFromInputPath();
125
+
126
+ size_t chartCellIndex = 0;
127
+ const ChartCellLabel *chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
128
+
129
+ size_t ind = 0;
130
+ for (size_t sourcePos = m_wordsRange->GetStartPos(); sourcePos <= m_wordsRange->GetEndPos(); ++sourcePos, ++ind) {
131
+ if (chartCellLabel) {
132
+ if (sourcePos == chartCellLabel->GetCoverage().GetEndPos()) {
133
+ // end of child range. push an empty word to denote non-term
134
+ ruleSourceFromInputPath.push_back(NULL);
135
+ ++chartCellIndex;
136
+ chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
137
+ } else if (sourcePos >= chartCellLabel->GetCoverage().GetStartPos()) {
138
+ // in the range of child hypo. do nothing
139
+ } else {
140
+ // not yet reached child range. add word
141
+ ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
142
+ }
143
+ } else {
144
+ // no child in sight. add word
145
+ ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
146
+ }
147
+ }
148
+
149
+ // save it to each trans opt
150
+ CollType::iterator iter;
151
+ for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
152
+ ChartTranslationOption &transOpt = **iter;
153
+ transOpt.SetSourceRuleFromInputPath(&ruleSourceFromInputPath);
154
+ }
155
+
156
+ }
157
+
158
+ std::ostream& operator<<(std::ostream &out, const ChartTranslationOptions &obj)
159
+ {
160
+ for (size_t i = 0; i < obj.m_collection.size(); ++i) {
161
+ const ChartTranslationOption &transOpt = *obj.m_collection[i];
162
+ out << transOpt << endl;
163
+ }
164
+
165
+ return out;
166
+ }
167
+
168
+ }
mosesdecoder/moses/DecodeStepGeneration.cpp ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "DecodeStepGeneration.h"
23
+ #include "GenerationDictionary.h"
24
+ #include "TranslationOption.h"
25
+ #include "TranslationOptionCollection.h"
26
+ #include "PartialTranslOptColl.h"
27
+ #include "FactorCollection.h"
28
+
29
+ namespace Moses
30
+ {
31
+ using namespace std;
32
+
33
+ DecodeStepGeneration::DecodeStepGeneration(GenerationDictionary* dict,
34
+ const DecodeStep* prev,
35
+ const std::vector<FeatureFunction*> &features)
36
+ : DecodeStep(dict, prev, features)
37
+ {
38
+ }
39
+
40
+ // helpers
41
+ typedef pair<Word, ScoreComponentCollection> WordPair;
42
+ typedef list< WordPair > WordList;
43
+ // 1st = word
44
+ // 2nd = score
45
+ typedef list< WordPair >::const_iterator WordListIterator;
46
+
47
+ /** used in generation: increases iterators when looping through the exponential number of generation expansions */
48
+ inline void IncrementIterators(vector< WordListIterator > &wordListIterVector
49
+ , const vector< WordList > &wordListVector)
50
+ {
51
+ for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++) {
52
+ WordListIterator &iter = wordListIterVector[currPos];
53
+ iter++;
54
+ if (iter != wordListVector[currPos].end()) {
55
+ // eg. 4 -> 5
56
+ return;
57
+ } else {
58
+ // eg 9 -> 10
59
+ iter = wordListVector[currPos].begin();
60
+ }
61
+ }
62
+ }
63
+
64
+ void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOpt
65
+ , const DecodeStep &decodeStep
66
+ , PartialTranslOptColl &outputPartialTranslOptColl
67
+ , TranslationOptionCollection * /* toc */
68
+ , bool /*adhereTableLimit*/) const
69
+ {
70
+ if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) {
71
+ // word deletion
72
+
73
+ TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt);
74
+ outputPartialTranslOptColl.Add(newTransOpt);
75
+
76
+ return;
77
+ }
78
+
79
+ // normal generation step
80
+ const GenerationDictionary* generationDictionary = decodeStep.GetGenerationDictionaryFeature();
81
+
82
+ const Phrase &targetPhrase = inputPartialTranslOpt.GetTargetPhrase();
83
+ const InputPath &inputPath = inputPartialTranslOpt.GetInputPath();
84
+ size_t targetLength = targetPhrase.GetSize();
85
+
86
+ // generation list for each word in phrase
87
+ vector< WordList > wordListVector(targetLength);
88
+
89
+ // create generation list
90
+ int wordListVectorPos = 0;
91
+ for (size_t currPos = 0 ; currPos < targetLength ; currPos++) { // going thorugh all words
92
+ // generatable factors for this word to be put in wordList
93
+ WordList &wordList = wordListVector[wordListVectorPos];
94
+ const Word &word = targetPhrase.GetWord(currPos);
95
+
96
+ // consult dictionary for possible generations for this word
97
+ const OutputWordCollection *wordColl = generationDictionary->FindWord(word);
98
+
99
+ if (wordColl == NULL) {
100
+ // word not found in generation dictionary
101
+ //toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
102
+ return; // can't be part of a phrase, special handling
103
+ } else {
104
+ // sort(*wordColl, CompareWordCollScore);
105
+ OutputWordCollection::const_iterator iterWordColl;
106
+ for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl) {
107
+ const Word &outputWord = (*iterWordColl).first;
108
+ const ScoreComponentCollection& score = (*iterWordColl).second;
109
+ // enter into word list generated factor(s) and its(their) score(s)
110
+ wordList.push_back(WordPair(outputWord, score));
111
+ }
112
+
113
+ wordListVectorPos++; // done, next word
114
+ }
115
+ }
116
+
117
+ // use generation list (wordList)
118
+ // set up iterators (total number of expansions)
119
+ size_t numIteration = 1;
120
+ vector< WordListIterator > wordListIterVector(targetLength);
121
+ vector< const Word* > mergeWords(targetLength);
122
+ for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
123
+ wordListIterVector[currPos] = wordListVector[currPos].begin();
124
+ numIteration *= wordListVector[currPos].size();
125
+ }
126
+
127
+ // go thru each possible factor for each word & create hypothesis
128
+ for (size_t currIter = 0 ; currIter < numIteration ; currIter++) {
129
+ ScoreComponentCollection generationScore; // total score for this string of words
130
+
131
+ // create vector of words with new factors for last phrase
132
+ for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
133
+ const WordPair &wordPair = *wordListIterVector[currPos];
134
+ mergeWords[currPos] = &(wordPair.first);
135
+ generationScore.PlusEquals(wordPair.second);
136
+ }
137
+
138
+ // merge with existing trans opt
139
+ Phrase genPhrase( mergeWords);
140
+
141
+ if (IsFilteringStep()) {
142
+ if (!inputPartialTranslOpt.IsCompatible(genPhrase, m_conflictFactors))
143
+ continue;
144
+ }
145
+
146
+ const TargetPhrase &inPhrase = inputPartialTranslOpt.GetTargetPhrase();
147
+ TargetPhrase outPhrase(inPhrase);
148
+ outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
149
+
150
+ outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
151
+ outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
152
+
153
+ const Range &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
154
+
155
+ TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
156
+ assert(newTransOpt);
157
+
158
+ newTransOpt->SetInputPath(inputPath);
159
+
160
+ outputPartialTranslOptColl.Add(newTransOpt);
161
+
162
+ // increment iterators
163
+ IncrementIterators(wordListIterVector, wordListVector);
164
+ }
165
+ }
166
+
167
+ }
168
+
169
+
mosesdecoder/moses/FloydWarshall.cpp ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "util/exception.hh"
2
+ #include <climits>
3
+ #include <vector>
4
+
5
+ #define MAX_DIST (INT_MAX / 2)
6
+
7
+ //#include "FloydWarshall.h"
8
+
9
+ using namespace std;
10
+
11
+ // All-pairs shortest path algorithm
12
+ void floyd_warshall(const std::vector<std::vector<bool> >& edges, std::vector<std::vector<int> >& dist)
13
+ {
14
+ UTIL_THROW_IF2(edges.size() != edges.front().size(), "Error");
15
+ dist.clear();
16
+ dist.resize(edges.size(), std::vector<int>(edges.size(), 0));
17
+
18
+ size_t num_edges = edges.size();
19
+
20
+ for (size_t i=0; i<num_edges; ++i) {
21
+ for (size_t j=0; j<num_edges; ++j) {
22
+ if (edges[i][j])
23
+ dist[i][j] = 1;
24
+ else
25
+ dist[i][j] = MAX_DIST;
26
+ if (i == j) dist[i][j] = MAX_DIST;
27
+ }
28
+ }
29
+
30
+ for (size_t k=0; k<num_edges; ++k)
31
+ for (size_t i=0; i<num_edges; ++i)
32
+ for (size_t j=0; j<num_edges; ++j)
33
+ if (dist[i][j] > (dist[i][k] + dist[k][j]))
34
+ dist[i][j] = dist[i][k] + dist[k][j];
35
+ }
36
+
mosesdecoder/moses/HypothesisStack.h ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef moses_HypothesisStack_h
2
+ #define moses_HypothesisStack_h
3
+
4
+ #include <vector>
5
+ #include <set>
6
+ #include <boost/unordered_set.hpp>
7
+ #include "Hypothesis.h"
8
+ #include "Bitmap.h"
9
+
10
+ namespace Moses
11
+ {
12
+
13
+ class Manager;
14
+
15
+ /** abstract unique set of hypotheses that cover a certain number of words,
16
+ * ie. a stack in phrase-based decoding
17
+ */
18
+ class HypothesisStack
19
+ {
20
+
21
+ protected:
22
+ typedef boost::unordered_set< Hypothesis*, UnorderedComparer<Hypothesis>, UnorderedComparer<Hypothesis> > _HCType;
23
+ _HCType m_hypos; /**< contains hypotheses */
24
+ Manager& m_manager;
25
+
26
+ public:
27
+ HypothesisStack(Manager& manager): m_manager(manager) {}
28
+ typedef _HCType::iterator iterator;
29
+ typedef _HCType::const_iterator const_iterator;
30
+ //! iterators
31
+ const_iterator begin() const {
32
+ return m_hypos.begin();
33
+ }
34
+ const_iterator end() const {
35
+ return m_hypos.end();
36
+ }
37
+ size_t size() const {
38
+ return m_hypos.size();
39
+ }
40
+ virtual inline float GetWorstScore() const {
41
+ return -std::numeric_limits<float>::infinity();
42
+ };
43
+ virtual float GetWorstScoreForBitmap( WordsBitmapID ) {
44
+ return -std::numeric_limits<float>::infinity();
45
+ };
46
+ virtual float GetWorstScoreForBitmap( const Bitmap& ) {
47
+ return -std::numeric_limits<float>::infinity();
48
+ };
49
+
50
+ virtual ~HypothesisStack();
51
+ virtual bool AddPrune(Hypothesis *hypothesis) = 0;
52
+ virtual const Hypothesis *GetBestHypothesis() const = 0;
53
+ virtual std::vector<const Hypothesis*> GetSortedList() const = 0;
54
+
55
+ //! remove hypothesis pointed to by iterator but don't delete the object
56
+ virtual void Detach(const HypothesisStack::iterator &iter);
57
+ /** destroy Hypothesis pointed to by iterator (object pool version) */
58
+ virtual void Remove(const HypothesisStack::iterator &iter);
59
+
60
+ };
61
+
62
+ }
63
+
64
+ #endif
mosesdecoder/moses/Sentence.cpp ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+
4
+ /***********************************************************************
5
+ Moses - factored phrase-based language decoder
6
+ Copyright (C) 2006 University of Edinburgh
7
+
8
+ This library is free software; you can redistribute it and/or
9
+ modify it under the terms of the GNU Lesser General Public
10
+ License as published by the Free Software Foundation; either
11
+ version 2.1 of the License, or (at your option) any later version.
12
+
13
+ This library is distributed in the hope that it will be useful,
14
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
15
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16
+ Lesser General Public License for more details.
17
+
18
+ You should have received a copy of the GNU Lesser General Public
19
+ License along with this library; if not, write to the Free Software
20
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
+ ***********************************************************************/
22
+
23
+ #include <stdexcept>
24
+ #include <boost/algorithm/string.hpp>
25
+ #include <boost/foreach.hpp>
26
+
27
+ #include "Sentence.h"
28
+ #include "TranslationOptionCollectionText.h"
29
+ #include "StaticData.h"
30
+ #include "moses/FF/DynamicCacheBasedLanguageModel.h"
31
+ #include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
32
+ #include "ChartTranslationOptions.h"
33
+ #include "Util.h"
34
+ #include "XmlOption.h"
35
+ #include "FactorCollection.h"
36
+ #include "TranslationTask.h"
37
+
38
+ using namespace std;
39
+
40
+ namespace Moses
41
+ {
42
+
43
+ Sentence::
44
+ Sentence(AllOptions::ptr const& opts) : Phrase(0) , InputType(opts)
45
+ {
46
+ if (is_syntax(opts->search.algo))
47
+ m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
48
+ }
49
+
50
+ Sentence::
51
+ ~Sentence()
52
+ {
53
+ RemoveAllInColl(m_xmlOptions);
54
+ }
55
+
56
+ void
57
+ Sentence::
58
+ aux_init_partial_translation(string& line)
59
+ {
60
+ string sourceCompletedStr;
61
+ int loc1 = line.find( "|||", 0 );
62
+ int loc2 = line.find( "|||", loc1 + 3 );
63
+ if (loc1 > -1 && loc2 > -1) {
64
+ m_initialTargetPhrase = Trim(line.substr(0, loc1));
65
+ string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
66
+ line = line.substr(loc2 + 3);
67
+
68
+ m_sourceCompleted.resize(scov.size());
69
+ int contiguous = 1;
70
+ for (size_t i = 0; i < scov.size(); ++i) {
71
+ if (sourceCompletedStr.at(i) == '1') {
72
+ m_sourceCompleted[i] = true;
73
+ if (contiguous) m_frontSpanCoveredLength++;
74
+ } else {
75
+ m_sourceCompleted[i] = false;
76
+ contiguous = 0;
77
+ }
78
+ }
79
+ }
80
+ }
81
+
82
+ void
83
+ Sentence::
84
+ aux_interpret_sgml_markup(string& line)
85
+ {
86
+ // if sentences is specified as "<seg id=1> ... </seg>", extract id
87
+ typedef std::map<std::string, std::string> metamap;
88
+ metamap meta = ProcessAndStripSGML(line);
89
+ metamap::const_iterator i;
90
+ if ((i = meta.find("id")) != meta.end())
91
+ this->SetTranslationId(atol(i->second.c_str()));
92
+ if ((i = meta.find("docid")) != meta.end()) {
93
+ this->SetDocumentId(atol(i->second.c_str()));
94
+ this->SetUseTopicId(false);
95
+ this->SetUseTopicIdAndProb(false);
96
+ }
97
+ if ((i = meta.find("topic")) != meta.end()) {
98
+ vector<string> topic_params;
99
+ boost::split(topic_params, i->second, boost::is_any_of("\t "));
100
+ if (topic_params.size() == 1) {
101
+ this->SetTopicId(atol(topic_params[0].c_str()));
102
+ this->SetUseTopicId(true);
103
+ this->SetUseTopicIdAndProb(false);
104
+ } else {
105
+ this->SetTopicIdAndProb(topic_params);
106
+ this->SetUseTopicId(false);
107
+ this->SetUseTopicIdAndProb(true);
108
+ }
109
+ }
110
+ if ((i = meta.find("weight-setting")) != meta.end()) {
111
+ this->SetWeightSetting(i->second);
112
+ this->SetSpecifiesWeightSetting(true);
113
+ StaticData::Instance().SetWeightSetting(i->second);
114
+ // oh this is so horrible! Why does this have to be propagated globally?
115
+ // --- UG
116
+ } else this->SetSpecifiesWeightSetting(false);
117
+ }
118
+
119
+ void
120
+ Sentence::
121
+ aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
122
+ {
123
+ using namespace std;
124
+ typedef map<string, string> str2str_map;
125
+ m_dlt_meta = ProcessAndStripDLT(line);
126
+ // what's happening below is most likely not thread-safe! UG
127
+ BOOST_FOREACH(str2str_map const& M, m_dlt_meta) {
128
+ str2str_map::const_iterator i,j;
129
+ if ((i = M.find("type")) != M.end()) {
130
+ j = M.find("id");
131
+ string id = j == M.end() ? "default" : j->second;
132
+ if (i->second == "cbtm") {
133
+ PhraseDictionaryDynamicCacheBased* cbtm;
134
+ cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
135
+ if (cbtm) cbtm->ExecuteDlt(M);
136
+ }
137
+ if (i->second == "cblm") {
138
+ DynamicCacheBasedLanguageModel* cblm;
139
+ cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
140
+ if (cblm) cblm->ExecuteDlt(M);
141
+ }
142
+ }
143
+ }
144
+ }
145
+
146
+ void
147
+ Sentence::
148
+ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
149
+ std::vector<std::pair<size_t, std::string> >& placeholders)
150
+ {
151
+ // parse XML markup in translation line
152
+ using namespace std;
153
+ if (m_options->input.xml_policy != XmlPassThrough) {
154
+ bool OK = ProcessAndStripXMLTags(*m_options, line,
155
+ m_xmlOptions,
156
+ m_reorderingConstraint,
157
+ xmlWalls, placeholders,
158
+ *this);
159
+ if (!OK) {
160
+ TRACE_ERR("Unable to parse XML in line: " << line);
161
+ }
162
+ }
163
+ }
164
+
165
+ void
166
+ Sentence::
167
+ init(string line)
168
+ {
169
+ using namespace std;
170
+
171
+ m_frontSpanCoveredLength = 0;
172
+ m_sourceCompleted.resize(0);
173
+
174
+ if (m_options->input.continue_partial_translation)
175
+ aux_init_partial_translation(line);
176
+
177
+ line = Trim(line);
178
+ aux_interpret_sgml_markup(line); // for "<seg id=..." markup
179
+ aux_interpret_dlt(line); // some poorly documented cache-based stuff
180
+
181
+ // if sentences is specified as "<passthrough tag1=""/>"
182
+ if (m_options->output.PrintPassThrough ||m_options->nbest.include_passthrough) {
183
+ string pthru = PassthroughSGML(line,"passthrough");
184
+ this->SetPassthroughInformation(pthru);
185
+ }
186
+
187
+ vector<size_t> xmlWalls;
188
+ vector<pair<size_t, string> >placeholders;
189
+ aux_interpret_xml(line, xmlWalls, placeholders);
190
+
191
+ Phrase::CreateFromString(Input, m_options->input.factor_order, line, NULL);
192
+
193
+ ProcessPlaceholders(placeholders);
194
+
195
+ if (is_syntax(m_options->search.algo))
196
+ InitStartEndWord();
197
+
198
+ // now that we have final word positions in phrase (from
199
+ // CreateFromString), we can make input phrase objects to go with
200
+ // our XmlOptions and create TranslationOptions
201
+
202
+ // only fill the vector if we are parsing XML
203
+ if (m_options->input.xml_policy != XmlPassThrough) {
204
+ m_xmlCoverageMap.assign(GetSize(), false);
205
+ BOOST_FOREACH(XmlOption const* o, m_xmlOptions) {
206
+ Range const& r = o->range;
207
+ for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
208
+ m_xmlCoverageMap[j]=true;
209
+ }
210
+ }
211
+
212
+ // reordering walls and zones
213
+ m_reorderingConstraint.InitializeWalls(GetSize());
214
+
215
+ // set reordering walls, if "-monotone-at-punction" is set
216
+ if (m_options->reordering.monotone_at_punct && GetSize()) {
217
+ Range r(0, GetSize()-1);
218
+ m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
219
+ }
220
+
221
+ // set walls obtained from xml
222
+ for(size_t i=0; i<xmlWalls.size(); i++)
223
+ if(xmlWalls[i] < GetSize()) // no buggy walls, please
224
+ m_reorderingConstraint.SetWall(xmlWalls[i], true);
225
+ m_reorderingConstraint.FinalizeWalls();
226
+
227
+ }
228
+
229
+ int
230
+ Sentence::
231
+ Read(std::istream& in)
232
+ {
233
+ std::string line;
234
+ if (getline(in, line, '\n').eof())
235
+ return 0;
236
+ init(line);
237
+ return 1;
238
+ }
239
+
240
+ void
241
+ Sentence::
242
+ ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
243
+ {
244
+ FactorType placeholderFactor = m_options->input.placeholder_factor;
245
+ if (placeholderFactor == NOT_FOUND) {
246
+ return;
247
+ }
248
+
249
+ for (size_t i = 0; i < placeholders.size(); ++i) {
250
+ size_t pos = placeholders[i].first;
251
+ const string &str = placeholders[i].second;
252
+ const Factor *factor = FactorCollection::Instance().AddFactor(str);
253
+ Word &word = Phrase::GetWord(pos);
254
+ word[placeholderFactor] = factor;
255
+ }
256
+ }
257
+
258
+ TranslationOptionCollection*
259
+ Sentence::
260
+ CreateTranslationOptionCollection(ttasksptr const& ttask) const
261
+ {
262
+ TranslationOptionCollection *rv
263
+ = new TranslationOptionCollectionText(ttask, *this);
264
+ assert(rv);
265
+ return rv;
266
+ }
267
+ void Sentence::Print(std::ostream& out) const
268
+ {
269
+ out<<*static_cast<Phrase const*>(this);
270
+ }
271
+
272
+
273
+ bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const
274
+ {
275
+ for (size_t pos = startPos; pos <= endPos ; pos++) {
276
+ if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
277
+ return true;
278
+ }
279
+ }
280
+ return false;
281
+ }
282
+
283
+ void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list) const
284
+ {
285
+ for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
286
+ iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
287
+ const XmlOption &xmlOption = **iterXMLOpts;
288
+ const Range &range = xmlOption.range;
289
+ const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
290
+ TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
291
+ list.push_back(transOpt);
292
+ }
293
+ }
294
+
295
+ void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const
296
+ {
297
+ //iterate over XmlOptions list, find exact source/target matches
298
+
299
+ for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
300
+ iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
301
+ const XmlOption &xmlOption = **iterXMLOpts;
302
+ const Range &range = xmlOption.range;
303
+
304
+ if (startPos == range.GetStartPos()
305
+ && endPos == range.GetEndPos()) {
306
+ const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
307
+
308
+ TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
309
+ list.push_back(transOpt);
310
+ }
311
+ }
312
+ }
313
+
314
+ std::vector <ChartTranslationOptions*>
315
+ Sentence::
316
+ GetXmlChartTranslationOptions() const
317
+ {
318
+ std::vector <ChartTranslationOptions*> ret;
319
+
320
+ // XML Options
321
+ // this code is a copy of the 1 in Sentence.
322
+
323
+ //only fill the vector if we are parsing XML
324
+ if (m_options->input.xml_policy != XmlPassThrough ) {
325
+ //TODO: needed to handle exclusive
326
+ //for (size_t i=0; i<GetSize(); i++) {
327
+ // m_xmlCoverageMap.push_back(false);
328
+ //}
329
+
330
+ //iterXMLOpts will be empty for XmlIgnore
331
+ //look at each column
332
+ for(std::vector<XmlOption const*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
333
+ iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {
334
+
335
+ const XmlOption &xmlOption = **iterXmlOpts;
336
+ TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);
337
+
338
+ Range *range = new Range(xmlOption.range);
339
+ StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted
340
+
341
+ TargetPhraseCollection *tpc = new TargetPhraseCollection;
342
+ tpc->Add(targetPhrase);
343
+
344
+ ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
345
+ ret.push_back(transOpt);
346
+
347
+ //TODO: needed to handle exclusive
348
+ //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
349
+ // m_xmlCoverageMap[j]=true;
350
+ //}
351
+ }
352
+ }
353
+
354
+ return ret;
355
+ }
356
+
357
+ void
358
+ Sentence::
359
+ CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
360
+ {
361
+ Phrase::CreateFromString(Input, FOrder, phraseString, NULL);
362
+ }
363
+
364
+ Sentence::
365
+ Sentence(AllOptions::ptr const& opts, size_t const transId, string stext)
366
+ : InputType(opts, transId)
367
+ {
368
+ init(stext);
369
+ }
370
+
371
+ }
372
+
mosesdecoder/moses/Syntax/Cube.h ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <queue>
4
+ #include <vector>
5
+ #include <utility>
6
+
7
+ #include <boost/unordered_set.hpp>
8
+
9
+ #include "SHyperedge.h"
10
+ #include "SHyperedgeBundle.h"
11
+
12
+ namespace Moses
13
+ {
14
+ namespace Syntax
15
+ {
16
+
17
+ // A cube -- in the cube pruning sense (see Chiang (2007)) -- that lazily
18
+ // produces SHyperedge objects from a SHyperedgeBundle in approximately
19
+ // best-first order.
20
+ class Cube
21
+ {
22
+ public:
23
+ Cube(const SHyperedgeBundle &);
24
+ ~Cube();
25
+
26
+ SHyperedge *Pop();
27
+
28
+ SHyperedge *Top() const {
29
+ return m_queue.top().first;
30
+ }
31
+
32
+ bool IsEmpty() const {
33
+ return m_queue.empty();
34
+ }
35
+
36
+ private:
37
+ typedef boost::unordered_set<std::vector<int> > CoordinateSet;
38
+
39
+ typedef std::pair<SHyperedge *, const std::vector<int> *> QueueItem;
40
+
41
+ class QueueItemOrderer
42
+ {
43
+ public:
44
+ bool operator()(const QueueItem &p, const QueueItem &q) const {
45
+ return p.first->label.futureScore < q.first->label.futureScore;
46
+ }
47
+ };
48
+
49
+ typedef std::priority_queue<QueueItem, std::vector<QueueItem>,
50
+ QueueItemOrderer> Queue;
51
+
52
+ SHyperedge *CreateHyperedge(const std::vector<int> &);
53
+ void CreateNeighbour(const std::vector<int> &);
54
+ void CreateNeighbours(const std::vector<int> &);
55
+
56
+ const SHyperedgeBundle &m_bundle;
57
+ CoordinateSet m_visited;
58
+ Queue m_queue;
59
+ };
60
+
61
+ } // Syntax
62
+ } // Moses
mosesdecoder/moses/Syntax/CubeQueue.cpp ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "CubeQueue.h"
2
+
3
+ namespace Moses
4
+ {
5
+ namespace Syntax
6
+ {
7
+
8
+ CubeQueue::~CubeQueue()
9
+ {
10
+ while (!m_queue.empty()) {
11
+ Cube *cube = m_queue.top();
12
+ m_queue.pop();
13
+ delete cube;
14
+ }
15
+ }
16
+
17
+ SHyperedge *CubeQueue::Pop()
18
+ {
19
+ // pop the most promising cube
20
+ Cube *cube = m_queue.top();
21
+ m_queue.pop();
22
+
23
+ // pop the most promising hyperedge from the cube
24
+ SHyperedge *hyperedge = cube->Pop();
25
+
26
+ // if the cube contains more items then push it back onto the queue
27
+ if (!cube->IsEmpty()) {
28
+ m_queue.push(cube);
29
+ } else {
30
+ delete cube;
31
+ }
32
+
33
+ return hyperedge;
34
+ }
35
+
36
+ } // Syntax
37
+ } // Moses
mosesdecoder/moses/Syntax/CubeQueue.h ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <queue>
4
+ #include <vector>
5
+
6
+ #include "Cube.h"
7
+ #include "SHyperedge.h"
8
+ #include "SHyperedgeBundle.h"
9
+
10
+ namespace Moses
11
+ {
12
+ namespace Syntax
13
+ {
14
+
15
+ class CubeQueue
16
+ {
17
+ public:
18
+ template<typename InputIterator>
19
+ CubeQueue(InputIterator, InputIterator);
20
+
21
+ ~CubeQueue();
22
+
23
+ SHyperedge *Pop();
24
+
25
+ bool IsEmpty() const {
26
+ return m_queue.empty();
27
+ }
28
+
29
+ private:
30
+ class CubeOrderer
31
+ {
32
+ public:
33
+ bool operator()(const Cube *p, const Cube *q) const {
34
+ return p->Top()->label.futureScore < q->Top()->label.futureScore;
35
+ }
36
+ };
37
+
38
+ typedef std::priority_queue<Cube*, std::vector<Cube*>, CubeOrderer> Queue;
39
+
40
+ Queue m_queue;
41
+ };
42
+
43
+ template<typename InputIterator>
44
+ CubeQueue::CubeQueue(InputIterator first, InputIterator last)
45
+ {
46
+ while (first != last) {
47
+ m_queue.push(new Cube(*first++));
48
+ }
49
+ }
50
+
51
+ } // Syntax
52
+ } // Moses
mosesdecoder/moses/Syntax/InputWeightFF.cpp ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "InputWeightFF.h"
2
+
3
+ #include <vector>
4
+
5
+ #include "moses/ScoreComponentCollection.h"
6
+ #include "moses/Syntax/SHyperedge.h"
7
+ #include "moses/TargetPhrase.h"
8
+
9
+ namespace Moses
10
+ {
11
+ namespace Syntax
12
+ {
13
+
14
+ InputWeightFF::InputWeightFF(const std::string &line)
15
+ : StatelessFeatureFunction(1, line)
16
+ {
17
+ ReadParameters();
18
+ }
19
+
20
+ void InputWeightFF::EvaluateWhenApplied(const Hypothesis& hypo,
21
+ ScoreComponentCollection* accumulator) const
22
+ {
23
+ // TODO Throw exception.
24
+ assert(false);
25
+ }
26
+
27
+ void InputWeightFF::EvaluateWhenApplied(const ChartHypothesis &hypo,
28
+ ScoreComponentCollection* accumulator) const
29
+ {
30
+ // TODO Throw exception.
31
+ assert(false);
32
+ }
33
+
34
+ void InputWeightFF::EvaluateWhenApplied(
35
+ const Syntax::SHyperedge &hyperedge,
36
+ ScoreComponentCollection* accumulator) const
37
+ {
38
+ accumulator->PlusEquals(this, hyperedge.label.inputWeight);
39
+ }
40
+
41
+ void InputWeightFF::SetParameter(const std::string& key,
42
+ const std::string& value)
43
+ {
44
+ StatelessFeatureFunction::SetParameter(key, value);
45
+ }
46
+
47
+ } // Syntax
48
+ } // Moses
mosesdecoder/moses/Syntax/Manager.cpp ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <sstream>
2
+ #include "Manager.h"
3
+ #include "PVertex.h"
4
+ #include "moses/OutputCollector.h"
5
+ #include "moses/Util.h"
6
+
7
+ namespace Moses
8
+ {
9
+ namespace Syntax
10
+ {
11
+
12
+ Manager::Manager(ttasksptr const& ttask)
13
+ : Moses::BaseManager(ttask)
14
+ { }
15
+
16
+ void Manager::OutputBest(OutputCollector *collector) const
17
+ {
18
+ if (!collector) {
19
+ return;
20
+ }
21
+ std::ostringstream out;
22
+ FixPrecision(out);
23
+ const SHyperedge *best = GetBestSHyperedge();
24
+ if (best == NULL) {
25
+ VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
26
+ if (options()->output.ReportHypoScore) {
27
+ out << "0 ";
28
+ }
29
+ out << '\n';
30
+ } else {
31
+ if (options()->output.ReportHypoScore) {
32
+ out << best->label.futureScore << " ";
33
+ }
34
+ Phrase yield = GetOneBestTargetYield(*best);
35
+ // delete 1st & last
36
+ UTIL_THROW_IF2(yield.GetSize() < 2,
37
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
38
+ yield.RemoveWord(0);
39
+ yield.RemoveWord(yield.GetSize()-1);
40
+ out << yield.GetStringRep(options()->output.factor_order);
41
+ out << '\n';
42
+ }
43
+ collector->Write(m_source.GetTranslationId(), out.str());
44
+ }
45
+
46
+ void Manager::OutputNBest(OutputCollector *collector) const
47
+ {
48
+ if (collector) {
49
+ long translationId = m_source.GetTranslationId();
50
+ KBestExtractor::KBestVec nBestList;
51
+ ExtractKBest(options()->nbest.nbest_size, nBestList,
52
+ options()->nbest.only_distinct);
53
+ OutputNBestList(collector, nBestList, translationId);
54
+ }
55
+ }
56
+
57
+ void Manager::OutputUnknowns(OutputCollector *collector) const
58
+ {
59
+ if (collector) {
60
+ long translationId = m_source.GetTranslationId();
61
+
62
+ std::ostringstream out;
63
+ for (boost::unordered_set<Moses::Word>::const_iterator p = m_oovs.begin();
64
+ p != m_oovs.end(); ++p) {
65
+ out << *p;
66
+ }
67
+ out << std::endl;
68
+ collector->Write(translationId, out.str());
69
+ }
70
+ }
71
+
72
+ void Manager::OutputNBestList(OutputCollector *collector,
73
+ const KBestExtractor::KBestVec &nBestList,
74
+ long translationId) const
75
+ {
76
+ const std::vector<FactorType> &outputFactorOrder = options()->output.factor_order;
77
+
78
+ std::ostringstream out;
79
+
80
+ if (collector->OutputIsCout()) {
81
+ // Set precision only if we're writing the n-best list to cout. This is to
82
+ // preserve existing behaviour, but should probably be done either way.
83
+ FixPrecision(out);
84
+ }
85
+
86
+ bool includeWordAlignment = options()->nbest.include_alignment_info;
87
+ bool PrintNBestTrees = options()->nbest.print_trees; // PrintNBestTrees();
88
+
89
+ for (KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
90
+ p != nBestList.end(); ++p) {
91
+ const KBestExtractor::Derivation &derivation = **p;
92
+
93
+ // get the derivation's target-side yield
94
+ Phrase outputPhrase = KBestExtractor::GetOutputPhrase(derivation);
95
+
96
+ // delete <s> and </s>
97
+ UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
98
+ "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
99
+ outputPhrase.RemoveWord(0);
100
+ outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
101
+
102
+ // print the translation ID, surface factors, and scores
103
+ out << translationId << " ||| ";
104
+ OutputSurface(out, outputPhrase); // , outputFactorOrder, false);
105
+ out << " ||| ";
106
+ bool with_labels = options()->nbest.include_feature_labels;
107
+ derivation.scoreBreakdown.OutputAllFeatureScores(out, with_labels);
108
+ out << " ||| " << derivation.score;
109
+
110
+ // optionally, print word alignments
111
+ if (includeWordAlignment) {
112
+ out << " ||| ";
113
+ Alignments align;
114
+ OutputAlignmentNBest(align, derivation, 0);
115
+ for (Alignments::const_iterator q = align.begin(); q != align.end();
116
+ ++q) {
117
+ out << q->first << "-" << q->second << " ";
118
+ }
119
+ }
120
+
121
+ // optionally, print tree
122
+ if (PrintNBestTrees) {
123
+ TreePointer tree = KBestExtractor::GetOutputTree(derivation);
124
+ out << " ||| " << tree->GetString();
125
+ }
126
+
127
+ out << std::endl;
128
+ }
129
+
130
+ assert(collector);
131
+ collector->Write(translationId, out.str());
132
+ }
133
+
134
+ std::size_t Manager::OutputAlignmentNBest(
135
+ Alignments &retAlign,
136
+ const KBestExtractor::Derivation &derivation,
137
+ std::size_t startTarget) const
138
+ {
139
+ const SHyperedge &shyperedge = derivation.edge->shyperedge;
140
+
141
+ std::size_t totalTargetSize = 0;
142
+ std::size_t startSource = shyperedge.head->pvertex->span.GetStartPos();
143
+
144
+ const TargetPhrase &tp = *(shyperedge.label.translation);
145
+
146
+ std::size_t thisSourceSize = CalcSourceSize(derivation);
147
+
148
+ // position of each terminal word in translation rule, irrespective of
149
+ // alignment if non-term, number is undefined
150
+ std::vector<std::size_t> sourceOffsets(thisSourceSize, 0);
151
+ std::vector<std::size_t> targetOffsets(tp.GetSize(), 0);
152
+
153
+ const AlignmentInfo &aiNonTerm =
154
+ shyperedge.label.translation->GetAlignNonTerm();
155
+ std::vector<std::size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
156
+ const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd =
157
+ aiNonTerm.GetNonTermIndexMap();
158
+
159
+ UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
160
+ "Error");
161
+
162
+ std::size_t targetInd = 0;
163
+ for (std::size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
164
+ if (tp.GetWord(targetPos).IsNonTerminal()) {
165
+ UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
166
+ std::size_t sourceInd = targetPos2SourceInd[targetPos];
167
+ std::size_t sourcePos = sourceInd2pos[sourceInd];
168
+
169
+ const KBestExtractor::Derivation &subderivation =
170
+ *derivation.subderivations[sourceInd];
171
+
172
+ // calc source size
173
+ std::size_t sourceSize =
174
+ subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
175
+ sourceOffsets[sourcePos] = sourceSize;
176
+
177
+ // calc target size.
178
+ // Recursively look thru child hypos
179
+ std::size_t currStartTarget = startTarget + totalTargetSize;
180
+ std::size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
181
+ currStartTarget);
182
+ targetOffsets[targetPos] = targetSize;
183
+
184
+ totalTargetSize += targetSize;
185
+ ++targetInd;
186
+ } else {
187
+ ++totalTargetSize;
188
+ }
189
+ }
190
+
191
+ // convert position within translation rule to absolute position within
192
+ // source sentence / output sentence
193
+ ShiftOffsets(sourceOffsets, startSource);
194
+ ShiftOffsets(targetOffsets, startTarget);
195
+
196
+ // get alignments from this hypo
197
+ const AlignmentInfo &aiTerm = shyperedge.label.translation->GetAlignTerm();
198
+
199
+ // add to output arg, offsetting by source & target
200
+ AlignmentInfo::const_iterator iter;
201
+ for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
202
+ const std::pair<std::size_t, std::size_t> &align = *iter;
203
+ std::size_t relSource = align.first;
204
+ std::size_t relTarget = align.second;
205
+ std::size_t absSource = sourceOffsets[relSource];
206
+ std::size_t absTarget = targetOffsets[relTarget];
207
+
208
+ std::pair<std::size_t, std::size_t> alignPoint(absSource, absTarget);
209
+ std::pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
210
+ UTIL_THROW_IF2(!ret.second, "Error");
211
+ }
212
+
213
+ return totalTargetSize;
214
+ }
215
+
216
+ std::size_t Manager::CalcSourceSize(const KBestExtractor::Derivation &d) const
217
+ {
218
+ const SHyperedge &shyperedge = d.edge->shyperedge;
219
+ std::size_t ret = shyperedge.head->pvertex->span.GetNumWordsCovered();
220
+ for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
221
+ std::size_t childSize =
222
+ shyperedge.tail[i]->pvertex->span.GetNumWordsCovered();
223
+ ret -= (childSize - 1);
224
+ }
225
+ return ret;
226
+ }
227
+
228
+ } // Syntax
229
+ } // Moses
mosesdecoder/moses/Syntax/NonTerminalMap.h ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <vector>
4
+
5
+ #include <boost/unordered_map.hpp>
6
+
7
+ #include "moses/FactorCollection.h"
8
+ #include "moses/Word.h"
9
+
10
+ #include "SymbolEqualityPred.h"
11
+ #include "SymbolHasher.h"
12
+
13
+ namespace Moses
14
+ {
15
+ namespace Syntax
16
+ {
17
+
18
+ // Hybrid map/vector-based container for key-value pairs where the key is a
19
+ // non-terminal Word. The interface is like a (stripped-down) map type, with
20
+ // the main differences being that:
21
+ // 1. Find() is implemented using vector indexing to make it fast.
22
+ // 2. Once a value has been inserted it can be modified but can't be removed.
23
+ template<typename T>
24
+ class NonTerminalMap
25
+ {
26
+ private:
27
+ typedef boost::unordered_map<Word, T, SymbolHasher, SymbolEqualityPred> Map;
28
+ typedef std::vector<T*> Vec;
29
+
30
+ public:
31
+ typedef typename Map::iterator Iterator;
32
+ typedef typename Map::const_iterator ConstIterator;
33
+
34
+ NonTerminalMap()
35
+ : m_vec(FactorCollection::Instance().GetNumNonTerminals(), NULL) {}
36
+
37
+ Iterator Begin() {
38
+ return m_map.begin();
39
+ }
40
+ Iterator End() {
41
+ return m_map.end();
42
+ }
43
+
44
+ ConstIterator Begin() const {
45
+ return m_map.begin();
46
+ }
47
+ ConstIterator End() const {
48
+ return m_map.end();
49
+ }
50
+
51
+ std::size_t Size() const {
52
+ return m_map.size();
53
+ }
54
+
55
+ bool IsEmpty() const {
56
+ return m_map.empty();
57
+ }
58
+
59
+ std::pair<Iterator, bool> Insert(const Word &, const T &);
60
+
61
+ T *Find(const Word &w) const {
62
+ return m_vec[w[0]->GetId()];
63
+ }
64
+
65
+ private:
66
+ Map m_map;
67
+ Vec m_vec;
68
+ };
69
+
70
+ template<typename T>
71
+ std::pair<typename NonTerminalMap<T>::Iterator, bool> NonTerminalMap<T>::Insert(
72
+ const Word &key, const T &value)
73
+ {
74
+ std::pair<typename Map::iterator, bool> result =
75
+ m_map.insert(typename Map::value_type(key, value));
76
+ if (result.second) {
77
+ T *p = &(result.first->second);
78
+ std::size_t i = key[0]->GetId();
79
+ m_vec[i] = p;
80
+ }
81
+ return result;
82
+ }
83
+
84
+ } // namespace Syntax
85
+ } // namespace Moses
mosesdecoder/moses/Syntax/PHyperedge.h ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <vector>
4
+
5
+ #include "PLabel.h"
6
+
7
+ namespace Moses
8
+ {
9
+ namespace Syntax
10
+ {
11
+
12
+ struct PVertex;
13
+
14
+ struct PHyperedge {
15
+ PVertex *head;
16
+ std::vector<PVertex*> tail;
17
+ PLabel label;
18
+ };
19
+
20
+ } // Syntax
21
+ } // Moses
mosesdecoder/moses/Syntax/RuleTableFF.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <string>
4
+
5
+ #include "moses/TranslationModel/PhraseDictionary.h"
6
+
7
+ namespace Moses
8
+ {
9
+
10
+ class ChartParser;
11
+ class ChartCellCollectionBase;
12
+ class AllOptions;
13
+ namespace Syntax
14
+ {
15
+
16
+ class RuleTable;
17
+
18
+ // Feature function for dealing with local rule scores (that come from a
19
+ // rule table). The scores themselves are stored on TargetPhrase objects
20
+ // and the decoder accesses them directly, so this object doesn't really do
21
+ // anything except provide somewhere to store the weights and parameter values.
22
+ class RuleTableFF : public PhraseDictionary
23
+ {
24
+ public:
25
+ RuleTableFF(const std::string &);
26
+
27
+ // FIXME Delete m_table?
28
+ ~RuleTableFF() {}
29
+
30
+ void Load(AllOptions::ptr const& opts);
31
+
32
+ const RuleTable *GetTable() const {
33
+ return m_table;
34
+ }
35
+
36
+ static const std::vector<RuleTableFF*> &Instances() {
37
+ return s_instances;
38
+ }
39
+
40
+ ChartRuleLookupManager *CreateRuleLookupManager(
41
+ const ChartParser &, const ChartCellCollectionBase &, std::size_t) {
42
+ assert(false);
43
+ return 0;
44
+ }
45
+
46
+ // Get the source terminal vocabulary for this table's grammar (as a set of
47
+ // factor IDs)
48
+ const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
49
+ return m_sourceTerminalSet;
50
+ }
51
+
52
+ private:
53
+ static std::vector<RuleTableFF*> s_instances;
54
+
55
+ const RuleTable *m_table;
56
+ boost::unordered_set<std::size_t> m_sourceTerminalSet;
57
+ };
58
+
59
+ } // Syntax
60
+ } // Moses
mosesdecoder/moses/Syntax/SHyperedgeBundle.h ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <vector>
4
+
5
+ #include "moses/ScoreComponentCollection.h"
6
+ #include "moses/TargetPhraseCollection.h"
7
+
8
+ #include "SVertexStack.h"
9
+
10
+ namespace Moses
11
+ {
12
+ namespace Syntax
13
+ {
14
+
15
+ struct PVertex;
16
+
17
+ struct SHyperedgeBundle {
18
+ float inputWeight;
19
+ std::vector<const SVertexStack*> stacks;
20
+ TargetPhraseCollection::shared_ptr translations;
21
+
22
+ friend void swap(SHyperedgeBundle &x, SHyperedgeBundle &y) {
23
+ using std::swap;
24
+ swap(x.inputWeight, y.inputWeight);
25
+ swap(x.stacks, y.stacks);
26
+ swap(x.translations, y.translations);
27
+ }
28
+ };
29
+
30
+ } // Syntax
31
+ } // Moses
mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "moses/FF/FFState.h"
4
+
5
+ #include "SVertex.h"
6
+
7
+ namespace Moses
8
+ {
9
+ namespace Syntax
10
+ {
11
+
12
+ class SVertexRecombinationHasher
13
+ {
14
+ public:
15
+ std::size_t operator()(const SVertex *v) const {
16
+ std::size_t seed = 0;
17
+ for (std::vector<FFState*>::const_iterator p = v->states.begin();
18
+ p != v->states.end(); ++p) {
19
+ boost::hash_combine(seed, (*p)->hash());
20
+ }
21
+ return seed;
22
+ }
23
+ };
24
+
25
+ } // Syntax
26
+ } // Moses
mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include "ThrowingFwrite.h"
23
+ #include "BlockHashIndex.h"
24
+ #include "CmphStringVectorAdapter.h"
25
+ #include "util/exception.hh"
26
+ #include "util/string_stream.hh"
27
+
28
+ #ifdef HAVE_CMPH
29
+ #include "cmph.h"
30
+ #endif
31
+
32
+ namespace Moses
33
+ {
34
+ #ifdef WITH_THREADS
35
+ BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
36
+ size_t threadsNum)
37
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
38
+ m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0),
39
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
40
+ m_threadPool(threadsNum)
41
+ {
42
+ #ifndef HAVE_CMPH
43
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
44
+ exit(1);
45
+ #endif
46
+ }
47
+ #else
48
+ BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
49
+ : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
50
+ m_fileHandle(0), m_fileHandleStart(0), m_size(0),
51
+ m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
52
+ {
53
+ #ifndef HAVE_CMPH
54
+ std::cerr << "minphr: CMPH support not compiled in." << std::endl;
55
+ exit(1);
56
+ #endif
57
+ }
58
+ #endif
59
+
60
+ BlockHashIndex::~BlockHashIndex()
61
+ {
62
+ #ifdef HAVE_CMPH
63
+ for(std::vector<void*>::iterator it = m_hashes.begin();
64
+ it != m_hashes.end(); it++)
65
+ if(*it != 0)
66
+ cmph_destroy((cmph_t*)*it);
67
+
68
+ for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
69
+ it != m_arrays.end(); it++)
70
+ if(*it != 0)
71
+ delete *it;
72
+ #endif
73
+ }
74
+
75
+ size_t BlockHashIndex::GetHash(const char* key)
76
+ {
77
+ std::string keyStr(key);
78
+ size_t i = std::distance(m_landmarks.begin(),
79
+ std::upper_bound(m_landmarks.begin(),
80
+ m_landmarks.end(), keyStr)) - 1;
81
+
82
+ if(i == 0ul-1)
83
+ return GetSize();
84
+
85
+ size_t pos = GetHash(i, key);
86
+ if(pos != GetSize())
87
+ return (1ul << m_orderBits) * i + pos;
88
+ else
89
+ return GetSize();
90
+ }
91
+
92
+ size_t BlockHashIndex::GetFprint(const char* key) const
93
+ {
94
+ size_t hash;
95
+ MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash);
96
+ hash &= (1ul << m_fingerPrintBits) - 1;
97
+ return hash;
98
+ }
99
+
100
+ size_t BlockHashIndex::GetHash(size_t i, const char* key)
101
+ {
102
+ //#ifdef WITH_THREADS
103
+ // boost::mutex::scoped_lock lock(m_mutex);
104
+ //#endif
105
+ //if(m_hashes[i] == 0)
106
+ //LoadRange(i);
107
+ #ifdef HAVE_CMPH
108
+ size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
109
+ #else
110
+ assert(0);
111
+ size_t idx = 0;
112
+ #endif
113
+
114
+ std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
115
+ m_clocks[i] = clock();
116
+
117
+ if(GetFprint(key) == orderPrint.second)
118
+ return orderPrint.first;
119
+ else
120
+ return GetSize();
121
+ }
122
+
123
+ size_t BlockHashIndex::GetHash(std::string key)
124
+ {
125
+ return GetHash(key.c_str());
126
+ }
127
+
128
+ size_t BlockHashIndex::operator[](std::string key)
129
+ {
130
+ return GetHash(key);
131
+ }
132
+
133
+ size_t BlockHashIndex::operator[](char* key)
134
+ {
135
+ return GetHash(key);
136
+ }
137
+
138
+ size_t BlockHashIndex::Save(std::string filename)
139
+ {
140
+ std::FILE* mphf = std::fopen(filename.c_str(), "w");
141
+ size_t size = Save(mphf);
142
+ std::fclose(mphf);
143
+ return size;
144
+ }
145
+
146
+ void BlockHashIndex::BeginSave(std::FILE * mphf)
147
+ {
148
+ m_fileHandle = mphf;
149
+ ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
150
+ ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
151
+
152
+ m_fileHandleStart = std::ftell(m_fileHandle);
153
+
154
+ size_t relIndexPos = 0;
155
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
156
+ }
157
+
158
+ void BlockHashIndex::SaveRange(size_t i)
159
+ {
160
+ #ifdef HAVE_CMPH
161
+ if(m_seekIndex.size() <= i)
162
+ m_seekIndex.resize(i+1);
163
+ m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart;
164
+ cmph_dump((cmph_t*)m_hashes[i], m_fileHandle);
165
+ m_arrays[i]->Save(m_fileHandle);
166
+ #endif
167
+ }
168
+
169
+ void BlockHashIndex::SaveLastRange()
170
+ {
171
+ #ifdef WITH_THREADS
172
+ boost::mutex::scoped_lock lock(m_mutex);
173
+ #endif
174
+
175
+ while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
176
+ size_t current = -m_queue.top();
177
+ m_queue.pop();
178
+ SaveRange(current);
179
+ m_lastSaved = current;
180
+ }
181
+ }
182
+
183
+ void BlockHashIndex::DropRange(size_t i)
184
+ {
185
+ #ifdef HAVE_CMPH
186
+ if(m_hashes[i] != 0) {
187
+ cmph_destroy((cmph_t*)m_hashes[i]);
188
+ m_hashes[i] = 0;
189
+ }
190
+ if(m_arrays[i] != 0) {
191
+ delete m_arrays[i];
192
+ m_arrays[i] = 0;
193
+ m_clocks[i] = 0;
194
+ }
195
+ m_numLoadedRanges--;
196
+ #endif
197
+ }
198
+
199
+ void BlockHashIndex::DropLastRange()
200
+ {
201
+ #ifdef WITH_THREADS
202
+ boost::mutex::scoped_lock lock(m_mutex);
203
+ #endif
204
+
205
+ while(m_lastDropped != m_lastSaved)
206
+ DropRange(++m_lastDropped);
207
+ }
208
+
209
+ #ifdef WITH_THREADS
210
+ void BlockHashIndex::WaitAll()
211
+ {
212
+ m_threadPool.Stop(true);
213
+ }
214
+ #endif
215
+
216
+ size_t BlockHashIndex::FinalizeSave()
217
+ {
218
+ #ifdef WITH_THREADS
219
+ m_threadPool.Stop(true);
220
+ #endif
221
+
222
+ SaveLastRange();
223
+
224
+ size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
225
+
226
+ std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
227
+ ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
228
+
229
+ std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
230
+ m_landmarks.save(m_fileHandle);
231
+
232
+ size_t seekIndexSize = m_seekIndex.size();
233
+ ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
234
+ ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
235
+
236
+ ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
237
+
238
+ size_t fileHandleStop = std::ftell(m_fileHandle);
239
+ return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
240
+ + sizeof(m_fingerPrintBits);
241
+ }
242
+
243
+ size_t BlockHashIndex::Save(std::FILE * mphf)
244
+ {
245
+ m_queue = std::priority_queue<int>();
246
+ BeginSave(mphf);
247
+ for(size_t i = 0; i < m_hashes.size(); i++)
248
+ SaveRange(i);
249
+ return FinalizeSave();
250
+ }
251
+
252
+ size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
253
+ {
254
+ m_fileHandle = mphf;
255
+
256
+ size_t beginning = std::ftell(mphf);
257
+
258
+ size_t read = 0;
259
+ read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
260
+ read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
261
+ m_fileHandleStart = std::ftell(m_fileHandle);
262
+
263
+ size_t relIndexPos;
264
+ read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
265
+ std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
266
+
267
+ m_landmarks.load(mphf);
268
+
269
+ size_t seekIndexSize;
270
+ read += std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
271
+ m_seekIndex.resize(seekIndexSize);
272
+ read += std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
273
+ m_hashes.resize(seekIndexSize, 0);
274
+ m_clocks.resize(seekIndexSize, 0);
275
+ m_arrays.resize(seekIndexSize, 0);
276
+
277
+ read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
278
+
279
+ size_t end = std::ftell(mphf);
280
+
281
+ return end - beginning;
282
+ }
283
+
284
+ void BlockHashIndex::LoadRange(size_t i)
285
+ {
286
+ #ifdef HAVE_CMPH
287
+ std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET);
288
+ cmph_t* hash = cmph_load(m_fileHandle);
289
+ m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
290
+ m_fingerPrintBits);
291
+ m_arrays[i]->Load(m_fileHandle);
292
+
293
+ m_hashes[i] = (void*)hash;
294
+ m_clocks[i] = clock();
295
+
296
+ m_numLoadedRanges++;
297
+ #endif
298
+ }
299
+
300
+ size_t BlockHashIndex::Load(std::string filename)
301
+ {
302
+ std::FILE* mphf = std::fopen(filename.c_str(), "r");
303
+ size_t size = Load(mphf);
304
+ std::fclose(mphf);
305
+ return size;
306
+ }
307
+
308
+ size_t BlockHashIndex::Load(std::FILE * mphf)
309
+ {
310
+ size_t byteSize = LoadIndex(mphf);
311
+ size_t end = std::ftell(mphf);
312
+
313
+ for(size_t i = 0; i < m_seekIndex.size(); i++)
314
+ LoadRange(i);
315
+ std::fseek(m_fileHandle, end, SEEK_SET);
316
+ return byteSize;
317
+ }
318
+
319
+ size_t BlockHashIndex::GetSize() const
320
+ {
321
+ return m_size;
322
+ }
323
+
324
+ void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
325
+ {
326
+ /*
327
+ #ifdef WITH_THREADS
328
+ boost::mutex::scoped_lock lock(m_mutex);
329
+ #endif
330
+ size_t n = m_hashes.size() * ratio;
331
+ size_t max = n * (1 + tolerance);
332
+ if(m_numLoadedRanges > max) {
333
+ typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
334
+ LastLoaded lastLoaded;
335
+ for(size_t i = 0; i < m_hashes.size(); i++)
336
+ if(m_hashes[i] != 0)
337
+ lastLoaded.push_back(std::make_pair(m_clocks[i], i));
338
+
339
+ std::sort(lastLoaded.begin(), lastLoaded.end());
340
+ for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
341
+ it != lastLoaded.rend(); it++)
342
+ DropRange(it->second);
343
+ }*/
344
+ }
345
+
346
+ void BlockHashIndex::CalcHash(size_t current, void* source_void)
347
+ {
348
+ #ifdef HAVE_CMPH
349
+ cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
350
+ cmph_config_t *config = cmph_config_new(source);
351
+ cmph_config_set_algo(config, CMPH_CHD);
352
+
353
+ cmph_t* hash = cmph_new(config);
354
+ PairedPackedArray<> *pv =
355
+ new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);
356
+
357
+ size_t i = 0;
358
+
359
+ source->rewind(source->data);
360
+
361
+ std::string lastKey = "";
362
+ while(i < source->nkeys) {
363
+ unsigned keylen;
364
+ char* key;
365
+ source->read(source->data, &key, &keylen);
366
+ std::string temp(key, keylen);
367
+ source->dispose(source->data, key, keylen);
368
+
369
+ if(lastKey > temp) {
370
+ if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
371
+ util::StringStream strme;
372
+ strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n";
373
+ strme << "1: " << lastKey << "\n";
374
+ strme << "2: " << temp << "\n";
375
+ UTIL_THROW2(strme.str());
376
+ }
377
+ }
378
+ lastKey = temp;
379
+
380
+ size_t fprint = GetFprint(temp.c_str());
381
+ size_t idx = cmph_search(hash, temp.c_str(),
382
+ (cmph_uint32) temp.size());
383
+
384
+ pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
385
+ i++;
386
+ }
387
+
388
+ cmph_config_destroy(config);
389
+
390
+ #ifdef WITH_THREADS
391
+ boost::mutex::scoped_lock lock(m_mutex);
392
+ #endif
393
+
394
+ if(m_hashes.size() <= current) {
395
+ m_hashes.resize(current + 1, 0);
396
+ m_arrays.resize(current + 1, 0);
397
+ m_clocks.resize(current + 1, 0);
398
+ }
399
+
400
+ m_hashes[current] = (void*)hash;
401
+ m_arrays[current] = pv;
402
+ m_clocks[current] = clock();
403
+ m_queue.push(-current);
404
+ #endif
405
+ }
406
+
407
+ #ifdef HAVE_CMPH
408
+ void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
409
+ {
410
+ return (void*)CmphVectorAdapter(v);
411
+ }
412
+
413
+ void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv)
414
+ {
415
+ return (void*)CmphStringVectorAdapter(sv);
416
+ }
417
+
418
+ void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv)
419
+ {
420
+ return (void*)CmphStringVectorAdapter(sv);
421
+ }
422
+ #endif
423
+
424
+ }
mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_ConsistentPhrases_h
23
+ #define moses_ConsistentPhrases_h
24
+
25
+ #include <set>
26
+
27
+ namespace Moses
28
+ {
29
+
30
+ class ConsistentPhrases
31
+ {
32
+ public:
33
+ struct Phrase {
34
+ int i, j, m, n;
35
+ Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
36
+ };
37
+
38
+ struct PhraseSorter {
39
+ bool operator()(Phrase a, Phrase b) {
40
+ if(a.n > b.n)
41
+ return true;
42
+ if(a.n == b.n && a.j < b.j)
43
+ return true;
44
+ if(a.n == b.n && a.j == b.j && a.m > b.m)
45
+ return true;
46
+ if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
47
+ return true;
48
+ return false;
49
+ }
50
+ };
51
+
52
+ private:
53
+ typedef std::set<Phrase, PhraseSorter> PhraseQueue;
54
+ PhraseQueue m_phraseQueue;
55
+
56
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
57
+ typedef std::set<AlignPoint> Alignment;
58
+
59
+ public:
60
+
61
+ ConsistentPhrases(int mmax, int nmax, Alignment& a) {
62
+ for(int i = 0; i < mmax; i++) {
63
+ for(int m = 1; m <= mmax-i; m++) {
64
+ for(int j = 0; j < nmax; j++) {
65
+ for(int n = 1; n <= nmax-j; n++) {
66
+ bool consistant = true;
67
+ for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
68
+ int ip = it->first;
69
+ int jp = it->second;
70
+ if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) {
71
+ consistant = false;
72
+ break;
73
+ }
74
+ }
75
+ if(consistant)
76
+ m_phraseQueue.insert(Phrase(i, m, j, n));
77
+ }
78
+ }
79
+ }
80
+ }
81
+ m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
82
+ }
83
+
84
+ size_t Empty() {
85
+ return !m_phraseQueue.size();
86
+ }
87
+
88
+ Phrase Pop() {
89
+ if(m_phraseQueue.size()) {
90
+ Phrase p = *m_phraseQueue.begin();
91
+ m_phraseQueue.erase(m_phraseQueue.begin());
92
+ return p;
93
+ }
94
+ return Phrase(0,0,0,0);
95
+ }
96
+
97
+ void RemoveOverlap(Phrase p) {
98
+ PhraseQueue ok;
99
+ for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) {
100
+ Phrase pp = *it;
101
+ if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
102
+ (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
103
+ ok.insert(pp);
104
+ }
105
+ m_phraseQueue = ok;
106
+ }
107
+
108
+ };
109
+
110
+ }
111
+
112
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/Jamfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local current = "" ;
2
+ local includes = ;
3
+ local with-cmph = [ option.get "with-cmph" ] ;
4
+ if $(with-cmph) {
5
+ lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
6
+ includes += <include>$(with-cmph)/include ;
7
+ current = "--with-cmph=$(with-cmph)" ;
8
+ fakelib CompactPT : [ glob *.cpp ] ../..//headers cmph : $(includes) <dependency>$(PT-LOG) : : $(includes) ;
9
+ }
10
+ else {
11
+ alias cmph ;
12
+ fakelib CompactPT ;
13
+ }
14
+
15
+ path-constant PT-LOG : bin/pt.log ;
16
+ update-if-changed $(PT-LOG) $(current) ;
17
+
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #include <sstream>
23
+ #include "LexicalReorderingTableCreator.h"
24
+ #include "ThrowingFwrite.h"
25
+ #include "moses/Util.h"
26
+ #include "util/file.hh"
27
+ #include "util/exception.hh"
28
+
29
+ namespace Moses
30
+ {
31
+
32
+ LexicalReorderingTableCreator::LexicalReorderingTableCreator(
33
+ std::string inPath, std::string outPath, std::string tempfilePath,
34
+ size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
35
+ size_t quantize
36
+ #ifdef WITH_THREADS
37
+ , size_t threads
38
+ #endif
39
+ )
40
+ : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
41
+ m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
42
+ m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
43
+ m_quantize(quantize), m_separator(" ||| "),
44
+ m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
45
+ #ifdef WITH_THREADS
46
+ , m_threads(threads)
47
+ #endif
48
+ {
49
+ PrintInfo();
50
+
51
+ m_outFile = std::fopen(m_outPath.c_str(), "w");
52
+
53
+ std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
54
+ m_hash.BeginSave(m_outFile);
55
+
56
+ if(tempfilePath.size()) {
57
+ MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
58
+ m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
59
+ } else {
60
+ m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
61
+ }
62
+
63
+ EncodeScores();
64
+
65
+ std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
66
+ CalcHuffmanCodes();
67
+
68
+ std::cerr << "Pass 2/2: Compressing scores" << std::endl;
69
+
70
+
71
+ if(tempfilePath.size()) {
72
+ MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
73
+ m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
74
+ } else {
75
+ m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
76
+ }
77
+ CompressScores();
78
+
79
+ std::cerr << "Saving to " << m_outPath << std::endl;
80
+ Save();
81
+ std::cerr << "Done" << std::endl;
82
+ std::fclose(m_outFile);
83
+ }
84
+
85
+ void LexicalReorderingTableCreator::PrintInfo()
86
+ {
87
+ std::cerr << "Used options:" << std::endl;
88
+ std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
89
+ std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
90
+ std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
91
+ std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
92
+ std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
93
+ std::cerr << "\tUsing score quantization: ";
94
+ if(m_quantize)
95
+ std::cerr << m_quantize << " best" << std::endl;
96
+ else
97
+ std::cerr << "no" << std::endl;
98
+
99
+ #ifdef WITH_THREADS
100
+ std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
101
+ #endif
102
+ std::cerr << std::endl;
103
+ }
104
+
105
+ LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
106
+ {
107
+ for(size_t i = 0; i < m_scoreTrees.size(); i++) {
108
+ delete m_scoreTrees[i];
109
+ delete m_scoreCounters[i];
110
+ }
111
+
112
+ delete m_encodedScores;
113
+ delete m_compressedScores;
114
+ }
115
+
116
+
117
+ void LexicalReorderingTableCreator::EncodeScores()
118
+ {
119
+ InputFileStream inFile(m_inPath);
120
+
121
+ #ifdef WITH_THREADS
122
+ boost::thread_group threads;
123
+ for (size_t i = 0; i < m_threads; ++i) {
124
+ EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
125
+ threads.create_thread(*et);
126
+ }
127
+ threads.join_all();
128
+ #else
129
+ EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
130
+ (*et)();
131
+ delete et;
132
+ #endif
133
+ FlushEncodedQueue(true);
134
+ }
135
+
136
+ void LexicalReorderingTableCreator::CalcHuffmanCodes()
137
+ {
138
+ std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
139
+ for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
140
+ it != m_scoreCounters.end(); it++) {
141
+ if(m_quantize)
142
+ (*it)->Quantize(m_quantize);
143
+
144
+ std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
145
+ << " scores" << std::endl;
146
+
147
+ *treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
148
+ treeIt++;
149
+ }
150
+ std::cerr << std::endl;
151
+ }
152
+
153
+ void LexicalReorderingTableCreator::CompressScores()
154
+ {
155
+ #ifdef WITH_THREADS
156
+ boost::thread_group threads;
157
+ for (size_t i = 0; i < m_threads; ++i) {
158
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
159
+ threads.create_thread(*ct);
160
+ }
161
+ threads.join_all();
162
+ #else
163
+ CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
164
+ (*ct)();
165
+ delete ct;
166
+ #endif
167
+ FlushCompressedQueue(true);
168
+ }
169
+
170
+ void LexicalReorderingTableCreator::Save()
171
+ {
172
+ ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
173
+ ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
174
+ for(size_t i = 0; i < m_scoreTrees.size(); i++)
175
+ m_scoreTrees[i]->Save(m_outFile);
176
+
177
+ m_compressedScores->save(m_outFile);
178
+ }
179
+
180
+ std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
181
+ {
182
+ std::string key = source + m_separator;
183
+ if(!target.empty())
184
+ key += target + m_separator;
185
+ return key;
186
+ }
187
+
188
+ std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
189
+ {
190
+ std::string scoresString = tokens.back();
191
+ std::stringstream scoresStream;
192
+
193
+ std::vector<float> scores;
194
+ Tokenize<float>(scores, scoresString);
195
+
196
+ if(!m_numScoreComponent) {
197
+ m_numScoreComponent = scores.size();
198
+ m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
199
+ for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
200
+ it != m_scoreCounters.end(); it++)
201
+ *it = new ScoreCounter();
202
+ m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
203
+ }
204
+
205
+ if(m_numScoreComponent != scores.size()) {
206
+ std::stringstream strme;
207
+ strme << "Error: Wrong number of scores detected ("
208
+ << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
209
+ strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
210
+ UTIL_THROW2(strme.str());
211
+ }
212
+
213
+ size_t c = 0;
214
+ float score;
215
+ while(c < m_numScoreComponent) {
216
+ score = scores[c];
217
+ score = FloorScore(TransformScore(score));
218
+ scoresStream.write((char*)&score, sizeof(score));
219
+
220
+ m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
221
+ c++;
222
+ }
223
+
224
+ return scoresStream.str();
225
+ }
226
+
227
+ void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
228
+ {
229
+ m_queue.push(pi);
230
+ }
231
+
232
+ void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
233
+ {
234
+ if(force || m_queue.size() > 10000) {
235
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
236
+ PackedItem pi = m_queue.top();
237
+ m_queue.pop();
238
+ m_lastFlushedLine++;
239
+
240
+ m_lastRange.push_back(pi.GetSrc());
241
+ m_encodedScores->push_back(pi.GetTrg());
242
+
243
+ if((pi.GetLine()+1) % 100000 == 0)
244
+ std::cerr << ".";
245
+ if((pi.GetLine()+1) % 5000000 == 0)
246
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
247
+
248
+ if(m_lastRange.size() == (1ul << m_orderBits)) {
249
+ m_hash.AddRange(m_lastRange);
250
+ m_hash.SaveLastRange();
251
+ m_hash.DropLastRange();
252
+ m_lastRange.clear();
253
+ }
254
+ }
255
+ }
256
+
257
+ if(force) {
258
+ m_lastFlushedLine = -1;
259
+
260
+ if(!m_lastRange.empty()) {
261
+ m_hash.AddRange(m_lastRange);
262
+ m_lastRange.clear();
263
+ }
264
+
265
+ #ifdef WITH_THREADS
266
+ m_hash.WaitAll();
267
+ #endif
268
+
269
+ m_hash.SaveLastRange();
270
+ m_hash.DropLastRange();
271
+ m_hash.FinalizeSave();
272
+
273
+ std::cerr << std::endl << std::endl;
274
+ }
275
+ }
276
+
277
+ std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
278
+ {
279
+ std::stringstream encodedScoresStream(encodedScores);
280
+ encodedScoresStream.unsetf(std::ios::skipws);
281
+
282
+ std::string compressedScores;
283
+ BitWrapper<> compressedScoresStream(compressedScores);
284
+
285
+ size_t currScore = 0;
286
+ float score;
287
+ encodedScoresStream.read((char*) &score, sizeof(score));
288
+
289
+ while(encodedScoresStream) {
290
+ size_t index = currScore % m_scoreTrees.size();
291
+
292
+ if(m_quantize)
293
+ score = m_scoreCounters[index]->LowerBound(score);
294
+
295
+ m_scoreTrees[index]->Put(compressedScoresStream, score);
296
+ encodedScoresStream.read((char*) &score, sizeof(score));
297
+ currScore++;
298
+ }
299
+
300
+ return compressedScores;
301
+ }
302
+
303
+ void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
304
+ {
305
+ m_queue.push(pi);
306
+ }
307
+
308
+ void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
309
+ {
310
+ if(force || m_queue.size() > 10000) {
311
+ while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
312
+ PackedItem pi = m_queue.top();
313
+ m_queue.pop();
314
+ m_lastFlushedLine++;
315
+
316
+ m_compressedScores->push_back(pi.GetTrg());
317
+
318
+ if((pi.GetLine()+1) % 100000 == 0)
319
+ std::cerr << ".";
320
+ if((pi.GetLine()+1) % 5000000 == 0)
321
+ std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
322
+ }
323
+ }
324
+
325
+ if(force) {
326
+ m_lastFlushedLine = -1;
327
+ std::cerr << std::endl << std::endl;
328
+ }
329
+ }
330
+
331
+ //****************************************************************************//
332
+
333
+ size_t EncodingTaskReordering::m_lineNum = 0;
334
+ #ifdef WITH_THREADS
335
+ boost::mutex EncodingTaskReordering::m_mutex;
336
+ boost::mutex EncodingTaskReordering::m_fileMutex;
337
+ #endif
338
+
339
+ EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
340
+ : m_inFile(inFile), m_creator(creator) {}
341
+
342
+ void EncodingTaskReordering::operator()()
343
+ {
344
+ size_t lineNum = 0;
345
+
346
+ std::vector<std::string> lines;
347
+ size_t max_lines = 1000;
348
+ lines.reserve(max_lines);
349
+
350
+ {
351
+ #ifdef WITH_THREADS
352
+ boost::mutex::scoped_lock lock(m_fileMutex);
353
+ #endif
354
+ std::string line;
355
+ while(lines.size() < max_lines && std::getline(m_inFile, line))
356
+ lines.push_back(line);
357
+ lineNum = m_lineNum;
358
+ m_lineNum += lines.size();
359
+ }
360
+
361
+ std::vector<PackedItem> result;
362
+ result.reserve(max_lines);
363
+
364
+ while(lines.size()) {
365
+ for(size_t i = 0; i < lines.size(); i++) {
366
+ std::vector<std::string> tokens;
367
+ Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
368
+
369
+ std::string encodedLine = m_creator.EncodeLine(tokens);
370
+
371
+ std::string f = tokens[0];
372
+
373
+ std::string e;
374
+ if(tokens.size() > 2)
375
+ e = tokens[1];
376
+
377
+ PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
378
+ encodedLine, i);
379
+ result.push_back(packedItem);
380
+ }
381
+
382
+ {
383
+ #ifdef WITH_THREADS
384
+ boost::mutex::scoped_lock lock(m_mutex);
385
+ #endif
386
+ for(size_t i = 0; i < result.size(); i++)
387
+ m_creator.AddEncodedLine(result[i]);
388
+ m_creator.FlushEncodedQueue();
389
+ }
390
+
391
+ lines.clear();
392
+ result.clear();
393
+ lines.reserve(max_lines);
394
+ result.reserve(max_lines);
395
+
396
+ #ifdef WITH_THREADS
397
+ boost::mutex::scoped_lock lock(m_fileMutex);
398
+ #endif
399
+ std::string line;
400
+ while(lines.size() < max_lines && std::getline(m_inFile, line))
401
+ lines.push_back(line);
402
+ lineNum = m_lineNum;
403
+ m_lineNum += lines.size();
404
+ }
405
+ }
406
+
407
+ //****************************************************************************//
408
+
409
+ size_t CompressionTaskReordering::m_scoresNum = 0;
410
+ #ifdef WITH_THREADS
411
+ boost::mutex CompressionTaskReordering::m_mutex;
412
+ #endif
413
+
414
+ CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
415
+ MmapAllocator>& encodedScores,
416
+ LexicalReorderingTableCreator& creator)
417
+ : m_encodedScores(encodedScores), m_creator(creator)
418
+ { }
419
+
420
+ void CompressionTaskReordering::operator()()
421
+ {
422
+ size_t scoresNum;
423
+ {
424
+ #ifdef WITH_THREADS
425
+ boost::mutex::scoped_lock lock(m_mutex);
426
+ #endif
427
+ scoresNum = m_scoresNum;
428
+ m_scoresNum++;
429
+ }
430
+
431
+ while(scoresNum < m_encodedScores.size()) {
432
+ std::string scores = m_encodedScores[scoresNum];
433
+ std::string compressedScores
434
+ = m_creator.CompressEncodedScores(scores);
435
+
436
+ std::string dummy;
437
+ PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
438
+
439
+ #ifdef WITH_THREADS
440
+ boost::mutex::scoped_lock lock(m_mutex);
441
+ #endif
442
+ m_creator.AddCompressedScores(packedItem);
443
+ m_creator.FlushCompressedQueue();
444
+
445
+ scoresNum = m_scoresNum;
446
+ m_scoresNum++;
447
+ }
448
+ }
449
+
450
+ }
mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_MmapAllocator_h
23
+ #define moses_MmapAllocator_h
24
+
25
+ #include <limits>
26
+ #include <iostream>
27
+ #include <cstdio>
28
+ #include <unistd.h>
29
+
30
+ #if defined(_WIN32) || defined(_WIN64)
31
+ #include <windows.h>
32
+ #include <io.h>
33
+ #else
34
+ #include <sys/mman.h>
35
+ #endif
36
+
37
+ #include "util/mmap.hh"
38
+
39
+ namespace Moses
40
+ {
41
+ template <class T>
42
+ class MmapAllocator
43
+ {
44
+ protected:
45
+ std::FILE* m_file_ptr;
46
+ size_t m_file_desc;
47
+
48
+ size_t m_page_size;
49
+ size_t m_map_size;
50
+
51
+ char* m_data_ptr;
52
+ size_t m_data_offset;
53
+ bool m_fixed;
54
+ size_t* m_count;
55
+
56
+ public:
57
+ typedef T value_type;
58
+ typedef T* pointer;
59
+ typedef const T* const_pointer;
60
+ typedef T& reference;
61
+ typedef const T& const_reference;
62
+ typedef std::size_t size_type;
63
+ typedef std::ptrdiff_t difference_type;
64
+
65
+ MmapAllocator() throw()
66
+ : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
67
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
68
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
69
+ }
70
+
71
+ MmapAllocator(std::FILE* f_ptr) throw()
72
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
73
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
74
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
75
+ }
76
+
77
+ MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
78
+ : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
79
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
80
+ m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
81
+ }
82
+
83
+ MmapAllocator(std::string fileName) throw()
84
+ : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
85
+ m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
86
+ m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
87
+ }
88
+
89
+ MmapAllocator(const MmapAllocator& c) throw()
90
+ : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
91
+ m_page_size(c.m_page_size), m_map_size(c.m_map_size),
92
+ m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
93
+ m_fixed(c.m_fixed), m_count(c.m_count) {
94
+ (*m_count)++;
95
+ }
96
+
97
+ ~MmapAllocator() throw() {
98
+ if(m_data_ptr && *m_count == 0) {
99
+ util::UnmapOrThrow(m_data_ptr, m_map_size);
100
+ if(!m_fixed && std::ftell(m_file_ptr) != -1)
101
+ std::fclose(m_file_ptr);
102
+ }
103
+ (*m_count)--;
104
+ }
105
+
106
+ template <class U>
107
+ struct rebind {
108
+ typedef MmapAllocator<U> other;
109
+ };
110
+
111
+ pointer address (reference value) const {
112
+ return &value;
113
+ }
114
+
115
+ const_pointer address (const_reference value) const {
116
+ return &value;
117
+ }
118
+
119
+ size_type max_size () const throw() {
120
+ return std::numeric_limits<size_t>::max() / sizeof(value_type);
121
+ }
122
+
123
+ pointer allocate (size_type num, const void* = 0) {
124
+ m_map_size = num * sizeof(T);
125
+
126
+ #if defined(_WIN32) || defined(_WIN64)
127
+ // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
128
+ const int map_shared = 0;
129
+ #else
130
+ const int map_shared = MAP_SHARED;
131
+ #endif
132
+ if(!m_fixed) {
133
+ size_t read = 0;
134
+ read += ftruncate(m_file_desc, m_map_size);
135
+ m_data_ptr = (char *)util::MapOrThrow(
136
+ m_map_size, true, map_shared, false, m_file_desc, 0);
137
+ return (pointer)m_data_ptr;
138
+ } else {
139
+ const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
140
+ const size_t relative_offset = m_data_offset - map_offset;
141
+ const size_t adjusted_map_size = m_map_size + relative_offset;
142
+
143
+ m_data_ptr = (char *)util::MapOrThrow(
144
+ adjusted_map_size, false, map_shared, false, m_file_desc, map_offset);
145
+
146
+ return (pointer)(m_data_ptr + relative_offset);
147
+ }
148
+ }
149
+
150
+ void deallocate (pointer p, size_type num) {
151
+ if(!m_fixed) {
152
+ util::UnmapOrThrow(p, num * sizeof(T));
153
+ } else {
154
+ const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
155
+ const size_t relative_offset = m_data_offset - map_offset;
156
+ const size_t adjusted_map_size = m_map_size + relative_offset;
157
+
158
+ util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size);
159
+ }
160
+ }
161
+
162
+ void construct (pointer p, const T& value) {
163
+ if(!m_fixed)
164
+ new(p) value_type(value);
165
+ }
166
+ void destroy (pointer p) {
167
+ if(!m_fixed)
168
+ p->~T();
169
+ }
170
+
171
+ template <class T1, class T2>
172
+ friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
173
+
174
+ template <class T1, class T2>
175
+ friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
176
+ };
177
+
178
+ template <class T1, class T2>
179
+ bool operator== (const MmapAllocator<T1>& a1,
180
+ const MmapAllocator<T2>& a2) throw()
181
+ {
182
+ bool equal = true;
183
+ equal &= a1.m_file_ptr == a2.m_file_ptr;
184
+ equal &= a1.m_file_desc == a2.m_file_desc;
185
+ equal &= a1.m_page_size == a2.m_page_size;
186
+ equal &= a1.m_map_size == a2.m_map_size;
187
+ equal &= a1.m_data_ptr == a2.m_data_ptr;
188
+ equal &= a1.m_data_offset == a2.m_data_offset;
189
+ equal &= a1.m_fixed == a2.m_fixed;
190
+ return equal;
191
+ }
192
+
193
+ template <class T1, class T2>
194
+ bool operator!=(const MmapAllocator<T1>& a1,
195
+ const MmapAllocator<T2>& a2) throw()
196
+ {
197
+ return !(a1 == a2);
198
+ }
199
+
200
+ }
201
+
202
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_MonotonicVector_h
23
+ #define moses_MonotonicVector_h
24
+
25
+ // MonotonicVector - Represents a monotonic increasing function that maps
26
+ // positive integers of any size onto a given number type. Each value has to be
27
+ // equal or larger than the previous one. Depending on the stepSize it can save
28
+ // up to 90% of memory compared to a std::vector<long>. Time complexity is roughly
29
+ // constant, in the worst case, however, stepSize times slower than a normal
30
+ // std::vector.
31
+
32
+ #include <vector>
33
+ #include <limits>
34
+ #include <algorithm>
35
+ #include <cstdio>
36
+ #include <cassert>
37
+
38
+ #include "ThrowingFwrite.h"
39
+ #include "ListCoders.h"
40
+ #include "MmapAllocator.h"
41
+
42
+ namespace Moses
43
+ {
44
+
45
+ template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
46
+ template <typename> class Allocator = std::allocator>
47
+ class MonotonicVector
48
+ {
49
+ private:
50
+ typedef std::vector<NumT, Allocator<NumT> > Anchors;
51
+ typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
52
+
53
+ Anchors m_anchors;
54
+ Diffs m_diffs;
55
+ std::vector<unsigned int> m_tempDiffs;
56
+
57
+ size_t m_size;
58
+ PosT m_last;
59
+ bool m_final;
60
+
61
+ public:
62
+ typedef PosT value_type;
63
+
64
+ MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
65
+
66
+ size_t size() const {
67
+ return m_size + m_tempDiffs.size();
68
+ }
69
+
70
+ PosT at(size_t i) const {
71
+ PosT s = stepSize;
72
+ PosT j = m_anchors[i / s];
73
+ PosT r = i % s;
74
+
75
+ typename Diffs::const_iterator it = m_diffs.begin() + j;
76
+
77
+ PosT k = 0;
78
+ k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
79
+ if(i < m_size)
80
+ k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
81
+ else if(i < m_size + m_tempDiffs.size())
82
+ for(size_t l = 0; l < r; l++)
83
+ k += m_tempDiffs[l];
84
+
85
+ return k;
86
+ }
87
+
88
+ PosT operator[](PosT i) const {
89
+ return at(i);
90
+ }
91
+
92
+ PosT back() const {
93
+ return at(size()-1);
94
+ }
95
+
96
+ void push_back(PosT i) {
97
+ assert(m_final != true);
98
+
99
+ if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
100
+ m_anchors.push_back(0);
101
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
102
+ m_last = i;
103
+ m_size++;
104
+
105
+ return;
106
+ }
107
+
108
+ if(m_tempDiffs.size() == stepSize-1) {
109
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
110
+ std::back_inserter(m_diffs));
111
+ m_anchors.push_back(m_diffs.size());
112
+ VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
113
+
114
+ m_size += m_tempDiffs.size() + 1;
115
+ m_tempDiffs.clear();
116
+ } else {
117
+ PosT last = m_last;
118
+ PosT diff = i - last;
119
+ m_tempDiffs.push_back(diff);
120
+ }
121
+ m_last = i;
122
+ }
123
+
124
+ void commit() {
125
+ assert(m_final != true);
126
+ Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
127
+ std::back_inserter(m_diffs));
128
+ m_size += m_tempDiffs.size();
129
+ m_tempDiffs.clear();
130
+ m_final = true;
131
+ }
132
+
133
+ size_t usage() {
134
+ return m_diffs.size() * sizeof(unsigned int)
135
+ + m_anchors.size() * sizeof(NumT);
136
+ }
137
+
138
+ size_t load(std::FILE* in, bool map = false) {
139
+ size_t byteSize = 0;
140
+
141
+ byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
142
+ byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
143
+ byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
144
+
145
+ byteSize += loadVector(m_diffs, in, map);
146
+ byteSize += loadVector(m_anchors, in, map);
147
+
148
+ return byteSize;
149
+ }
150
+
151
+ template <typename ValueT>
152
+ size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
153
+ std::FILE* in, bool map = false) {
154
+ // Can only be read into memory. Mapping not possible with std:allocator.
155
+ assert(map == false);
156
+
157
+ size_t byteSize = 0;
158
+
159
+ size_t valSize;
160
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
161
+
162
+ v.resize(valSize, 0);
163
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
164
+
165
+ return byteSize;
166
+ }
167
+
168
+ template <typename ValueT>
169
+ size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
170
+ std::FILE* in, bool map = false) {
171
+ size_t byteSize = 0;
172
+
173
+ size_t valSize;
174
+ byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
175
+
176
+ if(map == false) {
177
+ // Read data into temporary file (default constructor of MmapAllocator)
178
+ // and map memory onto temporary file. Can be resized.
179
+
180
+ v.resize(valSize, 0);
181
+ byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
182
+ } else {
183
+ // Map it directly on specified region of file "in" starting at valPos
184
+ // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
185
+
186
+ size_t valPos = std::ftell(in);
187
+
188
+ Allocator<ValueT> alloc(in, valPos);
189
+ std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
190
+ vTemp.resize(valSize);
191
+ v.swap(vTemp);
192
+
193
+ std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
194
+ byteSize += valSize * sizeof(ValueT);
195
+ }
196
+
197
+ return byteSize;
198
+ }
199
+
200
+ size_t save(std::FILE* out) {
201
+ if(!m_final)
202
+ commit();
203
+
204
+ bool byteSize = 0;
205
+ byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
206
+ byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
207
+ byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
208
+
209
+ size_t size = m_diffs.size();
210
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
211
+ byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
212
+
213
+ size = m_anchors.size();
214
+ byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
215
+ byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
216
+
217
+ return byteSize;
218
+ }
219
+
220
+ void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv) {
221
+ if(!m_final)
222
+ commit();
223
+
224
+ m_diffs.swap(mv.m_diffs);
225
+ m_anchors.swap(mv.m_anchors);
226
+ }
227
+ };
228
+
229
+ }
230
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_PhraseDecoder_h
23
+ #define moses_PhraseDecoder_h
24
+
25
+ #include <sstream>
26
+ #include <vector>
27
+ #include <boost/unordered_map.hpp>
28
+ #include <boost/unordered_set.hpp>
29
+ #include <string>
30
+ #include <iterator>
31
+ #include <algorithm>
32
+ #include <sys/stat.h>
33
+
34
+ #include "moses/TypeDef.h"
35
+ #include "moses/FactorCollection.h"
36
+ #include "moses/Word.h"
37
+ #include "moses/Util.h"
38
+ #include "moses/InputFileStream.h"
39
+ #include "moses/StaticData.h"
40
+ #include "moses/Range.h"
41
+
42
+ #include "PhraseDictionaryCompact.h"
43
+ #include "StringVector.h"
44
+ #include "CanonicalHuffman.h"
45
+ #include "TargetPhraseCollectionCache.h"
46
+
47
+ namespace Moses
48
+ {
49
+
50
+ class PhraseDictionaryCompact;
51
+
52
+ class PhraseDecoder
53
+ {
54
+ protected:
55
+
56
+ friend class PhraseDictionaryCompact;
57
+
58
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
59
+ typedef std::pair<unsigned, unsigned> SrcTrg;
60
+
61
+ enum Coding { None, REnc, PREnc } m_coding;
62
+
63
+ size_t m_numScoreComponent;
64
+ bool m_containsAlignmentInfo;
65
+ size_t m_maxRank;
66
+ size_t m_maxPhraseLength;
67
+
68
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
69
+ StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
70
+ StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
71
+
72
+ std::vector<size_t> m_lexicalTableIndex;
73
+ std::vector<SrcTrg> m_lexicalTable;
74
+
75
+ CanonicalHuffman<unsigned>* m_symbolTree;
76
+
77
+ bool m_multipleScoreTrees;
78
+ std::vector<CanonicalHuffman<float>*> m_scoreTrees;
79
+
80
+ CanonicalHuffman<AlignPoint>* m_alignTree;
81
+
82
+ TargetPhraseCollectionCache m_decodingCache;
83
+
84
+ PhraseDictionaryCompact& m_phraseDictionary;
85
+
86
+ // ***********************************************
87
+
88
+ const std::vector<FactorType>* m_input;
89
+ const std::vector<FactorType>* m_output;
90
+
91
+ std::string m_separator;
92
+
93
+ // ***********************************************
94
+
95
+ unsigned GetSourceSymbolId(std::string& s);
96
+ std::string GetTargetSymbol(unsigned id) const;
97
+
98
+ size_t GetREncType(unsigned encodedSymbol);
99
+ size_t GetPREncType(unsigned encodedSymbol);
100
+
101
+ unsigned GetTranslation(unsigned srcIdx, size_t rank);
102
+
103
+ size_t GetMaxSourcePhraseLength();
104
+
105
+ unsigned DecodeREncSymbol1(unsigned encodedSymbol);
106
+ unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
107
+ unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
108
+ unsigned DecodeREncSymbol3(unsigned encodedSymbol);
109
+
110
+ unsigned DecodePREncSymbol1(unsigned encodedSymbol);
111
+ int DecodePREncSymbol2Left(unsigned encodedSymbol);
112
+ int DecodePREncSymbol2Right(unsigned encodedSymbol);
113
+ unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
114
+
115
+ std::string MakeSourceKey(std::string &);
116
+
117
+ public:
118
+
119
+ PhraseDecoder(
120
+ PhraseDictionaryCompact &phraseDictionary,
121
+ const std::vector<FactorType>* input,
122
+ const std::vector<FactorType>* output,
123
+ size_t numScoreComponent
124
+ );
125
+
126
+ ~PhraseDecoder();
127
+
128
+ size_t Load(std::FILE* in);
129
+
130
+ TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
131
+ bool topLevel = false, bool eval = true);
132
+
133
+ TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
134
+ BitWrapper<> &encodedBitStream,
135
+ const Phrase &sourcePhrase,
136
+ bool topLevel,
137
+ bool eval);
138
+
139
+ void PruneCache();
140
+ };
141
+
142
+ }
143
+
144
+ #endif
mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // $Id$
2
+ // vim:tabstop=2
3
+ /***********************************************************************
4
+ Moses - factored phrase-based language decoder
5
+ Copyright (C) 2006 University of Edinburgh
6
+
7
+ This library is free software; you can redistribute it and/or
8
+ modify it under the terms of the GNU Lesser General Public
9
+ License as published by the Free Software Foundation; either
10
+ version 2.1 of the License, or (at your option) any later version.
11
+
12
+ This library is distributed in the hope that it will be useful,
13
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ Lesser General Public License for more details.
16
+
17
+ You should have received a copy of the GNU Lesser General Public
18
+ License along with this library; if not, write to the Free Software
19
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
+ ***********************************************************************/
21
+
22
+ #ifndef moses_PhraseTableCreator_h
23
+ #define moses_PhraseTableCreator_h
24
+
25
+ #include <sstream>
26
+ #include <iostream>
27
+ #include <queue>
28
+ #include <vector>
29
+ #include <set>
30
+ #include <boost/unordered_map.hpp>
31
+
32
+ #include "moses/InputFileStream.h"
33
+ #include "moses/ThreadPool.h"
34
+ #include "moses/Util.h"
35
+
36
+ #include "BlockHashIndex.h"
37
+ #include "StringVector.h"
38
+ #include "StringVectorTemp.h"
39
+ #include "CanonicalHuffman.h"
40
+
41
+ namespace Moses
42
+ {
43
+
44
+ typedef std::pair<unsigned char, unsigned char> AlignPoint;
45
+
46
+ template <typename DataType>
47
+ class Counter
48
+ {
49
+ public:
50
+ typedef boost::unordered_map<DataType, size_t> FreqMap;
51
+ typedef typename FreqMap::iterator iterator;
52
+ typedef typename FreqMap::mapped_type mapped_type;
53
+ typedef typename FreqMap::value_type value_type;
54
+
55
+ private:
56
+ #ifdef WITH_THREADS
57
+ boost::mutex m_mutex;
58
+ #endif
59
+ FreqMap m_freqMap;
60
+ size_t m_maxSize;
61
+ std::vector<DataType> m_bestVec;
62
+
63
+ struct FreqSorter {
64
+ bool operator()(const value_type& a, const value_type& b) const {
65
+ if(a.second > b.second)
66
+ return true;
67
+ // Check impact on translation quality!
68
+ if(a.second == b.second && a.first > b.first)
69
+ return true;
70
+ return false;
71
+ }
72
+ };
73
+
74
+ public:
75
+ Counter() : m_maxSize(0) {}
76
+
77
+ iterator Begin() {
78
+ return m_freqMap.begin();
79
+ }
80
+
81
+ iterator End() {
82
+ return m_freqMap.end();
83
+ }
84
+
85
+ void Increase(DataType data) {
86
+ #ifdef WITH_THREADS
87
+ boost::mutex::scoped_lock lock(m_mutex);
88
+ #endif
89
+ m_freqMap[data]++;
90
+ }
91
+
92
+ void IncreaseBy(DataType data, size_t num) {
93
+ #ifdef WITH_THREADS
94
+ boost::mutex::scoped_lock lock(m_mutex);
95
+ #endif
96
+ m_freqMap[data] += num;
97
+ }
98
+
99
+ mapped_type& operator[](DataType data) {
100
+ return m_freqMap[data];
101
+ }
102
+
103
+ size_t Size() {
104
+ #ifdef WITH_THREADS
105
+ boost::mutex::scoped_lock lock(m_mutex);
106
+ #endif
107
+ return m_freqMap.size();
108
+ }
109
+
110
+ void Quantize(size_t maxSize) {
111
+ #ifdef WITH_THREADS
112
+ boost::mutex::scoped_lock lock(m_mutex);
113
+ #endif
114
+ m_maxSize = maxSize;
115
+ std::vector<std::pair<DataType, mapped_type> > freqVec;
116
+ freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
117
+ std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
118
+
119
+ for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
120
+ m_bestVec.push_back(freqVec[i].first);
121
+
122
+ std::sort(m_bestVec.begin(), m_bestVec.end());
123
+
124
+ FreqMap t_freqMap;
125
+ for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
126
+ = freqVec.begin(); it != freqVec.end(); it++) {
127
+ DataType closest = LowerBound(it->first);
128
+ t_freqMap[closest] += it->second;
129
+ }
130
+
131
+ m_freqMap.swap(t_freqMap);
132
+ }
133
+
134
+ void Clear() {
135
+ #ifdef WITH_THREADS
136
+ boost::mutex::scoped_lock lock(m_mutex);
137
+ #endif
138
+ m_freqMap.clear();
139
+ }
140
+
141
+ DataType LowerBound(DataType data) {
142
+ if(m_maxSize == 0 || m_bestVec.size() == 0)
143
+ return data;
144
+ else {
145
+ typename std::vector<DataType>::iterator it
146
+ = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
147
+ if(it != m_bestVec.end())
148
+ return *it;
149
+ else
150
+ return m_bestVec.back();
151
+ }
152
+ }
153
+ };
154
+
155
+ class PackedItem
156
+ {
157
+ private:
158
+ long m_line;
159
+ std::string m_sourcePhrase;
160
+ std::string m_packedTargetPhrase;
161
+ size_t m_rank;
162
+ float m_score;
163
+
164
+ public:
165
+ PackedItem(long line, std::string sourcePhrase,
166
+ std::string packedTargetPhrase, size_t rank,
167
+ float m_score = 0);
168
+
169
+ long GetLine() const;
170
+ const std::string& GetSrc() const;
171
+ const std::string& GetTrg() const;
172
+ size_t GetRank() const;
173
+ float GetScore() const;
174
+ };
175
+
176
+ bool operator<(const PackedItem &pi1, const PackedItem &pi2);
177
+
178
+ class PhraseTableCreator
179
+ {
180
+ public:
181
+ enum Coding { None, REnc, PREnc };
182
+
183
+ private:
184
+ std::string m_inPath;
185
+ std::string m_outPath;
186
+ std::string m_tempfilePath;
187
+
188
+ std::FILE* m_outFile;
189
+
190
+ size_t m_numScoreComponent;
191
+ size_t m_sortScoreIndex;
192
+ size_t m_warnMe;
193
+
194
+ Coding m_coding;
195
+ size_t m_orderBits;
196
+ size_t m_fingerPrintBits;
197
+ bool m_useAlignmentInfo;
198
+ bool m_multipleScoreTrees;
199
+ size_t m_quantize;
200
+ size_t m_maxRank;
201
+
202
+ static std::string m_phraseStopSymbol;
203
+ static std::string m_separator;
204
+
205
+ #ifdef WITH_THREADS
206
+ size_t m_threads;
207
+ boost::mutex m_mutex;
208
+ #endif
209
+
210
+ BlockHashIndex m_srcHash;
211
+ BlockHashIndex m_rnkHash;
212
+
213
+ size_t m_maxPhraseLength;
214
+
215
+ std::vector<unsigned> m_ranks;
216
+
217
+ typedef std::pair<unsigned, unsigned> SrcTrg;
218
+ typedef std::pair<std::string, std::string> SrcTrgString;
219
+ typedef std::pair<SrcTrgString, float> SrcTrgProb;
220
+
221
+ struct SrcTrgProbSorter {
222
+ bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const {
223
+ if(a.first.first < b.first.first)
224
+ return true;
225
+
226
+ if(a.first.first == b.first.first && a.second > b.second)
227
+ return true;
228
+
229
+ if(a.first.first == b.first.first
230
+ && a.second == b.second
231
+ && a.first.second < b.first.second)
232
+ return true;
233
+
234
+ return false;
235
+ }
236
+ };
237
+
238
+ std::vector<size_t> m_lexicalTableIndex;
239
+ std::vector<SrcTrg> m_lexicalTable;
240
+
241
+ StringVectorTemp<unsigned char, unsigned long, MmapAllocator>*
242
+ m_encodedTargetPhrases;
243
+
244
+ StringVector<unsigned char, unsigned long, MmapAllocator>*
245
+ m_compressedTargetPhrases;
246
+
247
+ boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
248
+ boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
249
+
250
+ typedef Counter<unsigned> SymbolCounter;
251
+ typedef Counter<float> ScoreCounter;
252
+ typedef Counter<AlignPoint> AlignCounter;
253
+
254
+ typedef CanonicalHuffman<unsigned> SymbolTree;
255
+ typedef CanonicalHuffman<float> ScoreTree;
256
+ typedef CanonicalHuffman<AlignPoint> AlignTree;
257
+
258
+ SymbolCounter m_symbolCounter;
259
+ SymbolTree* m_symbolTree;
260
+
261
+ AlignCounter m_alignCounter;
262
+ AlignTree* m_alignTree;
263
+
264
+ std::vector<ScoreCounter*> m_scoreCounters;
265
+ std::vector<ScoreTree*> m_scoreTrees;
266
+
267
+ std::priority_queue<PackedItem> m_queue;
268
+ long m_lastFlushedLine;
269
+ long m_lastFlushedSourceNum;
270
+ std::string m_lastFlushedSourcePhrase;
271
+ std::vector<std::string> m_lastSourceRange;
272
+ std::priority_queue<std::pair<float, size_t> > m_rankQueue;
273
+ std::vector<std::string> m_lastCollection;
274
+
275
+ void Save();
276
+ void PrintInfo();
277
+
278
+ void AddSourceSymbolId(std::string& symbol);
279
+ unsigned GetSourceSymbolId(std::string& symbol);
280
+
281
+ void AddTargetSymbolId(std::string& symbol);
282
+ unsigned GetTargetSymbolId(std::string& symbol);
283
+ unsigned GetOrAddTargetSymbolId(std::string& symbol);
284
+
285
+ unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
286
+
287
+ unsigned EncodeREncSymbol1(unsigned symbol);
288
+ unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
289
+ unsigned EncodeREncSymbol3(unsigned rank);
290
+
291
+ unsigned EncodePREncSymbol1(unsigned symbol);
292
+ unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
293
+
294
+ void EncodeTargetPhraseNone(std::vector<std::string>& t,
295
+ std::ostream& os);
296
+
297
+ void EncodeTargetPhraseREnc(std::vector<std::string>& s,
298
+ std::vector<std::string>& t,
299
+ std::set<AlignPoint>& a,
300
+ std::ostream& os);
301
+
302
+ void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
303
+ std::vector<std::string>& t,
304
+ std::set<AlignPoint>& a, size_t ownRank,
305
+ std::ostream& os);
306
+
307
+ void EncodeScores(std::vector<float>& scores, std::ostream& os);
308
+ void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
309
+
310
+ std::string MakeSourceKey(std::string&);
311
+ std::string MakeSourceTargetKey(std::string&, std::string&);
312
+
313
+ void LoadLexicalTable(std::string filePath);
314
+
315
+ void CreateRankHash();
316
+ void EncodeTargetPhrases();
317
+ void CalcHuffmanCodes();
318
+ void CompressTargetPhrases();
319
+
320
+ void AddRankedLine(PackedItem& pi);
321
+ void FlushRankedQueue(bool force = false);
322
+
323
+ std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
324
+ void AddEncodedLine(PackedItem& pi);
325
+ void FlushEncodedQueue(bool force = false);
326
+
327
+ std::string CompressEncodedCollection(std::string encodedCollection);
328
+ void AddCompressedCollection(PackedItem& pi);
329
+ void FlushCompressedQueue(bool force = false);
330
+
331
+ public:
332
+
333
+ PhraseTableCreator(std::string inPath,
334
+ std::string outPath,
335
+ std::string tempfilePath,
336
+ size_t numScoreComponent = 5,
337
+ size_t sortScoreIndex = 2,
338
+ Coding coding = PREnc,
339
+ size_t orderBits = 10,
340
+ size_t fingerPrintBits = 16,
341
+ bool useAlignmentInfo = false,
342
+ bool multipleScoreTrees = true,
343
+ size_t quantize = 0,
344
+ size_t maxRank = 100,
345
+ bool warnMe = true
346
+ #ifdef WITH_THREADS
347
+ , size_t threads = 2
348
+ #endif
349
+ );
350
+
351
+ ~PhraseTableCreator();
352
+
353
+ friend class RankingTask;
354
+ friend class EncodingTask;
355
+ friend class CompressionTask;
356
+ };
357
+
358
+ class RankingTask
359
+ {
360
+ private:
361
+ #ifdef WITH_THREADS
362
+ static boost::mutex m_mutex;
363
+ static boost::mutex m_fileMutex;
364
+ #endif
365
+ static size_t m_lineNum;
366
+ InputFileStream& m_inFile;
367
+ PhraseTableCreator& m_creator;
368
+
369
+ public:
370
+ RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
371
+ void operator()();
372
+ };
373
+
374
+ class EncodingTask
375
+ {
376
+ private:
377
+ #ifdef WITH_THREADS
378
+ static boost::mutex m_mutex;
379
+ static boost::mutex m_fileMutex;
380
+ #endif
381
+ static size_t m_lineNum;
382
+ static size_t m_sourcePhraseNum;
383
+ static std::string m_lastSourcePhrase;
384
+
385
+ InputFileStream& m_inFile;
386
+ PhraseTableCreator& m_creator;
387
+
388
+ public:
389
+ EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
390
+ void operator()();
391
+ };
392
+
393
+ class CompressionTask
394
+ {
395
+ private:
396
+ #ifdef WITH_THREADS
397
+ static boost::mutex m_mutex;
398
+ #endif
399
+ static size_t m_collectionNum;
400
+ StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
401
+ m_encodedCollections;
402
+ PhraseTableCreator& m_creator;
403
+
404
+ public:
405
+ CompressionTask(StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
406
+ encodedCollections, PhraseTableCreator& creator);
407
+ void operator()();
408
+ };
409
+
410
+ }
411
+
412
+ #endif
mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /***********************************************************************
2
+ Moses - statistical machine translation system
3
+ Copyright (C) 2006-2012 University of Edinburgh
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ ***********************************************************************/
19
+
20
+ #include "Parser.h"
21
+
22
+ #include "moses/ChartParser.h"
23
+ #include "moses/ChartTranslationOptionList.h"
24
+ #include "moses/InputType.h"
25
+ #include "moses/NonTerminal.h"
26
+ #include "moses/TranslationModel/RuleTable/UTrieNode.h"
27
+ #include "moses/TranslationModel/RuleTable/UTrie.h"
28
+ #include "moses/StaticData.h"
29
+ #include "ApplicableRuleTrie.h"
30
+ #include "StackLattice.h"
31
+ #include "StackLatticeBuilder.h"
32
+ #include "StackLatticeSearcher.h"
33
+ #include "VarSpanTrieBuilder.h"
34
+
35
+ #include <memory>
36
+ #include <vector>
37
+
38
+ namespace Moses
39
+ {
40
+
41
+ void Scope3Parser::GetChartRuleCollection(
42
+ const InputPath &inputPath,
43
+ size_t last,
44
+ ChartParserCallback &outColl)
45
+ {
46
+ const Range &range = inputPath.GetWordsRange();
47
+ const size_t start = range.GetStartPos();
48
+ const size_t end = range.GetEndPos();
49
+
50
+ std::vector<std::pair<const UTrieNode *, const VarSpanNode *> > &pairVec
51
+ = m_ruleApplications[start][end-start+1];
52
+
53
+ MatchCallback matchCB(range, outColl);
54
+ for (std::vector<std::pair<const UTrieNode *, const VarSpanNode *> >::const_iterator p = pairVec.begin(); p != pairVec.end(); ++p) {
55
+ const UTrieNode &ruleNode = *(p->first);
56
+ const VarSpanNode &varSpanNode = *(p->second);
57
+
58
+ const UTrieNode::LabelMap &labelMap = ruleNode.GetLabelMap();
59
+
60
+ if (varSpanNode.m_rank == 0) { // Purely lexical rule.
61
+ assert(labelMap.size() == 1);
62
+ TargetPhraseCollection::shared_ptr tpc = labelMap.begin()->second;
63
+ matchCB.m_tpc = tpc;
64
+ matchCB(m_emptyStackVec);
65
+ } else { // Rule has at least one non-terminal.
66
+ varSpanNode.CalculateRanges(start, end, m_ranges);
67
+ m_latticeBuilder.Build(start, end, ruleNode, varSpanNode, m_ranges,
68
+ *this, m_lattice,
69
+ m_quickCheckTable);
70
+ StackLatticeSearcher<MatchCallback> searcher(m_lattice, m_ranges);
71
+ UTrieNode::LabelMap::const_iterator p = labelMap.begin();
72
+ for (; p != labelMap.end(); ++p) {
73
+ const std::vector<int> &labels = p->first;
74
+ TargetPhraseCollection::shared_ptr tpc = p->second;
75
+ assert(labels.size() == varSpanNode.m_rank);
76
+ bool failCheck = false;
77
+ for (size_t i = 0; i < varSpanNode.m_rank; ++i) {
78
+ if (!m_quickCheckTable[i][labels[i]]) {
79
+ failCheck = true;
80
+ break;
81
+ }
82
+ }
83
+ if (failCheck) {
84
+ continue;
85
+ }
86
+ matchCB.m_tpc = tpc;
87
+ searcher.Search(labels, matchCB);
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ void Scope3Parser::Init()
94
+ {
95
+ InitRuleApplicationVector();
96
+
97
+ // Build a map from Words to index-sets.
98
+ SentenceMap sentMap;
99
+ FillSentenceMap(sentMap);
100
+
101
+ // Build a trie containing 'elastic' application contexts
102
+ const UTrieNode &rootNode = m_ruleTable.GetRootNode();
103
+ std::auto_ptr<ApplicableRuleTrie> art(new ApplicableRuleTrie(-1, -1, rootNode));
104
+ art->Extend(rootNode, -1, sentMap, false);
105
+
106
+ // Build a trie containing just the non-terminal contexts and insert pointers
107
+ // to its nodes back into the ART trie. Contiguous non-terminal contexts are
108
+ // merged and the number of split points is recorded.
109
+ VarSpanTrieBuilder vstBuilder;
110
+ m_varSpanTrie = vstBuilder.Build(*art);
111
+
112
+ // Fill each cell with a list of pointers to relevant ART nodes.
113
+ AddRulesToCells(*art, std::make_pair<int, int>(-1, -1), GetParser().GetSize()-1, 0);
114
+ }
115
+
116
+ void Scope3Parser::InitRuleApplicationVector()
117
+ {
118
+ const size_t sourceSize = GetParser().GetSize();
119
+ m_ruleApplications.resize(sourceSize);
120
+ for (size_t start = 0; start < sourceSize; ++start) {
121
+ size_t maxSpan = sourceSize-start+1;
122
+ m_ruleApplications[start].resize(maxSpan+1);
123
+ }
124
+ }
125
+
126
+ void Scope3Parser::FillSentenceMap(SentenceMap &sentMap)
127
+ {
128
+ for (size_t i = 0; i < GetParser().GetSize(); ++i) {
129
+ const Word &word = GetParser().GetInputPath(i, i).GetLastWord();
130
+ sentMap[word].push_back(i);
131
+ }
132
+ }
133
+
134
+ void Scope3Parser::AddRulesToCells(
135
+ const ApplicableRuleTrie &node,
136
+ std::pair<int, int> start,
137
+ int maxPos,
138
+ int depth)
139
+ {
140
+ if (depth > 0) {
141
+ // Determine the start range for this path if not already known.
142
+ if (start.first == -1 && start.second == -1) {
143
+ assert(depth == 1);
144
+ start.first = std::max(0, node.m_start);
145
+ start.second = node.m_start;
146
+ } else if (start.second < 0) {
147
+ assert(depth > 1);
148
+ if (node.m_start == -1) {
149
+ --start.second; // Record split point
150
+ } else {
151
+ int numSplitPoints = -1 - start.second;
152
+ start.second = node.m_start - (numSplitPoints+1);
153
+ }
154
+ }
155
+ }
156
+
157
+ if (node.m_node->HasRules()) {
158
+ assert(depth > 0);
159
+ assert(node.m_vstNode);
160
+ // Determine the end range for this path.
161
+ std::pair<int, int> end;
162
+ if (node.m_end == -1) {
163
+ end.first = (*(node.m_vstNode->m_label))[2];
164
+ end.second = (*(node.m_vstNode->m_label))[3];
165
+ assert(end.first != -1);
166
+ if (end.second == -1) {
167
+ end.second = maxPos;
168
+ }
169
+ } else {
170
+ assert(node.m_start == node.m_end); // Should be a terminal
171
+ end.first = end.second = node.m_start;
172
+ }
173
+ // Add a (rule trie node, VST node) pair for each cell in the range.
174
+ int s2 = start.second;
175
+ if (s2 < 0) {
176
+ int numSplitPoints = -1 - s2;
177
+ s2 = maxPos - numSplitPoints;
178
+ }
179
+ for (int i = start.first; i <= s2; ++i) {
180
+ int e1 = std::max(i+depth-1, end.first);
181
+ for (int j = e1; j <= end.second; ++j) {
182
+ size_t span = j-i+1;
183
+ assert(span >= 1);
184
+ if (m_maxChartSpan && span > m_maxChartSpan) {
185
+ break;
186
+ }
187
+ m_ruleApplications[i][span].push_back(std::make_pair(node.m_node,
188
+ node.m_vstNode));
189
+ }
190
+ }
191
+ }
192
+
193
+ for (std::vector<ApplicableRuleTrie*>::const_iterator p = node.m_children.begin(); p != node.m_children.end(); ++p) {
194
+ AddRulesToCells(**p, start, maxPos, depth+1);
195
+ }
196
+ }
197
+
198
+ } // namespace Moses
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <string>
2
+ #include <cassert>
3
+ #include <iomanip>
4
+ #include <algorithm>
5
+ #include "ug_stringdist.h"
6
+ // string distance measures
7
+ // Code by Ulrich Germann
8
+
9
+ namespace stringdist
10
+ {
11
+
12
+ UErrorCode strip_accents(UnicodeString & trg)
13
+ {
14
+ UErrorCode status = U_ZERO_ERROR;
15
+ static Transliterator *stripper
16
+ = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
17
+ UTRANS_FORWARD, status);
18
+ stripper->transliterate(trg);
19
+ return status;
20
+ }
21
+
22
+ char const*
23
+ StringDiff::
24
+ Segment::
25
+ elabel[] = { "same", "cap", "flip", "permutation",
26
+ "accent", "duplication",
27
+ "insertion", "deletion",
28
+ "mismatch", "noinit" };
29
+
30
+ StringDiff::
31
+ StringDiff()
32
+ {}
33
+
34
+ StringDiff::
35
+ StringDiff(string const& a, string const& b)
36
+ {
37
+ set_a(a);
38
+ set_b(b);
39
+ align();
40
+ }
41
+
42
+ StringDiff::
43
+ Segment::
44
+ Segment()
45
+ : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
46
+ {}
47
+
48
+ UnicodeString const&
49
+ StringDiff::
50
+ set_a(string const& a)
51
+ {
52
+ this->a = a.c_str();
53
+ return this->a;
54
+ }
55
+
56
+ UnicodeString const&
57
+ StringDiff::
58
+ set_b(string const& b)
59
+ {
60
+ this->b = b.c_str();
61
+ return this->b;
62
+ }
63
+
64
+ UnicodeString const&
65
+ StringDiff::
66
+ get_a() const
67
+ {
68
+ return this->a;
69
+ }
70
+
71
+ UnicodeString const&
72
+ StringDiff::
73
+ get_b() const
74
+ {
75
+ return this->b;
76
+ }
77
+
78
+ size_t
79
+ StringDiff::
80
+ size()
81
+ {
82
+ return this->difflist.size();
83
+ }
84
+
85
+ // float
86
+ // StringDiff::
87
+ // levelshtein(bool force)
88
+ // {
89
+ // align(force);
90
+ // float ret = 0;
91
+ // for (size_t i = 0; i < difflist.size(); +++i)
92
+ // {
93
+ // Segment const& s = difflist[i];
94
+ // if (s.match == same) continue;
95
+ // else if (s.match == insertion) ret += s.end_b - s.start_b;
96
+ // else if (s.match == deletion) ret += s.end_a - s.start_a;
97
+
98
+ // }
99
+ // }
100
+
101
+ void
102
+ StringDiff::
103
+ fillAlignmentMatrix(vector<vector<float> > & M) const
104
+ {
105
+ assert(a.length() && b.length());
106
+ M.assign(a.length(),vector<float>(b.length(),0));
107
+ int i = 0,j;
108
+ while (i < b.length() && b[i] != a[0]) ++i;
109
+ while (i < b.length()) M[0][i++] = 1;
110
+ i = 0;
111
+ while (i < a.length() && a[i] != b[0]) ++i;
112
+ while (i < a.length()) M[i++][0] = 1;
113
+ for (i = 1; i < a.length(); ++i)
114
+ {
115
+ for (j = 1; j < b.length(); ++j)
116
+ {
117
+ float & s = M[i][j];
118
+ s = max(M[i-1][j],M[i][j-1]);
119
+ if (a[i] == b[j])
120
+ s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0));
121
+ }
122
+ }
123
+ #if 0
124
+ string abuf,bbuf;
125
+ a.toUTF8String(abuf);
126
+ b.toUTF8String(bbuf);
127
+ cout << " " << bbuf[0];
128
+ for (int x = 1; x < b.length(); ++x)
129
+ cout << " " << bbuf[x];
130
+ cout << endl;
131
+ for (int x = 0; x < a.length(); ++x)
132
+ {
133
+ cout << abuf[x] << " ";
134
+ for (int y = 0; y < b.length(); ++y)
135
+ cout << int(M[x][y]) << " ";
136
+ cout << endl;
137
+ }
138
+ #endif
139
+ }
140
+
141
+ float
142
+ fillAlignmentMatrix(UChar const* a, size_t const lenA,
143
+ UChar const* b, size_t const lenB,
144
+ vector<vector<float> > & M)
145
+ {
146
+ M.assign(lenA,vector<float>(lenB,0));
147
+ assert(lenA); assert(lenB);
148
+ size_t i = 0;
149
+ while (i < lenB && b[i] != a[0]) ++i;
150
+ while (i < lenB) M[0][i++] = 1;
151
+ i = 0;
152
+ while (i < lenA && a[i] != b[0]) ++i;
153
+ while (i < lenA) M[i++][0] = 1;
154
+ for (i = 1; i < lenA; ++i)
155
+ {
156
+ for (size_t j = 1; j < lenB; ++j)
157
+ {
158
+ float & s = M[i][j];
159
+ s = max(M[i-1][j], M[i][j-1]);
160
+ if (a[i] == b[j])
161
+ s = max(s, M[i-1][j-1] + 1);
162
+ }
163
+ }
164
+ return M.back().back();
165
+ }
166
+
167
+ float
168
+ levenshtein(UChar const* a, size_t const lenA,
169
+ UChar const* b, size_t const lenB)
170
+ {
171
+ vector<vector<float> > M;
172
+ fillAlignmentMatrix(a,lenA,b,lenB,M);
173
+ size_t ret = 0;
174
+ #define DEBUGME 0
175
+ #if DEBUGME
176
+ for (size_t i = 0; i < M.size(); ++i)
177
+ {
178
+ for (size_t j = 0; j < M[i].size(); ++j)
179
+ cout << M[i][j] << " ";
180
+ cout << endl;
181
+ }
182
+ cout << string(25,'-') << endl;
183
+ #endif
184
+
185
+ int i = M.size() -1;
186
+ int j = M.back().size() -1;
187
+ int I=i, J=j;
188
+ for (;i >= 0 || j >= 0; --i, --j)
189
+ {
190
+ I=i, J=j;
191
+ if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i;
192
+ if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j;
193
+ size_t ilen = I >= 0 ? I - i : 0;
194
+ size_t jlen = J >= 0 ? J - j : 0;
195
+ ret += max(ilen,jlen);
196
+ #if DEBUGME
197
+ cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
198
+ #endif
199
+ I=i, J=j;
200
+ }
201
+ size_t ilen = I >= 0 ? I - i : 0;
202
+ size_t jlen = J >= 0 ? J - j : 0;
203
+ ret += max(ilen,jlen);
204
+ #if DEBUGME
205
+ cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
206
+ #endif
207
+ return ret;
208
+ }
209
+
210
+
211
+
212
+ StringDiff::
213
+ Segment::
214
+ Segment(size_t const as, size_t const ae,
215
+ size_t const bs, size_t const be,
216
+ UnicodeString const& a,
217
+ UnicodeString const& b)
218
+ {
219
+ dist = 0;
220
+ start_a = as; end_a = ae;
221
+ start_b = bs; end_b = be;
222
+ if (as == ae)
223
+ match = bs == be ? same : insertion;
224
+ else if (bs == be)
225
+ match = deletion;
226
+ else if (be-bs != ae-as)
227
+ {
228
+ match = mismatch;
229
+ dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
230
+ b.getBuffer() + bs, be - bs);
231
+ }
232
+ else
233
+ {
234
+ match = same;
235
+ size_t stop = ae-as;
236
+ for (size_t i = 0; i < stop && match == same; ++i)
237
+ if (a[as+i] != b[bs+i]) match = mismatch;
238
+ if (match == mismatch)
239
+ {
240
+ if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs])
241
+ match = flip;
242
+ else
243
+ {
244
+ vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae);
245
+ vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be);
246
+ sort(x.begin(),x.end());
247
+ sort(y.begin(),y.end());
248
+ if (x == y) match = permutation;
249
+ else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
250
+ b.getBuffer() + bs, be - bs);
251
+ }
252
+ }
253
+ }
254
+ if (match == insertion)
255
+ {
256
+ dist = be-bs;
257
+ }
258
+ else if (match == deletion)
259
+ {
260
+ dist = ae-as;
261
+ }
262
+ else if (match == flip) dist = 1;
263
+ else if (match == permutation) dist = ae-as-1;
264
+ if (match == mismatch)
265
+ {
266
+ UnicodeString ax(a,as,ae-as);
267
+ UnicodeString bx(b,bs,be-bs);
268
+ if (ax.toLower() == bx.toLower())
269
+ match = cap;
270
+ else
271
+ {
272
+ strip_accents(ax);
273
+ strip_accents(bx);
274
+ if (ax == bx) match = accent;
275
+ }
276
+ }
277
+ }
278
+
279
+ size_t
280
+ StringDiff::
281
+ align(bool force)
282
+ {
283
+ if (force) difflist.clear();
284
+ if (difflist.size()) return 0;
285
+ vector<vector<float> > M;
286
+ fillAlignmentMatrix(M);
287
+ // now backtrack
288
+ int i = a.length() - 1;
289
+ int j = b.length() - 1;
290
+ vector<int> A(a.length(), -1);
291
+ vector<int> B(b.length(), -1);
292
+ while (i + j)
293
+ {
294
+ while (i && M[i-1][j] == M[i][j]) --i;
295
+ while (j && M[i][j-1] == M[i][j]) --j;
296
+ if (a[i] == b[j]) { A[i] = j; B[j] = i; }
297
+ if (i) --i;
298
+ if (j) --j;
299
+ }
300
+ i = a.length() - 1;
301
+ j = b.length() - 1;
302
+ vector<int> A2(a.length(), -1);
303
+ vector<int> B2(b.length(), -1);
304
+ while (i + j)
305
+ {
306
+ while (j && M[i][j-1] == M[i][j]) --j;
307
+ while (i && M[i-1][j] == M[i][j]) --i;
308
+ if (a[i] == b[j]) { A2[i] = j; B2[j] = i; }
309
+ if (i) --i;
310
+ if (j) --j;
311
+ }
312
+ for (size_t k = 0; k < A.size(); ++k)
313
+ A[k] = min(A[k],A2[k]);
314
+ for (size_t k = 0; k < B.size(); ++k)
315
+ B[k] = min(B[k],B2[k]);
316
+
317
+ if (a[i] == b[j]) { A[i] = j; B[j] = i; }
318
+ i = 0;
319
+ j = 0;
320
+ size_t I, J;
321
+ while (i < a.length() and j < b.length())
322
+ {
323
+ if (A[i] < 0)
324
+ {
325
+ I = i + 1;
326
+ while (I < A.size() and A[I] < 0) ++I;
327
+ if (i)
328
+ { for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); }
329
+ else if (I < A.size())
330
+ { for (j = J = A[I]; j && B[j-1] < 0; --j); }
331
+ else J = B.size();
332
+ difflist.push_back(Segment(i,I,j,J,a,b));
333
+ i = I; j = J;
334
+ }
335
+ else if (B[j] < 0)
336
+ {
337
+ for (J = j + 1; J < B.size() && B[J] < 0; ++J);
338
+ difflist.push_back(Segment(i,i,j,J,a,b));
339
+ j = J;
340
+ }
341
+ else
342
+ {
343
+ I = i;
344
+ J = j;
345
+ while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
346
+ { ++I; ++J; }
347
+ difflist.push_back(Segment(i,I,j,J,a,b));
348
+ i = I; j = J;
349
+ }
350
+ }
351
+ if (i < a.length() || j < b.length())
352
+ difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
353
+
354
+ diffcnt.assign(noinit,0);
355
+ for (size_t i = 0; i < difflist.size(); ++i)
356
+ {
357
+ Segment & s = difflist[i];
358
+ if (s.match == insertion and
359
+ ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
360
+ (s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
361
+ {
362
+ bool sameletter = true;
363
+ for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i)
364
+ sameletter = b[i] == b[i-1];
365
+ if (sameletter) s.match = duplication;
366
+ }
367
+ else if (s.match == deletion and
368
+ ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
369
+ (s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
370
+ {
371
+ bool sameletter = true;
372
+ for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i)
373
+ sameletter = a[i] == a[i-1];
374
+ if (sameletter) s.match= duplication;
375
+ }
376
+ ++diffcnt[s.match];
377
+ }
378
+ return 0;
379
+ }
380
+
381
+ void
382
+ StringDiff::
383
+ showDiff(std::ostream& out)
384
+ {
385
+ if (difflist.size() == 0) align();
386
+ vector<size_t> fromEnd(difflist.size(),0);
387
+ for (int d = difflist.size()-1; d-- > 0;)
388
+ {
389
+ fromEnd[d] = a.length() - difflist[d].end_a;
390
+ // cout << d << " " << fromEnd[d] << " "
391
+ // << difflist[d].start_a << "-"
392
+ // << difflist[d].end_a << endl;
393
+ }
394
+ for (size_t d = 0; d < difflist.size(); ++d)
395
+ {
396
+ Segment const& s = difflist[d];
397
+ UnicodeString aseg,bseg;
398
+ a.extract(s.start_a, s.end_a - s.start_a, aseg);
399
+ b.extract(s.start_b, s.end_b - s.start_b, bseg);
400
+ string abuf,bbuf;
401
+ aseg.toUTF8String(abuf);
402
+ bseg.toUTF8String(bbuf);
403
+ out << abuf << " ";
404
+ out << bbuf << " ";
405
+ out << s.label() << " "
406
+ << s.dist << " "
407
+ << fromEnd[d]
408
+ << endl;
409
+ }
410
+ }
411
+
412
+ char const*
413
+ StringDiff::
414
+ Segment::
415
+ label() const
416
+ {
417
+ return elabel[this->match];
418
+ }
419
+
420
+ StringDiff::Segment const&
421
+ StringDiff::
422
+ operator[](uint32_t const i) const
423
+ {
424
+ return difflist.at(i);
425
+ }
426
+
427
+ vector<int> const&
428
+ StringDiff::
429
+ getFeatures() const
430
+ {
431
+ return diffcnt;
432
+ }
433
+
434
+ }
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //-*- c++ -*-
2
+ #pragma once
3
+
4
+ // string distance measures
5
+ // Code by Ulrich Germann
6
+ #include<iostream>
7
+
8
+
9
+ #include <unicode/stringpiece.h>
10
+ #include <unicode/translit.h>
11
+ #include <unicode/utypes.h>
12
+ #include <unicode/unistr.h>
13
+ #include <unicode/uchar.h>
14
+ #include <unicode/utf8.h>
15
+ #include <vector>
16
+
17
+ #include "moses/TranslationModel/UG/mm/tpt_typedefs.h"
18
+
19
+
20
+ namespace stringdist
21
+ {
22
+ float
23
+ levenshtein(UChar const* a, size_t const lenA,
24
+ UChar const* b, size_t const lenB);
25
+
26
+ UErrorCode strip_accents(UnicodeString & trg);
27
+
28
+ float
29
+ fillAlignmentMatrix(UChar const* a, size_t const lenA,
30
+ UChar const* b, size_t const lenB,
31
+ std::vector<std::vector<float> > & M);
32
+
33
+ class StringDiff
34
+ {
35
+ public:
36
+ enum MATCHTYPE
37
+ {
38
+ same, // a and b are identical
39
+ cap, // a and b differ only in capitalization
40
+ flip, // two-letter flip
41
+ permutation, // a and b have same letters but in different order
42
+ accent, // a and b are the same basic letters, ignoring accents
43
+ duplication, // a is empty
44
+ insertion, // a is empty
45
+ deletion, // b is empty
46
+ mismatch, // none of the above
47
+ noinit // not initialized
48
+ };
49
+
50
+ struct Segment
51
+ {
52
+ static char const* elabel[];
53
+ int start_a, end_a;
54
+ int start_b, end_b;
55
+ MATCHTYPE match;
56
+ float dist;
57
+ Segment();
58
+ Segment(size_t const as, size_t const ae,
59
+ size_t const bs, size_t const be,
60
+ UnicodeString const& a,
61
+ UnicodeString const& b);
62
+ char const* label() const;
63
+ };
64
+ private:
65
+ UnicodeString a,b;
66
+ std::vector<Segment> difflist;
67
+ std::vector<int> diffcnt;
68
+ public:
69
+ UnicodeString const& set_a(std::string const& a);
70
+ UnicodeString const& set_b(std::string const& b);
71
+ UnicodeString const& get_a() const;
72
+ UnicodeString const& get_b() const;
73
+ StringDiff(std::string const& a, std::string const& b);
74
+ StringDiff();
75
+ size_t size();
76
+ size_t align(bool force=false); // returns the levenshtein distance
77
+ void showDiff(std::ostream& out);
78
+ float levenshtein();
79
+ Segment const& operator[](uint32_t i) const;
80
+ void fillAlignmentMatrix(std::vector<std::vector<float> > & M) const;
81
+ vector<int> const& getFeatures() const;
82
+ };
83
+ }
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "ug_thread_pool.h"
2
+ namespace ug {
3
+
4
+ ThreadPool::
5
+ ThreadPool(size_t const num_workers)
6
+ : m_service(), m_busywork(new boost::asio::io_service::work(m_service))
7
+ {
8
+ m_workers.reserve(num_workers);
9
+ for (size_t i = 0; i < num_workers; ++i)
10
+ {
11
+ // boost::shared_ptr<boost::thread> t;
12
+ // t.reset(new boost::thread(boost::bind(&service_t::run, &m_service)));
13
+ boost::thread* t;
14
+ t = new boost::thread(boost::bind(&service_t::run, &m_service));
15
+ m_pool.add_thread(t);
16
+ // m_workers.push_back(t);
17
+ }
18
+ }
19
+
20
+ ThreadPool::
21
+ ~ThreadPool()
22
+ {
23
+ m_busywork.reset();
24
+ m_pool.join_all();
25
+ m_service.stop();
26
+ }
27
+
28
+
29
+
30
+
31
+ }
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
2
+ #pragma once
3
+ #include <boost/asio.hpp>
4
+ #include <boost/bind.hpp>
5
+ #include <boost/thread.hpp>
6
+ #include <boost/scoped_ptr.hpp>
7
+ #include <boost/shared_ptr.hpp>
8
+
9
+ #include <iostream>
10
+ #include <vector>
11
+ #include <string>
12
+
13
+ namespace ug {
14
+ class ThreadPool
15
+ {
16
+ typedef boost::asio::io_service service_t;
17
+ service_t m_service;
18
+ boost::thread_group m_pool;
19
+ boost::scoped_ptr<service_t::work> m_busywork;
20
+ std::vector<boost::shared_ptr<boost::thread> > m_workers;
21
+
22
+ public:
23
+ ThreadPool(size_t const num_workers);
24
+ ~ThreadPool();
25
+
26
+ template<class callable>
27
+ void add(callable& job) { m_service.post(job); }
28
+
29
+ }; // end of class declaration ThreadPool
30
+ } // end of namespace ug
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
2
+ #include "ug_thread_safe_counter.h"
3
+ // obsolete once <atomic> can be assumed to be available everywhere
4
+
5
+ namespace Moses
6
+ {
7
+ ThreadSafeCounter::
8
+ ThreadSafeCounter()
9
+ : ctr(0)
10
+ { }
11
+
12
+ size_t
13
+ ThreadSafeCounter::
14
+ operator++()
15
+ {
16
+ boost::lock_guard<boost::mutex> guard(this->lock);
17
+ return ++ctr;
18
+ }
19
+
20
+ size_t
21
+ ThreadSafeCounter::
22
+ operator++(int foo)
23
+ {
24
+ boost::lock_guard<boost::mutex> guard(this->lock);
25
+ return ctr++;
26
+ }
27
+
28
+ ThreadSafeCounter::
29
+ operator size_t() const
30
+ {
31
+ return ctr;
32
+ }
33
+
34
+ size_t
35
+ ThreadSafeCounter::
36
+ operator--()
37
+ {
38
+ boost::lock_guard<boost::mutex> guard(this->lock);
39
+ return --ctr;
40
+ }
41
+
42
+ size_t
43
+ ThreadSafeCounter::
44
+ operator--(int foo)
45
+ {
46
+ boost::lock_guard<boost::mutex> guard(this->lock);
47
+ return ctr--;
48
+ }
49
+
50
+
51
+ }
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <boost/thread.hpp>
3
+
4
+ namespace Moses
5
+ {
6
+ class ThreadSafeCounter
7
+ {
8
+ size_t ctr;
9
+ boost::mutex lock;
10
+ public:
11
+ ThreadSafeCounter();
12
+ size_t operator++();
13
+ size_t operator++(int);
14
+ size_t operator--();
15
+ size_t operator--(int);
16
+ operator size_t() const;
17
+ };
18
+
19
+ }
20
+
21
+
mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Some systems apparently distinguish between shell
2
+ # variables and environment variables. The latter are
3
+ # visible to the make utility, the former apparently not,
4
+ # so we need to set them if they are not defined yet
5
+
6
+ # ===============================================================================
7
+ # COMPILATION PREFERENCES
8
+ # ===============================================================================
9
+ # CCACHE: if set to ccache, use ccache to speed up compilation
10
+ # OPTI: optimization level
11
+ # PROF: profiler switches
12
+
13
+ CCACHE = ccache
14
+ OPTI = 3
15
+ EXE_TAG = exe
16
+ PROF =
17
+ # PROF = -g -pg
18
+
19
+ # ===============================================================================
20
+
21
+ SHELL = bash
22
+ MAKEFLAGS += --warn-undefined-variables
23
+ .DEFAULT_GOAL = all
24
+ .SUFFIXES:
25
+
26
+ # ===============================================================================
27
+ # COMPILATION 'LOCALIZATION'
28
+ HOST ?= $(shell hostname)
29
+ HOSTTYPE ?= $(shell uname -m)
30
+ KERNEL = $(shell uname -r)
31
+
32
+ MOSES_ROOT ?= ${HOME}/code/mosesdecoder
33
+ WDIR = build/${HOSTTYPE}/${KERNEL}/${OPTI}
34
+ VPATH = ${HOME}/code/mosesdecoder/
35
+ CXXFLAGS = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES}
36
+ CXXFLAGS += -DMAX_NUM_FACTORS=4
37
+ CXXFLAGS += -DKENLM_MAX_ORDER=5
38
+ modirs := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
39
+ CXXFLAGS += -I${MOSES_ROOT}
40
+ INCLUDES =
41
+ BZLIB =
42
+ BOOSTLIBTAG =
43
+
44
+ REQLIBS = m z pthread lzma ${BZLIB} \
45
+ boost_thread${BOOSTLIBTAG} \
46
+ boost_iostreams${BOOSTLIBTAG} \
47
+ boost_program_options${BOOSTLIBTAG} \
48
+ boost_system${BOOSTLIBTAG} \
49
+ boost_filesystem${BOOSTLIBTAG}
50
+
51
+ # icuuc icuio icui18n \
52
+
53
+ LIBS = $(addprefix -l, ${REQLIBS} moses)
54
+ LIBDIRS = -L${HOME}/code/mosesdecoder/lib
55
+ LIBDIRS += -L${HOME}/lib
56
+ PREFIX ?= .
57
+ BINDIR ?= ${PREFIX}/bin
58
+ ifeq "$(OPTI)" "0"
59
+ BINPREF = debug.
60
+ else
61
+ BINPREF =
62
+ endif
63
+
64
+
65
+ OBJ2 :=
66
+
67
+ define compile
68
+
69
+ DEP += ${WDIR}/$(basename $(notdir $1)).d
70
+ ${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
71
+ @echo -e "COMPILING $1"
72
+ @mkdir -p $$(@D)
73
+ ${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
74
+
75
+ endef
76
+
77
+ testprogs = test-dynamic-im-tsa
78
+ programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
79
+ programs += mtt-count-words calc-coverage
80
+
81
+ all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
82
+ @echo $^
83
+ clean:
84
+ rm -f ${WDIR}/*.o ${WDIR}/*.d
85
+
86
+ custom-pt: ${BINDIR}/${BINPREF}custom-pt
87
+ echo $^
88
+
89
+ INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
90
+ OBJ = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
91
+ OBJ += $(patsubst %.cpp,%.o,${INMOGEN})
92
+ EXE = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
93
+
94
+ $(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
95
+ $(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
96
+ $(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
97
+ $(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a
98
+ ${BINDIR}/${BINPREF}%: ${WDIR}/%.o
99
+ echo PREREQS: $<
100
+ $(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS}
101
+
102
+ .SECONDARY:
103
+
104
+ -include $(DEP)
105
+
mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
2
+ #include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
3
+ #include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
4
+ #include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
5
+ #include "moses/TranslationModel/UG/mm/ug_typedefs.h"
6
+ #include "moses/TranslationModel/UG/mm/tpt_pickler.h"
7
+ #include "moses/TranslationModel/UG/mm/ug_bitext.h"
8
+ #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
9
+
10
+ #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
11
+
12
+ // using namespace Moses;
13
+ using namespace ugdiss;
14
+ using namespace sapt;
15
+ using namespace std;
16
+
17
+ typedef L2R_Token<SimpleWordId> Token;
18
+ TokenIndex V;
19
+ SPTR<vector<vector<Token> > > C(new vector<vector<Token> >());
20
+ void
21
+ add_file(string fname)
22
+ {
23
+ boost::iostreams::filtering_istream in;
24
+ open_input_stream(fname,in);
25
+ string line;
26
+ while (getline(in,line))
27
+ {
28
+ C->push_back(vector<Token>());
29
+ fill_token_seq(V,line,C->back());
30
+ }
31
+ }
32
+
33
+ int
34
+ main(int argc, char* argv[])
35
+ {
36
+ V.setDynamic(true);
37
+ add_file(argv[1]);
38
+ SPTR<imTtrack<Token> > T(new imTtrack<Token>(C));
39
+ imTSA<Token> I(T,NULL,NULL);
40
+ string line;
41
+ while (getline(cin,line))
42
+ {
43
+ vector<Token> seq; fill_token_seq<Token>(V,line,seq);
44
+ for (size_t i = 0; i < seq.size(); ++i)
45
+ {
46
+ TSA<Token>::tree_iterator m(&I);
47
+ cout << V[seq[i].id()];
48
+ for (size_t k = i; k < seq.size() && m.extend(seq[k]); ++k)
49
+ {
50
+ cout << " ";
51
+ if (k > i) cout << V[seq[k].id()] << " ";
52
+ cout << "[" << m.approxOccurrenceCount() << "]";
53
+ }
54
+ cout << endl;
55
+ }
56
+ }
57
+ }
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ // Program to extract word cooccurrence counts from a memory-mapped
3
+ // word-aligned bitext stores the counts lexicon in the format for
4
+ // mm2dTable<uint32_t> (ug_mm_2d_table.h)
5
+ //
6
+ // (c) 2010-2012 Ulrich Germann
7
+
8
+ // to do: multi-threading
9
+
10
+ #include <queue>
11
+ #include <iomanip>
12
+ #include <vector>
13
+ #include <iterator>
14
+ #include <sstream>
15
+ #include <algorithm>
16
+
17
+ #include <boost/program_options.hpp>
18
+ #include <boost/dynamic_bitset.hpp>
19
+ #include <boost/shared_ptr.hpp>
20
+ #include <boost/foreach.hpp>
21
+ #include <boost/thread.hpp>
22
+ #include <boost/math/distributions/binomial.hpp>
23
+ #include <boost/unordered_map.hpp>
24
+ #include <boost/unordered_set.hpp>
25
+
26
+ #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
27
+ #include "moses/Util.h"
28
+ #include "ug_mm_2d_table.h"
29
+ #include "ug_mm_ttrack.h"
30
+ #include "ug_corpus_token.h"
31
+
32
+ using namespace std;
33
+ using namespace sapt;
34
+ using namespace ugdiss;
35
+ using namespace boost::math;
36
+
37
+ typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
38
+ typedef SimpleWordId Token;
39
+
40
+ // DECLARATIONS
41
+ void interpret_args(int ac, char* av[]);
42
+
43
+ mmTtrack<Token> T1,T2;
44
+ mmTtrack<char> Tx;
45
+ TokenIndex V1,V2;
46
+
47
+ typedef pair<id_type,id_type> wpair;
48
+ struct Count
49
+ {
50
+ uint32_t a;
51
+ uint32_t c;
52
+ Count() : a(0), c(0) {};
53
+ Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
54
+ };
55
+
56
+ bool
57
+ operator<(pair<id_type,Count> const& a,
58
+ pair<id_type,Count> const& b)
59
+ {
60
+ return a.first < b.first;
61
+ }
62
+
63
+
64
+ typedef boost::unordered_map<wpair,Count> countmap_t;
65
+ typedef vector<vector<pair<id_type,Count> > > countlist_t;
66
+
67
+ vector<countlist_t> XLEX;
68
+
69
+ class Counter
70
+ {
71
+ public:
72
+ countmap_t CNT;
73
+ countlist_t & LEX;
74
+ size_t offset;
75
+ size_t skip;
76
+ Counter(countlist_t& lex, size_t o, size_t s)
77
+ : LEX(lex), offset(o), skip(s) {}
78
+ void processSentence(id_type sid);
79
+ void operator()();
80
+ };
81
+
82
+ string bname,cfgFile,L1,L2,oname,cooc;
83
+ int verbose;
84
+ size_t truncat;
85
+ size_t num_threads;
86
+
87
+ void
88
+ Counter::
89
+ operator()()
90
+ {
91
+ for (size_t sid = offset; sid < min(truncat,T1.size()); sid += skip)
92
+ processSentence(sid);
93
+
94
+ LEX.resize(V1.ksize());
95
+ for (countmap_t::const_iterator c = CNT.begin(); c != CNT.end(); ++c)
96
+ {
97
+ pair<id_type,Count> foo(c->first.second,c->second);
98
+ LEX.at(c->first.first).push_back(foo);
99
+ }
100
+ typedef vector<pair<id_type,Count> > v_t;
101
+ BOOST_FOREACH(v_t& v, LEX)
102
+ sort(v.begin(),v.end());
103
+ }
104
+
105
+ struct lexsorter
106
+ {
107
+ vector<countlist_t> const& v;
108
+ id_type wid;
109
+ lexsorter(vector<countlist_t> const& vx, id_type widx)
110
+ : v(vx),wid(widx) {}
111
+ bool operator()(pair<uint32_t,uint32_t> const& a,
112
+ pair<uint32_t,uint32_t> const& b) const
113
+ {
114
+ return (v.at(a.first).at(wid).at(a.second).first >
115
+ v.at(b.first).at(wid).at(b.second).first);
116
+ }
117
+ };
118
+
119
+ void
120
+ writeTableHeader(ostream& out)
121
+ {
122
+ filepos_type idxOffset=0;
123
+ tpt::numwrite(out,idxOffset); // blank for the time being
124
+ tpt::numwrite(out,id_type(V1.ksize()));
125
+ tpt::numwrite(out,id_type(V2.ksize()));
126
+ }
127
+
128
+ void writeTable(ostream* aln_out, ostream* coc_out)
129
+ {
130
+ vector<uint32_t> m1a(V1.ksize(),0); // marginals L1
131
+ vector<uint32_t> m2a(V2.ksize(),0); // marginals L2
132
+ vector<uint32_t> m1c(V1.ksize(),0); // marginals L1
133
+ vector<uint32_t> m2c(V2.ksize(),0); // marginals L2
134
+ vector<id_type> idxa(V1.ksize()+1,0);
135
+ vector<id_type> idxc(V1.ksize()+1,0);
136
+ if (aln_out) writeTableHeader(*aln_out);
137
+ if (coc_out) writeTableHeader(*coc_out);
138
+ size_t CellCountA=0,CellCountC=0;
139
+ for (size_t id1 = 0; id1 < V1.ksize(); ++id1)
140
+ {
141
+ idxa[id1] = CellCountA;
142
+ idxc[id1] = CellCountC;
143
+ lexsorter sorter(XLEX,id1);
144
+ vector<pair<uint32_t,uint32_t> > H; H.reserve(num_threads);
145
+ for (size_t i = 0; i < num_threads; ++i)
146
+ {
147
+ if (id1 < XLEX.at(i).size() && XLEX[i][id1].size())
148
+ H.push_back(pair<uint32_t,uint32_t>(i,0));
149
+ }
150
+ if (!H.size()) continue;
151
+ make_heap(H.begin(),H.end(),sorter);
152
+ while (H.size())
153
+ {
154
+ id_type id2 = XLEX[H[0].first][id1][H[0].second].first;
155
+ uint32_t aln = XLEX[H[0].first][id1][H[0].second].second.a;
156
+ uint32_t coc = XLEX[H[0].first][id1][H[0].second].second.c;
157
+ pop_heap(H.begin(),H.end(),sorter);
158
+ ++H.back().second;
159
+ if (H.back().second == XLEX[H.back().first][id1].size())
160
+ H.pop_back();
161
+ else
162
+ push_heap(H.begin(),H.end(),sorter);
163
+ while (H.size() &&
164
+ XLEX[H[0].first][id1].at(H[0].second).first == id2)
165
+ {
166
+ aln += XLEX[H[0].first][id1][H[0].second].second.a;
167
+ coc += XLEX[H[0].first][id1][H[0].second].second.c;
168
+ pop_heap(H.begin(),H.end(),sorter);
169
+ ++H.back().second;
170
+ if (H.back().second == XLEX[H.back().first][id1].size())
171
+ H.pop_back();
172
+ else
173
+ push_heap(H.begin(),H.end(),sorter);
174
+ }
175
+ if (aln_out)
176
+ {
177
+ ++CellCountA;
178
+ tpt::numwrite(*aln_out,id2);
179
+ tpt::numwrite(*aln_out,aln);
180
+ m1a[id1] += aln;
181
+ m2a[id2] += aln;
182
+ }
183
+ if (coc_out && coc)
184
+ {
185
+ ++CellCountC;
186
+ tpt::numwrite(*coc_out,id2);
187
+ tpt::numwrite(*coc_out,coc);
188
+ m1c[id1] += coc;
189
+ m2c[id2] += coc;
190
+ }
191
+ }
192
+ }
193
+ idxa.back() = CellCountA;
194
+ idxc.back() = CellCountC;
195
+ if (aln_out)
196
+ {
197
+ filepos_type idxOffsetA = aln_out->tellp();
198
+ BOOST_FOREACH(id_type foo, idxa)
199
+ tpt::numwrite(*aln_out,foo);
200
+ aln_out->write(reinterpret_cast<char const*>(&m1a[0]),m1a.size()*4);
201
+ aln_out->write(reinterpret_cast<char const*>(&m2a[0]),m2a.size()*4);
202
+ aln_out->seekp(0);
203
+ tpt::numwrite(*aln_out,idxOffsetA);
204
+ }
205
+ if (coc_out)
206
+ {
207
+ filepos_type idxOffsetC = coc_out->tellp();
208
+ BOOST_FOREACH(id_type foo, idxc)
209
+ tpt::numwrite(*coc_out,foo);
210
+ coc_out->write(reinterpret_cast<char const*>(&m1c[0]),m1c.size()*4);
211
+ coc_out->write(reinterpret_cast<char const*>(&m2c[0]),m2c.size()*4);
212
+ coc_out->seekp(0);
213
+ tpt::numwrite(*coc_out,idxOffsetC);
214
+ }
215
+ }
216
+
217
+ void
218
+ Counter::
219
+ processSentence(id_type sid)
220
+ {
221
+ Token const* s1 = T1.sntStart(sid);
222
+ Token const* e1 = T1.sntEnd(sid);
223
+ Token const* s2 = T2.sntStart(sid);
224
+ Token const* e2 = T2.sntEnd(sid);
225
+ // vector<ushort> cnt1(V1.ksize(),0);
226
+ // vector<ushort> cnt2(V2.ksize(),0);
227
+ // for (Token const* x = s1; x < e1; ++x)
228
+ // ++cnt1.at(x->id());
229
+ // for (Token const* x = s2; x < e2; ++x)
230
+ // ++cnt2.at(x->id());
231
+
232
+ // boost::unordered_set<wpair> seen;
233
+ bitvector check1(T1.sntLen(sid)); check1.set();
234
+ bitvector check2(T2.sntLen(sid)); check2.set();
235
+
236
+ // count links
237
+ char const* p = Tx.sntStart(sid);
238
+ char const* q = Tx.sntEnd(sid);
239
+ ushort r,c;
240
+ if (verbose && sid % 1000000 == 0)
241
+ cerr << sid/1000000 << " M sentences processed" << endl;
242
+ while (p < q)
243
+ {
244
+ p = tpt::binread(p,r);
245
+ p = tpt::binread(p,c);
246
+ // cout << sid << " " << r << "-" << c << endl;
247
+ UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
248
+ UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
249
+ // assert(r < check1.size());
250
+ // assert(c < check2.size());
251
+ UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
252
+ UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
253
+ // assert(s1+r < e1);
254
+ // assert(s2+c < e2);
255
+ check1.reset(r);
256
+ check2.reset(c);
257
+ id_type id1 = (s1+r)->id();
258
+ id_type id2 = (s2+c)->id();
259
+ wpair k(id1,id2);
260
+ Count& cnt = CNT[k];
261
+ cnt.a++;
262
+ // if (seen.insert(k).second)
263
+ // cnt.c += cnt1[id1] * cnt2[id2];
264
+ }
265
+ // count unaliged words
266
+ for (size_t i = check1.find_first();
267
+ i < check1.size();
268
+ i = check1.find_next(i))
269
+ CNT[wpair((s1+i)->id(),0)].a++;
270
+ for (size_t i = check2.find_first();
271
+ i < check2.size();
272
+ i = check2.find_next(i))
273
+ CNT[wpair(0,(s2+i)->id())].a++;
274
+ }
275
+
276
+ int
277
+ main(int argc, char* argv[])
278
+ {
279
+ interpret_args(argc,argv);
280
+ char c = *bname.rbegin();
281
+ if (c != '/' && c != '.') bname += '.';
282
+ T1.open(bname+L1+".mct");
283
+ T2.open(bname+L2+".mct");
284
+ Tx.open(bname+L1+"-"+L2+".mam");
285
+ V1.open(bname+L1+".tdx");
286
+ V2.open(bname+L2+".tdx");
287
+ if (!truncat) truncat = T1.size();
288
+ XLEX.resize(num_threads);
289
+ vector<boost::shared_ptr<boost::thread> > workers(num_threads);
290
+ for (size_t i = 0; i < num_threads; ++i)
291
+ workers[i].reset(new boost::thread(Counter(XLEX[i],i,num_threads)));
292
+ for (size_t i = 0; i < workers.size(); ++i)
293
+ workers[i]->join();
294
+ // cerr << "done counting" << endl;
295
+ ofstream aln_out,coc_out;
296
+ if (oname.size()) aln_out.open(oname.c_str());
297
+ // if (cooc.size()) coc_out.open(cooc.c_str());
298
+ writeTable(oname.size() ? &aln_out : NULL,
299
+ cooc.size() ? &coc_out : NULL);
300
+ if (oname.size()) aln_out.close();
301
+ // if (cooc.size()) coc_out.close();
302
+ }
303
+
304
+ void
305
+ interpret_args(int ac, char* av[])
306
+ {
307
+ namespace po=boost::program_options;
308
+ po::variables_map vm;
309
+ po::options_description o("Options");
310
+ po::options_description h("Hidden Options");
311
+ po::positional_options_description a;
312
+
313
+ o.add_options()
314
+ ("help,h", "print this message")
315
+ ("cfg,f", po::value<string>(&cfgFile),"config file")
316
+ ("oname,o", po::value<string>(&oname),"output file name")
317
+ // ("cooc,c", po::value<string>(&cooc),
318
+ // "file name for raw co-occurrence counts")
319
+ ("verbose,v", po::value<int>(&verbose)->default_value(0)->implicit_value(1),
320
+ "verbosity level")
321
+ ("threads,t", po::value<size_t>(&num_threads)->default_value(4),
322
+ "count in <N> parallel threads")
323
+ ("truncate,n", po::value<size_t>(&truncat)->default_value(0),
324
+ "truncate corpus to <N> sentences (for debugging)")
325
+ ;
326
+
327
+ h.add_options()
328
+ ("bname", po::value<string>(&bname), "base name")
329
+ ("L1", po::value<string>(&L1),"L1 tag")
330
+ ("L2", po::value<string>(&L2),"L2 tag")
331
+ ;
332
+ a.add("bname",1);
333
+ a.add("L1",1);
334
+ a.add("L2",1);
335
+ get_options(ac,av,h.add(o),a,vm,"cfg");
336
+
337
+ if (vm.count("help") || bname.empty() || (oname.empty() && cooc.empty()))
338
+ {
339
+ cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> [-o <output file>] [-c <output file>]\n" << endl;
340
+ cout << "at least one of -o / -c must be specified." << endl;
341
+ cout << o << endl;
342
+ exit(0);
343
+ }
344
+ size_t num_cores = boost::thread::hardware_concurrency();
345
+ num_threads = min(num_threads,num_cores);
346
+ }
347
+
348
+
mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ // Converts a corpus in text format (plain text, one centence per line) or
3
+ // conll format or treetagger output format (which one is automatically
4
+ // recognized based on the number of fields per line) into memory-mapped
5
+ // format. (c) 2007-2013 Ulrich Germann
6
+
7
+ #include <boost/algorithm/string/predicate.hpp>
8
+ #include <boost/program_options.hpp>
9
+ #include <boost/program_options/options_description.hpp>
10
+ #include <boost/program_options/parsers.hpp>
11
+ #include <boost/program_options/variables_map.hpp>
12
+ #include <boost/iostreams/device/mapped_file.hpp>
13
+
14
+ #include <iostream>
15
+ #include <fstream>
16
+ #include <sstream>
17
+ #include <iomanip>
18
+ #include <vector>
19
+ #include <string>
20
+
21
+ #include <sys/types.h>
22
+ #include <sys/wait.h>
23
+
24
+ #include "ug_conll_record.h"
25
+ #include "tpt_tokenindex.h"
26
+ #include "ug_mm_ttrack.h"
27
+ #include "tpt_pickler.h"
28
+ #include "ug_deptree.h"
29
+ #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
30
+ #include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
31
+
32
+ using namespace std;
33
+ using namespace sapt;
34
+ using namespace Moses;
35
+ using namespace boost;
36
+ using namespace boost::algorithm;
37
+ namespace po=boost::program_options;
38
+
39
+ int with_pfas;
40
+ int with_dcas;
41
+ int with_sfas;
42
+
43
+ bool incremental = false; // build / grow vocabs automatically
44
+ bool is_conll = false; // text or conll format?
45
+ bool quiet = false; // no progress reporting
46
+
47
+ string vocabBase; // base name for existing vocabs that should be used
48
+ string baseName; // base name for all files
49
+ string tmpFile, mttFile; /* name of temporary / actual track file
50
+ * (.mtt for Conll format, .mct for plain text)
51
+ */
52
+ string UNK;
53
+
54
+ TokenIndex SF; // surface form
55
+ TokenIndex LM; // lemma
56
+ TokenIndex PS; // part of speech
57
+ TokenIndex DT; // dependency type
58
+
59
+ void interpret_args(int ac, char* av[]);
60
+
61
+ inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
62
+
63
+ id_type
64
+ get_id(TokenIndex const& T, string const& w)
65
+ {
66
+ id_type ret = T[w];
67
+ if (ret == 1 && w != UNK)
68
+ {
69
+ cerr << "Warning! Unkown vocabulary item '" << w << "', but "
70
+ << "incremental mode (-i) is not set." << endl;
71
+ assert(0);
72
+ }
73
+ return ret;
74
+ }
75
+
76
+ void
77
+ open_vocab(TokenIndex& T, string fname)
78
+ {
79
+ if (!access(fname.c_str(), F_OK))
80
+ {
81
+ T.open(fname,UNK);
82
+ assert(T[UNK] == 1);
83
+ }
84
+ else T.setUnkLabel(UNK);
85
+ if (incremental) T.setDynamic(true);
86
+ assert(T["NULL"] == 0);
87
+ assert(T[UNK] == 1);
88
+ }
89
+
90
+ void
91
+ ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
92
+ {
93
+ v.resize(T.totalVocabSize());
94
+ for (size_t i = 0; i < T.totalVocabSize(); ++i)
95
+ {
96
+ v[i].first = T[i];
97
+ v[i].second = 0;
98
+ }
99
+ }
100
+
101
+ void
102
+ write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o)
103
+ {
104
+ if (!quiet) cerr << "Writing " << fname << endl;
105
+ vector<id_type> o2n(n2o.size());
106
+ for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i;
107
+ vector<pair<string,uint32_t> > v(n2o.size());
108
+ for (id_type i = 0; i < n2o.size(); ++i)
109
+ {
110
+ v[i].first = T[n2o[i]];
111
+ v[i].second = i;
112
+ }
113
+ T.close();
114
+ sort(v.begin(),v.end());
115
+ write_tokenindex_to_disk(v, fname, UNK);
116
+ }
117
+
118
+ void init(int argc, char* argv[])
119
+ {
120
+ interpret_args(argc,argv);
121
+ if (is_conll)
122
+ {
123
+ open_vocab(SF, vocabBase+".tdx.sfo"); // surface form
124
+ open_vocab(LM, vocabBase+".tdx.lem"); // lemma
125
+ open_vocab(PS, vocabBase+".tdx.pos"); // part-of-speech
126
+ open_vocab(DT, vocabBase+".tdx.drl"); // dependency type
127
+ }
128
+ else open_vocab(SF, vocabBase+".tdx"); // surface form
129
+ }
130
+
131
+ void fill_rec(Conll_Record& rec, vector<string> const& w)
132
+ {
133
+ if (w.size() == 3) // treetagger output
134
+ {
135
+ rec.sform = get_id(SF, w[0]);
136
+ rec.lemma = get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]);
137
+ rec.majpos = rangeCheck(get_id(PS, w[1]), 256);
138
+ rec.minpos = rangeCheck(get_id(PS, w[1]), 256);
139
+ rec.dtype = 0;
140
+ rec.parent = -1;
141
+ }
142
+ else if (w.size() >= 8) // CONLL format
143
+ {
144
+ int id = atoi(w[0].c_str());
145
+ int gov = atoi(w[6].c_str());
146
+ rec.sform = get_id(SF, w[1]);
147
+ rec.lemma = get_id(LM, w[2]);
148
+ rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
149
+ rec.minpos = rangeCheck(get_id(PS, w[4]), 256);
150
+ rec.dtype = get_id(DT, w[7]);
151
+ rec.parent = gov ? gov - id : 0;
152
+ }
153
+ }
154
+
155
+ void log_progress(size_t ctr)
156
+ {
157
+ if (ctr % 100000 == 0)
158
+ {
159
+ if (ctr) cerr << endl;
160
+ cerr << setw(12) << ctr / 1000 << "K sentences processed ";
161
+ }
162
+ else if (ctr % 10000 == 0)
163
+ {
164
+ cerr << ".";
165
+ }
166
+ }
167
+
168
+
169
+ size_t
170
+ process_plain_input(ostream& out, vector<id_type> & s_index)
171
+ {
172
+ id_type totalWords = 0;
173
+ string line,w;
174
+ while (getline(cin,line))
175
+ {
176
+ istringstream buf(line);
177
+ if (!quiet) log_progress(s_index.size());
178
+ s_index.push_back(totalWords);
179
+ while (buf>>w)
180
+ {
181
+ tpt::numwrite(out,get_id(SF,w));
182
+ ++totalWords;
183
+ }
184
+ }
185
+ s_index.push_back(totalWords);
186
+ return totalWords;
187
+ }
188
+
189
+ size_t
190
+ process_tagged_input(ostream& out,
191
+ vector<id_type> & s_index,
192
+ vector<id_type> & p_index)
193
+ {
194
+ string line;
195
+ Conll_Record rec;
196
+ bool new_sent = true;
197
+ bool new_par = true;
198
+ id_type totalWords = 0;
199
+
200
+ while (getline(cin,line))
201
+ {
202
+ vector<string> w; string f; istringstream buf(line);
203
+ while (buf>>f) w.push_back(f);
204
+
205
+ if (w.size() == 0 || starts_with(w[0], "SID="))
206
+ new_sent = true;
207
+
208
+ else if (w.size() == 1 && w[0] == "<P>")
209
+ new_par = new_sent = true;
210
+
211
+ if (w.size() < 3) continue;
212
+ if (!quiet && new_sent) log_progress(s_index.size());
213
+ if (new_sent) { s_index.push_back(totalWords); new_sent = false; }
214
+ if (new_par) { p_index.push_back(totalWords); new_par = false; }
215
+ fill_rec(rec,w);
216
+ out.write(reinterpret_cast<char const*>(&rec),sizeof(rec));
217
+ ++totalWords;
218
+ }
219
+ s_index.push_back(totalWords);
220
+ return totalWords;
221
+ }
222
+
223
+ size_t
224
+ numberize()
225
+ {
226
+ ofstream out(tmpFile.c_str());
227
+ filepos_type startIdx=0;
228
+ id_type idxSize=0,totalWords=0;
229
+ tpt::numwrite(out,startIdx); // place holder, to be filled at the end
230
+ tpt::numwrite(out,idxSize); // place holder, to be filled at the end
231
+ tpt::numwrite(out,totalWords); // place holder, to be filled at the end
232
+
233
+ vector<id_type> s_index, p_index;
234
+
235
+ if(is_conll)
236
+ totalWords = process_tagged_input(out,s_index,p_index);
237
+ else
238
+ totalWords = process_plain_input(out,s_index);
239
+
240
+ vector<id_type> const* index = &s_index;
241
+ if (p_index.size() && p_index.back())
242
+ {
243
+ p_index.push_back(totalWords);
244
+ index = &p_index;
245
+ }
246
+
247
+ if (!quiet)
248
+ cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
249
+
250
+ startIdx = out.tellp();
251
+ for (size_t i = 0; i < index->size(); i++)
252
+ tpt::numwrite(out,(*index)[i]);
253
+ out.seekp(0);
254
+ idxSize = index->size();
255
+ tpt::numwrite(out, startIdx);
256
+ tpt::numwrite(out, idxSize - 1);
257
+ tpt::numwrite(out, totalWords);
258
+ out.close();
259
+ if (!quiet) cerr << "done" << endl;
260
+ return totalWords;
261
+ }
262
+
263
+ vector<id_type> smap,lmap,pmap,dmap;
264
+
265
+ void
266
+ invert(vector<id_type> const& from, vector<id_type> & to)
267
+ {
268
+ to.resize(from.size());
269
+ for (size_t i = 0 ; i < to.size(); ++i)
270
+ to[from[i]] = i;
271
+ }
272
+
273
+ // sorts new items based on occurrence counts but won't reassign
274
+ // existing token ids
275
+ void
276
+ conservative_sort(TokenIndex const & V,
277
+ vector<size_t> const & cnt,
278
+ vector<id_type> & xmap)
279
+ {
280
+ xmap.resize(V.totalVocabSize());
281
+ for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i;
282
+ VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt);
283
+ sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter);
284
+ }
285
+
286
+ // reassign token ids in the corpus track based on the id map created by
287
+ // conservative_sort
288
+ void remap()
289
+ {
290
+ if (!quiet) cerr << "Remapping ids ... ";
291
+ filepos_type idxOffset;
292
+ id_type totalWords, idxSize;
293
+ boost::iostreams::mapped_file mtt(tmpFile);
294
+ char const* p = mtt.data();
295
+ p = tpt::numread(p,idxOffset);
296
+ p = tpt::numread(p,idxSize);
297
+ p = tpt::numread(p,totalWords);
298
+ if (is_conll)
299
+ {
300
+ vector<size_t> sf(SF.totalVocabSize(), 0);
301
+ vector<size_t> lm(LM.totalVocabSize(), 0);
302
+ vector<size_t> ps(PS.totalVocabSize(), 0);
303
+ vector<size_t> dt(DT.totalVocabSize(), 0);
304
+ Conll_Record* w = reinterpret_cast<Conll_Record*>(const_cast<char*>(p));
305
+ for (size_t i = 0; i < totalWords; ++i)
306
+ {
307
+ ++sf.at(w[i].sform);
308
+ ++lm.at(w[i].lemma);
309
+ ++ps.at(w[i].majpos);
310
+ ++ps.at(w[i].minpos);
311
+ ++dt.at(w[i].dtype);
312
+ }
313
+ conservative_sort(SF,sf,smap);
314
+ conservative_sort(LM,lm,lmap);
315
+ conservative_sort(PS,ps,pmap);
316
+ conservative_sort(DT,dt,dmap);
317
+ vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
318
+ vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i);
319
+ vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i);
320
+ vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i);
321
+ for (size_t i = 0; i < totalWords; ++i)
322
+ {
323
+ w[i].sform = smap_i[w[i].sform];
324
+ w[i].lemma = lmap_i[w[i].lemma];
325
+ w[i].majpos = pmap_i[w[i].majpos];
326
+ w[i].minpos = pmap_i[w[i].minpos];
327
+ w[i].dtype = dmap_i[w[i].dtype];
328
+ }
329
+ }
330
+ else
331
+ {
332
+ vector<size_t> sf(SF.totalVocabSize(), 0);
333
+ id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p));
334
+ for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]);
335
+ conservative_sort(SF,sf,smap);
336
+ vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
337
+ for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]];
338
+ }
339
+ mtt.close();
340
+ if (!quiet) cerr << "done." << endl;
341
+ }
342
+
343
+ void save_vocabs()
344
+ {
345
+ string vbase = baseName;
346
+ if (is_conll)
347
+ {
348
+ if (SF.totalVocabSize() > SF.knownVocabSize())
349
+ write_tokenindex(vbase+".tdx.sfo",SF,smap);
350
+ if (LM.totalVocabSize() > LM.knownVocabSize())
351
+ write_tokenindex(vbase+".tdx.lem",LM,lmap);
352
+ if (PS.totalVocabSize() > PS.knownVocabSize())
353
+ write_tokenindex(vbase+".tdx.pos",PS,pmap);
354
+ if (DT.totalVocabSize() > DT.knownVocabSize())
355
+ write_tokenindex(vbase+".tdx.drl",DT,dmap);
356
+ }
357
+ else if (SF.totalVocabSize() > SF.knownVocabSize())
358
+ write_tokenindex(vbase+".tdx",SF,smap);
359
+ }
360
+
361
+ template<typename Token>
362
+ void
363
+ build_mmTSA(string infile, string outfile)
364
+ {
365
+ // size_t mypid = fork();
366
+ // if(mypid) return mypid;
367
+ boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
368
+ bdBitset filter;
369
+ filter.resize(T->size(),true);
370
+ imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
371
+ S.save_as_mm_tsa(outfile);
372
+ // exit(0);
373
+ }
374
+
375
+ bool
376
+ build_plaintext_tsas()
377
+ {
378
+ typedef L2R_Token<SimpleWordId> L2R;
379
+ typedef R2L_Token<SimpleWordId> R2L;
380
+ // size_t c = with_sfas + with_pfas;
381
+ if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
382
+ if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
383
+ // while (c--) wait(NULL);
384
+ return true;
385
+ }
386
+
387
+ void build_conll_tsas()
388
+ {
389
+ string bn = baseName;
390
+ string mtt = tmpFile;
391
+ size_t c = 3 * (with_sfas + with_pfas + with_dcas);
392
+ if (with_sfas)
393
+ {
394
+ build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
395
+ build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
396
+ build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
397
+ }
398
+
399
+ if (with_pfas)
400
+ {
401
+ build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
402
+ build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
403
+ build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
404
+ }
405
+
406
+ if (with_dcas)
407
+ {
408
+ build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
409
+ build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
410
+ build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
411
+ }
412
+ // while (c--) wait(NULL);
413
+ }
414
+
415
+
416
+ int main(int argc, char* argv[])
417
+ {
418
+ init(argc,argv);
419
+ numberize();
420
+ if (SF.totalVocabSize() > SF.knownVocabSize() ||
421
+ LM.totalVocabSize() > LM.knownVocabSize() ||
422
+ PS.totalVocabSize() > PS.knownVocabSize() ||
423
+ DT.totalVocabSize() > DT.knownVocabSize())
424
+ {
425
+ remap();
426
+ save_vocabs();
427
+ }
428
+ if (is_conll) build_conll_tsas();
429
+ else build_plaintext_tsas();
430
+ if (!quiet) cerr << endl;
431
+ rename(tmpFile.c_str(),mttFile.c_str());
432
+ }
433
+
434
+ void
435
+ interpret_args(int ac, char* av[])
436
+ {
437
+ po::variables_map vm;
438
+ po::options_description o("Options");
439
+ o.add_options()
440
+
441
+ ("help,h", "print this message")
442
+
443
+ ("quiet,q", po::bool_switch(&quiet),
444
+ "don't print progress information")
445
+
446
+ ("incremental,i", po::bool_switch(&incremental),
447
+ "incremental mode; rewrites vocab files!")
448
+
449
+ ("vocab-base,v", po::value<string>(&vocabBase),
450
+ "base name of various vocabularies")
451
+
452
+ ("output,o", po::value<string>(&baseName),
453
+ "base file name of the resulting file(s)")
454
+
455
+ ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
456
+ "also build suffix arrays")
457
+
458
+ ("pfa,p", po::value<int>(&with_pfas)
459
+ ->default_value(0)->implicit_value(1),
460
+ "also build prefix arrays")
461
+
462
+ ("dca,d", po::value<int>(&with_dcas)
463
+ ->default_value(0)->implicit_value(1),
464
+ "also build dependency chain arrays")
465
+
466
+ ("conll,c", po::bool_switch(&is_conll),
467
+ "corpus is in CoNLL format (default: plain text)")
468
+
469
+ ("unk,u", po::value<string>(&UNK)->default_value("UNK"),
470
+ "label for unknown tokens")
471
+
472
+ // ("map,m", po::value<string>(&vmap),
473
+ // "map words to word classes for indexing")
474
+
475
+ ;
476
+
477
+ po::options_description h("Hidden Options");
478
+ h.add_options()
479
+ ;
480
+ h.add(o);
481
+ po::positional_options_description a;
482
+ a.add("output",1);
483
+
484
+ po::store(po::command_line_parser(ac,av)
485
+ .options(h)
486
+ .positional(a)
487
+ .run(),vm);
488
+ po::notify(vm);
489
+ if (vm.count("help") || !vm.count("output"))
490
+ {
491
+ cout << "\nusage:\n\t cat <corpus> | " << av[0]
492
+ << " [options] <output .mtt file>" << endl;
493
+ cout << o << endl;
494
+ exit(0);
495
+ }
496
+ mttFile = baseName + (is_conll ? ".mtt" : ".mct");
497
+ tmpFile = mttFile + "_";
498
+ }
mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // (c) 2008-2010 Ulrich Germann
3
+ #include <boost/program_options.hpp>
4
+ #include <iomanip>
5
+
6
+ #include "tpt_typedefs.h"
7
+ #include "ug_mm_ttrack.h"
8
+ #include "tpt_tokenindex.h"
9
+ #include "ug_deptree.h"
10
+ #include "ug_corpus_token.h"
11
+
12
+ using namespace std;
13
+ using namespace sapt;
14
+ namespace po = boost::program_options;
15
+
16
+ string bname,mtt,mct;
17
+ vector<string> range;
18
+
19
+ typedef L2R_Token<Conll_Sform> Token;
20
+
21
+ TokenIndex SF,LM,PS,DT;
22
+ mmTtrack<Token> MTT;
23
+ mmTtrack<SimpleWordId> MCT;
24
+ bool sform;
25
+ bool have_mtt, have_mct;
26
+ bool with_sids;
27
+ bool with_positions;
28
+ void
29
+ interpret_args(int ac, char* av[])
30
+ {
31
+ po::variables_map vm;
32
+ po::options_description o("Options");
33
+ o.add_options()
34
+ ("help,h", "print this message")
35
+ ("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
36
+ ("sform,s", po::bool_switch(&sform), "sform only")
37
+ ("with-positions,p", po::bool_switch(&with_positions), "show word positions")
38
+ ;
39
+
40
+ po::options_description h("Hidden Options");
41
+ h.add_options()
42
+ ("bname", po::value<string>(&bname), "base name")
43
+ ("range", po::value<vector<string> >(&range), "range")
44
+ ;
45
+ po::positional_options_description a;
46
+ a.add("bname",1);
47
+ a.add("range",-1);
48
+
49
+ po::store(po::command_line_parser(ac,av)
50
+ .options(h.add(o))
51
+ .positional(a)
52
+ .run(),vm);
53
+ po::notify(vm); // IMPORTANT
54
+ if (vm.count("help") || bname.empty())
55
+ {
56
+ cout << "usage:\n\t"
57
+ << av[0] << " track name [<range>]\n"
58
+ << endl;
59
+ cout << o << endl;
60
+ exit(0);
61
+ }
62
+ mtt = bname+".mtt";
63
+ mct = bname+".mct";
64
+ }
65
+
66
+ void
67
+ printRangeMTT(size_t start, size_t stop)
68
+ {
69
+ for (;start < stop; start++)
70
+ {
71
+ size_t i = 0;
72
+ Token const* s = MTT.sntStart(start);
73
+ Token const* e = MTT.sntEnd(start);
74
+ if (with_sids) cout << start << " ";
75
+ for (Token const* t = s; t < e; ++t)
76
+ {
77
+ #if 0
78
+ uchar const* x = reinterpret_cast<uchar const*>(t);
79
+ cout << *reinterpret_cast<id_type const*>(x) << " ";
80
+ cout << *reinterpret_cast<id_type const*>(x+4) << " ";
81
+ cout << int(*(x+8)) << " ";
82
+ cout << int(*(x+9)) << " ";
83
+ cout << *reinterpret_cast<short const*>(x+10) << endl;
84
+ #endif
85
+ if (!sform)
86
+ {
87
+ cout << setw(2) << right << ++i << " ";
88
+ cout << setw(30) << right << SF[t->sform] << " ";
89
+ cout << setw(4) << right << PS[t->majpos] << " ";
90
+ cout << setw(4) << right << PS[t->minpos] << " ";
91
+ cout << setw(30) << left << LM[t->lemma] << " ";
92
+ cout << i+t->parent << " ";
93
+ cout << DT[t->dtype] << endl;
94
+ }
95
+ else
96
+ {
97
+ if (with_positions) cout << t-s << ":";
98
+ cout << SF[t->id()] << " ";
99
+ }
100
+ }
101
+ cout << endl;
102
+ }
103
+ }
104
+
105
+ void
106
+ printRangeMCT(size_t start, size_t stop)
107
+ {
108
+ for (;start < stop; start++)
109
+ {
110
+ SimpleWordId const* s = MCT.sntStart(start);
111
+ SimpleWordId const* t = s;
112
+ SimpleWordId const* e = MCT.sntEnd(start);
113
+ if (with_sids) cout << start << " ";
114
+ while (t < e)
115
+ {
116
+ if (with_positions) cout << t-s << ":";
117
+ cout << SF[(t++)->id()] << " ";
118
+ }
119
+ cout << endl;
120
+ }
121
+ }
122
+
123
+ int
124
+ main(int argc, char*argv[])
125
+ {
126
+ interpret_args(argc,argv);
127
+ have_mtt = !access(mtt.c_str(),F_OK);
128
+ have_mct = !have_mtt && !access(mct.c_str(),F_OK);
129
+ if (!have_mtt && !have_mct)
130
+ {
131
+ cerr << "FATAL ERROR: neither " << mtt << " nor " << mct << " exit." << endl;
132
+ exit(1);
133
+ }
134
+ if (have_mtt)
135
+ {
136
+ SF.open(bname+".tdx.sfo"); SF.iniReverseIndex();
137
+ LM.open(bname+".tdx.lem"); LM.iniReverseIndex();
138
+ PS.open(bname+".tdx.pos"); PS.iniReverseIndex();
139
+ DT.open(bname+".tdx.drl"); DT.iniReverseIndex();
140
+ MTT.open(mtt);
141
+ }
142
+ else
143
+ {
144
+ sform = true;
145
+ SF.open(bname+".tdx"); SF.iniReverseIndex();
146
+ MCT.open(mct);
147
+ }
148
+
149
+ if (!range.size())
150
+ have_mtt ? printRangeMTT(0, MTT.size()) : printRangeMCT(0, MCT.size());
151
+ else
152
+ {
153
+ for (size_t i = 0; i < range.size(); i++)
154
+ {
155
+ istringstream buf(range[i]);
156
+ size_t first,last; uchar c;
157
+ buf>>first;
158
+ if (buf.peek() == '-') buf>>c>>last;
159
+ else last = first;
160
+ if (have_mtt && last < MTT.size())
161
+ printRangeMTT(first,last+1);
162
+ else if (last < MCT.size())
163
+ printRangeMCT(first,last+1);
164
+ }
165
+ }
166
+ }
mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // build a phrase table for the given input
2
+ #include "ug_mm_ttrack.h"
3
+ #include "ug_mm_tsa.h"
4
+ #include "tpt_tokenindex.h"
5
+ #include "ug_corpus_token.h"
6
+ #include <string>
7
+ #include <vector>
8
+ #include <cassert>
9
+ #include <boost/unordered_map.hpp>
10
+ #include <boost/foreach.hpp>
11
+ #include <iomanip>
12
+ #include "ug_typedefs.h"
13
+ #include "tpt_pickler.h"
14
+ #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
15
+ #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
16
+ #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
17
+ #include <algorithm>
18
+ #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
19
+
20
+ using namespace std;
21
+ using namespace ugdiss;
22
+ using namespace Moses;
23
+ typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
24
+ typedef sapt::mmTSA<Token>::tree_iterator iter;
25
+ typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
26
+
27
+ #define CACHING_THRESHOLD 1000
28
+
29
+ sapt::mmTtrack<Token> T; // token tracks
30
+ sapt::TokenIndex V; // vocabs
31
+ sapt::mmTSA<Token> I; // suffix arrays
32
+
33
+ void interpret_args(int ac, char* av[]);
34
+ string bname;
35
+ bool echo;
36
+ int main(int argc, char* argv[])
37
+ {
38
+ interpret_args(argc,argv);
39
+
40
+ T.open(bname+".mct");
41
+ V.open(bname+".tdx"); V.iniReverseIndex();
42
+ I.open(bname+".sfa",&T);
43
+ string line;
44
+ while (getline(cin,line))
45
+ {
46
+ vector<id_type> phr;
47
+ V.fillIdSeq(line,phr);
48
+ TSA<Token>::tree_iterator m(&I);
49
+ size_t i = 0;
50
+ while (i < phr.size() && m.extend(phr[i])) ++i;
51
+ if (echo) cout << line << ": ";
52
+ if (i < phr.size()) cout << 0 << endl;
53
+ else cout << m.rawCnt() << endl;
54
+ }
55
+ exit(0);
56
+ }
57
+
58
+ void
59
+ interpret_args(int ac, char* av[])
60
+ {
61
+ namespace po=boost::program_options;
62
+ po::variables_map vm;
63
+ po::options_description o("Options");
64
+ po::options_description h("Hidden Options");
65
+ po::positional_options_description a;
66
+
67
+ o.add_options()
68
+ ("help,h", "print this message")
69
+ ("echo,e", po::bool_switch(&echo), "repeat lookup phrases")
70
+ ;
71
+
72
+ h.add_options()
73
+ ("bname", po::value<string>(&bname), "base name")
74
+ ;
75
+ a.add("bname",1);
76
+ get_options(ac,av,h.add(o),a,vm);
77
+ }
mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "num_read_write.h"
2
+ namespace tpt {
3
+ typedef unsigned char uchar;
4
+
5
+ void
6
+ numwrite(std::ostream& out, uint16_t const& x)
7
+ {
8
+ char buf[2];
9
+ buf[0] = x%256;
10
+ buf[1] = (x>>8)%256;
11
+ out.write(buf,2);
12
+ }
13
+
14
+ void
15
+ numwrite(std::ostream& out, uint32_t const& x)
16
+ {
17
+ char buf[4];
18
+ buf[0] = x%256;
19
+ buf[1] = (x>>8)%256;
20
+ buf[2] = (x>>16)%256;
21
+ buf[3] = (x>>24)%256;
22
+ out.write(buf,4);
23
+ }
24
+
25
+ void
26
+ numwrite(std::ostream& out, uint64_t const& x)
27
+ {
28
+ char buf[8];
29
+ buf[0] = x%256;
30
+ buf[1] = (x>>8)%256;
31
+ buf[2] = (x>>16)%256;
32
+ buf[3] = (x>>24)%256;
33
+ buf[4] = (x>>32)%256;
34
+ buf[5] = (x>>40)%256;
35
+ buf[6] = (x>>48)%256;
36
+ buf[7] = (x>>56)%256;
37
+ out.write(buf,8);
38
+ }
39
+
40
+ char const*
41
+ numread(char const* src, uint16_t & x)
42
+ {
43
+ uchar const* d = reinterpret_cast<uchar const*>(src);
44
+ x = (uint16_t(d[0])<<0) | (uint16_t(d[1])<<8);
45
+ return src+2;
46
+ }
47
+
48
+ char const*
49
+ numread(char const* src, uint32_t & x)
50
+ {
51
+ uchar const* d = reinterpret_cast<uchar const*>(src);
52
+ x = ((uint32_t(d[0])<<0) |
53
+ (uint32_t(d[1])<<8) |
54
+ (uint32_t(d[2])<<16)|
55
+ (uint32_t(d[3])<<24));
56
+ return src+4;
57
+ }
58
+
59
+ char const*
60
+ numread(char const* src, uint64_t & x)
61
+ {
62
+ uchar const* d = reinterpret_cast<uchar const*>(src);
63
+ x = ((uint64_t(d[0])<<0) |
64
+ (uint64_t(d[1])<<8) |
65
+ (uint64_t(d[2])<<16) |
66
+ (uint64_t(d[3])<<24) |
67
+ (uint64_t(d[4])<<32) |
68
+ (uint64_t(d[5])<<40) |
69
+ (uint64_t(d[6])<<48) |
70
+ (uint64_t(d[7])<<56));
71
+ return src+8;
72
+ }
73
+
74
+ }
mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- c++ -*-
2
+ #include "ug_http_client.h"
3
+
4
+ int main(int argc, char* argv[])
5
+ {
6
+ try
7
+ {
8
+ if (argc != 2)
9
+ {
10
+ std::cout << "Usage: async_client <url>\n";
11
+ std::cout << "Example:\n";
12
+ std::cout << " async_client www.boost.org/LICENSE_1_0.txt\n";
13
+ return 1;
14
+ }
15
+
16
+ boost::asio::io_service io_service;
17
+ Moses::http_client c(io_service, argv[1]);
18
+ io_service.run();
19
+ std::cout << c.content() << std::endl;
20
+ }
21
+ catch (std::exception& e)
22
+ {
23
+ std::cout << "Exception: " << e.what() << "\n";
24
+ }
25
+
26
+ return 0;
27
+ }
mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <iostream>
2
+ #include <string>
3
+ #include <iomanip>
4
+ #include "ug_http_client.h"
5
+
6
+ using namespace std;
7
+ int main()
8
+ {
9
+ string line;
10
+ while (getline(cin,line))
11
+ cout << Moses::uri_encode(line) << endl;
12
+ }
13
+
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc ADDED
@@ -0,0 +1,594 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // (c) 2007,2008 Ulrich Germann
3
+
4
+ /* Functions for writing indices tightly (use only the bytes you need).
5
+ * The first bit indicates whether a byte belongs to a key or a value.
6
+ * The remaining 7 bits are part of the respective integer value.
7
+ * (c) 2007 Ulrich Germann
8
+ */
9
+ //
10
+ // ugTightIndex.cc
11
+ //
12
+ // Made by Ulrich Germann
13
+ // Login <germann@germann-laptop>
14
+ //
15
+ // Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
16
+ // Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
17
+ //
18
+
19
+ #include <iostream>
20
+ #include <cassert>
21
+ #include "tpt_tightindex.h"
22
+
23
+ namespace tpt
24
+ {
25
+
26
+ // #define LOG_WRITE_ACTIVITY
27
+
28
+ // write a key or value into a tight index
29
+ // flag indicates wheter it's a key or a value
30
+ void tightwrite(std::ostream& out, uint64_t data, bool flag)
31
+ {
32
+ // assert(sizeof(size_t)==4);
33
+ #ifdef LOG_WRITE_ACTIVITY
34
+ size_t bytes_written=1;
35
+ std::cerr << "starting at file position " << out.tellp()
36
+ << ": tightwrite " << data;
37
+ #endif
38
+ if (flag)
39
+ {
40
+ #ifdef LOG_WRITE_ACTIVITY
41
+ std::cerr << " with flag 1 ";
42
+ #endif
43
+ while (data >= 128)
44
+ {
45
+ char c = char(data%128)|char(-128);
46
+ out.put(c);
47
+ data >>= 7;
48
+ #ifdef LOG_WRITE_ACTIVITY
49
+ bytes_written++;
50
+ #endif
51
+ }
52
+ char c = char(data%128)|char(-128);
53
+ out.put(c);
54
+ }
55
+ else
56
+ {
57
+ #ifdef LOG_WRITE_ACTIVITY
58
+ std::cerr << " with flag 0 ";
59
+ #endif
60
+ while (data >= 128)
61
+ {
62
+ char c = data&127;
63
+ out.put(c);
64
+ data >>= 7;
65
+ #ifdef LOG_WRITE_ACTIVITY
66
+ bytes_written++;
67
+ #endif
68
+ }
69
+ char c = (data&127);
70
+ out.put(c);
71
+ }
72
+ #ifdef LOG_WRITE_ACTIVITY
73
+ std::cerr << " in " << bytes_written << " bytes" << std::endl;
74
+ #endif
75
+ }
76
+
77
+ // For the code below: does it make a difference if I hard-code the
78
+ // unraveled loop or does code optimization by the compiler take care
79
+ // of that?
80
+
81
+ #define DEBUG_TIGHTREAD 0
82
+
83
+ // read a key value from a tight index; filepos_type must be at least as
84
+ // large as count_type
85
+ filepos_type
86
+ tightread(std::istream& in, std::ios::pos_type stop)
87
+ {
88
+ // debug=true;
89
+ // assert(sizeof(size_t) == 4);
90
+ assert(in.rdbuf()->in_avail() > 0);
91
+ filepos_type data = 0;
92
+ short int bitshift = 7;
93
+ int pos = in.tellg();
94
+ #if DEBUG_TIGHTREAD
95
+ if (debug)
96
+ cerr << bitpattern(uint(in.peek())) << " " << in.peek()
97
+ << " pos=" << in.tellg() << "\n";
98
+ #endif
99
+ int buf = in.get();
100
+ if (stop == std::ios::pos_type(0))
101
+ stop = size_t(in.tellg())+in.rdbuf()->in_avail();
102
+ else
103
+ stop = std::min(size_t(stop),size_t(in.tellg())+in.rdbuf()->in_avail());
104
+ if (buf < 0)
105
+ std::cerr << "number read: " << buf << " " << pos << " "
106
+ << in.tellg() << std::endl;
107
+ assert (buf>=0);
108
+
109
+ if (buf >= 128) // continuation bit is 1
110
+ {
111
+ data = buf-128; // unset the bit
112
+ while (in.tellg() < stop && in.peek() >= 128)
113
+ {
114
+ #if DEBUG_TIGHTREAD
115
+ if (debug)
116
+ cerr << bitpattern(uint(in.peek())) << " " << in.peek();
117
+ #endif
118
+ // cerr << bitpattern(size_t(in.peek())) << std::endl;
119
+ data += size_t(in.get()-128)<<bitshift;
120
+ bitshift += 7;
121
+ #if DEBUG_TIGHTREAD
122
+ if (debug)
123
+ cerr << " " << data << " pos=" << in.tellg() << std::endl;
124
+ #endif
125
+ }
126
+ }
127
+ else
128
+ {
129
+ data = buf;
130
+ while (in.tellg() < stop && in.peek() < 128)
131
+ {
132
+ // cerr << bitpattern(size_t(in.peek())) << std::endl;
133
+ #if DEBUG_TIGHTREAD
134
+ if (debug)
135
+ cerr << bitpattern(uint(in.peek())) << " " << in.peek();
136
+
137
+ #endif
138
+ data += size_t(in.get())<<bitshift;
139
+ bitshift += 7;
140
+ #if DEBUG_TIGHTREAD
141
+ if (debug)
142
+ cerr << " " << data << " pos=" << in.tellg() << "\n";
143
+ #endif
144
+ }
145
+ }
146
+ return data;
147
+ }
148
+
149
+ #define DEBUG_TIGHTFIND 0
150
+ #if DEBUG_TIGHTFIND
151
+ bool debug=true;
152
+ #endif
153
+ bool
154
+ tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop)
155
+ {
156
+ in.seekg((start+stop)/2);
157
+ // Jump approximately to the middle. Since we might land in the
158
+ // middle of a number, we need to find the start of the next
159
+ // [index key/file offset] pair first. Bytes belonging to an index
160
+ // key have the leftmost bit set to 0, bytes belonging to a file
161
+ // offset have it set to 1
162
+
163
+ // if we landed in the middle of an index key, skip to the end of it
164
+ while (static_cast<filepos_type>(in.tellg()) < stop && in.get() < 128)
165
+ {
166
+ #if DEBUG_TIGHTFIND
167
+ if (debug)
168
+ {
169
+ in.unget();
170
+ char c = in.get();
171
+ std::cerr << in.tellg() << " skipped key byte " << c << std::endl;
172
+ }
173
+ #endif
174
+ if (in.eof()) return false;
175
+ }
176
+ // Also skip the associated file offset:
177
+ while (static_cast<filepos_type>(in.tellg()) < stop && in.peek() >= 128)
178
+ {
179
+ #if DEBUG_TIGHTFIND
180
+ int r = in.get();
181
+ if (debug)
182
+ std::cerr << in.tellg() << " skipped value byte " << r
183
+ << " next is " << in.peek()
184
+ << std::endl;
185
+ #else
186
+ in.get();
187
+ #endif
188
+ }
189
+ return true;
190
+ }
191
+
192
+ char const*
193
+ tightfind_midpoint(char const* const start,
194
+ char const* const stop)
195
+ {
196
+ char const* mp = start + (stop - start)/2;
197
+ while (*mp < 0 && mp > start) mp--;
198
+ while (*mp >= 0 && mp > start) mp--;
199
+ return (*mp < 0) ? ++mp : mp;
200
+ }
201
+
202
+ bool
203
+ linear_search(std::istream& in, filepos_type start, filepos_type stop,
204
+ id_type key, unsigned char& flags)
205
+ { // performs a linear search in the range
206
+ in.seekg(start);
207
+
208
+ #if DEBUG_TIGHTFIND
209
+ if (debug) std::cerr << in.tellg() << " ";
210
+ #endif
211
+
212
+ // ATTENTION! The bitshift operations below are important:
213
+ // We use some of the bits in the key value to store additional
214
+ // information about what and where node iformation is stored.
215
+
216
+ id_type foo;
217
+ for(foo = tightread(in,stop);
218
+ (foo>>FLAGBITS) < key;
219
+ foo = tightread(in,stop))
220
+ {
221
+ // skip the value associated with key /foo/
222
+ while (static_cast<filepos_type>(in.tellg()) < stop
223
+ && in.peek() >= 128) in.get();
224
+
225
+ #if DEBUG_TIGHTFIND
226
+ if (debug)
227
+ std::cerr << (foo>>FLAGBITS) << " [" << key << "] "
228
+ << in.tellg() << std::endl;
229
+ #endif
230
+
231
+ if (in.tellg() == std::ios::pos_type(stop))
232
+ return false; // not found
233
+ }
234
+
235
+ #if DEBUG_TIGHTFIND
236
+ if (debug && (foo>>FLAGBITS)==key)
237
+ std::cerr << "found entry for " << key << std::endl;
238
+ std::cerr << "current file position is " << in.tellg()
239
+ << " (value read: " << key << std::endl;
240
+ #endif
241
+
242
+ assert(static_cast<filepos_type>(in.tellg()) < stop);
243
+ if ((foo>>FLAGBITS)==key)
244
+ {
245
+ flags = (foo%256);
246
+ flags &= FLAGMASK;
247
+ return true;
248
+ }
249
+ else
250
+ return false;
251
+ }
252
+
253
+ bool
254
+ tightfind(std::istream& in, filepos_type start, filepos_type stop,
255
+ id_type key, unsigned char& flags)
256
+ {
257
+ // returns true if the value is found
258
+ #if DEBUG_TIGHTFIND
259
+ if (debug)
260
+ std::cerr << "looking for " << key
261
+ << " in range [" << start << ":" << stop << "]" << std::endl;
262
+ #endif
263
+ if (start==stop) return false;
264
+ assert(stop>start);
265
+ if ((start+1)==stop) return false; // list is empty
266
+
267
+ unsigned int const granularity = sizeof(filepos_type)*5;
268
+ // granularity: point where we should switch to linear search,
269
+ // because otherwise we might skip over the entry we are looking for
270
+ // because we land right in the middle of it.
271
+
272
+ if (stop > start + granularity)
273
+ if (!tightfind_midpoint(in,start,stop))
274
+ return false; // something went wrong (empty index)
275
+
276
+ if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
277
+ { // If the search range is very short, tightfind_midpoint might skip the
278
+ // entry we are loking for. In this case, we can afford a linear
279
+ // search
280
+ return linear_search(in,start,stop,key,flags);
281
+ }
282
+
283
+ // perform binary search
284
+ filepos_type curpos = in.tellg();
285
+ id_type foo = tightread(in,stop);
286
+ id_type tmpid = foo>>FLAGBITS;
287
+ if (tmpid == key)
288
+ {
289
+ flags = foo%256;
290
+ flags &= FLAGMASK;
291
+ #if DEBUG_TIGHTFIND
292
+ if (debug) std::cerr << "found entry for " << key << std::endl;
293
+ #endif
294
+ return true; // done, found
295
+ }
296
+ else if (tmpid > key)
297
+ { // look in the lower half
298
+ #if DEBUG_TIGHTFIND
299
+ if (debug) std::cerr << foo << " > " << key << std::endl;
300
+ #endif
301
+ return tightfind(in,start,curpos,key,flags);
302
+ }
303
+ else
304
+ { // look in the upper half
305
+ while (static_cast<filepos_type>(in.tellg()) < stop
306
+ && in.rdbuf()->in_avail() > 0 // is that still necessary???
307
+ && in.peek() >= 128)
308
+ in.get(); // skip associated value
309
+ if (in.rdbuf()->in_avail() == 0 || in.tellg() == std::ios::pos_type(stop))
310
+ return false;
311
+ #if DEBUG_TIGHTFIND
312
+ if (debug) std::cerr << foo << " < " << key << std::endl;
313
+ #endif
314
+ return tightfind(in,in.tellg(),stop,key,flags);
315
+ }
316
+ }
317
+
318
+
319
+ char const*
320
+ tightfind(char const* const start,
321
+ char const* const stop,
322
+ id_type key,
323
+ unsigned char& flags)
324
+ {
325
+ // returns true if the value is found
326
+
327
+ if (start==stop) return NULL;
328
+ assert(stop>start);
329
+ if ((start+1)==stop) return NULL; // list is empty
330
+ char const* p = tightfind_midpoint(start,stop);
331
+ // if ids can be larger than 67,108,864 on 32-bit machines
332
+ // (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
333
+ size_t foo;
334
+ char const* after = tightread(p,stop,foo);
335
+ id_type tmpId = foo>>FLAGBITS;
336
+ if (tmpId == key)
337
+ {
338
+ flags = foo%256;
339
+ flags &= FLAGMASK;
340
+ return after;
341
+ }
342
+ else if (tmpId > key)
343
+ { // look in the lower half
344
+ return tightfind(start,p,key,flags);
345
+ }
346
+ else
347
+ { // look in the upper half
348
+ while (*after<0 && ++after < stop);
349
+ if (after == stop) return NULL;
350
+ return tightfind(after,stop,key,flags);
351
+ }
352
+ }
353
+
354
+ char const*
355
+ tightfind_noflags(char const* const start,
356
+ char const* const stop,
357
+ id_type key)
358
+ {
359
+ // returns true if the value is found
360
+
361
+ if (start==stop) return NULL;
362
+ assert(stop>start);
363
+ if ((start+1)==stop) return NULL; // list is empty
364
+ char const* p = tightfind_midpoint(start,stop);
365
+ // if ids can be larger than 67,108,864 on 32-bit machines
366
+ // (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
367
+ size_t foo;
368
+ char const* after = tightread(p,stop,foo);
369
+ if (foo == key)
370
+ return after;
371
+ else if (foo > key)
372
+ { // look in the lower half
373
+ return tightfind_noflags(start,p,key);
374
+ }
375
+ else
376
+ { // look in the upper half
377
+ while (*after<0 && ++after < stop);
378
+ if (after == stop) return NULL;
379
+ return tightfind_noflags(after,stop,key);
380
+ }
381
+ }
382
+
383
+ bool
384
+ linear_search_noflags(std::istream& in, filepos_type start,
385
+ filepos_type stop, id_type key)
386
+ { // performs a linear search in the range
387
+ std::ios::pos_type mystop = stop;
388
+
389
+ in.seekg(start);
390
+ id_type foo;
391
+ for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop))
392
+ {
393
+ // skip the value associated with key /foo/
394
+ while (in.tellg() < mystop && in.peek() >= 128)
395
+ in.get();
396
+ if (in.tellg() == mystop)
397
+ return false; // not found
398
+ }
399
+ assert(in.tellg() < mystop);
400
+ return (foo==key);
401
+ }
402
+
403
+
404
+ bool
405
+ tightfind_noflags(std::istream& in, filepos_type start,
406
+ filepos_type stop, id_type key)
407
+ {
408
+ // returns true if the value is found
409
+ if (start==stop) return false;
410
+ assert(stop>start);
411
+ if ((start+1)==stop) return false; // list is empty
412
+
413
+ // granularity: point where we should switch to linear search,
414
+ // because otherwise we might skip over the entry we are looking for
415
+ // because we land right in the middle of it.
416
+ unsigned int const granularity = sizeof(filepos_type)*5;
417
+ // UG: why 5? we should be able to get away with less!
418
+
419
+ if (stop > start + granularity)
420
+ if (!tightfind_midpoint(in,start,stop))
421
+ return false; // something went wrong (empty index)
422
+
423
+ // If the search range is very short, tightfind_midpoint might skip the
424
+ // entry we are loking for. In this case, we can afford a linear
425
+ // search
426
+ if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
427
+ return linear_search_noflags(in,start,stop,key);
428
+
429
+ // Otherwise, perform binary search
430
+ filepos_type curpos = in.tellg();
431
+ id_type foo = tightread(in,stop);
432
+ if (foo == key)
433
+ return true; // done, found
434
+
435
+ else if (foo > key) // search first half
436
+ return tightfind_noflags(in,start,curpos,key);
437
+
438
+ else // search second half
439
+ {
440
+ std::ios::pos_type mystop = stop;
441
+ while (in.tellg() < mystop
442
+ && in.rdbuf()->in_avail() > 0 // is that still necessary???
443
+ && in.peek() >= 128)
444
+ in.get(); // skip associated value
445
+ if (in.rdbuf()->in_avail() == 0 || in.tellg() == mystop)
446
+ return false;
447
+ return tightfind_noflags(in,in.tellg(),stop,key);
448
+ }
449
+ }
450
+
451
+ void tightwrite2(std::ostream& out, size_t data, bool flag)
452
+ {
453
+ // same as tightwrite, but uses basic storage units of size 2
454
+ // assert(sizeof(size_t)==4);
455
+ short int foo = (data%32768);
456
+ if (flag)
457
+ {
458
+ foo += 32768; // set first bit
459
+ while (data >= 32768) // = 2^15
460
+ {
461
+ out.write(reinterpret_cast<char*>(&foo),2);
462
+ data >>= 15;
463
+ foo = (data%32768)+32768;
464
+ }
465
+ }
466
+ else
467
+ {
468
+ while (data >= 32768) // = 2^15
469
+ {
470
+ out.write(reinterpret_cast<char*>(&foo),2);
471
+ data >>= 15;
472
+ foo = data%32768;
473
+ }
474
+ }
475
+ out.write(reinterpret_cast<char*>(&foo),2);
476
+ }
477
+
478
+ char const*
479
+ tightread8(char const* start,
480
+ char const* stop,
481
+ uint64_t& dest)
482
+ {
483
+ static char bitmask=127;
484
+ dest = 0;
485
+ if (*start < 0)
486
+ {
487
+ dest = (*start)&bitmask;
488
+ if (++start==stop || *start >= 0) return start;
489
+ dest += uint64_t((*start)&bitmask)<<7;
490
+ if (++start==stop || *start >= 0) return start;
491
+ dest += uint64_t((*start)&bitmask)<<14;
492
+ if (++start==stop || *start >= 0) return start;
493
+ dest += uint64_t((*start)&bitmask)<<21;
494
+ if (++start==stop || *start >= 0) return start;
495
+ dest += uint64_t((*start)&bitmask)<<28;
496
+ if (++start==stop || *start >= 0) return start;
497
+ dest += uint64_t((*start)&bitmask)<<35;
498
+ if (++start==stop || *start >= 0) return start;
499
+ dest += uint64_t((*start)&bitmask)<<42;
500
+ if (++start==stop || *start >= 0) return start;
501
+ dest += uint64_t((*start)&bitmask)<<49;
502
+ if (++start==stop || *start >= 0) return start;
503
+ dest += uint64_t((*start)&bitmask)<<56;
504
+ if (++start==stop || *start >= 0) return start;
505
+ dest += uint64_t((*start)&bitmask)<<63;
506
+ }
507
+ else
508
+ {
509
+ dest = *start;
510
+ if (++start==stop || *start < 0) return start;
511
+ dest += uint64_t(*start)<<7;
512
+ if (++start==stop || *start < 0) return start;
513
+ dest += uint64_t(*start)<<14;
514
+ if (++start==stop || *start < 0) return start;
515
+ dest += uint64_t(*start)<<21;
516
+ if (++start==stop || *start < 0) return start;
517
+ dest += uint64_t(*start)<<28;
518
+ if (++start==stop || *start < 0) return start;
519
+ dest += uint64_t(*start)<<35;
520
+ if (++start==stop || *start < 0) return start;
521
+ dest += uint64_t(*start)<<42;
522
+ if (++start==stop || *start < 0) return start;
523
+ dest += uint64_t(*start)<<49;
524
+ if (++start==stop || *start < 0) return start;
525
+ dest += uint64_t(*start)<<56;
526
+ if (++start==stop || *start < 0) return start;
527
+ dest += uint64_t(*start)<<63;
528
+ }
529
+ assert(start<stop);
530
+ return ++start;
531
+ }
532
+
533
+ char const*
534
+ tightread4(char const* start,
535
+ char const* stop,
536
+ uint32_t& dest)
537
+ {
538
+ static char bitmask=127;
539
+ dest = 0;
540
+ if (*start < 0)
541
+ {
542
+ dest = (*start)&bitmask;
543
+ if (++start==stop || *start >= 0) return start;
544
+ dest += uint32_t((*start)&bitmask)<<7;
545
+ if (++start==stop || *start >= 0) return start;
546
+ dest += uint32_t((*start)&bitmask)<<14;
547
+ if (++start==stop || *start >= 0) return start;
548
+ dest += uint32_t((*start)&bitmask)<<21;
549
+ if (++start==stop || *start >= 0) return start;
550
+ dest += uint32_t((*start)&bitmask)<<28;
551
+ }
552
+ else
553
+ {
554
+ dest = *start;
555
+ if (++start==stop || *start < 0) return start;
556
+ dest += uint32_t(*start)<<7;
557
+ if (++start==stop || *start < 0) return start;
558
+ dest += uint32_t(*start)<<14;
559
+ if (++start==stop || *start < 0) return start;
560
+ dest += uint32_t(*start)<<21;
561
+ if (++start==stop || *start < 0) return start;
562
+ dest += uint32_t(*start)<<28;
563
+ }
564
+ assert(start<stop);
565
+ return ++start;
566
+ }
567
+
568
+ char const*
569
+ tightread2(char const* start,
570
+ char const* stop,
571
+ uint16_t& dest)
572
+ {
573
+ static char bitmask=127;
574
+ dest = 0;
575
+ if (*start < 0)
576
+ {
577
+ dest = (*start)&bitmask;
578
+ if (++start==stop || *start >= 0) return start;
579
+ dest += uint32_t((*start)&bitmask)<<7;
580
+ if (++start==stop || *start >= 0) return start;
581
+ dest += uint32_t((*start)&bitmask)<<14;
582
+ }
583
+ else
584
+ {
585
+ dest = *start;
586
+ if (++start==stop || *start < 0) return start;
587
+ dest += uint32_t(*start)<<7;
588
+ if (++start==stop || *start < 0) return start;
589
+ dest += uint32_t(*start)<<14;
590
+ }
591
+ assert(start<stop);
592
+ return ++start;
593
+ }
594
+ } // end namespace ugdiss
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // (c) 2007,2008 Ulrich Germann
3
+ /* Functions for writing indices tightly (use only the bytes you need).
4
+ * The first bit indicates whether a byte belongs to a key or a value.
5
+ * The remaining 7 bits are part of the respective integer value.
6
+ */
7
+ #ifndef __ugTightIndex
8
+ #define __ugTightIndex
9
+ #include <map>
10
+ #include <iostream>
11
+ #include <sstream>
12
+ #include "tpt_typedefs.h"
13
+ #include <cassert>
14
+
15
+ #ifndef uchar
16
+ #endif
17
+
18
+ #define FLAGBITS 2
19
+ #define FLAGMASK (uchar(3))
20
+ #define HAS_VALUE_MASK (uchar(2))
21
+ #define HAS_CHILD_MASK (uchar(1))
22
+
23
+
24
+ extern bool debug;
25
+
26
+ namespace tpt
27
+ {
28
+ // void tightwritex(iostream& out, size_t data, bool flag);
29
+ void
30
+ tightwrite(std::ostream& out, uint64_t data, bool flag);
31
+
32
+ filepos_type
33
+ tightread(std::istream& in, std::ios::pos_type stop);
34
+
35
+ bool
36
+ tightfind(std::istream& in,
37
+ filepos_type start,
38
+ filepos_type stop,
39
+ id_type key,
40
+ unsigned char& flags);
41
+
42
+ bool
43
+ tightfind_noflags(std::istream& in,
44
+ filepos_type start,
45
+ filepos_type stop,
46
+ id_type key);
47
+
48
+ char const*
49
+ tightfind(char const* const start,
50
+ char const* const stop,
51
+ id_type key,
52
+ unsigned char& flags);
53
+
54
+ char const*
55
+ tightfind_noflags(char const* const start,
56
+ char const* const stop,
57
+ id_type key);
58
+
59
+
60
+
61
+ /** move read header in istream /in/ to the first entry after the midpoint of
62
+ * file position range [start,stop) in in a 'tight' index
63
+ * @param in the data input stream
64
+ * @param start start of the search range
65
+ * @param stop end of the search range
66
+ * @return true if no errors occurred
67
+ */
68
+ bool
69
+ tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop);
70
+
71
+ // the bitpattern functions below are for debugging
72
+ // They return a string showing the bits of the argument value
73
+ // std::string bitpattern(unsigned int s);
74
+ // std::string bitpattern(unsigned char c);
75
+ // std::string bitpattern(char c);
76
+
77
+
78
+ /** read a number from a tight index directy from a memory location
79
+ * @param start start of read range
80
+ * @param stop non-inclusive end of read range
81
+ * @param dest destination
82
+ * @return first memory position after the number
83
+ */
84
+
85
+ char const*
86
+ tightread2(char const* start, char const* stop, uint16_t& dest);
87
+
88
+ char const*
89
+ tightread4(char const* start, char const* stop, uint32_t& dest);
90
+
91
+ char const*
92
+ tightread8(char const* start, char const* stop, uint64_t& dest);
93
+
94
+ template<typename numType>
95
+ char const*
96
+ tightread(char const* start, char const* stop, numType& dest)
97
+ {
98
+ if (sizeof(numType)==2)
99
+ return tightread2(start,stop,reinterpret_cast<uint16_t&>(dest));
100
+ if (sizeof(numType)==4)
101
+ return tightread4(start,stop,reinterpret_cast<uint32_t&>(dest));
102
+ else if (sizeof(numType)==8)
103
+ return tightread8(start,stop,reinterpret_cast<uint64_t&>(dest));
104
+ assert(0);
105
+ return NULL;
106
+ }
107
+
108
+ // char const*
109
+ // tightread(char const* start, char const* stop, uint64_t& dest);
110
+
111
+ // char const*
112
+ // tightread(char const* start, char const* stop, filepos_type& dest);
113
+
114
+ #if 0
115
+ template<typename dtype>
116
+ char const*
117
+ tightread(char const* start,
118
+ char const* stop,
119
+ dtype& dest)
120
+ {
121
+ static char bitmask=127;
122
+ dest = 0;
123
+ if (*start < 0)
124
+ {
125
+ dest = (*start)&bitmask;
126
+ if (++start==stop || *start >= 0) return start;
127
+ dest += dtype((*start)&bitmask)<<7;
128
+ if (++start==stop || *start >= 0) return start;
129
+ dest += dtype((*start)&bitmask)<<14;
130
+ if (++start==stop || *start >= 0) return start;
131
+ dest += dtype((*start)&bitmask)<<21;
132
+ if (++start==stop || *start >= 0) return start;
133
+ dest += dtype((*start)&bitmask)<<28;
134
+ if (++start==stop || *start >= 0) return start;
135
+ assert(sizeof(dtype) > 4);
136
+ dest += dtype((*start)&bitmask)<<35;
137
+ if (++start==stop || *start >= 0) return start;
138
+ dest += dtype((*start)&bitmask)<<42;
139
+ if (++start==stop || *start >= 0) return start;
140
+ dest += dtype((*start)&bitmask)<<49;
141
+ if (++start==stop || *start >= 0) return start;
142
+ dest += dtype((*start)&bitmask)<<56;
143
+ if (++start==stop || *start >= 0) return start;
144
+ dest += dtype((*start)&bitmask)<<63;
145
+ }
146
+ else
147
+ {
148
+ dest = *start;
149
+ if (++start==stop || *start < 0) return start;
150
+ dest += dtype(*start)<<7;
151
+ if (++start==stop || *start < 0) return start;
152
+ dest += dtype(*start)<<14;
153
+ if (++start==stop || *start < 0) return start;
154
+ dest += dtype(*start)<<21;
155
+ if (++start==stop || *start < 0) return start;
156
+ dest += dtype(*start)<<28;
157
+ if (++start==stop || *start < 0) return start;
158
+ assert(sizeof(dtype) > 4);
159
+ dest += dtype(*start)<<35;
160
+ if (++start==stop || *start < 0) return start;
161
+ dest += dtype(*start)<<42;
162
+ if (++start==stop || *start < 0) return start;
163
+ dest += dtype(*start)<<49;
164
+ if (++start==stop || *start < 0) return start;
165
+ dest += dtype(*start)<<56;
166
+ if (++start==stop || *start < 0) return start;
167
+ dest += dtype(*start)<<63;
168
+ }
169
+ assert(start<stop);
170
+ return ++start;
171
+ }
172
+ #endif
173
+
174
+
175
+ }
176
+ #endif
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // (c) 2007-2013 Ulrich Germann
3
+ #include <sstream>
4
+ #include <cstring>
5
+ #include <algorithm>
6
+ #include <iostream>
7
+ #include <stdexcept>
8
+
9
+ #include <boost/pool/pool_alloc.hpp>
10
+
11
+ #include "tpt_tokenindex.h"
12
+ #include "ug_typedefs.h"
13
+
14
+ using namespace std;
15
+ namespace sapt
16
+ {
17
+
18
+ TokenIndex::
19
+ TokenIndex(string unkToken)
20
+ : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
21
+ , startIdx(0), endIdx(0)
22
+ {
23
+ lock.reset(new boost::mutex());
24
+ };
25
+
26
+ #if 0
27
+ TokenIndex::
28
+ TokenIndex(string fname, string unkToken,bool dyna)
29
+ : ridx(0),unkLabel(unkToken)
30
+ {
31
+ this->open(fname,unkToken,dyna);
32
+ };
33
+ #endif
34
+
35
+ void
36
+ TokenIndex::
37
+ open(string fname, string unkToken,bool dyna)
38
+ {
39
+ if (access(fname.c_str(),F_OK))
40
+ {
41
+ ostringstream msg;
42
+ msg << "TokenIndex::open: File '" << fname << "' does not exist.";
43
+ throw std::runtime_error(msg.str().c_str());
44
+ }
45
+
46
+ file.open(fname);
47
+ if (!file.is_open())
48
+ {
49
+ ostringstream msg;
50
+ msg << "TokenIndex::open: Error opening file '" << fname << "'.";
51
+ throw std::runtime_error(msg.str().c_str());
52
+ }
53
+
54
+ this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
55
+ unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
56
+
57
+ startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
58
+ endIdx = startIdx + numTokens;
59
+ comp.base = reinterpret_cast<char const*>(endIdx);
60
+ if (!unkToken.empty())
61
+ {
62
+ Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
63
+ unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
64
+ ? bla->id
65
+ : numTokens);
66
+ }
67
+ this->dynamic=dyna;
68
+ if (dyna)
69
+ {
70
+ this->str2idExtra.reset(new map<string,id_type>());
71
+ this->newWords.reset(new vector<string>());
72
+ }
73
+ }
74
+
75
+ void
76
+ TokenIndex::
77
+ close()
78
+ {
79
+ file.close();
80
+ }
81
+
82
+ TokenIndex::
83
+ CompFunc::
84
+ CompFunc()
85
+ {};
86
+
87
+ bool
88
+ TokenIndex::
89
+ CompFunc::
90
+ operator()(Entry const& A, char const* w)
91
+ {
92
+ return strcmp(base+A.offset,w) < 0;
93
+ };
94
+
95
+ id_type
96
+ TokenIndex::
97
+ operator[](char const* p) const
98
+ {
99
+ if (startIdx != endIdx)
100
+ {
101
+ Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
102
+ if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
103
+ return bla->id;
104
+ if (!dynamic) return unkId;
105
+ }
106
+ else if (!dynamic) return strcmp(p,"NULL") && unkId;
107
+
108
+ boost::lock_guard<boost::mutex> lk(*this->lock);
109
+ // stuff below is new as of 2011-01-30, for dynamic adding of
110
+ // unknown items IMPORTANT: numTokens is not currently not
111
+ // changed, it is the number of PRE-EXISING TOKENS, not including
112
+ // dynamically added Items
113
+ // if (!str2idExtra)
114
+ // {
115
+ // this->str2idExtra.reset(new map<string,id_type>());
116
+ // this->newWords.reset(new vector<string>());
117
+ // }
118
+ map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
119
+ pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
120
+ if (foo.second) // it actually is a new item
121
+ newWords->push_back(foo.first->first);
122
+ return foo.first->second;
123
+ }
124
+
125
+ id_type
126
+ TokenIndex::
127
+ operator[](string const& w) const
128
+ {
129
+ return (*this)[w.c_str()];
130
+ }
131
+
132
+ vector<char const*>
133
+ TokenIndex::
134
+ reverseIndex() const
135
+ {
136
+ size_t numToks = endIdx-startIdx;
137
+
138
+ // cout << "tokenindex has " << numToks << " tokens" << endl;
139
+
140
+ vector<char const*> v(numToks,NULL);
141
+ // v.reserve(endIdx-startIdx);
142
+ for (Entry const* x = startIdx; x != endIdx; x++)
143
+ {
144
+ if (x->id >= v.size())
145
+ v.resize(x->id+1);
146
+ v[x->id] = comp.base+x->offset;
147
+ }
148
+ // cout << "done reversing index " << endl;
149
+ return v;
150
+ }
151
+
152
+ char const* const
153
+ TokenIndex::
154
+ operator[](id_type id) const
155
+ {
156
+ if (!ridx.size())
157
+ {
158
+ boost::lock_guard<boost::mutex> lk(*this->lock);
159
+ // Someone else (multi-threading!) may have created the
160
+ // reverse index in the meantime, so let's check again
161
+ if (!ridx.size()) ridx = reverseIndex();
162
+ }
163
+ if (id < ridx.size())
164
+ return ridx[id];
165
+
166
+ boost::lock_guard<boost::mutex> lk(*this->lock);
167
+ if (dynamic && id < ridx.size()+newWords->size())
168
+ return (*newWords)[id-ridx.size()].c_str();
169
+ return unkLabel.c_str();
170
+ }
171
+
172
+ void
173
+ TokenIndex::
174
+ iniReverseIndex()
175
+ {
176
+ if (!ridx.size())
177
+ {
178
+ boost::lock_guard<boost::mutex> lk(*this->lock);
179
+ if (!ridx.size()) ridx = reverseIndex();
180
+ }
181
+ }
182
+
183
+
184
+ char const* const
185
+ TokenIndex::
186
+ operator[](id_type id)
187
+ {
188
+ if (!ridx.size())
189
+ {
190
+ boost::lock_guard<boost::mutex> lk(*this->lock);
191
+ if (!ridx.size()) ridx = reverseIndex();
192
+ }
193
+ if (id < ridx.size())
194
+ return ridx[id];
195
+ boost::lock_guard<boost::mutex> lk(*this->lock);
196
+ if (dynamic && id < ridx.size()+newWords->size())
197
+ return (*newWords)[id-ridx.size()].c_str();
198
+ return unkLabel.c_str();
199
+ }
200
+
201
+ string
202
+ TokenIndex::
203
+ toString(vector<id_type> const& v)
204
+ {
205
+ if (!ridx.size())
206
+ {
207
+ boost::lock_guard<boost::mutex> lk(*this->lock);
208
+ if (!ridx.size()) ridx = reverseIndex();
209
+ }
210
+ ostringstream buf;
211
+ for (size_t i = 0; i < v.size(); i++)
212
+ buf << (i ? " " : "") << (*this)[v[i]];
213
+ return buf.str();
214
+ }
215
+
216
+ string
217
+ TokenIndex::
218
+ toString(vector<id_type> const& v) const
219
+ {
220
+ if (!ridx.size())
221
+ {
222
+ boost::lock_guard<boost::mutex> lk(*this->lock);
223
+ if (!ridx.size()) ridx = reverseIndex();
224
+ }
225
+ ostringstream buf;
226
+ for (size_t i = 0; i < v.size(); i++)
227
+ buf << (i ? " " : "") << (*this)[v[i]];
228
+ return buf.str();
229
+ }
230
+
231
+ string
232
+ TokenIndex::
233
+ toString(id_type const* start, id_type const* const stop)
234
+ {
235
+ if (!ridx.size())
236
+ {
237
+ boost::lock_guard<boost::mutex> lk(*this->lock);
238
+ if (!ridx.size()) ridx = reverseIndex();
239
+ }
240
+ ostringstream buf;
241
+ if (start < stop)
242
+ buf << (*this)[*start];
243
+ while (++start < stop)
244
+ buf << " " << (*this)[*start];
245
+ return buf.str();
246
+ }
247
+
248
+ string
249
+ TokenIndex::
250
+ toString(id_type const* start, id_type const* const stop) const
251
+ {
252
+ if (!ridx.size())
253
+ {
254
+ boost::lock_guard<boost::mutex> lk(*this->lock);
255
+ if (!ridx.size()) ridx = reverseIndex();
256
+ }
257
+ ostringstream buf;
258
+ if (start < stop)
259
+ buf << (*this)[*start];
260
+ while (++start < stop)
261
+ buf << " " << (*this)[*start];
262
+ return buf.str();
263
+ }
264
+
265
+ vector<id_type>
266
+ TokenIndex::
267
+ toIdSeq(string const& line) const
268
+ {
269
+ istringstream buf(line);
270
+ string w;
271
+ vector<id_type> retval;
272
+ while (buf>>w)
273
+ retval.push_back((*this)[w]);
274
+ return retval;
275
+ }
276
+
277
+ /// Return false if line contains unknown tokens, true otherwise
278
+ bool
279
+ TokenIndex::
280
+ fillIdSeq(string const& line, vector<id_type> & v) const
281
+ {
282
+ bool allgood = true; string w;
283
+ v.clear();
284
+ for (istringstream buf(line); buf>>w;)
285
+ {
286
+ v.push_back((*this)[w]);
287
+ allgood = allgood && v.back() > 1;
288
+ }
289
+ return allgood;
290
+ }
291
+
292
+ id_type
293
+ TokenIndex::
294
+ getNumTokens() const
295
+ {
296
+ return numTokens;
297
+ }
298
+
299
+ id_type
300
+ TokenIndex::
301
+ getUnkId() const
302
+ {
303
+ return unkId;
304
+ }
305
+
306
+ char const* const
307
+ TokenIndex::
308
+ getUnkToken() const
309
+ {
310
+ return unkLabel.c_str();
311
+ // return (*this)[unkId];
312
+ }
313
+
314
+ id_type
315
+ TokenIndex::
316
+ knownVocabSize() const
317
+ {
318
+ return numTokens;
319
+ }
320
+
321
+ id_type
322
+ TokenIndex::
323
+ ksize() const
324
+ {
325
+ return numTokens;
326
+ }
327
+
328
+ id_type
329
+ TokenIndex::
330
+ totalVocabSize() const
331
+ { return tsize(); }
332
+
333
+ id_type
334
+ TokenIndex::
335
+ tsize() const
336
+ {
337
+ return (newWords != NULL
338
+ ? numTokens+newWords->size()
339
+ : numTokens);
340
+ }
341
+
342
+ void
343
+ write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
344
+ string const& ofile, string const& unkToken)
345
+ {
346
+ typedef pair<uint32_t,id_type> IndexEntry; // offset and id
347
+
348
+ // Write token strings to a buffer, keep track of offsets
349
+ vector<IndexEntry> index(tok.size());
350
+ ostringstream data;
351
+ id_type unkId = tok.size();
352
+ for (size_t i = 0; i < tok.size(); i++)
353
+ {
354
+ if (tok[i].first == unkToken)
355
+ unkId = tok[i].second;
356
+ index[i].first = data.tellp(); // offset of string
357
+ index[i].second = tok[i].second; // respective ID
358
+ data<<tok[i].first<<char(0); // write string to buffer
359
+ }
360
+
361
+ // Now write the actual file
362
+ ofstream out(ofile.c_str());
363
+ uint32_t vsize = index.size(); // how many vocab items?
364
+ out.write(reinterpret_cast<char*>(&vsize),4);
365
+ out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type));
366
+ for (size_t i = 0; i < index.size(); i++)
367
+ {
368
+ out.write(reinterpret_cast<char*>(&index[i].first),4);
369
+ out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type));
370
+ }
371
+ out<<data.str();
372
+ }
373
+
374
+ void
375
+ TokenIndex::
376
+ write(string fname)
377
+ {
378
+ typedef pair<string,uint32_t> Token; // token and id
379
+ vector<Token> tok(totalVocabSize());
380
+ for (id_type i = 0; i < tok.size(); ++i)
381
+ tok[i] = Token((*this)[i],i);
382
+ sort(tok.begin(),tok.end());
383
+ write_tokenindex_to_disk(tok,fname,unkLabel);
384
+ }
385
+
386
+ bool
387
+ TokenIndex::
388
+ isDynamic() const
389
+ {
390
+ return dynamic;
391
+ }
392
+
393
+ bool
394
+ TokenIndex::
395
+ setDynamic(bool on)
396
+ {
397
+ bool ret = dynamic;
398
+ if (on && this->str2idExtra == NULL)
399
+ {
400
+ this->str2idExtra.reset(new map<string,id_type>());
401
+ this->newWords.reset(new vector<string>());
402
+ }
403
+ dynamic = on;
404
+ if (on)
405
+ {
406
+ (*this)["NULL"];
407
+ (*this)[unkLabel];
408
+ }
409
+ return ret;
410
+ }
411
+
412
+ void
413
+ TokenIndex::
414
+ setUnkLabel(string unk)
415
+ {
416
+ unkId = (*this)[unk];
417
+ unkLabel = unk;
418
+ }
419
+
420
+ }
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //-*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
2
+
3
+ #include "ug_bitext.h"
4
+ #include <algorithm>
5
+ #include <boost/math/distributions/binomial.hpp>
6
+
7
+ namespace sapt
8
+ {
9
+
10
+ float
11
+ lbop(size_t const tries, size_t const succ, float const confidence)
12
+ {
13
+ return (confidence == 0
14
+ ? float(succ)/tries
15
+ : (boost::math::binomial_distribution<>::
16
+ find_lower_bound_on_p(tries, succ, confidence)));
17
+ }
18
+
19
+ void
20
+ snt_adder<L2R_Token<SimpleWordId> >::
21
+ operator()()
22
+ {
23
+ typedef L2R_Token<SimpleWordId> tkn;
24
+ std::vector<id_type> sids; sids.reserve(snt.size());
25
+ BOOST_FOREACH(std::string const& foo, snt)
26
+ {
27
+ sids.push_back(track ? track->size() : 0);
28
+ std::istringstream buf(foo);
29
+ std::string w;
30
+ std::vector<tkn> s; s.reserve(100);
31
+ while (buf >> w) s.push_back(tkn(V[w]));
32
+ track = append(track,s);
33
+ }
34
+ if (index)
35
+ index.reset(new imTSA<tkn>(*index,track,sids,V.tsize()));
36
+ else
37
+ index.reset(new imTSA<tkn>(track,NULL,NULL));
38
+ }
39
+
40
+ snt_adder<L2R_Token<SimpleWordId> >::
41
+ snt_adder(std::vector<std::string> const& s, TokenIndex& v,
42
+ SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t,
43
+ SPTR<imTSA<L2R_Token<SimpleWordId> > >& i)
44
+ : snt(s), V(v), track(t), index(i)
45
+ { }
46
+
47
+ bool
48
+ expand_phrase_pair
49
+ (std::vector<std::vector<ushort> >& a1,
50
+ std::vector<std::vector<ushort> >& a2,
51
+ ushort const s2, // next word on in target side
52
+ ushort const L1, ushort const R1, // limits of previous phrase
53
+ ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
54
+ {
55
+ if (a2[s2].size() == 0)
56
+ {
57
+ std::cout << __FILE__ << ":" << __LINE__ << std::endl;
58
+ return false;
59
+ }
60
+ bitvector done1(a1.size());
61
+ bitvector done2(a2.size());
62
+ std::vector<std::pair<ushort,ushort> > agenda;
63
+ // x.first: side (1 or 2)
64
+ // x.second: word position
65
+ agenda.reserve(a1.size() + a2.size());
66
+ agenda.push_back(std::pair<ushort,ushort>(2,s2));
67
+ e2 = s2;
68
+ s1 = e1 = a2[s2].front();
69
+ if (s1 >= L1 && s1 < R1)
70
+ {
71
+ std::cout << __FILE__ << ":" << __LINE__ << std::endl;
72
+ return false;
73
+ }
74
+ agenda.push_back(std::pair<ushort,ushort>(2,s2));
75
+ while (agenda.size())
76
+ {
77
+ ushort side = agenda.back().first;
78
+ ushort p = agenda.back().second;
79
+ agenda.pop_back();
80
+ if (side == 1)
81
+ {
82
+ done1.set(p);
83
+ BOOST_FOREACH(ushort i, a1[p])
84
+ {
85
+ if (i < s2)
86
+ {
87
+ // cout << __FILE__ << ":" << __LINE__ << endl;
88
+ return false;
89
+ }
90
+ if (done2[i]) continue;
91
+ for (;e2 <= i;++e2)
92
+ if (!done2[e2])
93
+ agenda.push_back(std::pair<ushort,ushort>(2,e2));
94
+ }
95
+ }
96
+ else
97
+ {
98
+ done2.set(p);
99
+ BOOST_FOREACH(ushort i, a2[p])
100
+ {
101
+ if ((e1 < L1 && i >= L1) ||
102
+ (s1 >= R1 && i < R1) ||
103
+ (i >= L1 && i < R1))
104
+ {
105
+ // cout << __FILE__ << ":" << __LINE__ << " "
106
+ // << L1 << "-" << R1 << " " << i << " "
107
+ // << s1 << "-" << e1<< endl;
108
+ return false;
109
+ }
110
+
111
+ if (e1 < i)
112
+ {
113
+ for (; e1 <= i; ++e1)
114
+ if (!done1[e1])
115
+ agenda.push_back(std::pair<ushort,ushort>(1,e1));
116
+ }
117
+ else if (s1 > i)
118
+ {
119
+ for (; i <= s1; ++i)
120
+ if (!done1[i])
121
+ agenda.push_back(std::pair<ushort,ushort>(1,i));
122
+ }
123
+ }
124
+ }
125
+ }
126
+ ++e1;
127
+ ++e2;
128
+ return true;
129
+ }
130
+
131
+ void
132
+ print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2,
133
+ ushort b1, ushort e1, ushort b2, ushort e2)
134
+ {
135
+ using namespace std;
136
+ std::vector<bitvector> M(a1.size(),bitvector(len2));
137
+ for (ushort j = 0; j < a1.size(); ++j)
138
+ {
139
+ BOOST_FOREACH(ushort k, a1[j])
140
+ M[j].set(k);
141
+ }
142
+ cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
143
+ cout << " ";
144
+ for (size_t c = 0; c < len2;++c)
145
+ cout << c%10;
146
+ cout << endl;
147
+ for (size_t r = 0; r < M.size(); ++r)
148
+ {
149
+ cout << setw(3) << r << " ";
150
+ for (size_t c = 0; c < M[r].size(); ++c)
151
+ {
152
+ if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
153
+ cout << (M[r][c] ? 'x' : '-');
154
+ else cout << (M[r][c] ? 'o' : '.');
155
+ }
156
+ cout << endl;
157
+ }
158
+ cout << std::string(90,'-') << endl;
159
+ }
160
+
161
+ void
162
+ write_bitvector(bitvector const& v, std::ostream& out)
163
+ {
164
+ for (size_t i = v.find_first(); i < v.size();)
165
+ {
166
+ out << i;
167
+ if ((i = v.find_next(i)) < v.size()) out << ",";
168
+ }
169
+ }
170
+
171
+ }
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h ADDED
@@ -0,0 +1,782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #pragma once
3
+ // Implementations of word-aligned bitext.
4
+ // Written by Ulrich Germann
5
+ //
6
+ // mmBitext: static, memory-mapped bitext
7
+ // imBitext: dynamic, in-memory bitext
8
+ //
9
+
10
+ // things we can do to speed up things:
11
+ // - set up threads at startup time that force the
12
+ // data in to memory sequentially
13
+ //
14
+ // - use multiple agendas for better load balancing and to avoid
15
+ // competition for locks
16
+ //
17
+
18
+
19
+ #define UG_BITEXT_TRACK_ACTIVE_THREADS 0
20
+
21
+ #include <string>
22
+ #include <vector>
23
+ #include <cassert>
24
+ #include <iomanip>
25
+ #include <algorithm>
26
+
27
+ #include <boost/foreach.hpp>
28
+ #include <boost/random.hpp>
29
+ #include <boost/format.hpp>
30
+ #include <boost/thread.hpp>
31
+ #include <boost/unordered_map.hpp>
32
+ #include <boost/math/distributions/binomial.hpp>
33
+
34
+ #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
35
+ #include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
36
+ #include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
37
+ #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
38
+ #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
39
+ // #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
40
+ #include "moses/Util.h"
41
+
42
+ #ifndef NO_MOSES
43
+ // #pragma message "COMPILING WITH MOSES SUPPORT!"
44
+ #include "moses/StaticData.h"
45
+ #include "moses/thread_safe_container.h"
46
+ #include "moses/ContextScope.h"
47
+ #include "moses/TranslationTask.h"
48
+ #else
49
+ // #pragma message "COMPILING WITHOUT MOSES SUPPORT!"
50
+ #endif
51
+
52
+ #include "util/exception.hh"
53
+ // #include "util/check.hh"
54
+
55
+ #include "ug_typedefs.h"
56
+ #include "ug_mm_ttrack.h"
57
+ #include "ug_im_ttrack.h"
58
+ #include "ug_mm_tsa.h"
59
+ #include "ug_im_tsa.h"
60
+ #include "tpt_tokenindex.h"
61
+ #include "ug_corpus_token.h"
62
+ #include "tpt_pickler.h"
63
+ #include "ug_lexical_phrase_scorer2.h"
64
+ #include "ug_lru_cache.h"
65
+ #include "ug_lexical_reordering.h"
66
+ #include "ug_sampling_bias.h"
67
+ #include "ug_phrasepair.h"
68
+ #include "ug_bitext_phrase_extraction_record.h"
69
+ #include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
70
+
71
+ // Minimum source count for caching phrase lookup statistics.
72
+ // If source phrase occurs less frequently, never cache;
73
+ // always re-compute.
74
+ #define PSTATS_CACHE_THRESHOLD 50
75
+
76
+ namespace Moses { class Mmsapt; }
77
+ namespace sapt
78
+ {
79
+ using Moses::ttasksptr;
80
+ using Moses::ttaskwptr;
81
+ using tpt::binread;
82
+ using tpt::binwrite;
83
+
84
+ float lbop(size_t const tries, size_t const succ, float const confidence);
85
+ void write_bitvector(bitvector const& v, std::ostream& out);
86
+
87
+ #ifndef NO_MOSES
88
+ struct
89
+ ContextForQuery
90
+ {
91
+ // needs to be made thread-safe
92
+ // ttasksptr const m_ttask;
93
+ // size_t max_samples;
94
+ boost::shared_mutex lock;
95
+ SPTR<SamplingBias> bias;
96
+ SPTR<pstats::cache_t> cache1, cache2;
97
+ std::ostream* bias_log;
98
+ ContextForQuery() : bias_log(NULL) { }
99
+ };
100
+ #endif
101
+
102
+ template<typename Token> class BitextSampler;
103
+
104
+ template<typename TKN>
105
+ class Bitext // : public Moses::reference_counter
106
+ {
107
+ public:
108
+ template<typename Token> friend class BitextSampler;
109
+ typedef TKN Token;
110
+ typedef typename TSA<Token>::tree_iterator iter;
111
+ typedef typename std::vector<PhrasePair<Token> > vec_ppair;
112
+ typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
113
+ typedef TSA<Token> tsa;
114
+ friend class Moses::Mmsapt;
115
+ protected:
116
+ mutable boost::shared_mutex m_lock; // for thread-safe operation
117
+
118
+ class agenda; // for parallel sampling see ug_bitext_agenda.h
119
+ mutable SPTR<agenda> ag;
120
+ size_t m_num_workers; // number of workers available to the agenda
121
+
122
+ size_t m_default_sample_size;
123
+ size_t m_pstats_cache_threshold; // threshold for caching sampling results
124
+ SPTR<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
125
+
126
+ std::vector<std::string> m_docname;
127
+ std::map<std::string,id_type> m_docname2docid; // maps from doc names to ids
128
+ SPTR<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids)
129
+
130
+ mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
131
+ // caches for unbiased sampling; biased sampling uses the caches that
132
+ // are stored locally on the translation task
133
+ public:
134
+ SPTR<Ttrack<char> > Tx; // word alignments
135
+ SPTR<Ttrack<Token> > T1; // token track
136
+ SPTR<Ttrack<Token> > T2; // token track
137
+ SPTR<TokenIndex> V1; // vocab
138
+ SPTR<TokenIndex> V2; // vocab
139
+ SPTR<TSA<Token> > I1; // indices
140
+ SPTR<TSA<Token> > I2; // indices
141
+
142
+ /// given the source phrase sid[start:stop]
143
+ // find the possible start (s1 .. s2) and end (e1 .. e2)
144
+ // points of the target phrase; if non-NULL, store word
145
+ // alignments in *core_alignment. If /flip/, source phrase is
146
+ // L2.
147
+ bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
148
+ bool find_trg_phr_bounds
149
+ ( size_t const sid, // sentence to investigate
150
+ size_t const start, // start of source phrase
151
+ size_t const stop, // last position of source phrase
152
+ size_t & s1, size_t & s2, // beginning and end of target start
153
+ size_t & e1, size_t & e2, // beginning and end of target end
154
+ int& po_fwd, int& po_bwd, // phrase orientations
155
+ std::vector<unsigned char> * core_alignment, // stores the core alignment
156
+ bitvector* full_alignment, // stores full word alignment for this sent.
157
+ bool const flip) const; // flip source and target (reverse lookup)
158
+
159
+ // prep2 launches sampling and returns immediately.
160
+ // lookup (below) waits for the job to finish before it returns
161
+ SPTR<pstats>
162
+ prep2(iter const& phrase, int max_sample = -1) const;
163
+
164
+ #ifndef NO_MOSES
165
+ SPTR<pstats>
166
+ prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
167
+ int max_sample = -1) const;
168
+ #endif
169
+
170
+ protected:
171
+ Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
172
+
173
+ Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
174
+ Ttrack<char>* const tx,
175
+ TokenIndex* const v1, TokenIndex* const v2,
176
+ TSA<Token>* const i1, TSA<Token>* const i2,
177
+ size_t const max_sample=1000,
178
+ size_t const xnum_workers=16);
179
+ public:
180
+ virtual void
181
+ open(std::string const base, std::string const L1, std::string const L2) = 0;
182
+
183
+ SPTR<pstats>
184
+ lookup(iter const& phrase, int max_sample = -1) const;
185
+
186
+ void prep(iter const& phrase) const;
187
+
188
+ #ifndef NO_MOSES
189
+ SPTR<pstats>
190
+ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
191
+
192
+ void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
193
+ #endif
194
+
195
+ void setDefaultSampleSize(size_t const max_samples);
196
+ size_t getDefaultSampleSize() const;
197
+
198
+ std::string toString(uint64_t pid, int isL2) const;
199
+
200
+ virtual size_t revision() const { return 0; }
201
+
202
+ SPTR<SentenceBias>
203
+ loadSentenceBias(std::string const& fname) const;
204
+
205
+ SPTR<DocumentBias>
206
+ SetupDocumentBias(std::string const& bserver, std::string const& text,
207
+ std::ostream* log) const;
208
+
209
+ SPTR<DocumentBias>
210
+ SetupDocumentBias(std::map<std::string,float> context_weights,
211
+ std::ostream* log) const;
212
+
213
+ void
214
+ mark_match(Token const* start, Token const* end, iter const& m,
215
+ bitvector& check) const;
216
+ void
217
+ write_yawat_alignment
218
+ ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
219
+
220
+ std::string sid2docname(id_type const sid) const;
221
+ std::string docid2name(id_type const sid) const;
222
+ int docname2docid(std::string const& name) const;
223
+
224
+ std::vector<id_type> const* sid2did() const;
225
+ int sid2did(uint32_t sid) const;
226
+ };
227
+
228
+ #include "ug_bitext_agenda.h"
229
+
230
+ template<typename Token>
231
+ int
232
+ Bitext<Token>::
233
+ docname2docid(std::string const& name) const
234
+ {
235
+ std::map<std::string,id_type>::const_iterator m;
236
+ m = m_docname2docid.find(name);
237
+ if (m != m_docname2docid.end()) return m->second;
238
+ return -1;
239
+ }
240
+
241
+ template<typename Token>
242
+ std::string
243
+ Bitext<Token>::
244
+ docid2name(id_type const did) const
245
+ {
246
+ if (did < m_docname.size())
247
+ return m_docname[did];
248
+ else
249
+ return (boost::format("%d") % did).str();
250
+ }
251
+
252
+ template<typename Token>
253
+ std::string
254
+ Bitext<Token>::
255
+ sid2docname(id_type const sid) const
256
+ {
257
+ if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
258
+ return m_docname[(*m_sid2docid)[sid]];
259
+ else
260
+ return "";
261
+ }
262
+
263
+ template<typename Token>
264
+ std::vector<id_type> const*
265
+ Bitext<Token>::
266
+ sid2did() const
267
+ {
268
+ return m_sid2docid.get();
269
+ }
270
+
271
+ template<typename Token>
272
+ int
273
+ Bitext<Token>::
274
+ sid2did(uint32_t sid) const
275
+ {
276
+ if (m_sid2docid)
277
+ return m_sid2docid->at(sid);
278
+ return -1;
279
+ }
280
+
281
+
282
+ template<typename Token>
283
+ SPTR<SentenceBias>
284
+ Bitext<Token>::
285
+ loadSentenceBias(std::string const& fname) const
286
+ {
287
+ SPTR<SentenceBias> ret(new SentenceBias(T1->size()));
288
+ std::ifstream in(fname.c_str());
289
+ size_t i = 0;
290
+ float v; while (in>>v) (*ret)[i++] = v;
291
+ UTIL_THROW_IF2(i != T1->size(),
292
+ "Mismatch between bias vector size and corpus size at "
293
+ << HERE);
294
+ return ret;
295
+ }
296
+
297
+ template<typename Token>
298
+ std::string
299
+ Bitext<Token>::
300
+ toString(uint64_t pid, int isL2) const
301
+ {
302
+ std::ostringstream buf;
303
+ uint32_t sid,off,len; parse_pid(pid,sid,off,len);
304
+ Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
305
+ Token const* x = t + len;
306
+ TokenIndex const& V = isL2 ? *V2 : *V1;
307
+ while (t < x)
308
+ {
309
+ buf << V[t->id()];
310
+ if (++t < x) buf << " ";
311
+ }
312
+ return buf.str();
313
+ }
314
+
315
+ template<typename Token>
316
+ size_t
317
+ Bitext<Token>::
318
+ getDefaultSampleSize() const
319
+ {
320
+ return m_default_sample_size;
321
+ }
322
+ template<typename Token>
323
+ void
324
+ Bitext<Token>::
325
+ setDefaultSampleSize(size_t const max_samples)
326
+ {
327
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
328
+ if (max_samples != m_default_sample_size)
329
+ {
330
+ m_cache1.reset(new pstats::cache_t);
331
+ m_cache2.reset(new pstats::cache_t);
332
+ m_default_sample_size = max_samples;
333
+ }
334
+ }
335
+
336
+ template<typename Token>
337
+ Bitext<Token>::
338
+ Bitext(size_t const max_sample, size_t const xnum_workers)
339
+ : m_num_workers(xnum_workers)
340
+ , m_default_sample_size(max_sample)
341
+ , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
342
+ , m_cache1(new pstats::cache_t)
343
+ , m_cache2(new pstats::cache_t)
344
+ { }
345
+
346
+ template<typename Token>
347
+ Bitext<Token>::
348
+ Bitext(Ttrack<Token>* const t1,
349
+ Ttrack<Token>* const t2,
350
+ Ttrack<char>* const tx,
351
+ TokenIndex* const v1,
352
+ TokenIndex* const v2,
353
+ TSA<Token>* const i1,
354
+ TSA<Token>* const i2,
355
+ size_t const max_sample,
356
+ size_t const xnum_workers)
357
+ : m_num_workers(xnum_workers)
358
+ , m_default_sample_size(max_sample)
359
+ , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
360
+ , m_cache1(new pstats::cache_t)
361
+ , m_cache2(new pstats::cache_t)
362
+ , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
363
+ { }
364
+
365
+ template<typename TKN> class snt_adder;
366
+ template<> class snt_adder<L2R_Token<SimpleWordId> >;
367
+
368
+ template<>
369
+ class snt_adder<L2R_Token<SimpleWordId> >
370
+ {
371
+ typedef L2R_Token<SimpleWordId> TKN;
372
+ std::vector<std::string> const & snt;
373
+ TokenIndex & V;
374
+ SPTR<imTtrack<TKN> > & track;
375
+ SPTR<imTSA<TKN > > & index;
376
+ public:
377
+ snt_adder(std::vector<std::string> const& s, TokenIndex& v,
378
+ SPTR<imTtrack<TKN> >& t, SPTR<imTSA<TKN> >& i);
379
+
380
+ void operator()();
381
+ };
382
+
383
+ template<typename Token>
384
+ bool
385
+ Bitext<Token>::
386
+ find_trg_phr_bounds(PhraseExtractionRecord& rec) const
387
+ {
388
+ return find_trg_phr_bounds(rec.sid, rec.start, rec.stop,
389
+ rec.s1, rec.s2, rec.e1, rec.e2,
390
+ rec.po_fwd, rec.po_bwd,
391
+ rec.aln, rec.full_aln, rec.flip);
392
+ }
393
+
394
+ template<typename Token>
395
+ bool
396
+ Bitext<Token>::
397
+ find_trg_phr_bounds
398
+ ( size_t const sid, // sentence to investigate
399
+ size_t const start, // start of source phrase
400
+ size_t const stop, // last position of source phrase
401
+ size_t & s1, size_t & s2, // beginning and end of target start
402
+ size_t & e1, size_t & e2, // beginning and end of target end
403
+ int& po_fwd, int& po_bwd, // phrase orientations
404
+ std::vector<unsigned char> * core_alignment, // stores the core alignment
405
+ bitvector* full_alignment, // stores full word alignment for this sent.
406
+ bool const flip) const // flip source and target (reverse lookup)
407
+ {
408
+ // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
409
+ // a word on the core_alignment (core_alignment):
410
+ //
411
+ // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
412
+ // < e2, respectively) are be definition unaligned, we store
413
+ // only the core alignment in *aln. It is up to the calling
414
+ // function to shift alignment points over for start positions
415
+ // of extracted phrases that start with a fringe word
416
+ assert(T1);
417
+ assert(T2);
418
+ assert(Tx);
419
+
420
+ size_t slen1,slen2;
421
+ if (flip)
422
+ {
423
+ slen1 = T2->sntLen(sid);
424
+ slen2 = T1->sntLen(sid);
425
+ }
426
+ else
427
+ {
428
+ slen1 = T1->sntLen(sid);
429
+ slen2 = T2->sntLen(sid);
430
+ }
431
+ bitvector forbidden(slen2);
432
+ if (full_alignment)
433
+ {
434
+ if (slen1*slen2 > full_alignment->size())
435
+ full_alignment->resize(slen1*slen2*2);
436
+ full_alignment->reset();
437
+ }
438
+ size_t src,trg;
439
+ size_t lft = forbidden.size();
440
+ size_t rgt = 0;
441
+ std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
442
+
443
+ // process word alignment for this sentence
444
+ char const* p = Tx->sntStart(sid);
445
+ char const* x = Tx->sntEnd(sid);
446
+ while (p < x)
447
+ {
448
+ if (flip)
449
+ {
450
+ p = binread(p,trg);
451
+ assert(p<x);
452
+ p = binread(p,src);
453
+ }
454
+ else
455
+ {
456
+ p = binread(p,src);
457
+ assert(p<x);
458
+ p = binread(p,trg);
459
+ }
460
+
461
+ UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
462
+ "Alignment range error at sentence " << sid << "!\n"
463
+ << src << "/" << slen1 << " " << trg << "/" << slen2);
464
+
465
+ if (src < start || src >= stop)
466
+ forbidden.set(trg);
467
+ else
468
+ {
469
+ lft = std::min(lft,trg);
470
+ rgt = std::max(rgt,trg);
471
+ }
472
+ if (core_alignment)
473
+ {
474
+ aln1[src].push_back(trg);
475
+ aln2[trg].push_back(src);
476
+ }
477
+ if (full_alignment)
478
+ full_alignment->set(src*slen2 + trg);
479
+ }
480
+
481
+ for (size_t i = lft; i <= rgt; ++i)
482
+ if (forbidden[i])
483
+ return false;
484
+
485
+ s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
486
+ e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
487
+
488
+ if (lft > rgt) return false;
489
+ if (core_alignment)
490
+ {
491
+ core_alignment->clear();
492
+ for (size_t i = start; i < stop; ++i)
493
+ {
494
+ BOOST_FOREACH(ushort x, aln1[i])
495
+ {
496
+ core_alignment->push_back(i - start);
497
+ core_alignment->push_back(x - lft);
498
+ }
499
+ }
500
+ // now determine fwd and bwd phrase orientation
501
+ po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
502
+ po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
503
+ }
504
+ return lft <= rgt;
505
+ }
506
+
507
+ template<typename Token>
508
+ SPTR<DocumentBias>
509
+ Bitext<Token>::
510
+ SetupDocumentBias
511
+ ( std::string const& bserver, std::string const& text, std::ostream* log ) const
512
+ {
513
+ SPTR<DocumentBias> ret;
514
+ UTIL_THROW_IF2(m_sid2docid == NULL,
515
+ "Document bias requested but no document map loaded.");
516
+ ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
517
+ bserver, text, log));
518
+ return ret;
519
+ }
520
+
521
+ template<typename Token>
522
+ SPTR<DocumentBias>
523
+ Bitext<Token>::
524
+ SetupDocumentBias
525
+ ( std::map<std::string,float> context_weights, std::ostream* log ) const
526
+ {
527
+ SPTR<DocumentBias> ret;
528
+ UTIL_THROW_IF2(m_sid2docid == NULL,
529
+ "Document bias requested but no document map loaded.");
530
+ ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
531
+ context_weights, log));
532
+ return ret;
533
+ }
534
+
535
+ template<typename Token>
536
+ void
537
+ Bitext<Token>::
538
+ prep(iter const& phrase) const
539
+ {
540
+ prep2(phrase, m_default_sample_size);
541
+ }
542
+
543
+
544
+
545
+ // prep2 schedules a phrase for sampling, and returns immediately
546
+ // the member function lookup retrieves the respective pstats instance
547
+ // and waits until the sampling is finished before it returns.
548
+ // This allows sampling in the background
549
+ template<typename Token>
550
+ SPTR<pstats>
551
+ Bitext<Token>
552
+ ::prep2
553
+ (iter const& phrase, int max_sample) const
554
+ {
555
+ if (max_sample < 0) max_sample = m_default_sample_size;
556
+ SPTR<SamplingBias> bias;
557
+ SPTR<pstats::cache_t> cache;
558
+ // - no caching for rare phrases and special requests (max_sample)
559
+ // (still need to test what a good caching threshold is ...)
560
+ // - use the task-specific cache when there is a sampling bias
561
+ if (max_sample == int(m_default_sample_size)
562
+ && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
563
+ {
564
+ cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
565
+ }
566
+
567
+ SPTR<pstats> ret;
568
+ SPTR<pstats> const* cached;
569
+
570
+ if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
571
+ return *cached;
572
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
573
+ if (!ag)
574
+ {
575
+ ag.reset(new agenda(*this));
576
+ if (m_num_workers > 1)
577
+ ag->add_workers(m_num_workers);
578
+ }
579
+ ret = ag->add_job(this, phrase, max_sample, bias);
580
+ if (cache) cache->set(phrase.getPid(),ret);
581
+ UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
582
+ return ret;
583
+ }
584
+
585
+ // worker for scoring and sorting phrase table entries in parallel
586
+ template<typename Token>
587
+ class pstats2pplist
588
+ {
589
+ Ttrack<Token> const& m_other;
590
+ SPTR<pstats> m_pstats;
591
+ std::vector<PhrasePair<Token> >& m_pplist;
592
+ typename PhrasePair<Token>::Scorer const* m_scorer;
593
+ PhrasePair<Token> m_pp;
594
+ Token const* m_token;
595
+ size_t m_len;
596
+ uint64_t m_pid1;
597
+ bool m_is_inverse;
598
+ public:
599
+
600
+ // CONSTRUCTOR
601
+ pstats2pplist(typename TSA<Token>::tree_iterator const& m,
602
+ Ttrack<Token> const& other,
603
+ SPTR<pstats> const& ps,
604
+ std::vector<PhrasePair<Token> >& dest,
605
+ typename PhrasePair<Token>::Scorer const* scorer)
606
+ : m_other(other)
607
+ , m_pstats(ps)
608
+ , m_pplist(dest)
609
+ , m_scorer(scorer)
610
+ , m_token(m.getToken(0))
611
+ , m_len(m.size())
612
+ , m_pid1(m.getPid())
613
+ , m_is_inverse(false)
614
+ { }
615
+
616
+ // WORKER
617
+ void
618
+ operator()()
619
+ {
620
+ // wait till all statistics have been collected
621
+ boost::unique_lock<boost::mutex> lock(m_pstats->lock);
622
+ while (m_pstats->in_progress)
623
+ m_pstats->ready.wait(lock);
624
+
625
+ m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
626
+
627
+ // convert pstats entries to phrase pairs
628
+ pstats::trg_map_t::iterator a;
629
+ for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
630
+ {
631
+ uint32_t sid,off,len;
632
+ parse_pid(a->first, sid, off, len);
633
+ m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
634
+ m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
635
+ m_pp.joint);
636
+ // Poor man's early pruning: if p(f|e) or p(e|f) < 1/128, don't
637
+ // even consider the phrase pair, as it is unlikely to ever be
638
+ // considered as a valid translation.
639
+ size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
640
+ if (m_pp.good1 > J || m_pp.good2 > J) continue;
641
+ if (m_scorer)
642
+ {
643
+ (*m_scorer)(m_pp);
644
+ }
645
+ m_pplist.push_back(m_pp);
646
+ }
647
+ std::greater<PhrasePair<Token> > sorter;
648
+ if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
649
+ }
650
+ };
651
+
652
+ template<typename Token>
653
+ void
654
+ Bitext<Token>
655
+ ::mark_match(Token const* start, Token const* end,
656
+ iter const& m, bitvector& check) const
657
+ {
658
+ check.resize(end-start);
659
+ check.reset();
660
+ Token const* x = m.getToken(0);
661
+ for (Token const* s = start; s < end; ++s)
662
+ {
663
+ if (s->id() != x->id()) continue;
664
+ Token const* a = x;
665
+ Token const* b = s;
666
+ size_t i = 0;
667
+ while (a && b && a->id() == b->id() && i < m.size())
668
+ {
669
+ ++i;
670
+ a = a->next();
671
+ b = b->next();
672
+ }
673
+ if (i == m.size())
674
+ {
675
+ b = s;
676
+ while (i-- > 0) { check.set(b-start); b = b->next(); }
677
+ }
678
+ }
679
+ }
680
+
681
+ template<typename Token>
682
+ void
683
+ Bitext<Token>::
684
+ write_yawat_alignment
685
+ ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const
686
+ {
687
+ std::vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
688
+ bitvector f1(a1.size()), f2(a2.size());
689
+ if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
690
+ if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
691
+
692
+ std::vector<std::pair<bitvector, bitvector> > agroups;
693
+ std::vector<std::string> grouplabel;
694
+ std::pair<bitvector, bitvector> ag;
695
+ ag.first.resize(a1.size());
696
+ ag.second.resize(a2.size());
697
+ char const* x = Tx->sntStart(sid);
698
+ size_t a, b;
699
+ while (x < Tx->sntEnd(sid))
700
+ {
701
+ x = binread(x,a);
702
+ x = binread(x,b);
703
+ if (a1.at(a) < 0 && a2.at(b) < 0)
704
+ {
705
+ a1[a] = a2[b] = agroups.size();
706
+ ag.first.reset();
707
+ ag.second.reset();
708
+ ag.first.set(a);
709
+ ag.second.set(b);
710
+ agroups.push_back(ag);
711
+ grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
712
+ }
713
+ else if (a1.at(a) < 0)
714
+ {
715
+ a1[a] = a2[b];
716
+ agroups[a2[b]].first.set(a);
717
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
718
+ }
719
+ else if (a2.at(b) < 0)
720
+ {
721
+ a2[b] = a1[a];
722
+ agroups[a1[a]].second.set(b);
723
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
724
+ }
725
+ else
726
+ {
727
+ agroups[a1[a]].first |= agroups[a2[b]].first;
728
+ agroups[a1[a]].second |= agroups[a2[b]].second;
729
+ a2[b] = a1[a];
730
+ if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
731
+ }
732
+ }
733
+
734
+ for (a = 0; a < a1.size(); ++a)
735
+ {
736
+ if (a1[a] < 0)
737
+ {
738
+ if (f1[a]) out << a << "::" << "infocusmono ";
739
+ continue;
740
+ }
741
+ bitvector const& A = agroups[a1[a]].first;
742
+ bitvector const& B = agroups[a1[a]].second;
743
+ if (A.find_first() < a) continue;
744
+ write_bitvector(A,out); out << ":";
745
+ write_bitvector(B,out); out << ":";
746
+ out << grouplabel[a1[a]] << " ";
747
+ }
748
+ for (b = 0; b < a2.size(); ++b)
749
+ {
750
+ if (a2[b] < 0 && f2[b])
751
+ out << "::" << "infocusmono ";
752
+ }
753
+ }
754
+
755
+ template<typename Token>
756
+ void
757
+ expand(typename Bitext<Token>::iter const& m,
758
+ Bitext<Token> const& bt, pstats const& ps,
759
+ std::vector<PhrasePair<Token> >& dest, std::ostream* log)
760
+ {
761
+ bool fwd = m.root == bt.I1.get();
762
+ dest.reserve(ps.trg.size());
763
+ PhrasePair<Token> pp;
764
+ pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
765
+ // cout << HERE << " "
766
+ // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl;
767
+ pstats::trg_map_t::const_iterator a;
768
+ for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
769
+ {
770
+ uint32_t sid,off,len;
771
+ parse_pid(a->first, sid, off, len);
772
+ pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
773
+ len, a->second);
774
+ dest.push_back(pp);
775
+ }
776
+ }
777
+
778
+ } // end of namespace sapt
779
+
780
+ #include "ug_im_bitext.h"
781
+ #include "ug_mm_bitext.h"
782
+ #include "ug_bitext_moses.h"
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ // to be included from ug_bitext.h
3
+
4
+ // The agenda handles parallel sampling.
5
+ // It maintains a queue of unfinished sampling jobs and
6
+ // assigns them to a pool of workers.
7
+ //
8
+ template<typename Token>
9
+ class Bitext<Token>
10
+ ::agenda
11
+ {
12
+ public:
13
+ class job;
14
+ class worker;
15
+ private:
16
+ boost::mutex lock;
17
+ std::list<SPTR<job> > joblist;
18
+ std::vector<SPTR<boost::thread> > workers;
19
+ bool shutdown;
20
+ size_t doomed;
21
+
22
+ public:
23
+
24
+
25
+ Bitext<Token> const& bt;
26
+
27
+ agenda(Bitext<Token> const& bitext);
28
+ ~agenda();
29
+
30
+ void
31
+ add_workers(int n);
32
+
33
+ SPTR<pstats>
34
+ add_job(Bitext<Token> const* const theBitext,
35
+ typename TSA<Token>::tree_iterator const& phrase,
36
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
37
+ bool const track_sids);
38
+ // add_job(Bitext<Token> const* const theBitext,
39
+ // typename TSA<Token>::tree_iterator const& phrase,
40
+ // size_t const max_samples, SamplingBias const* const bias);
41
+
42
+ SPTR<job>
43
+ get_job();
44
+ };
45
+
46
+ template<typename Token>
47
+ class
48
+ Bitext<Token>::agenda::
49
+ worker
50
+ {
51
+ agenda& ag;
52
+ public:
53
+ worker(agenda& a) : ag(a) {}
54
+ void operator()();
55
+ };
56
+
57
+ #include "ug_bitext_agenda_worker.h"
58
+ #include "ug_bitext_agenda_job.h"
59
+
60
+ template<typename Token>
61
+ void Bitext<Token>
62
+ ::agenda
63
+ ::add_workers(int n)
64
+ {
65
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
66
+ boost::lock_guard<boost::mutex> guard(this->lock);
67
+
68
+ int target = std::max(1, int(n + workers.size() - this->doomed));
69
+ // house keeping: remove all workers that have finished
70
+ for (size_t i = 0; i < workers.size(); )
71
+ {
72
+ if (workers[i]->timed_join(nodelay))
73
+ {
74
+ if (i + 1 < workers.size())
75
+ workers[i].swap(workers.back());
76
+ workers.pop_back();
77
+ }
78
+ else ++i;
79
+ }
80
+ // cerr << workers.size() << "/" << target << " active" << std::endl;
81
+ if (int(workers.size()) > target)
82
+ this->doomed = workers.size() - target;
83
+ else
84
+ while (int(workers.size()) < target)
85
+ {
86
+ SPTR<boost::thread> w(new boost::thread(worker(*this)));
87
+ workers.push_back(w);
88
+ }
89
+ }
90
+
91
+
92
+ template<typename Token>
93
+ SPTR<pstats> Bitext<Token>
94
+ ::agenda
95
+ ::add_job(Bitext<Token> const* const theBitext,
96
+ typename TSA<Token>::tree_iterator const& phrase,
97
+ size_t const max_samples, SPTR<SamplingBias const> const& bias,
98
+ bool const track_sids)
99
+ {
100
+ boost::unique_lock<boost::mutex> lk(this->lock);
101
+ static boost::posix_time::time_duration nodelay(0,0,0,0);
102
+ bool fwd = phrase.root == bt.I1.get();
103
+ SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
104
+ max_samples, fwd, bias, track_sids));
105
+ j->stats->register_worker();
106
+
107
+ joblist.push_back(j);
108
+ if (joblist.size() == 1)
109
+ {
110
+ size_t i = 0;
111
+ while (i < workers.size())
112
+ {
113
+ if (workers[i]->timed_join(nodelay))
114
+ {
115
+ if (doomed)
116
+ {
117
+ if (i+1 < workers.size())
118
+ workers[i].swap(workers.back());
119
+ workers.pop_back();
120
+ --doomed;
121
+ }
122
+ else
123
+ workers[i++] = SPTR<boost::thread>(new boost::thread(worker(*this)));
124
+ }
125
+ else ++i;
126
+ }
127
+ }
128
+ return j->stats;
129
+ }
130
+
131
+ template<typename Token>
132
+ SPTR<typename Bitext<Token>::agenda::job>
133
+ Bitext<Token>
134
+ ::agenda
135
+ ::get_job()
136
+ {
137
+ // cerr << workers.size() << " workers on record" << std::endl;
138
+ SPTR<job> ret;
139
+ if (this->shutdown) return ret;
140
+ boost::unique_lock<boost::mutex> lock(this->lock);
141
+ if (this->doomed)
142
+ { // the number of workers has been reduced, tell the redundant once to quit
143
+ --this->doomed;
144
+ return ret;
145
+ }
146
+
147
+ typename std::list<SPTR<job> >::iterator j = joblist.begin();
148
+ while (j != joblist.end())
149
+ {
150
+ if ((*j)->done())
151
+ {
152
+ (*j)->stats->release();
153
+ joblist.erase(j++);
154
+ }
155
+ else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
156
+ else break; // found one
157
+ }
158
+ if (joblist.size())
159
+ {
160
+ ret = j == joblist.end() ? joblist.front() : *j;
161
+ // if we've reached the end of the queue (all jobs have 4 workers on them),
162
+ // take the first in the queue
163
+ boost::lock_guard<boost::mutex> jguard(ret->lock);
164
+ ++ret->workers;
165
+ }
166
+ return ret;
167
+ }
168
+
169
+ template<typename Token>
170
+ Bitext<Token>::
171
+ agenda::
172
+ ~agenda()
173
+ {
174
+ this->lock.lock();
175
+ this->shutdown = true;
176
+ this->lock.unlock();
177
+ for (size_t i = 0; i < workers.size(); ++i)
178
+ workers[i]->join();
179
+ }
180
+
181
+ template<typename Token>
182
+ Bitext<Token>::
183
+ agenda::
184
+ agenda(Bitext<Token> const& thebitext)
185
+ : shutdown(false), doomed(0), bt(thebitext)
186
+ { }
187
+
188
+
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
2
+ #pragma once
3
+ #include <string>
4
+ #include <stdint.h>
5
+ #include "ug_typedefs.h"
6
+ #include "ug_lexical_reordering.h"
7
+ #include <boost/thread.hpp>
8
+
9
+ namespace sapt
10
+ {
11
+
12
+ // "joint" (i.e., phrase std::pair) statistics
13
+ class
14
+ jstats
15
+ {
16
+ boost::mutex lock;
17
+ uint32_t my_rcnt; // unweighted joint count
18
+ uint32_t my_cnt2; // raw counts L2
19
+ float my_wcnt; // weighted joint count
20
+ float my_bcnt; // cumulative bias
21
+
22
+ // to do: use a static alignment pattern store that stores each pattern only
23
+ // once, so that we don't have to store so many alignment std::vectors
24
+ std::vector<std::pair<size_t, std::vector<unsigned char> > > my_aln;
25
+ // internal word alignment
26
+
27
+ uint32_t ofwd[LRModel::NONE+1]; // forward distortion type counts
28
+ uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
29
+
30
+ public:
31
+ SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
32
+ std::map<uint32_t,uint32_t> indoc;
33
+ // std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
34
+ jstats();
35
+ jstats(jstats const& other);
36
+ uint32_t rcnt() const; // raw joint counts
37
+ uint32_t cnt2() const; // raw target phrase occurrence count
38
+ float wcnt() const; // weighted joint counts
39
+ float bcnt() const; // cumulative bias scores
40
+
41
+ std::vector<std::pair<size_t, std::vector<unsigned char> > > const & aln() const;
42
+
43
+ size_t
44
+ add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
45
+ uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
46
+ bool const track_sid);
47
+
48
+ void invalidate();
49
+ void validate();
50
+ bool valid();
51
+ uint32_t dcnt_fwd(PhraseOrientation const idx) const;
52
+ uint32_t dcnt_bwd(PhraseOrientation const idx) const;
53
+ void fill_lr_vec(LRModel::Direction const& dir,
54
+ LRModel::ModelType const& mdl,
55
+ std::vector<float>& v);
56
+ };
57
+ }
58
+
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; cc-style: moses-cc-style -*-
2
+ #pragma once
3
+ #ifndef NO_MOSES
4
+ namespace sapt {
5
+
6
+ template<typename Token>
7
+ SPTR<pstats>
8
+ Bitext<Token>::
9
+ lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
10
+ {
11
+ SPTR<pstats> ret = prep2(ttask, phrase, max_sample);
12
+ UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
13
+
14
+ // Why were we locking here?
15
+ if (m_num_workers <= 1)
16
+ {
17
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
18
+ typename agenda::worker(*this->ag)();
19
+ }
20
+ else
21
+ {
22
+ boost::unique_lock<boost::mutex> lock(ret->lock);
23
+ while (ret->in_progress)
24
+ ret->ready.wait(lock);
25
+ }
26
+ return ret;
27
+ }
28
+
29
+
30
+ template<typename Token>
31
+ void
32
+ Bitext<Token>::
33
+ prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
34
+ {
35
+ prep2(ttask, phrase, track_sids, m_default_sample_size);
36
+ }
37
+
38
+
39
+ // prep2 schedules a phrase for sampling, and returns immediately
40
+ // the member function lookup retrieves the respective pstats instance
41
+ // and waits until the sampling is finished before it returns.
42
+ // This allows sampling in the background
43
+ template<typename Token>
44
+ SPTR<pstats>
45
+ Bitext<Token>
46
+ ::prep2
47
+ ( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
48
+ int max_sample) const
49
+ {
50
+ if (max_sample < 0) max_sample = m_default_sample_size;
51
+ SPTR<SamplingBias> bias;
52
+ SPTR<Moses::ContextScope> scope = ttask->GetScope();
53
+ SPTR<ContextForQuery> context = scope->get<ContextForQuery>(this);
54
+ if (context) bias = context->bias;
55
+ SPTR<pstats::cache_t> cache;
56
+ // - no caching for rare phrases and special requests (max_sample)
57
+ // (still need to test what a good caching threshold is ...)
58
+ // - use the task-specific cache when there is a sampling bias
59
+ if (max_sample == int(m_default_sample_size)
60
+ && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
61
+ {
62
+ cache = (phrase.root == I1.get()
63
+ ? (bias ? context->cache1 : m_cache1)
64
+ : (bias ? context->cache2 : m_cache2));
65
+ }
66
+ SPTR<pstats> ret;
67
+ SPTR<pstats> const* cached;
68
+
69
+ if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
70
+ return *cached;
71
+ boost::unique_lock<boost::shared_mutex> guard(m_lock);
72
+ if (!ag)
73
+ {
74
+ ag.reset(new agenda(*this));
75
+ if (m_num_workers > 1)
76
+ ag->add_workers(m_num_workers);
77
+ }
78
+ ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
79
+ if (cache) cache->set(phrase.getPid(),ret);
80
+ UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
81
+ return ret;
82
+ }
83
+
84
+
85
+
86
+ }
87
+ #endif