Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- mosesdecoder/moses/BitmapContainer.cpp +498 -0
- mosesdecoder/moses/Bitmaps.h +32 -0
- mosesdecoder/moses/ChartManager.h +162 -0
- mosesdecoder/moses/ChartTranslationOptions.cpp +168 -0
- mosesdecoder/moses/DecodeStepGeneration.cpp +169 -0
- mosesdecoder/moses/FloydWarshall.cpp +36 -0
- mosesdecoder/moses/HypothesisStack.h +64 -0
- mosesdecoder/moses/Sentence.cpp +372 -0
- mosesdecoder/moses/Syntax/Cube.h +62 -0
- mosesdecoder/moses/Syntax/CubeQueue.cpp +37 -0
- mosesdecoder/moses/Syntax/CubeQueue.h +52 -0
- mosesdecoder/moses/Syntax/InputWeightFF.cpp +48 -0
- mosesdecoder/moses/Syntax/Manager.cpp +229 -0
- mosesdecoder/moses/Syntax/NonTerminalMap.h +85 -0
- mosesdecoder/moses/Syntax/PHyperedge.h +21 -0
- mosesdecoder/moses/Syntax/RuleTableFF.h +60 -0
- mosesdecoder/moses/Syntax/SHyperedgeBundle.h +31 -0
- mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h +26 -0
- mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +424 -0
- mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h +112 -0
- mosesdecoder/moses/TranslationModel/CompactPT/Jamfile +17 -0
- mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +450 -0
- mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h +202 -0
- mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h +230 -0
- mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h +144 -0
- mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h +412 -0
- mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp +198 -0
- mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc +434 -0
- mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h +83 -0
- mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc +31 -0
- mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h +30 -0
- mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc +51 -0
- mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h +21 -0
- mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x +105 -0
- mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc +57 -0
- mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc +348 -0
- mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc +498 -0
- mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc +166 -0
- mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc +77 -0
- mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc +74 -0
- mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc +27 -0
- mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc +13 -0
- mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc +594 -0
- mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h +176 -0
- mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc +420 -0
- mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc +171 -0
- mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h +782 -0
- mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +188 -0
- mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +58 -0
- mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h +87 -0
mosesdecoder/moses/BitmapContainer.cpp
ADDED
|
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <algorithm>
|
| 23 |
+
#include <limits>
|
| 24 |
+
#include <utility>
|
| 25 |
+
|
| 26 |
+
#include "BitmapContainer.h"
|
| 27 |
+
#include "HypothesisStackCubePruning.h"
|
| 28 |
+
#include "moses/FF/DistortionScoreProducer.h"
|
| 29 |
+
#include "TranslationOptionList.h"
|
| 30 |
+
#include "Manager.h"
|
| 31 |
+
|
| 32 |
+
namespace Moses
|
| 33 |
+
{
|
| 34 |
+
|
| 35 |
+
class HypothesisScoreOrdererNoDistortion
|
| 36 |
+
{
|
| 37 |
+
public:
|
| 38 |
+
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
|
| 39 |
+
const float scoreA = hypoA->GetScore();
|
| 40 |
+
const float scoreB = hypoB->GetScore();
|
| 41 |
+
|
| 42 |
+
if (scoreA > scoreB) {
|
| 43 |
+
return true;
|
| 44 |
+
} else if (scoreA < scoreB) {
|
| 45 |
+
return false;
|
| 46 |
+
} else {
|
| 47 |
+
return hypoA < hypoB;
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
class HypothesisScoreOrdererWithDistortion
|
| 53 |
+
{
|
| 54 |
+
private:
|
| 55 |
+
bool m_deterministic;
|
| 56 |
+
|
| 57 |
+
public:
|
| 58 |
+
HypothesisScoreOrdererWithDistortion(const Range* transOptRange,
|
| 59 |
+
const bool deterministic = false)
|
| 60 |
+
: m_deterministic(deterministic)
|
| 61 |
+
, m_transOptRange(transOptRange) {
|
| 62 |
+
m_totalWeightDistortion = 0;
|
| 63 |
+
const StaticData &staticData = StaticData::Instance();
|
| 64 |
+
|
| 65 |
+
const std::vector<const DistortionScoreProducer*> &ffs = DistortionScoreProducer::GetDistortionFeatureFunctions();
|
| 66 |
+
std::vector<const DistortionScoreProducer*>::const_iterator iter;
|
| 67 |
+
for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
|
| 68 |
+
const DistortionScoreProducer *ff = *iter;
|
| 69 |
+
|
| 70 |
+
float weight =staticData.GetAllWeights().GetScoreForProducer(ff);
|
| 71 |
+
m_totalWeightDistortion += weight;
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
const Range* m_transOptRange;
|
| 76 |
+
float m_totalWeightDistortion;
|
| 77 |
+
|
| 78 |
+
bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
|
| 79 |
+
UTIL_THROW_IF2(m_transOptRange == NULL, "Words range not set");
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
const float distortionScoreA = DistortionScoreProducer::CalculateDistortionScore(
|
| 83 |
+
*hypoA,
|
| 84 |
+
hypoA->GetCurrSourceWordsRange(),
|
| 85 |
+
*m_transOptRange,
|
| 86 |
+
hypoA->GetWordsBitmap().GetFirstGapPos()
|
| 87 |
+
);
|
| 88 |
+
const float distortionScoreB = DistortionScoreProducer::CalculateDistortionScore(
|
| 89 |
+
*hypoB,
|
| 90 |
+
hypoB->GetCurrSourceWordsRange(),
|
| 91 |
+
*m_transOptRange,
|
| 92 |
+
hypoB->GetWordsBitmap().GetFirstGapPos()
|
| 93 |
+
);
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
const float scoreA = hypoA->GetScore() + distortionScoreA * m_totalWeightDistortion;
|
| 97 |
+
const float scoreB = hypoB->GetScore() + distortionScoreB * m_totalWeightDistortion;
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if (scoreA > scoreB) {
|
| 101 |
+
return true;
|
| 102 |
+
} else if (scoreA < scoreB) {
|
| 103 |
+
return false;
|
| 104 |
+
} else {
|
| 105 |
+
if (m_deterministic) {
|
| 106 |
+
// Equal scores: break ties by comparing target phrases
|
| 107 |
+
return (hypoA->GetCurrTargetPhrase().Compare(hypoB->GetCurrTargetPhrase()) < 0);
|
| 108 |
+
}
|
| 109 |
+
// Fallback: non-deterministic sort
|
| 110 |
+
return hypoA < hypoB;
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
};
|
| 115 |
+
|
| 116 |
+
////////////////////////////////////////////////////////////////////////////////
|
| 117 |
+
// BackwardsEdge Code
|
| 118 |
+
////////////////////////////////////////////////////////////////////////////////
|
| 119 |
+
|
| 120 |
+
BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
|
| 121 |
+
, BitmapContainer &parent
|
| 122 |
+
, const TranslationOptionList &translations
|
| 123 |
+
, const SquareMatrix &estimatedScores,
|
| 124 |
+
const InputType& itype,
|
| 125 |
+
const bool deterministic)
|
| 126 |
+
: m_initialized(false)
|
| 127 |
+
, m_prevBitmapContainer(prevBitmapContainer)
|
| 128 |
+
, m_parent(parent)
|
| 129 |
+
, m_translations(translations)
|
| 130 |
+
, m_estimatedScores(estimatedScores)
|
| 131 |
+
, m_deterministic(deterministic)
|
| 132 |
+
, m_seenPosition()
|
| 133 |
+
{
|
| 134 |
+
|
| 135 |
+
// If either dimension is empty, we haven't got anything to do.
|
| 136 |
+
if(m_prevBitmapContainer.GetHypotheses().size() == 0 || m_translations.size() == 0) {
|
| 137 |
+
VERBOSE(3, "Empty cube on BackwardsEdge" << std::endl);
|
| 138 |
+
return;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
// Fetch the things we need for distortion cost computation.
|
| 142 |
+
// int maxDistortion = StaticData::Instance().GetMaxDistortion();
|
| 143 |
+
int maxDistortion = itype.options()->reordering.max_distortion;
|
| 144 |
+
|
| 145 |
+
if (maxDistortion == -1) {
|
| 146 |
+
for (HypothesisSet::const_iterator iter = m_prevBitmapContainer.GetHypotheses().begin(); iter != m_prevBitmapContainer.GetHypotheses().end(); ++iter) {
|
| 147 |
+
m_hypotheses.push_back(*iter);
|
| 148 |
+
}
|
| 149 |
+
return;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
const Range &transOptRange = translations.Get(0)->GetSourceWordsRange();
|
| 153 |
+
|
| 154 |
+
HypothesisSet::const_iterator iterHypo = m_prevBitmapContainer.GetHypotheses().begin();
|
| 155 |
+
HypothesisSet::const_iterator iterEnd = m_prevBitmapContainer.GetHypotheses().end();
|
| 156 |
+
|
| 157 |
+
while (iterHypo != iterEnd) {
|
| 158 |
+
const Hypothesis &hypo = **iterHypo;
|
| 159 |
+
// Special case: If this is the first hypothesis used to seed the search,
|
| 160 |
+
// it doesn't have a valid range, and we create the hypothesis, if the
|
| 161 |
+
// initial position is not further into the sentence than the distortion limit.
|
| 162 |
+
if (hypo.GetWordsBitmap().GetNumWordsCovered() == 0) {
|
| 163 |
+
if ((int)transOptRange.GetStartPos() <= maxDistortion)
|
| 164 |
+
m_hypotheses.push_back(&hypo);
|
| 165 |
+
} else {
|
| 166 |
+
int distortionDistance = itype.ComputeDistortionDistance(hypo.GetCurrSourceWordsRange()
|
| 167 |
+
, transOptRange);
|
| 168 |
+
|
| 169 |
+
if (distortionDistance <= maxDistortion)
|
| 170 |
+
m_hypotheses.push_back(&hypo);
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
++iterHypo;
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
if (m_translations.size() > 1) {
|
| 177 |
+
UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
|
| 178 |
+
"Non-monotonic future score: "
|
| 179 |
+
<< m_translations.Get(0)->GetFutureScore() << " vs. "
|
| 180 |
+
<< m_translations.Get(1)->GetFutureScore());
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
if (m_hypotheses.size() > 1) {
|
| 184 |
+
UTIL_THROW_IF2(m_hypotheses[0]->GetFutureScore() < m_hypotheses[1]->GetFutureScore(),
|
| 185 |
+
"Non-monotonic total score"
|
| 186 |
+
<< m_hypotheses[0]->GetFutureScore() << " vs. "
|
| 187 |
+
<< m_hypotheses[1]->GetFutureScore());
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
HypothesisScoreOrdererWithDistortion orderer (&transOptRange, m_deterministic);
|
| 191 |
+
std::sort(m_hypotheses.begin(), m_hypotheses.end(), orderer);
|
| 192 |
+
|
| 193 |
+
// std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrdererNoDistortion());
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
BackwardsEdge::~BackwardsEdge()
|
| 197 |
+
{
|
| 198 |
+
m_seenPosition.clear();
|
| 199 |
+
m_hypotheses.clear();
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
void
|
| 204 |
+
BackwardsEdge::Initialize()
|
| 205 |
+
{
|
| 206 |
+
if(m_hypotheses.size() == 0 || m_translations.size() == 0) {
|
| 207 |
+
m_initialized = true;
|
| 208 |
+
return;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
const Bitmap &bm = m_hypotheses[0]->GetWordsBitmap();
|
| 212 |
+
const Range &newRange = m_translations.Get(0)->GetSourceWordsRange();
|
| 213 |
+
m_estimatedScore = m_estimatedScores.CalcEstimatedScore(bm, newRange.GetStartPos(), newRange.GetEndPos());
|
| 214 |
+
|
| 215 |
+
Hypothesis *expanded = CreateHypothesis(*m_hypotheses[0], *m_translations.Get(0));
|
| 216 |
+
m_parent.Enqueue(0, 0, expanded, this);
|
| 217 |
+
SetSeenPosition(0, 0);
|
| 218 |
+
m_initialized = true;
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt)
|
| 222 |
+
{
|
| 223 |
+
// create hypothesis and calculate all its scores
|
| 224 |
+
IFVERBOSE(2) {
|
| 225 |
+
hypothesis.GetManager().GetSentenceStats().StartTimeBuildHyp();
|
| 226 |
+
}
|
| 227 |
+
const Bitmap &bitmap = m_parent.GetWordsBitmap();
|
| 228 |
+
Hypothesis *newHypo = new Hypothesis(hypothesis, transOpt, bitmap, hypothesis.GetManager().GetNextHypoId());
|
| 229 |
+
IFVERBOSE(2) {
|
| 230 |
+
hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
|
| 231 |
+
}
|
| 232 |
+
newHypo->EvaluateWhenApplied(m_estimatedScore);
|
| 233 |
+
|
| 234 |
+
return newHypo;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
bool
|
| 238 |
+
BackwardsEdge::SeenPosition(const size_t x, const size_t y)
|
| 239 |
+
{
|
| 240 |
+
boost::unordered_set< int >::iterator iter = m_seenPosition.find((x<<16) + y);
|
| 241 |
+
return (iter != m_seenPosition.end());
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
void
|
| 245 |
+
BackwardsEdge::SetSeenPosition(const size_t x, const size_t y)
|
| 246 |
+
{
|
| 247 |
+
UTIL_THROW_IF2(x >= (1<<17), "Error");
|
| 248 |
+
UTIL_THROW_IF2(y >= (1<<17), "Error");
|
| 249 |
+
|
| 250 |
+
m_seenPosition.insert((x<<16) + y);
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
bool
|
| 255 |
+
BackwardsEdge::GetInitialized()
|
| 256 |
+
{
|
| 257 |
+
return m_initialized;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
const BitmapContainer&
|
| 261 |
+
BackwardsEdge::GetBitmapContainer() const
|
| 262 |
+
{
|
| 263 |
+
return m_prevBitmapContainer;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
void
|
| 267 |
+
BackwardsEdge::PushSuccessors(const size_t x, const size_t y)
|
| 268 |
+
{
|
| 269 |
+
Hypothesis *newHypo;
|
| 270 |
+
|
| 271 |
+
if(y + 1 < m_translations.size() && !SeenPosition(x, y + 1)) {
|
| 272 |
+
SetSeenPosition(x, y + 1);
|
| 273 |
+
newHypo = CreateHypothesis(*m_hypotheses[x], *m_translations.Get(y + 1));
|
| 274 |
+
if(newHypo != NULL) {
|
| 275 |
+
m_parent.Enqueue(x, y + 1, newHypo, (BackwardsEdge*)this);
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
if(x + 1 < m_hypotheses.size() && !SeenPosition(x + 1, y)) {
|
| 280 |
+
SetSeenPosition(x + 1, y);
|
| 281 |
+
newHypo = CreateHypothesis(*m_hypotheses[x + 1], *m_translations.Get(y));
|
| 282 |
+
if(newHypo != NULL) {
|
| 283 |
+
m_parent.Enqueue(x + 1, y, newHypo, (BackwardsEdge*)this);
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
////////////////////////////////////////////////////////////////////////////////
|
| 290 |
+
// BitmapContainer Code
|
| 291 |
+
////////////////////////////////////////////////////////////////////////////////
|
| 292 |
+
|
| 293 |
+
BitmapContainer::BitmapContainer(const Bitmap &bitmap
|
| 294 |
+
, HypothesisStackCubePruning &stack
|
| 295 |
+
, bool deterministic)
|
| 296 |
+
: m_bitmap(bitmap)
|
| 297 |
+
, m_stack(stack)
|
| 298 |
+
, m_numStackInsertions(0)
|
| 299 |
+
, m_deterministic(deterministic)
|
| 300 |
+
{
|
| 301 |
+
m_hypotheses = HypothesisSet();
|
| 302 |
+
m_edges = BackwardsEdgeSet();
|
| 303 |
+
m_queue = HypothesisQueue();
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
BitmapContainer::~BitmapContainer()
|
| 307 |
+
{
|
| 308 |
+
// As we have created the square position objects we clean up now.
|
| 309 |
+
|
| 310 |
+
while (!m_queue.empty()) {
|
| 311 |
+
HypothesisQueueItem *item = m_queue.top();
|
| 312 |
+
m_queue.pop();
|
| 313 |
+
|
| 314 |
+
delete item->GetHypothesis();
|
| 315 |
+
delete item;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
// Delete all edges.
|
| 319 |
+
RemoveAllInColl(m_edges);
|
| 320 |
+
|
| 321 |
+
m_hypotheses.clear();
|
| 322 |
+
m_edges.clear();
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
void
|
| 327 |
+
BitmapContainer::Enqueue(int hypothesis_pos
|
| 328 |
+
, int translation_pos
|
| 329 |
+
, Hypothesis *hypothesis
|
| 330 |
+
, BackwardsEdge *edge)
|
| 331 |
+
{
|
| 332 |
+
// Only supply target phrase if running deterministic search mode
|
| 333 |
+
const TargetPhrase *target_phrase = m_deterministic ? &(hypothesis->GetCurrTargetPhrase()) : NULL;
|
| 334 |
+
HypothesisQueueItem *item = new HypothesisQueueItem(hypothesis_pos
|
| 335 |
+
, translation_pos
|
| 336 |
+
, hypothesis
|
| 337 |
+
, edge
|
| 338 |
+
, target_phrase);
|
| 339 |
+
IFVERBOSE(2) {
|
| 340 |
+
item->GetHypothesis()->GetManager().GetSentenceStats().StartTimeManageCubes();
|
| 341 |
+
}
|
| 342 |
+
m_queue.push(item);
|
| 343 |
+
IFVERBOSE(2) {
|
| 344 |
+
item->GetHypothesis()->GetManager().GetSentenceStats().StopTimeManageCubes();
|
| 345 |
+
}
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
HypothesisQueueItem*
|
| 349 |
+
BitmapContainer::Dequeue(bool keepValue)
|
| 350 |
+
{
|
| 351 |
+
if (!m_queue.empty()) {
|
| 352 |
+
HypothesisQueueItem *item = m_queue.top();
|
| 353 |
+
|
| 354 |
+
if (!keepValue) {
|
| 355 |
+
m_queue.pop();
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
return item;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
return NULL;
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
HypothesisQueueItem*
|
| 365 |
+
BitmapContainer::Top() const
|
| 366 |
+
{
|
| 367 |
+
return m_queue.top();
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
size_t
|
| 371 |
+
BitmapContainer::Size()
|
| 372 |
+
{
|
| 373 |
+
return m_queue.size();
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
bool
|
| 377 |
+
BitmapContainer::Empty() const
|
| 378 |
+
{
|
| 379 |
+
return m_queue.empty();
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
const HypothesisSet&
|
| 383 |
+
BitmapContainer::GetHypotheses() const
|
| 384 |
+
{
|
| 385 |
+
return m_hypotheses;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
size_t
|
| 389 |
+
BitmapContainer::GetHypothesesSize() const
|
| 390 |
+
{
|
| 391 |
+
return m_hypotheses.size();
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
const BackwardsEdgeSet&
|
| 395 |
+
BitmapContainer::GetBackwardsEdges()
|
| 396 |
+
{
|
| 397 |
+
return m_edges;
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
void
|
| 401 |
+
BitmapContainer::AddHypothesis(Hypothesis *hypothesis)
|
| 402 |
+
{
|
| 403 |
+
bool itemExists = false;
|
| 404 |
+
HypothesisSet::const_iterator iter = m_hypotheses.begin();
|
| 405 |
+
HypothesisSet::const_iterator iterEnd = m_hypotheses.end();
|
| 406 |
+
|
| 407 |
+
// cfedermann: do we actually need this check?
|
| 408 |
+
while (iter != iterEnd) {
|
| 409 |
+
if (*iter == hypothesis) {
|
| 410 |
+
itemExists = true;
|
| 411 |
+
break;
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
++iter;
|
| 415 |
+
}
|
| 416 |
+
UTIL_THROW_IF2(itemExists, "Duplicate hypotheses");
|
| 417 |
+
m_hypotheses.push_back(hypothesis);
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
void
|
| 421 |
+
BitmapContainer::AddBackwardsEdge(BackwardsEdge *edge)
|
| 422 |
+
{
|
| 423 |
+
m_edges.insert(edge);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
void
|
| 427 |
+
BitmapContainer::InitializeEdges()
|
| 428 |
+
{
|
| 429 |
+
BackwardsEdgeSet::iterator iter = m_edges.begin();
|
| 430 |
+
BackwardsEdgeSet::iterator iterEnd = m_edges.end();
|
| 431 |
+
|
| 432 |
+
while (iter != iterEnd) {
|
| 433 |
+
BackwardsEdge *edge = *iter;
|
| 434 |
+
edge->Initialize();
|
| 435 |
+
|
| 436 |
+
++iter;
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
void
|
| 441 |
+
BitmapContainer::EnsureMinStackHyps(const size_t minNumHyps)
|
| 442 |
+
{
|
| 443 |
+
while ((!Empty()) && m_numStackInsertions < minNumHyps) {
|
| 444 |
+
ProcessBestHypothesis();
|
| 445 |
+
}
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
void
|
| 449 |
+
BitmapContainer::ProcessBestHypothesis()
|
| 450 |
+
{
|
| 451 |
+
if (m_queue.empty()) {
|
| 452 |
+
return;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
// Get the currently best hypothesis from the queue.
|
| 456 |
+
HypothesisQueueItem *item = Dequeue();
|
| 457 |
+
|
| 458 |
+
// If the priority queue is exhausted, we are done and should have exited
|
| 459 |
+
UTIL_THROW_IF2(item == NULL, "Null object");
|
| 460 |
+
|
| 461 |
+
// check we are pulling things off of priority queue in right order
|
| 462 |
+
if (!Empty()) {
|
| 463 |
+
HypothesisQueueItem *check = Dequeue(true);
|
| 464 |
+
UTIL_THROW_IF2(item->GetHypothesis()->GetFutureScore() < check->GetHypothesis()->GetFutureScore(),
|
| 465 |
+
"Non-monotonic total score: "
|
| 466 |
+
<< item->GetHypothesis()->GetFutureScore() << " vs. "
|
| 467 |
+
<< check->GetHypothesis()->GetFutureScore());
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
// Logging for the criminally insane
|
| 471 |
+
IFVERBOSE(3) {
|
| 472 |
+
item->GetHypothesis()->PrintHypothesis();
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
// Add best hypothesis to hypothesis stack.
|
| 476 |
+
const bool newstackentry = m_stack.AddPrune(item->GetHypothesis());
|
| 477 |
+
if (newstackentry)
|
| 478 |
+
m_numStackInsertions++;
|
| 479 |
+
|
| 480 |
+
IFVERBOSE(3) {
|
| 481 |
+
TRACE_ERR("new stack entry flag is " << newstackentry << std::endl);
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
// Create new hypotheses for the two successors of the hypothesis just added.
|
| 485 |
+
item->GetBackwardsEdge()->PushSuccessors(item->GetHypothesisPos(), item->GetTranslationPos());
|
| 486 |
+
|
| 487 |
+
// We are done with the queue item, we delete it.
|
| 488 |
+
delete item;
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
void
|
| 492 |
+
BitmapContainer::SortHypotheses()
|
| 493 |
+
{
|
| 494 |
+
std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrderer(m_deterministic));
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
}
|
| 498 |
+
|
mosesdecoder/moses/Bitmaps.h
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <boost/unordered_set.hpp>
|
| 4 |
+
#include <boost/unordered_map.hpp>
|
| 5 |
+
#include <set>
|
| 6 |
+
#include "Bitmap.h"
|
| 7 |
+
#include "Util.h"
|
| 8 |
+
|
| 9 |
+
namespace Moses
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
class Bitmaps
|
| 13 |
+
{
|
| 14 |
+
typedef boost::unordered_map<Range, const Bitmap*> NextBitmaps;
|
| 15 |
+
typedef boost::unordered_map<const Bitmap*, NextBitmaps, UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
|
| 16 |
+
//typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
|
| 17 |
+
Coll m_coll;
|
| 18 |
+
const Bitmap *m_initBitmap;
|
| 19 |
+
|
| 20 |
+
const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
|
| 21 |
+
public:
|
| 22 |
+
Bitmaps(size_t inputSize, const std::vector<bool> &initSourceCompleted);
|
| 23 |
+
virtual ~Bitmaps();
|
| 24 |
+
|
| 25 |
+
const Bitmap &GetInitialBitmap() const {
|
| 26 |
+
return *m_initBitmap;
|
| 27 |
+
}
|
| 28 |
+
const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
|
| 29 |
+
|
| 30 |
+
};
|
| 31 |
+
|
| 32 |
+
}
|
mosesdecoder/moses/ChartManager.h
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2010 Hieu Hoang
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#pragma once
|
| 23 |
+
|
| 24 |
+
#include <vector>
|
| 25 |
+
#include <boost/unordered_map.hpp>
|
| 26 |
+
#include "ChartCell.h"
|
| 27 |
+
#include "ChartCellCollection.h"
|
| 28 |
+
#include "Range.h"
|
| 29 |
+
#include "SentenceStats.h"
|
| 30 |
+
#include "ChartTranslationOptionList.h"
|
| 31 |
+
#include "ChartParser.h"
|
| 32 |
+
#include "ChartKBestExtractor.h"
|
| 33 |
+
#include "BaseManager.h"
|
| 34 |
+
#include "moses/Syntax/KBestExtractor.h"
|
| 35 |
+
|
| 36 |
+
namespace Moses
|
| 37 |
+
{
|
| 38 |
+
|
| 39 |
+
class ChartHypothesis;
|
| 40 |
+
class ChartSearchGraphWriter;
|
| 41 |
+
|
| 42 |
+
/** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
|
| 43 |
+
*/
|
| 44 |
+
class ChartManager : public BaseManager
|
| 45 |
+
{
|
| 46 |
+
private:
|
| 47 |
+
ChartCellCollection m_hypoStackColl;
|
| 48 |
+
std::auto_ptr<SentenceStats> m_sentenceStats;
|
| 49 |
+
clock_t m_start; /**< starting time, used for logging */
|
| 50 |
+
unsigned m_hypothesisId; /* For handing out hypothesis ids to ChartHypothesis */
|
| 51 |
+
|
| 52 |
+
ChartParser m_parser;
|
| 53 |
+
|
| 54 |
+
ChartTranslationOptionList m_translationOptionList; /**< pre-computed list of translation options for the phrases in this sentence */
|
| 55 |
+
|
| 56 |
+
/* auxilliary functions for SearchGraphs */
|
| 57 |
+
void FindReachableHypotheses(
|
| 58 |
+
const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable , size_t* winners, size_t* losers) const;
|
| 59 |
+
void WriteSearchGraph(const ChartSearchGraphWriter& writer) const;
|
| 60 |
+
|
| 61 |
+
// output
|
| 62 |
+
void OutputNBestList(OutputCollector *collector,
|
| 63 |
+
const ChartKBestExtractor::KBestVec &nBestList,
|
| 64 |
+
long translationId) const;
|
| 65 |
+
size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) const;
|
| 66 |
+
size_t OutputAlignmentNBest(Alignments &retAlign,
|
| 67 |
+
const Moses::ChartKBestExtractor::Derivation &derivation,
|
| 68 |
+
size_t startTarget) const;
|
| 69 |
+
size_t OutputAlignment(Alignments &retAlign,
|
| 70 |
+
const Moses::ChartHypothesis *hypo,
|
| 71 |
+
size_t startTarget) const;
|
| 72 |
+
void OutputDetailedTranslationReport(
|
| 73 |
+
OutputCollector *collector,
|
| 74 |
+
const ChartHypothesis *hypo,
|
| 75 |
+
const Sentence &sentence,
|
| 76 |
+
long translationId) const;
|
| 77 |
+
void OutputTranslationOptions(std::ostream &out,
|
| 78 |
+
ApplicationContext &applicationContext,
|
| 79 |
+
const ChartHypothesis *hypo,
|
| 80 |
+
const Sentence &sentence,
|
| 81 |
+
long translationId) const;
|
| 82 |
+
void OutputTranslationOption(std::ostream &out,
|
| 83 |
+
ApplicationContext &applicationContext,
|
| 84 |
+
const ChartHypothesis *hypo,
|
| 85 |
+
const Sentence &sentence,
|
| 86 |
+
long translationId) const;
|
| 87 |
+
void ReconstructApplicationContext(const ChartHypothesis &hypo,
|
| 88 |
+
const Sentence &sentence,
|
| 89 |
+
ApplicationContext &context) const;
|
| 90 |
+
void OutputTreeFragmentsTranslationOptions(std::ostream &out,
|
| 91 |
+
ApplicationContext &applicationContext,
|
| 92 |
+
const ChartHypothesis *hypo,
|
| 93 |
+
const Sentence &sentence,
|
| 94 |
+
long translationId) const;
|
| 95 |
+
void OutputDetailedAllTranslationReport(
|
| 96 |
+
OutputCollector *collector,
|
| 97 |
+
const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
|
| 98 |
+
const Sentence &sentence,
|
| 99 |
+
long translationId) const;
|
| 100 |
+
void OutputBestHypo(OutputCollector *collector, const ChartHypothesis *hypo, long translationId) const;
|
| 101 |
+
void Backtrack(const ChartHypothesis *hypo) const;
|
| 102 |
+
|
| 103 |
+
public:
|
| 104 |
+
ChartManager(ttasksptr const& ttask);
|
| 105 |
+
~ChartManager();
|
| 106 |
+
void Decode();
|
| 107 |
+
void AddXmlChartOptions();
|
| 108 |
+
const ChartHypothesis *GetBestHypothesis() const;
|
| 109 |
+
void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
|
| 110 |
+
|
| 111 |
+
/** "Moses" (osg) type format */
|
| 112 |
+
void OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const;
|
| 113 |
+
|
| 114 |
+
/** Output in (modified) Kenneth hypergraph format */
|
| 115 |
+
void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
|
| 116 |
+
|
| 117 |
+
//! debug data collected when decoding sentence
|
| 118 |
+
SentenceStats& GetSentenceStats() const {
|
| 119 |
+
return *m_sentenceStats;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
//DIMw
|
| 123 |
+
const ChartCellCollection& GetChartCellCollection() const {
|
| 124 |
+
return m_hypoStackColl;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
void CalcDecoderStatistics() const {
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
void ResetSentenceStats(const InputType& source) {
|
| 131 |
+
m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
//! contigious hypo id for each input sentence. For debugging purposes
|
| 135 |
+
unsigned GetNextHypoId() {
|
| 136 |
+
return m_hypothesisId++;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
const ChartParser &GetParser() const {
|
| 140 |
+
return m_parser;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// outputs
|
| 144 |
+
void OutputBest(OutputCollector *collector) const;
|
| 145 |
+
void OutputNBest(OutputCollector *collector) const;
|
| 146 |
+
void OutputLatticeSamples(OutputCollector *collector) const {
|
| 147 |
+
}
|
| 148 |
+
void OutputAlignment(OutputCollector *collector) const;
|
| 149 |
+
void OutputDetailedTranslationReport(OutputCollector *collector) const;
|
| 150 |
+
void OutputUnknowns(OutputCollector *collector) const;
|
| 151 |
+
void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
|
| 152 |
+
void OutputWordGraph(OutputCollector *collector) const {
|
| 153 |
+
}
|
| 154 |
+
void OutputSearchGraph(OutputCollector *collector) const;
|
| 155 |
+
void OutputSearchGraphSLF() const {
|
| 156 |
+
}
|
| 157 |
+
// void OutputSearchGraphHypergraph() const;
|
| 158 |
+
|
| 159 |
+
};
|
| 160 |
+
|
| 161 |
+
}
|
| 162 |
+
|
mosesdecoder/moses/ChartTranslationOptions.cpp
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - factored phrase-based language decoder
|
| 3 |
+
Copyright (C) 2010 Hieu Hoang
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "ChartTranslationOptions.h"
|
| 21 |
+
#include "ChartHypothesis.h"
|
| 22 |
+
#include "ChartCellLabel.h"
|
| 23 |
+
#include "ChartTranslationOption.h"
|
| 24 |
+
#include "InputPath.h"
|
| 25 |
+
#include "StaticData.h"
|
| 26 |
+
#include "TranslationTask.h"
|
| 27 |
+
|
| 28 |
+
using namespace std;
|
| 29 |
+
|
| 30 |
+
namespace Moses
|
| 31 |
+
{
|
| 32 |
+
|
| 33 |
+
ChartTranslationOptions::ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
|
| 34 |
+
const StackVec &stackVec,
|
| 35 |
+
const Range &range,
|
| 36 |
+
float score)
|
| 37 |
+
: m_stackVec(stackVec)
|
| 38 |
+
, m_wordsRange(&range)
|
| 39 |
+
, m_estimateOfBestScore(score)
|
| 40 |
+
{
|
| 41 |
+
TargetPhraseCollection::const_iterator iter;
|
| 42 |
+
for (iter = targetPhraseColl.begin(); iter != targetPhraseColl.end(); ++iter) {
|
| 43 |
+
const TargetPhrase *origTP = *iter;
|
| 44 |
+
|
| 45 |
+
boost::shared_ptr<ChartTranslationOption> ptr(new ChartTranslationOption(*origTP));
|
| 46 |
+
m_collection.push_back(ptr);
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
ChartTranslationOptions::~ChartTranslationOptions()
|
| 51 |
+
{
|
| 52 |
+
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
//! functor to compare (chart) hypotheses by (descending) score
|
| 56 |
+
class ChartTranslationOptionScoreOrderer
|
| 57 |
+
{
|
| 58 |
+
public:
|
| 59 |
+
bool operator()(const boost::shared_ptr<ChartTranslationOption> &transOptA
|
| 60 |
+
, const boost::shared_ptr<ChartTranslationOption> &transOptB) const {
|
| 61 |
+
const ScoreComponentCollection &scoresA = transOptA->GetScores();
|
| 62 |
+
const ScoreComponentCollection &scoresB = transOptB->GetScores();
|
| 63 |
+
return scoresA.GetWeightedScore() > scoresB.GetWeightedScore();
|
| 64 |
+
}
|
| 65 |
+
};
|
| 66 |
+
|
| 67 |
+
void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
|
| 68 |
+
{
|
| 69 |
+
SetInputPath(&inputPath);
|
| 70 |
+
// if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
|
| 71 |
+
if (inputPath.ttask->options()->input.placeholder_factor != NOT_FOUND) {
|
| 72 |
+
CreateSourceRuleFromInputPath();
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
CollType::iterator iter;
|
| 76 |
+
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
| 77 |
+
ChartTranslationOption &transOpt = **iter;
|
| 78 |
+
transOpt.SetInputPath(&inputPath);
|
| 79 |
+
transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// get rid of -inf trans opts
|
| 83 |
+
size_t numDiscard = 0;
|
| 84 |
+
for (size_t i = 0; i < m_collection.size(); ++i) {
|
| 85 |
+
ChartTranslationOption *transOpt = m_collection[i].get();
|
| 86 |
+
|
| 87 |
+
if (transOpt->GetScores().GetWeightedScore() == - std::numeric_limits<float>::infinity()) {
|
| 88 |
+
++numDiscard;
|
| 89 |
+
} else if (numDiscard) {
|
| 90 |
+
m_collection[i - numDiscard] = m_collection[i];
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
size_t newSize = m_collection.size() - numDiscard;
|
| 95 |
+
m_collection.resize(newSize);
|
| 96 |
+
|
| 97 |
+
// sort if necessary
|
| 98 |
+
const StaticData &staticData = StaticData::Instance();
|
| 99 |
+
if (staticData.RequireSortingAfterSourceContext()) {
|
| 100 |
+
std::sort(m_collection.begin()
|
| 101 |
+
, m_collection.begin() + newSize
|
| 102 |
+
, ChartTranslationOptionScoreOrderer());
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
void ChartTranslationOptions::SetInputPath(const InputPath *inputPath)
|
| 108 |
+
{
|
| 109 |
+
CollType::iterator iter;
|
| 110 |
+
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
| 111 |
+
ChartTranslationOption &transOpt = **iter;
|
| 112 |
+
transOpt.SetInputPath(inputPath);
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
void ChartTranslationOptions::CreateSourceRuleFromInputPath()
|
| 117 |
+
{
|
| 118 |
+
if (m_collection.size() == 0) {
|
| 119 |
+
return;
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
const InputPath *inputPath = m_collection.front()->GetInputPath();
|
| 123 |
+
assert(inputPath);
|
| 124 |
+
std::vector<const Word*> &ruleSourceFromInputPath = inputPath->AddRuleSourceFromInputPath();
|
| 125 |
+
|
| 126 |
+
size_t chartCellIndex = 0;
|
| 127 |
+
const ChartCellLabel *chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
|
| 128 |
+
|
| 129 |
+
size_t ind = 0;
|
| 130 |
+
for (size_t sourcePos = m_wordsRange->GetStartPos(); sourcePos <= m_wordsRange->GetEndPos(); ++sourcePos, ++ind) {
|
| 131 |
+
if (chartCellLabel) {
|
| 132 |
+
if (sourcePos == chartCellLabel->GetCoverage().GetEndPos()) {
|
| 133 |
+
// end of child range. push an empty word to denote non-term
|
| 134 |
+
ruleSourceFromInputPath.push_back(NULL);
|
| 135 |
+
++chartCellIndex;
|
| 136 |
+
chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
|
| 137 |
+
} else if (sourcePos >= chartCellLabel->GetCoverage().GetStartPos()) {
|
| 138 |
+
// in the range of child hypo. do nothing
|
| 139 |
+
} else {
|
| 140 |
+
// not yet reached child range. add word
|
| 141 |
+
ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
|
| 142 |
+
}
|
| 143 |
+
} else {
|
| 144 |
+
// no child in sight. add word
|
| 145 |
+
ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
// save it to each trans opt
|
| 150 |
+
CollType::iterator iter;
|
| 151 |
+
for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
|
| 152 |
+
ChartTranslationOption &transOpt = **iter;
|
| 153 |
+
transOpt.SetSourceRuleFromInputPath(&ruleSourceFromInputPath);
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
std::ostream& operator<<(std::ostream &out, const ChartTranslationOptions &obj)
|
| 159 |
+
{
|
| 160 |
+
for (size_t i = 0; i < obj.m_collection.size(); ++i) {
|
| 161 |
+
const ChartTranslationOption &transOpt = *obj.m_collection[i];
|
| 162 |
+
out << transOpt << endl;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
return out;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
}
|
mosesdecoder/moses/DecodeStepGeneration.cpp
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "DecodeStepGeneration.h"
|
| 23 |
+
#include "GenerationDictionary.h"
|
| 24 |
+
#include "TranslationOption.h"
|
| 25 |
+
#include "TranslationOptionCollection.h"
|
| 26 |
+
#include "PartialTranslOptColl.h"
|
| 27 |
+
#include "FactorCollection.h"
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
using namespace std;
|
| 32 |
+
|
| 33 |
+
DecodeStepGeneration::DecodeStepGeneration(GenerationDictionary* dict,
|
| 34 |
+
const DecodeStep* prev,
|
| 35 |
+
const std::vector<FeatureFunction*> &features)
|
| 36 |
+
: DecodeStep(dict, prev, features)
|
| 37 |
+
{
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
// helpers
|
| 41 |
+
typedef pair<Word, ScoreComponentCollection> WordPair;
|
| 42 |
+
typedef list< WordPair > WordList;
|
| 43 |
+
// 1st = word
|
| 44 |
+
// 2nd = score
|
| 45 |
+
typedef list< WordPair >::const_iterator WordListIterator;
|
| 46 |
+
|
| 47 |
+
/** used in generation: increases iterators when looping through the exponential number of generation expansions */
|
| 48 |
+
inline void IncrementIterators(vector< WordListIterator > &wordListIterVector
|
| 49 |
+
, const vector< WordList > &wordListVector)
|
| 50 |
+
{
|
| 51 |
+
for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++) {
|
| 52 |
+
WordListIterator &iter = wordListIterVector[currPos];
|
| 53 |
+
iter++;
|
| 54 |
+
if (iter != wordListVector[currPos].end()) {
|
| 55 |
+
// eg. 4 -> 5
|
| 56 |
+
return;
|
| 57 |
+
} else {
|
| 58 |
+
// eg 9 -> 10
|
| 59 |
+
iter = wordListVector[currPos].begin();
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOpt
|
| 65 |
+
, const DecodeStep &decodeStep
|
| 66 |
+
, PartialTranslOptColl &outputPartialTranslOptColl
|
| 67 |
+
, TranslationOptionCollection * /* toc */
|
| 68 |
+
, bool /*adhereTableLimit*/) const
|
| 69 |
+
{
|
| 70 |
+
if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) {
|
| 71 |
+
// word deletion
|
| 72 |
+
|
| 73 |
+
TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt);
|
| 74 |
+
outputPartialTranslOptColl.Add(newTransOpt);
|
| 75 |
+
|
| 76 |
+
return;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
// normal generation step
|
| 80 |
+
const GenerationDictionary* generationDictionary = decodeStep.GetGenerationDictionaryFeature();
|
| 81 |
+
|
| 82 |
+
const Phrase &targetPhrase = inputPartialTranslOpt.GetTargetPhrase();
|
| 83 |
+
const InputPath &inputPath = inputPartialTranslOpt.GetInputPath();
|
| 84 |
+
size_t targetLength = targetPhrase.GetSize();
|
| 85 |
+
|
| 86 |
+
// generation list for each word in phrase
|
| 87 |
+
vector< WordList > wordListVector(targetLength);
|
| 88 |
+
|
| 89 |
+
// create generation list
|
| 90 |
+
int wordListVectorPos = 0;
|
| 91 |
+
for (size_t currPos = 0 ; currPos < targetLength ; currPos++) { // going thorugh all words
|
| 92 |
+
// generatable factors for this word to be put in wordList
|
| 93 |
+
WordList &wordList = wordListVector[wordListVectorPos];
|
| 94 |
+
const Word &word = targetPhrase.GetWord(currPos);
|
| 95 |
+
|
| 96 |
+
// consult dictionary for possible generations for this word
|
| 97 |
+
const OutputWordCollection *wordColl = generationDictionary->FindWord(word);
|
| 98 |
+
|
| 99 |
+
if (wordColl == NULL) {
|
| 100 |
+
// word not found in generation dictionary
|
| 101 |
+
//toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
|
| 102 |
+
return; // can't be part of a phrase, special handling
|
| 103 |
+
} else {
|
| 104 |
+
// sort(*wordColl, CompareWordCollScore);
|
| 105 |
+
OutputWordCollection::const_iterator iterWordColl;
|
| 106 |
+
for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl) {
|
| 107 |
+
const Word &outputWord = (*iterWordColl).first;
|
| 108 |
+
const ScoreComponentCollection& score = (*iterWordColl).second;
|
| 109 |
+
// enter into word list generated factor(s) and its(their) score(s)
|
| 110 |
+
wordList.push_back(WordPair(outputWord, score));
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
wordListVectorPos++; // done, next word
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
// use generation list (wordList)
|
| 118 |
+
// set up iterators (total number of expansions)
|
| 119 |
+
size_t numIteration = 1;
|
| 120 |
+
vector< WordListIterator > wordListIterVector(targetLength);
|
| 121 |
+
vector< const Word* > mergeWords(targetLength);
|
| 122 |
+
for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
|
| 123 |
+
wordListIterVector[currPos] = wordListVector[currPos].begin();
|
| 124 |
+
numIteration *= wordListVector[currPos].size();
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
// go thru each possible factor for each word & create hypothesis
|
| 128 |
+
for (size_t currIter = 0 ; currIter < numIteration ; currIter++) {
|
| 129 |
+
ScoreComponentCollection generationScore; // total score for this string of words
|
| 130 |
+
|
| 131 |
+
// create vector of words with new factors for last phrase
|
| 132 |
+
for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
|
| 133 |
+
const WordPair &wordPair = *wordListIterVector[currPos];
|
| 134 |
+
mergeWords[currPos] = &(wordPair.first);
|
| 135 |
+
generationScore.PlusEquals(wordPair.second);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
// merge with existing trans opt
|
| 139 |
+
Phrase genPhrase( mergeWords);
|
| 140 |
+
|
| 141 |
+
if (IsFilteringStep()) {
|
| 142 |
+
if (!inputPartialTranslOpt.IsCompatible(genPhrase, m_conflictFactors))
|
| 143 |
+
continue;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
const TargetPhrase &inPhrase = inputPartialTranslOpt.GetTargetPhrase();
|
| 147 |
+
TargetPhrase outPhrase(inPhrase);
|
| 148 |
+
outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
|
| 149 |
+
|
| 150 |
+
outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
|
| 151 |
+
outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
|
| 152 |
+
|
| 153 |
+
const Range &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
|
| 154 |
+
|
| 155 |
+
TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
|
| 156 |
+
assert(newTransOpt);
|
| 157 |
+
|
| 158 |
+
newTransOpt->SetInputPath(inputPath);
|
| 159 |
+
|
| 160 |
+
outputPartialTranslOptColl.Add(newTransOpt);
|
| 161 |
+
|
| 162 |
+
// increment iterators
|
| 163 |
+
IncrementIterators(wordListIterVector, wordListVector);
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
mosesdecoder/moses/FloydWarshall.cpp
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "util/exception.hh"
|
| 2 |
+
#include <climits>
|
| 3 |
+
#include <vector>
|
| 4 |
+
|
| 5 |
+
#define MAX_DIST (INT_MAX / 2)
|
| 6 |
+
|
| 7 |
+
//#include "FloydWarshall.h"
|
| 8 |
+
|
| 9 |
+
using namespace std;
|
| 10 |
+
|
| 11 |
+
// All-pairs shortest path algorithm
|
| 12 |
+
void floyd_warshall(const std::vector<std::vector<bool> >& edges, std::vector<std::vector<int> >& dist)
|
| 13 |
+
{
|
| 14 |
+
UTIL_THROW_IF2(edges.size() != edges.front().size(), "Error");
|
| 15 |
+
dist.clear();
|
| 16 |
+
dist.resize(edges.size(), std::vector<int>(edges.size(), 0));
|
| 17 |
+
|
| 18 |
+
size_t num_edges = edges.size();
|
| 19 |
+
|
| 20 |
+
for (size_t i=0; i<num_edges; ++i) {
|
| 21 |
+
for (size_t j=0; j<num_edges; ++j) {
|
| 22 |
+
if (edges[i][j])
|
| 23 |
+
dist[i][j] = 1;
|
| 24 |
+
else
|
| 25 |
+
dist[i][j] = MAX_DIST;
|
| 26 |
+
if (i == j) dist[i][j] = MAX_DIST;
|
| 27 |
+
}
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
for (size_t k=0; k<num_edges; ++k)
|
| 31 |
+
for (size_t i=0; i<num_edges; ++i)
|
| 32 |
+
for (size_t j=0; j<num_edges; ++j)
|
| 33 |
+
if (dist[i][j] > (dist[i][k] + dist[k][j]))
|
| 34 |
+
dist[i][j] = dist[i][k] + dist[k][j];
|
| 35 |
+
}
|
| 36 |
+
|
mosesdecoder/moses/HypothesisStack.h
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef moses_HypothesisStack_h
|
| 2 |
+
#define moses_HypothesisStack_h
|
| 3 |
+
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <set>
|
| 6 |
+
#include <boost/unordered_set.hpp>
|
| 7 |
+
#include "Hypothesis.h"
|
| 8 |
+
#include "Bitmap.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
|
| 13 |
+
class Manager;
|
| 14 |
+
|
| 15 |
+
/** abstract unique set of hypotheses that cover a certain number of words,
|
| 16 |
+
* ie. a stack in phrase-based decoding
|
| 17 |
+
*/
|
| 18 |
+
class HypothesisStack
|
| 19 |
+
{
|
| 20 |
+
|
| 21 |
+
protected:
|
| 22 |
+
typedef boost::unordered_set< Hypothesis*, UnorderedComparer<Hypothesis>, UnorderedComparer<Hypothesis> > _HCType;
|
| 23 |
+
_HCType m_hypos; /**< contains hypotheses */
|
| 24 |
+
Manager& m_manager;
|
| 25 |
+
|
| 26 |
+
public:
|
| 27 |
+
HypothesisStack(Manager& manager): m_manager(manager) {}
|
| 28 |
+
typedef _HCType::iterator iterator;
|
| 29 |
+
typedef _HCType::const_iterator const_iterator;
|
| 30 |
+
//! iterators
|
| 31 |
+
const_iterator begin() const {
|
| 32 |
+
return m_hypos.begin();
|
| 33 |
+
}
|
| 34 |
+
const_iterator end() const {
|
| 35 |
+
return m_hypos.end();
|
| 36 |
+
}
|
| 37 |
+
size_t size() const {
|
| 38 |
+
return m_hypos.size();
|
| 39 |
+
}
|
| 40 |
+
virtual inline float GetWorstScore() const {
|
| 41 |
+
return -std::numeric_limits<float>::infinity();
|
| 42 |
+
};
|
| 43 |
+
virtual float GetWorstScoreForBitmap( WordsBitmapID ) {
|
| 44 |
+
return -std::numeric_limits<float>::infinity();
|
| 45 |
+
};
|
| 46 |
+
virtual float GetWorstScoreForBitmap( const Bitmap& ) {
|
| 47 |
+
return -std::numeric_limits<float>::infinity();
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
virtual ~HypothesisStack();
|
| 51 |
+
virtual bool AddPrune(Hypothesis *hypothesis) = 0;
|
| 52 |
+
virtual const Hypothesis *GetBestHypothesis() const = 0;
|
| 53 |
+
virtual std::vector<const Hypothesis*> GetSortedList() const = 0;
|
| 54 |
+
|
| 55 |
+
//! remove hypothesis pointed to by iterator but don't delete the object
|
| 56 |
+
virtual void Detach(const HypothesisStack::iterator &iter);
|
| 57 |
+
/** destroy Hypothesis pointed to by iterator (object pool version) */
|
| 58 |
+
virtual void Remove(const HypothesisStack::iterator &iter);
|
| 59 |
+
|
| 60 |
+
};
|
| 61 |
+
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
#endif
|
mosesdecoder/moses/Sentence.cpp
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
|
| 4 |
+
/***********************************************************************
|
| 5 |
+
Moses - factored phrase-based language decoder
|
| 6 |
+
Copyright (C) 2006 University of Edinburgh
|
| 7 |
+
|
| 8 |
+
This library is free software; you can redistribute it and/or
|
| 9 |
+
modify it under the terms of the GNU Lesser General Public
|
| 10 |
+
License as published by the Free Software Foundation; either
|
| 11 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 12 |
+
|
| 13 |
+
This library is distributed in the hope that it will be useful,
|
| 14 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 15 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 16 |
+
Lesser General Public License for more details.
|
| 17 |
+
|
| 18 |
+
You should have received a copy of the GNU Lesser General Public
|
| 19 |
+
License along with this library; if not, write to the Free Software
|
| 20 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 21 |
+
***********************************************************************/
|
| 22 |
+
|
| 23 |
+
#include <stdexcept>
|
| 24 |
+
#include <boost/algorithm/string.hpp>
|
| 25 |
+
#include <boost/foreach.hpp>
|
| 26 |
+
|
| 27 |
+
#include "Sentence.h"
|
| 28 |
+
#include "TranslationOptionCollectionText.h"
|
| 29 |
+
#include "StaticData.h"
|
| 30 |
+
#include "moses/FF/DynamicCacheBasedLanguageModel.h"
|
| 31 |
+
#include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
|
| 32 |
+
#include "ChartTranslationOptions.h"
|
| 33 |
+
#include "Util.h"
|
| 34 |
+
#include "XmlOption.h"
|
| 35 |
+
#include "FactorCollection.h"
|
| 36 |
+
#include "TranslationTask.h"
|
| 37 |
+
|
| 38 |
+
using namespace std;
|
| 39 |
+
|
| 40 |
+
namespace Moses
|
| 41 |
+
{
|
| 42 |
+
|
| 43 |
+
Sentence::
|
| 44 |
+
Sentence(AllOptions::ptr const& opts) : Phrase(0) , InputType(opts)
|
| 45 |
+
{
|
| 46 |
+
if (is_syntax(opts->search.algo))
|
| 47 |
+
m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
Sentence::
|
| 51 |
+
~Sentence()
|
| 52 |
+
{
|
| 53 |
+
RemoveAllInColl(m_xmlOptions);
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
void
|
| 57 |
+
Sentence::
|
| 58 |
+
aux_init_partial_translation(string& line)
|
| 59 |
+
{
|
| 60 |
+
string sourceCompletedStr;
|
| 61 |
+
int loc1 = line.find( "|||", 0 );
|
| 62 |
+
int loc2 = line.find( "|||", loc1 + 3 );
|
| 63 |
+
if (loc1 > -1 && loc2 > -1) {
|
| 64 |
+
m_initialTargetPhrase = Trim(line.substr(0, loc1));
|
| 65 |
+
string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
|
| 66 |
+
line = line.substr(loc2 + 3);
|
| 67 |
+
|
| 68 |
+
m_sourceCompleted.resize(scov.size());
|
| 69 |
+
int contiguous = 1;
|
| 70 |
+
for (size_t i = 0; i < scov.size(); ++i) {
|
| 71 |
+
if (sourceCompletedStr.at(i) == '1') {
|
| 72 |
+
m_sourceCompleted[i] = true;
|
| 73 |
+
if (contiguous) m_frontSpanCoveredLength++;
|
| 74 |
+
} else {
|
| 75 |
+
m_sourceCompleted[i] = false;
|
| 76 |
+
contiguous = 0;
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
void
|
| 83 |
+
Sentence::
|
| 84 |
+
aux_interpret_sgml_markup(string& line)
|
| 85 |
+
{
|
| 86 |
+
// if sentences is specified as "<seg id=1> ... </seg>", extract id
|
| 87 |
+
typedef std::map<std::string, std::string> metamap;
|
| 88 |
+
metamap meta = ProcessAndStripSGML(line);
|
| 89 |
+
metamap::const_iterator i;
|
| 90 |
+
if ((i = meta.find("id")) != meta.end())
|
| 91 |
+
this->SetTranslationId(atol(i->second.c_str()));
|
| 92 |
+
if ((i = meta.find("docid")) != meta.end()) {
|
| 93 |
+
this->SetDocumentId(atol(i->second.c_str()));
|
| 94 |
+
this->SetUseTopicId(false);
|
| 95 |
+
this->SetUseTopicIdAndProb(false);
|
| 96 |
+
}
|
| 97 |
+
if ((i = meta.find("topic")) != meta.end()) {
|
| 98 |
+
vector<string> topic_params;
|
| 99 |
+
boost::split(topic_params, i->second, boost::is_any_of("\t "));
|
| 100 |
+
if (topic_params.size() == 1) {
|
| 101 |
+
this->SetTopicId(atol(topic_params[0].c_str()));
|
| 102 |
+
this->SetUseTopicId(true);
|
| 103 |
+
this->SetUseTopicIdAndProb(false);
|
| 104 |
+
} else {
|
| 105 |
+
this->SetTopicIdAndProb(topic_params);
|
| 106 |
+
this->SetUseTopicId(false);
|
| 107 |
+
this->SetUseTopicIdAndProb(true);
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
if ((i = meta.find("weight-setting")) != meta.end()) {
|
| 111 |
+
this->SetWeightSetting(i->second);
|
| 112 |
+
this->SetSpecifiesWeightSetting(true);
|
| 113 |
+
StaticData::Instance().SetWeightSetting(i->second);
|
| 114 |
+
// oh this is so horrible! Why does this have to be propagated globally?
|
| 115 |
+
// --- UG
|
| 116 |
+
} else this->SetSpecifiesWeightSetting(false);
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
void
|
| 120 |
+
Sentence::
|
| 121 |
+
aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
|
| 122 |
+
{
|
| 123 |
+
using namespace std;
|
| 124 |
+
typedef map<string, string> str2str_map;
|
| 125 |
+
m_dlt_meta = ProcessAndStripDLT(line);
|
| 126 |
+
// what's happening below is most likely not thread-safe! UG
|
| 127 |
+
BOOST_FOREACH(str2str_map const& M, m_dlt_meta) {
|
| 128 |
+
str2str_map::const_iterator i,j;
|
| 129 |
+
if ((i = M.find("type")) != M.end()) {
|
| 130 |
+
j = M.find("id");
|
| 131 |
+
string id = j == M.end() ? "default" : j->second;
|
| 132 |
+
if (i->second == "cbtm") {
|
| 133 |
+
PhraseDictionaryDynamicCacheBased* cbtm;
|
| 134 |
+
cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
|
| 135 |
+
if (cbtm) cbtm->ExecuteDlt(M);
|
| 136 |
+
}
|
| 137 |
+
if (i->second == "cblm") {
|
| 138 |
+
DynamicCacheBasedLanguageModel* cblm;
|
| 139 |
+
cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
|
| 140 |
+
if (cblm) cblm->ExecuteDlt(M);
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
void
|
| 147 |
+
Sentence::
|
| 148 |
+
aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
|
| 149 |
+
std::vector<std::pair<size_t, std::string> >& placeholders)
|
| 150 |
+
{
|
| 151 |
+
// parse XML markup in translation line
|
| 152 |
+
using namespace std;
|
| 153 |
+
if (m_options->input.xml_policy != XmlPassThrough) {
|
| 154 |
+
bool OK = ProcessAndStripXMLTags(*m_options, line,
|
| 155 |
+
m_xmlOptions,
|
| 156 |
+
m_reorderingConstraint,
|
| 157 |
+
xmlWalls, placeholders,
|
| 158 |
+
*this);
|
| 159 |
+
if (!OK) {
|
| 160 |
+
TRACE_ERR("Unable to parse XML in line: " << line);
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
void
|
| 166 |
+
Sentence::
|
| 167 |
+
init(string line)
|
| 168 |
+
{
|
| 169 |
+
using namespace std;
|
| 170 |
+
|
| 171 |
+
m_frontSpanCoveredLength = 0;
|
| 172 |
+
m_sourceCompleted.resize(0);
|
| 173 |
+
|
| 174 |
+
if (m_options->input.continue_partial_translation)
|
| 175 |
+
aux_init_partial_translation(line);
|
| 176 |
+
|
| 177 |
+
line = Trim(line);
|
| 178 |
+
aux_interpret_sgml_markup(line); // for "<seg id=..." markup
|
| 179 |
+
aux_interpret_dlt(line); // some poorly documented cache-based stuff
|
| 180 |
+
|
| 181 |
+
// if sentences is specified as "<passthrough tag1=""/>"
|
| 182 |
+
if (m_options->output.PrintPassThrough ||m_options->nbest.include_passthrough) {
|
| 183 |
+
string pthru = PassthroughSGML(line,"passthrough");
|
| 184 |
+
this->SetPassthroughInformation(pthru);
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
vector<size_t> xmlWalls;
|
| 188 |
+
vector<pair<size_t, string> >placeholders;
|
| 189 |
+
aux_interpret_xml(line, xmlWalls, placeholders);
|
| 190 |
+
|
| 191 |
+
Phrase::CreateFromString(Input, m_options->input.factor_order, line, NULL);
|
| 192 |
+
|
| 193 |
+
ProcessPlaceholders(placeholders);
|
| 194 |
+
|
| 195 |
+
if (is_syntax(m_options->search.algo))
|
| 196 |
+
InitStartEndWord();
|
| 197 |
+
|
| 198 |
+
// now that we have final word positions in phrase (from
|
| 199 |
+
// CreateFromString), we can make input phrase objects to go with
|
| 200 |
+
// our XmlOptions and create TranslationOptions
|
| 201 |
+
|
| 202 |
+
// only fill the vector if we are parsing XML
|
| 203 |
+
if (m_options->input.xml_policy != XmlPassThrough) {
|
| 204 |
+
m_xmlCoverageMap.assign(GetSize(), false);
|
| 205 |
+
BOOST_FOREACH(XmlOption const* o, m_xmlOptions) {
|
| 206 |
+
Range const& r = o->range;
|
| 207 |
+
for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
|
| 208 |
+
m_xmlCoverageMap[j]=true;
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
|
| 212 |
+
// reordering walls and zones
|
| 213 |
+
m_reorderingConstraint.InitializeWalls(GetSize());
|
| 214 |
+
|
| 215 |
+
// set reordering walls, if "-monotone-at-punction" is set
|
| 216 |
+
if (m_options->reordering.monotone_at_punct && GetSize()) {
|
| 217 |
+
Range r(0, GetSize()-1);
|
| 218 |
+
m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
// set walls obtained from xml
|
| 222 |
+
for(size_t i=0; i<xmlWalls.size(); i++)
|
| 223 |
+
if(xmlWalls[i] < GetSize()) // no buggy walls, please
|
| 224 |
+
m_reorderingConstraint.SetWall(xmlWalls[i], true);
|
| 225 |
+
m_reorderingConstraint.FinalizeWalls();
|
| 226 |
+
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
int
|
| 230 |
+
Sentence::
|
| 231 |
+
Read(std::istream& in)
|
| 232 |
+
{
|
| 233 |
+
std::string line;
|
| 234 |
+
if (getline(in, line, '\n').eof())
|
| 235 |
+
return 0;
|
| 236 |
+
init(line);
|
| 237 |
+
return 1;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
void
|
| 241 |
+
Sentence::
|
| 242 |
+
ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
|
| 243 |
+
{
|
| 244 |
+
FactorType placeholderFactor = m_options->input.placeholder_factor;
|
| 245 |
+
if (placeholderFactor == NOT_FOUND) {
|
| 246 |
+
return;
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
for (size_t i = 0; i < placeholders.size(); ++i) {
|
| 250 |
+
size_t pos = placeholders[i].first;
|
| 251 |
+
const string &str = placeholders[i].second;
|
| 252 |
+
const Factor *factor = FactorCollection::Instance().AddFactor(str);
|
| 253 |
+
Word &word = Phrase::GetWord(pos);
|
| 254 |
+
word[placeholderFactor] = factor;
|
| 255 |
+
}
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
TranslationOptionCollection*
|
| 259 |
+
Sentence::
|
| 260 |
+
CreateTranslationOptionCollection(ttasksptr const& ttask) const
|
| 261 |
+
{
|
| 262 |
+
TranslationOptionCollection *rv
|
| 263 |
+
= new TranslationOptionCollectionText(ttask, *this);
|
| 264 |
+
assert(rv);
|
| 265 |
+
return rv;
|
| 266 |
+
}
|
| 267 |
+
void Sentence::Print(std::ostream& out) const
|
| 268 |
+
{
|
| 269 |
+
out<<*static_cast<Phrase const*>(this);
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const
|
| 274 |
+
{
|
| 275 |
+
for (size_t pos = startPos; pos <= endPos ; pos++) {
|
| 276 |
+
if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
|
| 277 |
+
return true;
|
| 278 |
+
}
|
| 279 |
+
}
|
| 280 |
+
return false;
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list) const
|
| 284 |
+
{
|
| 285 |
+
for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
|
| 286 |
+
iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
|
| 287 |
+
const XmlOption &xmlOption = **iterXMLOpts;
|
| 288 |
+
const Range &range = xmlOption.range;
|
| 289 |
+
const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
|
| 290 |
+
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
|
| 291 |
+
list.push_back(transOpt);
|
| 292 |
+
}
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const
|
| 296 |
+
{
|
| 297 |
+
//iterate over XmlOptions list, find exact source/target matches
|
| 298 |
+
|
| 299 |
+
for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
|
| 300 |
+
iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
|
| 301 |
+
const XmlOption &xmlOption = **iterXMLOpts;
|
| 302 |
+
const Range &range = xmlOption.range;
|
| 303 |
+
|
| 304 |
+
if (startPos == range.GetStartPos()
|
| 305 |
+
&& endPos == range.GetEndPos()) {
|
| 306 |
+
const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
|
| 307 |
+
|
| 308 |
+
TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
|
| 309 |
+
list.push_back(transOpt);
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
std::vector <ChartTranslationOptions*>
|
| 315 |
+
Sentence::
|
| 316 |
+
GetXmlChartTranslationOptions() const
|
| 317 |
+
{
|
| 318 |
+
std::vector <ChartTranslationOptions*> ret;
|
| 319 |
+
|
| 320 |
+
// XML Options
|
| 321 |
+
// this code is a copy of the 1 in Sentence.
|
| 322 |
+
|
| 323 |
+
//only fill the vector if we are parsing XML
|
| 324 |
+
if (m_options->input.xml_policy != XmlPassThrough ) {
|
| 325 |
+
//TODO: needed to handle exclusive
|
| 326 |
+
//for (size_t i=0; i<GetSize(); i++) {
|
| 327 |
+
// m_xmlCoverageMap.push_back(false);
|
| 328 |
+
//}
|
| 329 |
+
|
| 330 |
+
//iterXMLOpts will be empty for XmlIgnore
|
| 331 |
+
//look at each column
|
| 332 |
+
for(std::vector<XmlOption const*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
|
| 333 |
+
iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {
|
| 334 |
+
|
| 335 |
+
const XmlOption &xmlOption = **iterXmlOpts;
|
| 336 |
+
TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);
|
| 337 |
+
|
| 338 |
+
Range *range = new Range(xmlOption.range);
|
| 339 |
+
StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted
|
| 340 |
+
|
| 341 |
+
TargetPhraseCollection *tpc = new TargetPhraseCollection;
|
| 342 |
+
tpc->Add(targetPhrase);
|
| 343 |
+
|
| 344 |
+
ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
|
| 345 |
+
ret.push_back(transOpt);
|
| 346 |
+
|
| 347 |
+
//TODO: needed to handle exclusive
|
| 348 |
+
//for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
|
| 349 |
+
// m_xmlCoverageMap[j]=true;
|
| 350 |
+
//}
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
return ret;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
void
|
| 358 |
+
Sentence::
|
| 359 |
+
CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
|
| 360 |
+
{
|
| 361 |
+
Phrase::CreateFromString(Input, FOrder, phraseString, NULL);
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
Sentence::
|
| 365 |
+
Sentence(AllOptions::ptr const& opts, size_t const transId, string stext)
|
| 366 |
+
: InputType(opts, transId)
|
| 367 |
+
{
|
| 368 |
+
init(stext);
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
}
|
| 372 |
+
|
mosesdecoder/moses/Syntax/Cube.h
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <queue>
|
| 4 |
+
#include <vector>
|
| 5 |
+
#include <utility>
|
| 6 |
+
|
| 7 |
+
#include <boost/unordered_set.hpp>
|
| 8 |
+
|
| 9 |
+
#include "SHyperedge.h"
|
| 10 |
+
#include "SHyperedgeBundle.h"
|
| 11 |
+
|
| 12 |
+
namespace Moses
|
| 13 |
+
{
|
| 14 |
+
namespace Syntax
|
| 15 |
+
{
|
| 16 |
+
|
| 17 |
+
// A cube -- in the cube pruning sense (see Chiang (2007)) -- that lazily
|
| 18 |
+
// produces SHyperedge objects from a SHyperedgeBundle in approximately
|
| 19 |
+
// best-first order.
|
| 20 |
+
class Cube
|
| 21 |
+
{
|
| 22 |
+
public:
|
| 23 |
+
Cube(const SHyperedgeBundle &);
|
| 24 |
+
~Cube();
|
| 25 |
+
|
| 26 |
+
SHyperedge *Pop();
|
| 27 |
+
|
| 28 |
+
SHyperedge *Top() const {
|
| 29 |
+
return m_queue.top().first;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
bool IsEmpty() const {
|
| 33 |
+
return m_queue.empty();
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
private:
|
| 37 |
+
typedef boost::unordered_set<std::vector<int> > CoordinateSet;
|
| 38 |
+
|
| 39 |
+
typedef std::pair<SHyperedge *, const std::vector<int> *> QueueItem;
|
| 40 |
+
|
| 41 |
+
class QueueItemOrderer
|
| 42 |
+
{
|
| 43 |
+
public:
|
| 44 |
+
bool operator()(const QueueItem &p, const QueueItem &q) const {
|
| 45 |
+
return p.first->label.futureScore < q.first->label.futureScore;
|
| 46 |
+
}
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
typedef std::priority_queue<QueueItem, std::vector<QueueItem>,
|
| 50 |
+
QueueItemOrderer> Queue;
|
| 51 |
+
|
| 52 |
+
SHyperedge *CreateHyperedge(const std::vector<int> &);
|
| 53 |
+
void CreateNeighbour(const std::vector<int> &);
|
| 54 |
+
void CreateNeighbours(const std::vector<int> &);
|
| 55 |
+
|
| 56 |
+
const SHyperedgeBundle &m_bundle;
|
| 57 |
+
CoordinateSet m_visited;
|
| 58 |
+
Queue m_queue;
|
| 59 |
+
};
|
| 60 |
+
|
| 61 |
+
} // Syntax
|
| 62 |
+
} // Moses
|
mosesdecoder/moses/Syntax/CubeQueue.cpp
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "CubeQueue.h"
|
| 2 |
+
|
| 3 |
+
namespace Moses
|
| 4 |
+
{
|
| 5 |
+
namespace Syntax
|
| 6 |
+
{
|
| 7 |
+
|
| 8 |
+
CubeQueue::~CubeQueue()
|
| 9 |
+
{
|
| 10 |
+
while (!m_queue.empty()) {
|
| 11 |
+
Cube *cube = m_queue.top();
|
| 12 |
+
m_queue.pop();
|
| 13 |
+
delete cube;
|
| 14 |
+
}
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
SHyperedge *CubeQueue::Pop()
|
| 18 |
+
{
|
| 19 |
+
// pop the most promising cube
|
| 20 |
+
Cube *cube = m_queue.top();
|
| 21 |
+
m_queue.pop();
|
| 22 |
+
|
| 23 |
+
// pop the most promising hyperedge from the cube
|
| 24 |
+
SHyperedge *hyperedge = cube->Pop();
|
| 25 |
+
|
| 26 |
+
// if the cube contains more items then push it back onto the queue
|
| 27 |
+
if (!cube->IsEmpty()) {
|
| 28 |
+
m_queue.push(cube);
|
| 29 |
+
} else {
|
| 30 |
+
delete cube;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return hyperedge;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
} // Syntax
|
| 37 |
+
} // Moses
|
mosesdecoder/moses/Syntax/CubeQueue.h
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <queue>
|
| 4 |
+
#include <vector>
|
| 5 |
+
|
| 6 |
+
#include "Cube.h"
|
| 7 |
+
#include "SHyperedge.h"
|
| 8 |
+
#include "SHyperedgeBundle.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
namespace Syntax
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
class CubeQueue
|
| 16 |
+
{
|
| 17 |
+
public:
|
| 18 |
+
template<typename InputIterator>
|
| 19 |
+
CubeQueue(InputIterator, InputIterator);
|
| 20 |
+
|
| 21 |
+
~CubeQueue();
|
| 22 |
+
|
| 23 |
+
SHyperedge *Pop();
|
| 24 |
+
|
| 25 |
+
bool IsEmpty() const {
|
| 26 |
+
return m_queue.empty();
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
private:
|
| 30 |
+
class CubeOrderer
|
| 31 |
+
{
|
| 32 |
+
public:
|
| 33 |
+
bool operator()(const Cube *p, const Cube *q) const {
|
| 34 |
+
return p->Top()->label.futureScore < q->Top()->label.futureScore;
|
| 35 |
+
}
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
typedef std::priority_queue<Cube*, std::vector<Cube*>, CubeOrderer> Queue;
|
| 39 |
+
|
| 40 |
+
Queue m_queue;
|
| 41 |
+
};
|
| 42 |
+
|
| 43 |
+
template<typename InputIterator>
|
| 44 |
+
CubeQueue::CubeQueue(InputIterator first, InputIterator last)
|
| 45 |
+
{
|
| 46 |
+
while (first != last) {
|
| 47 |
+
m_queue.push(new Cube(*first++));
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
} // Syntax
|
| 52 |
+
} // Moses
|
mosesdecoder/moses/Syntax/InputWeightFF.cpp
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "InputWeightFF.h"
|
| 2 |
+
|
| 3 |
+
#include <vector>
|
| 4 |
+
|
| 5 |
+
#include "moses/ScoreComponentCollection.h"
|
| 6 |
+
#include "moses/Syntax/SHyperedge.h"
|
| 7 |
+
#include "moses/TargetPhrase.h"
|
| 8 |
+
|
| 9 |
+
namespace Moses
|
| 10 |
+
{
|
| 11 |
+
namespace Syntax
|
| 12 |
+
{
|
| 13 |
+
|
| 14 |
+
InputWeightFF::InputWeightFF(const std::string &line)
|
| 15 |
+
: StatelessFeatureFunction(1, line)
|
| 16 |
+
{
|
| 17 |
+
ReadParameters();
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
void InputWeightFF::EvaluateWhenApplied(const Hypothesis& hypo,
|
| 21 |
+
ScoreComponentCollection* accumulator) const
|
| 22 |
+
{
|
| 23 |
+
// TODO Throw exception.
|
| 24 |
+
assert(false);
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
void InputWeightFF::EvaluateWhenApplied(const ChartHypothesis &hypo,
|
| 28 |
+
ScoreComponentCollection* accumulator) const
|
| 29 |
+
{
|
| 30 |
+
// TODO Throw exception.
|
| 31 |
+
assert(false);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
void InputWeightFF::EvaluateWhenApplied(
|
| 35 |
+
const Syntax::SHyperedge &hyperedge,
|
| 36 |
+
ScoreComponentCollection* accumulator) const
|
| 37 |
+
{
|
| 38 |
+
accumulator->PlusEquals(this, hyperedge.label.inputWeight);
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
void InputWeightFF::SetParameter(const std::string& key,
|
| 42 |
+
const std::string& value)
|
| 43 |
+
{
|
| 44 |
+
StatelessFeatureFunction::SetParameter(key, value);
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
} // Syntax
|
| 48 |
+
} // Moses
|
mosesdecoder/moses/Syntax/Manager.cpp
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <sstream>
|
| 2 |
+
#include "Manager.h"
|
| 3 |
+
#include "PVertex.h"
|
| 4 |
+
#include "moses/OutputCollector.h"
|
| 5 |
+
#include "moses/Util.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
namespace Syntax
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
Manager::Manager(ttasksptr const& ttask)
|
| 13 |
+
: Moses::BaseManager(ttask)
|
| 14 |
+
{ }
|
| 15 |
+
|
| 16 |
+
void Manager::OutputBest(OutputCollector *collector) const
|
| 17 |
+
{
|
| 18 |
+
if (!collector) {
|
| 19 |
+
return;
|
| 20 |
+
}
|
| 21 |
+
std::ostringstream out;
|
| 22 |
+
FixPrecision(out);
|
| 23 |
+
const SHyperedge *best = GetBestSHyperedge();
|
| 24 |
+
if (best == NULL) {
|
| 25 |
+
VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
|
| 26 |
+
if (options()->output.ReportHypoScore) {
|
| 27 |
+
out << "0 ";
|
| 28 |
+
}
|
| 29 |
+
out << '\n';
|
| 30 |
+
} else {
|
| 31 |
+
if (options()->output.ReportHypoScore) {
|
| 32 |
+
out << best->label.futureScore << " ";
|
| 33 |
+
}
|
| 34 |
+
Phrase yield = GetOneBestTargetYield(*best);
|
| 35 |
+
// delete 1st & last
|
| 36 |
+
UTIL_THROW_IF2(yield.GetSize() < 2,
|
| 37 |
+
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
| 38 |
+
yield.RemoveWord(0);
|
| 39 |
+
yield.RemoveWord(yield.GetSize()-1);
|
| 40 |
+
out << yield.GetStringRep(options()->output.factor_order);
|
| 41 |
+
out << '\n';
|
| 42 |
+
}
|
| 43 |
+
collector->Write(m_source.GetTranslationId(), out.str());
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
void Manager::OutputNBest(OutputCollector *collector) const
|
| 47 |
+
{
|
| 48 |
+
if (collector) {
|
| 49 |
+
long translationId = m_source.GetTranslationId();
|
| 50 |
+
KBestExtractor::KBestVec nBestList;
|
| 51 |
+
ExtractKBest(options()->nbest.nbest_size, nBestList,
|
| 52 |
+
options()->nbest.only_distinct);
|
| 53 |
+
OutputNBestList(collector, nBestList, translationId);
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
void Manager::OutputUnknowns(OutputCollector *collector) const
|
| 58 |
+
{
|
| 59 |
+
if (collector) {
|
| 60 |
+
long translationId = m_source.GetTranslationId();
|
| 61 |
+
|
| 62 |
+
std::ostringstream out;
|
| 63 |
+
for (boost::unordered_set<Moses::Word>::const_iterator p = m_oovs.begin();
|
| 64 |
+
p != m_oovs.end(); ++p) {
|
| 65 |
+
out << *p;
|
| 66 |
+
}
|
| 67 |
+
out << std::endl;
|
| 68 |
+
collector->Write(translationId, out.str());
|
| 69 |
+
}
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
void Manager::OutputNBestList(OutputCollector *collector,
|
| 73 |
+
const KBestExtractor::KBestVec &nBestList,
|
| 74 |
+
long translationId) const
|
| 75 |
+
{
|
| 76 |
+
const std::vector<FactorType> &outputFactorOrder = options()->output.factor_order;
|
| 77 |
+
|
| 78 |
+
std::ostringstream out;
|
| 79 |
+
|
| 80 |
+
if (collector->OutputIsCout()) {
|
| 81 |
+
// Set precision only if we're writing the n-best list to cout. This is to
|
| 82 |
+
// preserve existing behaviour, but should probably be done either way.
|
| 83 |
+
FixPrecision(out);
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
bool includeWordAlignment = options()->nbest.include_alignment_info;
|
| 87 |
+
bool PrintNBestTrees = options()->nbest.print_trees; // PrintNBestTrees();
|
| 88 |
+
|
| 89 |
+
for (KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
|
| 90 |
+
p != nBestList.end(); ++p) {
|
| 91 |
+
const KBestExtractor::Derivation &derivation = **p;
|
| 92 |
+
|
| 93 |
+
// get the derivation's target-side yield
|
| 94 |
+
Phrase outputPhrase = KBestExtractor::GetOutputPhrase(derivation);
|
| 95 |
+
|
| 96 |
+
// delete <s> and </s>
|
| 97 |
+
UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
|
| 98 |
+
"Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
|
| 99 |
+
outputPhrase.RemoveWord(0);
|
| 100 |
+
outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
|
| 101 |
+
|
| 102 |
+
// print the translation ID, surface factors, and scores
|
| 103 |
+
out << translationId << " ||| ";
|
| 104 |
+
OutputSurface(out, outputPhrase); // , outputFactorOrder, false);
|
| 105 |
+
out << " ||| ";
|
| 106 |
+
bool with_labels = options()->nbest.include_feature_labels;
|
| 107 |
+
derivation.scoreBreakdown.OutputAllFeatureScores(out, with_labels);
|
| 108 |
+
out << " ||| " << derivation.score;
|
| 109 |
+
|
| 110 |
+
// optionally, print word alignments
|
| 111 |
+
if (includeWordAlignment) {
|
| 112 |
+
out << " ||| ";
|
| 113 |
+
Alignments align;
|
| 114 |
+
OutputAlignmentNBest(align, derivation, 0);
|
| 115 |
+
for (Alignments::const_iterator q = align.begin(); q != align.end();
|
| 116 |
+
++q) {
|
| 117 |
+
out << q->first << "-" << q->second << " ";
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// optionally, print tree
|
| 122 |
+
if (PrintNBestTrees) {
|
| 123 |
+
TreePointer tree = KBestExtractor::GetOutputTree(derivation);
|
| 124 |
+
out << " ||| " << tree->GetString();
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
out << std::endl;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
assert(collector);
|
| 131 |
+
collector->Write(translationId, out.str());
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
std::size_t Manager::OutputAlignmentNBest(
|
| 135 |
+
Alignments &retAlign,
|
| 136 |
+
const KBestExtractor::Derivation &derivation,
|
| 137 |
+
std::size_t startTarget) const
|
| 138 |
+
{
|
| 139 |
+
const SHyperedge ­peredge = derivation.edge->shyperedge;
|
| 140 |
+
|
| 141 |
+
std::size_t totalTargetSize = 0;
|
| 142 |
+
std::size_t startSource = shyperedge.head->pvertex->span.GetStartPos();
|
| 143 |
+
|
| 144 |
+
const TargetPhrase &tp = *(shyperedge.label.translation);
|
| 145 |
+
|
| 146 |
+
std::size_t thisSourceSize = CalcSourceSize(derivation);
|
| 147 |
+
|
| 148 |
+
// position of each terminal word in translation rule, irrespective of
|
| 149 |
+
// alignment if non-term, number is undefined
|
| 150 |
+
std::vector<std::size_t> sourceOffsets(thisSourceSize, 0);
|
| 151 |
+
std::vector<std::size_t> targetOffsets(tp.GetSize(), 0);
|
| 152 |
+
|
| 153 |
+
const AlignmentInfo &aiNonTerm =
|
| 154 |
+
shyperedge.label.translation->GetAlignNonTerm();
|
| 155 |
+
std::vector<std::size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
|
| 156 |
+
const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd =
|
| 157 |
+
aiNonTerm.GetNonTermIndexMap();
|
| 158 |
+
|
| 159 |
+
UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
|
| 160 |
+
"Error");
|
| 161 |
+
|
| 162 |
+
std::size_t targetInd = 0;
|
| 163 |
+
for (std::size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
|
| 164 |
+
if (tp.GetWord(targetPos).IsNonTerminal()) {
|
| 165 |
+
UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
|
| 166 |
+
std::size_t sourceInd = targetPos2SourceInd[targetPos];
|
| 167 |
+
std::size_t sourcePos = sourceInd2pos[sourceInd];
|
| 168 |
+
|
| 169 |
+
const KBestExtractor::Derivation &subderivation =
|
| 170 |
+
*derivation.subderivations[sourceInd];
|
| 171 |
+
|
| 172 |
+
// calc source size
|
| 173 |
+
std::size_t sourceSize =
|
| 174 |
+
subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
|
| 175 |
+
sourceOffsets[sourcePos] = sourceSize;
|
| 176 |
+
|
| 177 |
+
// calc target size.
|
| 178 |
+
// Recursively look thru child hypos
|
| 179 |
+
std::size_t currStartTarget = startTarget + totalTargetSize;
|
| 180 |
+
std::size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
|
| 181 |
+
currStartTarget);
|
| 182 |
+
targetOffsets[targetPos] = targetSize;
|
| 183 |
+
|
| 184 |
+
totalTargetSize += targetSize;
|
| 185 |
+
++targetInd;
|
| 186 |
+
} else {
|
| 187 |
+
++totalTargetSize;
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
// convert position within translation rule to absolute position within
|
| 192 |
+
// source sentence / output sentence
|
| 193 |
+
ShiftOffsets(sourceOffsets, startSource);
|
| 194 |
+
ShiftOffsets(targetOffsets, startTarget);
|
| 195 |
+
|
| 196 |
+
// get alignments from this hypo
|
| 197 |
+
const AlignmentInfo &aiTerm = shyperedge.label.translation->GetAlignTerm();
|
| 198 |
+
|
| 199 |
+
// add to output arg, offsetting by source & target
|
| 200 |
+
AlignmentInfo::const_iterator iter;
|
| 201 |
+
for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
|
| 202 |
+
const std::pair<std::size_t, std::size_t> &align = *iter;
|
| 203 |
+
std::size_t relSource = align.first;
|
| 204 |
+
std::size_t relTarget = align.second;
|
| 205 |
+
std::size_t absSource = sourceOffsets[relSource];
|
| 206 |
+
std::size_t absTarget = targetOffsets[relTarget];
|
| 207 |
+
|
| 208 |
+
std::pair<std::size_t, std::size_t> alignPoint(absSource, absTarget);
|
| 209 |
+
std::pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
|
| 210 |
+
UTIL_THROW_IF2(!ret.second, "Error");
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
return totalTargetSize;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
std::size_t Manager::CalcSourceSize(const KBestExtractor::Derivation &d) const
|
| 217 |
+
{
|
| 218 |
+
const SHyperedge ­peredge = d.edge->shyperedge;
|
| 219 |
+
std::size_t ret = shyperedge.head->pvertex->span.GetNumWordsCovered();
|
| 220 |
+
for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
|
| 221 |
+
std::size_t childSize =
|
| 222 |
+
shyperedge.tail[i]->pvertex->span.GetNumWordsCovered();
|
| 223 |
+
ret -= (childSize - 1);
|
| 224 |
+
}
|
| 225 |
+
return ret;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
} // Syntax
|
| 229 |
+
} // Moses
|
mosesdecoder/moses/Syntax/NonTerminalMap.h
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <vector>
|
| 4 |
+
|
| 5 |
+
#include <boost/unordered_map.hpp>
|
| 6 |
+
|
| 7 |
+
#include "moses/FactorCollection.h"
|
| 8 |
+
#include "moses/Word.h"
|
| 9 |
+
|
| 10 |
+
#include "SymbolEqualityPred.h"
|
| 11 |
+
#include "SymbolHasher.h"
|
| 12 |
+
|
| 13 |
+
namespace Moses
|
| 14 |
+
{
|
| 15 |
+
namespace Syntax
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
// Hybrid map/vector-based container for key-value pairs where the key is a
|
| 19 |
+
// non-terminal Word. The interface is like a (stripped-down) map type, with
|
| 20 |
+
// the main differences being that:
|
| 21 |
+
// 1. Find() is implemented using vector indexing to make it fast.
|
| 22 |
+
// 2. Once a value has been inserted it can be modified but can't be removed.
|
| 23 |
+
template<typename T>
|
| 24 |
+
class NonTerminalMap
|
| 25 |
+
{
|
| 26 |
+
private:
|
| 27 |
+
typedef boost::unordered_map<Word, T, SymbolHasher, SymbolEqualityPred> Map;
|
| 28 |
+
typedef std::vector<T*> Vec;
|
| 29 |
+
|
| 30 |
+
public:
|
| 31 |
+
typedef typename Map::iterator Iterator;
|
| 32 |
+
typedef typename Map::const_iterator ConstIterator;
|
| 33 |
+
|
| 34 |
+
NonTerminalMap()
|
| 35 |
+
: m_vec(FactorCollection::Instance().GetNumNonTerminals(), NULL) {}
|
| 36 |
+
|
| 37 |
+
Iterator Begin() {
|
| 38 |
+
return m_map.begin();
|
| 39 |
+
}
|
| 40 |
+
Iterator End() {
|
| 41 |
+
return m_map.end();
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
ConstIterator Begin() const {
|
| 45 |
+
return m_map.begin();
|
| 46 |
+
}
|
| 47 |
+
ConstIterator End() const {
|
| 48 |
+
return m_map.end();
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
std::size_t Size() const {
|
| 52 |
+
return m_map.size();
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
bool IsEmpty() const {
|
| 56 |
+
return m_map.empty();
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
std::pair<Iterator, bool> Insert(const Word &, const T &);
|
| 60 |
+
|
| 61 |
+
T *Find(const Word &w) const {
|
| 62 |
+
return m_vec[w[0]->GetId()];
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
private:
|
| 66 |
+
Map m_map;
|
| 67 |
+
Vec m_vec;
|
| 68 |
+
};
|
| 69 |
+
|
| 70 |
+
template<typename T>
|
| 71 |
+
std::pair<typename NonTerminalMap<T>::Iterator, bool> NonTerminalMap<T>::Insert(
|
| 72 |
+
const Word &key, const T &value)
|
| 73 |
+
{
|
| 74 |
+
std::pair<typename Map::iterator, bool> result =
|
| 75 |
+
m_map.insert(typename Map::value_type(key, value));
|
| 76 |
+
if (result.second) {
|
| 77 |
+
T *p = &(result.first->second);
|
| 78 |
+
std::size_t i = key[0]->GetId();
|
| 79 |
+
m_vec[i] = p;
|
| 80 |
+
}
|
| 81 |
+
return result;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
} // namespace Syntax
|
| 85 |
+
} // namespace Moses
|
mosesdecoder/moses/Syntax/PHyperedge.h
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <vector>
|
| 4 |
+
|
| 5 |
+
#include "PLabel.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
namespace Syntax
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
struct PVertex;
|
| 13 |
+
|
| 14 |
+
struct PHyperedge {
|
| 15 |
+
PVertex *head;
|
| 16 |
+
std::vector<PVertex*> tail;
|
| 17 |
+
PLabel label;
|
| 18 |
+
};
|
| 19 |
+
|
| 20 |
+
} // Syntax
|
| 21 |
+
} // Moses
|
mosesdecoder/moses/Syntax/RuleTableFF.h
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <string>
|
| 4 |
+
|
| 5 |
+
#include "moses/TranslationModel/PhraseDictionary.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
class ChartParser;
|
| 11 |
+
class ChartCellCollectionBase;
|
| 12 |
+
class AllOptions;
|
| 13 |
+
namespace Syntax
|
| 14 |
+
{
|
| 15 |
+
|
| 16 |
+
class RuleTable;
|
| 17 |
+
|
| 18 |
+
// Feature function for dealing with local rule scores (that come from a
|
| 19 |
+
// rule table). The scores themselves are stored on TargetPhrase objects
|
| 20 |
+
// and the decoder accesses them directly, so this object doesn't really do
|
| 21 |
+
// anything except provide somewhere to store the weights and parameter values.
|
| 22 |
+
class RuleTableFF : public PhraseDictionary
|
| 23 |
+
{
|
| 24 |
+
public:
|
| 25 |
+
RuleTableFF(const std::string &);
|
| 26 |
+
|
| 27 |
+
// FIXME Delete m_table?
|
| 28 |
+
~RuleTableFF() {}
|
| 29 |
+
|
| 30 |
+
void Load(AllOptions::ptr const& opts);
|
| 31 |
+
|
| 32 |
+
const RuleTable *GetTable() const {
|
| 33 |
+
return m_table;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
static const std::vector<RuleTableFF*> &Instances() {
|
| 37 |
+
return s_instances;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
ChartRuleLookupManager *CreateRuleLookupManager(
|
| 41 |
+
const ChartParser &, const ChartCellCollectionBase &, std::size_t) {
|
| 42 |
+
assert(false);
|
| 43 |
+
return 0;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
// Get the source terminal vocabulary for this table's grammar (as a set of
|
| 47 |
+
// factor IDs)
|
| 48 |
+
const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
|
| 49 |
+
return m_sourceTerminalSet;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
private:
|
| 53 |
+
static std::vector<RuleTableFF*> s_instances;
|
| 54 |
+
|
| 55 |
+
const RuleTable *m_table;
|
| 56 |
+
boost::unordered_set<std::size_t> m_sourceTerminalSet;
|
| 57 |
+
};
|
| 58 |
+
|
| 59 |
+
} // Syntax
|
| 60 |
+
} // Moses
|
mosesdecoder/moses/Syntax/SHyperedgeBundle.h
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <vector>
|
| 4 |
+
|
| 5 |
+
#include "moses/ScoreComponentCollection.h"
|
| 6 |
+
#include "moses/TargetPhraseCollection.h"
|
| 7 |
+
|
| 8 |
+
#include "SVertexStack.h"
|
| 9 |
+
|
| 10 |
+
namespace Moses
|
| 11 |
+
{
|
| 12 |
+
namespace Syntax
|
| 13 |
+
{
|
| 14 |
+
|
| 15 |
+
struct PVertex;
|
| 16 |
+
|
| 17 |
+
struct SHyperedgeBundle {
|
| 18 |
+
float inputWeight;
|
| 19 |
+
std::vector<const SVertexStack*> stacks;
|
| 20 |
+
TargetPhraseCollection::shared_ptr translations;
|
| 21 |
+
|
| 22 |
+
friend void swap(SHyperedgeBundle &x, SHyperedgeBundle &y) {
|
| 23 |
+
using std::swap;
|
| 24 |
+
swap(x.inputWeight, y.inputWeight);
|
| 25 |
+
swap(x.stacks, y.stacks);
|
| 26 |
+
swap(x.translations, y.translations);
|
| 27 |
+
}
|
| 28 |
+
};
|
| 29 |
+
|
| 30 |
+
} // Syntax
|
| 31 |
+
} // Moses
|
mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include "moses/FF/FFState.h"
|
| 4 |
+
|
| 5 |
+
#include "SVertex.h"
|
| 6 |
+
|
| 7 |
+
namespace Moses
|
| 8 |
+
{
|
| 9 |
+
namespace Syntax
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
class SVertexRecombinationHasher
|
| 13 |
+
{
|
| 14 |
+
public:
|
| 15 |
+
std::size_t operator()(const SVertex *v) const {
|
| 16 |
+
std::size_t seed = 0;
|
| 17 |
+
for (std::vector<FFState*>::const_iterator p = v->states.begin();
|
| 18 |
+
p != v->states.end(); ++p) {
|
| 19 |
+
boost::hash_combine(seed, (*p)->hash());
|
| 20 |
+
}
|
| 21 |
+
return seed;
|
| 22 |
+
}
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
} // Syntax
|
| 26 |
+
} // Moses
|
mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp
ADDED
|
@@ -0,0 +1,424 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include "ThrowingFwrite.h"
|
| 23 |
+
#include "BlockHashIndex.h"
|
| 24 |
+
#include "CmphStringVectorAdapter.h"
|
| 25 |
+
#include "util/exception.hh"
|
| 26 |
+
#include "util/string_stream.hh"
|
| 27 |
+
|
| 28 |
+
#ifdef HAVE_CMPH
|
| 29 |
+
#include "cmph.h"
|
| 30 |
+
#endif
|
| 31 |
+
|
| 32 |
+
namespace Moses
|
| 33 |
+
{
|
| 34 |
+
#ifdef WITH_THREADS
|
| 35 |
+
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
|
| 36 |
+
size_t threadsNum)
|
| 37 |
+
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
| 38 |
+
m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0),
|
| 39 |
+
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
|
| 40 |
+
m_threadPool(threadsNum)
|
| 41 |
+
{
|
| 42 |
+
#ifndef HAVE_CMPH
|
| 43 |
+
std::cerr << "minphr: CMPH support not compiled in." << std::endl;
|
| 44 |
+
exit(1);
|
| 45 |
+
#endif
|
| 46 |
+
}
|
| 47 |
+
#else
|
| 48 |
+
BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
|
| 49 |
+
: m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
| 50 |
+
m_fileHandle(0), m_fileHandleStart(0), m_size(0),
|
| 51 |
+
m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
|
| 52 |
+
{
|
| 53 |
+
#ifndef HAVE_CMPH
|
| 54 |
+
std::cerr << "minphr: CMPH support not compiled in." << std::endl;
|
| 55 |
+
exit(1);
|
| 56 |
+
#endif
|
| 57 |
+
}
|
| 58 |
+
#endif
|
| 59 |
+
|
| 60 |
+
BlockHashIndex::~BlockHashIndex()
|
| 61 |
+
{
|
| 62 |
+
#ifdef HAVE_CMPH
|
| 63 |
+
for(std::vector<void*>::iterator it = m_hashes.begin();
|
| 64 |
+
it != m_hashes.end(); it++)
|
| 65 |
+
if(*it != 0)
|
| 66 |
+
cmph_destroy((cmph_t*)*it);
|
| 67 |
+
|
| 68 |
+
for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
|
| 69 |
+
it != m_arrays.end(); it++)
|
| 70 |
+
if(*it != 0)
|
| 71 |
+
delete *it;
|
| 72 |
+
#endif
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
size_t BlockHashIndex::GetHash(const char* key)
|
| 76 |
+
{
|
| 77 |
+
std::string keyStr(key);
|
| 78 |
+
size_t i = std::distance(m_landmarks.begin(),
|
| 79 |
+
std::upper_bound(m_landmarks.begin(),
|
| 80 |
+
m_landmarks.end(), keyStr)) - 1;
|
| 81 |
+
|
| 82 |
+
if(i == 0ul-1)
|
| 83 |
+
return GetSize();
|
| 84 |
+
|
| 85 |
+
size_t pos = GetHash(i, key);
|
| 86 |
+
if(pos != GetSize())
|
| 87 |
+
return (1ul << m_orderBits) * i + pos;
|
| 88 |
+
else
|
| 89 |
+
return GetSize();
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
size_t BlockHashIndex::GetFprint(const char* key) const
|
| 93 |
+
{
|
| 94 |
+
size_t hash;
|
| 95 |
+
MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash);
|
| 96 |
+
hash &= (1ul << m_fingerPrintBits) - 1;
|
| 97 |
+
return hash;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
size_t BlockHashIndex::GetHash(size_t i, const char* key)
|
| 101 |
+
{
|
| 102 |
+
//#ifdef WITH_THREADS
|
| 103 |
+
// boost::mutex::scoped_lock lock(m_mutex);
|
| 104 |
+
//#endif
|
| 105 |
+
//if(m_hashes[i] == 0)
|
| 106 |
+
//LoadRange(i);
|
| 107 |
+
#ifdef HAVE_CMPH
|
| 108 |
+
size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
|
| 109 |
+
#else
|
| 110 |
+
assert(0);
|
| 111 |
+
size_t idx = 0;
|
| 112 |
+
#endif
|
| 113 |
+
|
| 114 |
+
std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
|
| 115 |
+
m_clocks[i] = clock();
|
| 116 |
+
|
| 117 |
+
if(GetFprint(key) == orderPrint.second)
|
| 118 |
+
return orderPrint.first;
|
| 119 |
+
else
|
| 120 |
+
return GetSize();
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
size_t BlockHashIndex::GetHash(std::string key)
|
| 124 |
+
{
|
| 125 |
+
return GetHash(key.c_str());
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
size_t BlockHashIndex::operator[](std::string key)
|
| 129 |
+
{
|
| 130 |
+
return GetHash(key);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
size_t BlockHashIndex::operator[](char* key)
|
| 134 |
+
{
|
| 135 |
+
return GetHash(key);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
size_t BlockHashIndex::Save(std::string filename)
|
| 139 |
+
{
|
| 140 |
+
std::FILE* mphf = std::fopen(filename.c_str(), "w");
|
| 141 |
+
size_t size = Save(mphf);
|
| 142 |
+
std::fclose(mphf);
|
| 143 |
+
return size;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
void BlockHashIndex::BeginSave(std::FILE * mphf)
|
| 147 |
+
{
|
| 148 |
+
m_fileHandle = mphf;
|
| 149 |
+
ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
|
| 150 |
+
ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
|
| 151 |
+
|
| 152 |
+
m_fileHandleStart = std::ftell(m_fileHandle);
|
| 153 |
+
|
| 154 |
+
size_t relIndexPos = 0;
|
| 155 |
+
ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
void BlockHashIndex::SaveRange(size_t i)
|
| 159 |
+
{
|
| 160 |
+
#ifdef HAVE_CMPH
|
| 161 |
+
if(m_seekIndex.size() <= i)
|
| 162 |
+
m_seekIndex.resize(i+1);
|
| 163 |
+
m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart;
|
| 164 |
+
cmph_dump((cmph_t*)m_hashes[i], m_fileHandle);
|
| 165 |
+
m_arrays[i]->Save(m_fileHandle);
|
| 166 |
+
#endif
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
void BlockHashIndex::SaveLastRange()
|
| 170 |
+
{
|
| 171 |
+
#ifdef WITH_THREADS
|
| 172 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 173 |
+
#endif
|
| 174 |
+
|
| 175 |
+
while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
|
| 176 |
+
size_t current = -m_queue.top();
|
| 177 |
+
m_queue.pop();
|
| 178 |
+
SaveRange(current);
|
| 179 |
+
m_lastSaved = current;
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
void BlockHashIndex::DropRange(size_t i)
|
| 184 |
+
{
|
| 185 |
+
#ifdef HAVE_CMPH
|
| 186 |
+
if(m_hashes[i] != 0) {
|
| 187 |
+
cmph_destroy((cmph_t*)m_hashes[i]);
|
| 188 |
+
m_hashes[i] = 0;
|
| 189 |
+
}
|
| 190 |
+
if(m_arrays[i] != 0) {
|
| 191 |
+
delete m_arrays[i];
|
| 192 |
+
m_arrays[i] = 0;
|
| 193 |
+
m_clocks[i] = 0;
|
| 194 |
+
}
|
| 195 |
+
m_numLoadedRanges--;
|
| 196 |
+
#endif
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
void BlockHashIndex::DropLastRange()
|
| 200 |
+
{
|
| 201 |
+
#ifdef WITH_THREADS
|
| 202 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 203 |
+
#endif
|
| 204 |
+
|
| 205 |
+
while(m_lastDropped != m_lastSaved)
|
| 206 |
+
DropRange(++m_lastDropped);
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
+
#ifdef WITH_THREADS
|
| 210 |
+
void BlockHashIndex::WaitAll()
|
| 211 |
+
{
|
| 212 |
+
m_threadPool.Stop(true);
|
| 213 |
+
}
|
| 214 |
+
#endif
|
| 215 |
+
|
| 216 |
+
size_t BlockHashIndex::FinalizeSave()
|
| 217 |
+
{
|
| 218 |
+
#ifdef WITH_THREADS
|
| 219 |
+
m_threadPool.Stop(true);
|
| 220 |
+
#endif
|
| 221 |
+
|
| 222 |
+
SaveLastRange();
|
| 223 |
+
|
| 224 |
+
size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
|
| 225 |
+
|
| 226 |
+
std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
|
| 227 |
+
ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
|
| 228 |
+
|
| 229 |
+
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
|
| 230 |
+
m_landmarks.save(m_fileHandle);
|
| 231 |
+
|
| 232 |
+
size_t seekIndexSize = m_seekIndex.size();
|
| 233 |
+
ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
|
| 234 |
+
ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
|
| 235 |
+
|
| 236 |
+
ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
|
| 237 |
+
|
| 238 |
+
size_t fileHandleStop = std::ftell(m_fileHandle);
|
| 239 |
+
return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
|
| 240 |
+
+ sizeof(m_fingerPrintBits);
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
size_t BlockHashIndex::Save(std::FILE * mphf)
|
| 244 |
+
{
|
| 245 |
+
m_queue = std::priority_queue<int>();
|
| 246 |
+
BeginSave(mphf);
|
| 247 |
+
for(size_t i = 0; i < m_hashes.size(); i++)
|
| 248 |
+
SaveRange(i);
|
| 249 |
+
return FinalizeSave();
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
|
| 253 |
+
{
|
| 254 |
+
m_fileHandle = mphf;
|
| 255 |
+
|
| 256 |
+
size_t beginning = std::ftell(mphf);
|
| 257 |
+
|
| 258 |
+
size_t read = 0;
|
| 259 |
+
read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
|
| 260 |
+
read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
|
| 261 |
+
m_fileHandleStart = std::ftell(m_fileHandle);
|
| 262 |
+
|
| 263 |
+
size_t relIndexPos;
|
| 264 |
+
read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
|
| 265 |
+
std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
|
| 266 |
+
|
| 267 |
+
m_landmarks.load(mphf);
|
| 268 |
+
|
| 269 |
+
size_t seekIndexSize;
|
| 270 |
+
read += std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
|
| 271 |
+
m_seekIndex.resize(seekIndexSize);
|
| 272 |
+
read += std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
|
| 273 |
+
m_hashes.resize(seekIndexSize, 0);
|
| 274 |
+
m_clocks.resize(seekIndexSize, 0);
|
| 275 |
+
m_arrays.resize(seekIndexSize, 0);
|
| 276 |
+
|
| 277 |
+
read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
|
| 278 |
+
|
| 279 |
+
size_t end = std::ftell(mphf);
|
| 280 |
+
|
| 281 |
+
return end - beginning;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
void BlockHashIndex::LoadRange(size_t i)
|
| 285 |
+
{
|
| 286 |
+
#ifdef HAVE_CMPH
|
| 287 |
+
std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET);
|
| 288 |
+
cmph_t* hash = cmph_load(m_fileHandle);
|
| 289 |
+
m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
|
| 290 |
+
m_fingerPrintBits);
|
| 291 |
+
m_arrays[i]->Load(m_fileHandle);
|
| 292 |
+
|
| 293 |
+
m_hashes[i] = (void*)hash;
|
| 294 |
+
m_clocks[i] = clock();
|
| 295 |
+
|
| 296 |
+
m_numLoadedRanges++;
|
| 297 |
+
#endif
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
size_t BlockHashIndex::Load(std::string filename)
|
| 301 |
+
{
|
| 302 |
+
std::FILE* mphf = std::fopen(filename.c_str(), "r");
|
| 303 |
+
size_t size = Load(mphf);
|
| 304 |
+
std::fclose(mphf);
|
| 305 |
+
return size;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
size_t BlockHashIndex::Load(std::FILE * mphf)
|
| 309 |
+
{
|
| 310 |
+
size_t byteSize = LoadIndex(mphf);
|
| 311 |
+
size_t end = std::ftell(mphf);
|
| 312 |
+
|
| 313 |
+
for(size_t i = 0; i < m_seekIndex.size(); i++)
|
| 314 |
+
LoadRange(i);
|
| 315 |
+
std::fseek(m_fileHandle, end, SEEK_SET);
|
| 316 |
+
return byteSize;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
size_t BlockHashIndex::GetSize() const
|
| 320 |
+
{
|
| 321 |
+
return m_size;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
|
| 325 |
+
{
|
| 326 |
+
/*
|
| 327 |
+
#ifdef WITH_THREADS
|
| 328 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 329 |
+
#endif
|
| 330 |
+
size_t n = m_hashes.size() * ratio;
|
| 331 |
+
size_t max = n * (1 + tolerance);
|
| 332 |
+
if(m_numLoadedRanges > max) {
|
| 333 |
+
typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
|
| 334 |
+
LastLoaded lastLoaded;
|
| 335 |
+
for(size_t i = 0; i < m_hashes.size(); i++)
|
| 336 |
+
if(m_hashes[i] != 0)
|
| 337 |
+
lastLoaded.push_back(std::make_pair(m_clocks[i], i));
|
| 338 |
+
|
| 339 |
+
std::sort(lastLoaded.begin(), lastLoaded.end());
|
| 340 |
+
for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
|
| 341 |
+
it != lastLoaded.rend(); it++)
|
| 342 |
+
DropRange(it->second);
|
| 343 |
+
}*/
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
void BlockHashIndex::CalcHash(size_t current, void* source_void)
|
| 347 |
+
{
|
| 348 |
+
#ifdef HAVE_CMPH
|
| 349 |
+
cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
|
| 350 |
+
cmph_config_t *config = cmph_config_new(source);
|
| 351 |
+
cmph_config_set_algo(config, CMPH_CHD);
|
| 352 |
+
|
| 353 |
+
cmph_t* hash = cmph_new(config);
|
| 354 |
+
PairedPackedArray<> *pv =
|
| 355 |
+
new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);
|
| 356 |
+
|
| 357 |
+
size_t i = 0;
|
| 358 |
+
|
| 359 |
+
source->rewind(source->data);
|
| 360 |
+
|
| 361 |
+
std::string lastKey = "";
|
| 362 |
+
while(i < source->nkeys) {
|
| 363 |
+
unsigned keylen;
|
| 364 |
+
char* key;
|
| 365 |
+
source->read(source->data, &key, &keylen);
|
| 366 |
+
std::string temp(key, keylen);
|
| 367 |
+
source->dispose(source->data, key, keylen);
|
| 368 |
+
|
| 369 |
+
if(lastKey > temp) {
|
| 370 |
+
if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
|
| 371 |
+
util::StringStream strme;
|
| 372 |
+
strme << "ERROR: Input file does not appear to be sorted with LC_ALL=C sort\n";
|
| 373 |
+
strme << "1: " << lastKey << "\n";
|
| 374 |
+
strme << "2: " << temp << "\n";
|
| 375 |
+
UTIL_THROW2(strme.str());
|
| 376 |
+
}
|
| 377 |
+
}
|
| 378 |
+
lastKey = temp;
|
| 379 |
+
|
| 380 |
+
size_t fprint = GetFprint(temp.c_str());
|
| 381 |
+
size_t idx = cmph_search(hash, temp.c_str(),
|
| 382 |
+
(cmph_uint32) temp.size());
|
| 383 |
+
|
| 384 |
+
pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
|
| 385 |
+
i++;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
cmph_config_destroy(config);
|
| 389 |
+
|
| 390 |
+
#ifdef WITH_THREADS
|
| 391 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 392 |
+
#endif
|
| 393 |
+
|
| 394 |
+
if(m_hashes.size() <= current) {
|
| 395 |
+
m_hashes.resize(current + 1, 0);
|
| 396 |
+
m_arrays.resize(current + 1, 0);
|
| 397 |
+
m_clocks.resize(current + 1, 0);
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
m_hashes[current] = (void*)hash;
|
| 401 |
+
m_arrays[current] = pv;
|
| 402 |
+
m_clocks[current] = clock();
|
| 403 |
+
m_queue.push(-current);
|
| 404 |
+
#endif
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
#ifdef HAVE_CMPH
|
| 408 |
+
void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
|
| 409 |
+
{
|
| 410 |
+
return (void*)CmphVectorAdapter(v);
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv)
|
| 414 |
+
{
|
| 415 |
+
return (void*)CmphStringVectorAdapter(sv);
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv)
|
| 419 |
+
{
|
| 420 |
+
return (void*)CmphStringVectorAdapter(sv);
|
| 421 |
+
}
|
| 422 |
+
#endif
|
| 423 |
+
|
| 424 |
+
}
|
mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_ConsistentPhrases_h
|
| 23 |
+
#define moses_ConsistentPhrases_h
|
| 24 |
+
|
| 25 |
+
#include <set>
|
| 26 |
+
|
| 27 |
+
namespace Moses
|
| 28 |
+
{
|
| 29 |
+
|
| 30 |
+
class ConsistentPhrases
|
| 31 |
+
{
|
| 32 |
+
public:
|
| 33 |
+
struct Phrase {
|
| 34 |
+
int i, j, m, n;
|
| 35 |
+
Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
|
| 36 |
+
};
|
| 37 |
+
|
| 38 |
+
struct PhraseSorter {
|
| 39 |
+
bool operator()(Phrase a, Phrase b) {
|
| 40 |
+
if(a.n > b.n)
|
| 41 |
+
return true;
|
| 42 |
+
if(a.n == b.n && a.j < b.j)
|
| 43 |
+
return true;
|
| 44 |
+
if(a.n == b.n && a.j == b.j && a.m > b.m)
|
| 45 |
+
return true;
|
| 46 |
+
if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
|
| 47 |
+
return true;
|
| 48 |
+
return false;
|
| 49 |
+
}
|
| 50 |
+
};
|
| 51 |
+
|
| 52 |
+
private:
|
| 53 |
+
typedef std::set<Phrase, PhraseSorter> PhraseQueue;
|
| 54 |
+
PhraseQueue m_phraseQueue;
|
| 55 |
+
|
| 56 |
+
typedef std::pair<unsigned char, unsigned char> AlignPoint;
|
| 57 |
+
typedef std::set<AlignPoint> Alignment;
|
| 58 |
+
|
| 59 |
+
public:
|
| 60 |
+
|
| 61 |
+
ConsistentPhrases(int mmax, int nmax, Alignment& a) {
|
| 62 |
+
for(int i = 0; i < mmax; i++) {
|
| 63 |
+
for(int m = 1; m <= mmax-i; m++) {
|
| 64 |
+
for(int j = 0; j < nmax; j++) {
|
| 65 |
+
for(int n = 1; n <= nmax-j; n++) {
|
| 66 |
+
bool consistant = true;
|
| 67 |
+
for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
|
| 68 |
+
int ip = it->first;
|
| 69 |
+
int jp = it->second;
|
| 70 |
+
if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) {
|
| 71 |
+
consistant = false;
|
| 72 |
+
break;
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
if(consistant)
|
| 76 |
+
m_phraseQueue.insert(Phrase(i, m, j, n));
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
size_t Empty() {
|
| 85 |
+
return !m_phraseQueue.size();
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
Phrase Pop() {
|
| 89 |
+
if(m_phraseQueue.size()) {
|
| 90 |
+
Phrase p = *m_phraseQueue.begin();
|
| 91 |
+
m_phraseQueue.erase(m_phraseQueue.begin());
|
| 92 |
+
return p;
|
| 93 |
+
}
|
| 94 |
+
return Phrase(0,0,0,0);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
void RemoveOverlap(Phrase p) {
|
| 98 |
+
PhraseQueue ok;
|
| 99 |
+
for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) {
|
| 100 |
+
Phrase pp = *it;
|
| 101 |
+
if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
|
| 102 |
+
(p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
|
| 103 |
+
ok.insert(pp);
|
| 104 |
+
}
|
| 105 |
+
m_phraseQueue = ok;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
};
|
| 109 |
+
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/Jamfile
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
local current = "" ;
|
| 2 |
+
local includes = ;
|
| 3 |
+
local with-cmph = [ option.get "with-cmph" ] ;
|
| 4 |
+
if $(with-cmph) {
|
| 5 |
+
lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
|
| 6 |
+
includes += <include>$(with-cmph)/include ;
|
| 7 |
+
current = "--with-cmph=$(with-cmph)" ;
|
| 8 |
+
fakelib CompactPT : [ glob *.cpp ] ../..//headers cmph : $(includes) <dependency>$(PT-LOG) : : $(includes) ;
|
| 9 |
+
}
|
| 10 |
+
else {
|
| 11 |
+
alias cmph ;
|
| 12 |
+
fakelib CompactPT ;
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
path-constant PT-LOG : bin/pt.log ;
|
| 16 |
+
update-if-changed $(PT-LOG) $(current) ;
|
| 17 |
+
|
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp
ADDED
|
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#include <sstream>
|
| 23 |
+
#include "LexicalReorderingTableCreator.h"
|
| 24 |
+
#include "ThrowingFwrite.h"
|
| 25 |
+
#include "moses/Util.h"
|
| 26 |
+
#include "util/file.hh"
|
| 27 |
+
#include "util/exception.hh"
|
| 28 |
+
|
| 29 |
+
namespace Moses
|
| 30 |
+
{
|
| 31 |
+
|
| 32 |
+
LexicalReorderingTableCreator::LexicalReorderingTableCreator(
|
| 33 |
+
std::string inPath, std::string outPath, std::string tempfilePath,
|
| 34 |
+
size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
|
| 35 |
+
size_t quantize
|
| 36 |
+
#ifdef WITH_THREADS
|
| 37 |
+
, size_t threads
|
| 38 |
+
#endif
|
| 39 |
+
)
|
| 40 |
+
: m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
|
| 41 |
+
m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
|
| 42 |
+
m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
|
| 43 |
+
m_quantize(quantize), m_separator(" ||| "),
|
| 44 |
+
m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
|
| 45 |
+
#ifdef WITH_THREADS
|
| 46 |
+
, m_threads(threads)
|
| 47 |
+
#endif
|
| 48 |
+
{
|
| 49 |
+
PrintInfo();
|
| 50 |
+
|
| 51 |
+
m_outFile = std::fopen(m_outPath.c_str(), "w");
|
| 52 |
+
|
| 53 |
+
std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
|
| 54 |
+
m_hash.BeginSave(m_outFile);
|
| 55 |
+
|
| 56 |
+
if(tempfilePath.size()) {
|
| 57 |
+
MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
|
| 58 |
+
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
|
| 59 |
+
} else {
|
| 60 |
+
m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
EncodeScores();
|
| 64 |
+
|
| 65 |
+
std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
|
| 66 |
+
CalcHuffmanCodes();
|
| 67 |
+
|
| 68 |
+
std::cerr << "Pass 2/2: Compressing scores" << std::endl;
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
if(tempfilePath.size()) {
|
| 72 |
+
MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
|
| 73 |
+
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
|
| 74 |
+
} else {
|
| 75 |
+
m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
|
| 76 |
+
}
|
| 77 |
+
CompressScores();
|
| 78 |
+
|
| 79 |
+
std::cerr << "Saving to " << m_outPath << std::endl;
|
| 80 |
+
Save();
|
| 81 |
+
std::cerr << "Done" << std::endl;
|
| 82 |
+
std::fclose(m_outFile);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void LexicalReorderingTableCreator::PrintInfo()
|
| 86 |
+
{
|
| 87 |
+
std::cerr << "Used options:" << std::endl;
|
| 88 |
+
std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
|
| 89 |
+
std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
|
| 90 |
+
std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
|
| 91 |
+
std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
|
| 92 |
+
std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
|
| 93 |
+
std::cerr << "\tUsing score quantization: ";
|
| 94 |
+
if(m_quantize)
|
| 95 |
+
std::cerr << m_quantize << " best" << std::endl;
|
| 96 |
+
else
|
| 97 |
+
std::cerr << "no" << std::endl;
|
| 98 |
+
|
| 99 |
+
#ifdef WITH_THREADS
|
| 100 |
+
std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
|
| 101 |
+
#endif
|
| 102 |
+
std::cerr << std::endl;
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
|
| 106 |
+
{
|
| 107 |
+
for(size_t i = 0; i < m_scoreTrees.size(); i++) {
|
| 108 |
+
delete m_scoreTrees[i];
|
| 109 |
+
delete m_scoreCounters[i];
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
delete m_encodedScores;
|
| 113 |
+
delete m_compressedScores;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
void LexicalReorderingTableCreator::EncodeScores()
|
| 118 |
+
{
|
| 119 |
+
InputFileStream inFile(m_inPath);
|
| 120 |
+
|
| 121 |
+
#ifdef WITH_THREADS
|
| 122 |
+
boost::thread_group threads;
|
| 123 |
+
for (size_t i = 0; i < m_threads; ++i) {
|
| 124 |
+
EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
|
| 125 |
+
threads.create_thread(*et);
|
| 126 |
+
}
|
| 127 |
+
threads.join_all();
|
| 128 |
+
#else
|
| 129 |
+
EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
|
| 130 |
+
(*et)();
|
| 131 |
+
delete et;
|
| 132 |
+
#endif
|
| 133 |
+
FlushEncodedQueue(true);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
void LexicalReorderingTableCreator::CalcHuffmanCodes()
|
| 137 |
+
{
|
| 138 |
+
std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
|
| 139 |
+
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
|
| 140 |
+
it != m_scoreCounters.end(); it++) {
|
| 141 |
+
if(m_quantize)
|
| 142 |
+
(*it)->Quantize(m_quantize);
|
| 143 |
+
|
| 144 |
+
std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
|
| 145 |
+
<< " scores" << std::endl;
|
| 146 |
+
|
| 147 |
+
*treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
|
| 148 |
+
treeIt++;
|
| 149 |
+
}
|
| 150 |
+
std::cerr << std::endl;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
void LexicalReorderingTableCreator::CompressScores()
|
| 154 |
+
{
|
| 155 |
+
#ifdef WITH_THREADS
|
| 156 |
+
boost::thread_group threads;
|
| 157 |
+
for (size_t i = 0; i < m_threads; ++i) {
|
| 158 |
+
CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
|
| 159 |
+
threads.create_thread(*ct);
|
| 160 |
+
}
|
| 161 |
+
threads.join_all();
|
| 162 |
+
#else
|
| 163 |
+
CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
|
| 164 |
+
(*ct)();
|
| 165 |
+
delete ct;
|
| 166 |
+
#endif
|
| 167 |
+
FlushCompressedQueue(true);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
void LexicalReorderingTableCreator::Save()
|
| 171 |
+
{
|
| 172 |
+
ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
|
| 173 |
+
ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
|
| 174 |
+
for(size_t i = 0; i < m_scoreTrees.size(); i++)
|
| 175 |
+
m_scoreTrees[i]->Save(m_outFile);
|
| 176 |
+
|
| 177 |
+
m_compressedScores->save(m_outFile);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
|
| 181 |
+
{
|
| 182 |
+
std::string key = source + m_separator;
|
| 183 |
+
if(!target.empty())
|
| 184 |
+
key += target + m_separator;
|
| 185 |
+
return key;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
|
| 189 |
+
{
|
| 190 |
+
std::string scoresString = tokens.back();
|
| 191 |
+
std::stringstream scoresStream;
|
| 192 |
+
|
| 193 |
+
std::vector<float> scores;
|
| 194 |
+
Tokenize<float>(scores, scoresString);
|
| 195 |
+
|
| 196 |
+
if(!m_numScoreComponent) {
|
| 197 |
+
m_numScoreComponent = scores.size();
|
| 198 |
+
m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
|
| 199 |
+
for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
|
| 200 |
+
it != m_scoreCounters.end(); it++)
|
| 201 |
+
*it = new ScoreCounter();
|
| 202 |
+
m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
if(m_numScoreComponent != scores.size()) {
|
| 206 |
+
std::stringstream strme;
|
| 207 |
+
strme << "Error: Wrong number of scores detected ("
|
| 208 |
+
<< scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
|
| 209 |
+
strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
|
| 210 |
+
UTIL_THROW2(strme.str());
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
size_t c = 0;
|
| 214 |
+
float score;
|
| 215 |
+
while(c < m_numScoreComponent) {
|
| 216 |
+
score = scores[c];
|
| 217 |
+
score = FloorScore(TransformScore(score));
|
| 218 |
+
scoresStream.write((char*)&score, sizeof(score));
|
| 219 |
+
|
| 220 |
+
m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
|
| 221 |
+
c++;
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
return scoresStream.str();
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
|
| 228 |
+
{
|
| 229 |
+
m_queue.push(pi);
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
|
| 233 |
+
{
|
| 234 |
+
if(force || m_queue.size() > 10000) {
|
| 235 |
+
while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
|
| 236 |
+
PackedItem pi = m_queue.top();
|
| 237 |
+
m_queue.pop();
|
| 238 |
+
m_lastFlushedLine++;
|
| 239 |
+
|
| 240 |
+
m_lastRange.push_back(pi.GetSrc());
|
| 241 |
+
m_encodedScores->push_back(pi.GetTrg());
|
| 242 |
+
|
| 243 |
+
if((pi.GetLine()+1) % 100000 == 0)
|
| 244 |
+
std::cerr << ".";
|
| 245 |
+
if((pi.GetLine()+1) % 5000000 == 0)
|
| 246 |
+
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
|
| 247 |
+
|
| 248 |
+
if(m_lastRange.size() == (1ul << m_orderBits)) {
|
| 249 |
+
m_hash.AddRange(m_lastRange);
|
| 250 |
+
m_hash.SaveLastRange();
|
| 251 |
+
m_hash.DropLastRange();
|
| 252 |
+
m_lastRange.clear();
|
| 253 |
+
}
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
if(force) {
|
| 258 |
+
m_lastFlushedLine = -1;
|
| 259 |
+
|
| 260 |
+
if(!m_lastRange.empty()) {
|
| 261 |
+
m_hash.AddRange(m_lastRange);
|
| 262 |
+
m_lastRange.clear();
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
#ifdef WITH_THREADS
|
| 266 |
+
m_hash.WaitAll();
|
| 267 |
+
#endif
|
| 268 |
+
|
| 269 |
+
m_hash.SaveLastRange();
|
| 270 |
+
m_hash.DropLastRange();
|
| 271 |
+
m_hash.FinalizeSave();
|
| 272 |
+
|
| 273 |
+
std::cerr << std::endl << std::endl;
|
| 274 |
+
}
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
|
| 278 |
+
{
|
| 279 |
+
std::stringstream encodedScoresStream(encodedScores);
|
| 280 |
+
encodedScoresStream.unsetf(std::ios::skipws);
|
| 281 |
+
|
| 282 |
+
std::string compressedScores;
|
| 283 |
+
BitWrapper<> compressedScoresStream(compressedScores);
|
| 284 |
+
|
| 285 |
+
size_t currScore = 0;
|
| 286 |
+
float score;
|
| 287 |
+
encodedScoresStream.read((char*) &score, sizeof(score));
|
| 288 |
+
|
| 289 |
+
while(encodedScoresStream) {
|
| 290 |
+
size_t index = currScore % m_scoreTrees.size();
|
| 291 |
+
|
| 292 |
+
if(m_quantize)
|
| 293 |
+
score = m_scoreCounters[index]->LowerBound(score);
|
| 294 |
+
|
| 295 |
+
m_scoreTrees[index]->Put(compressedScoresStream, score);
|
| 296 |
+
encodedScoresStream.read((char*) &score, sizeof(score));
|
| 297 |
+
currScore++;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
return compressedScores;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
|
| 304 |
+
{
|
| 305 |
+
m_queue.push(pi);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
|
| 309 |
+
{
|
| 310 |
+
if(force || m_queue.size() > 10000) {
|
| 311 |
+
while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
|
| 312 |
+
PackedItem pi = m_queue.top();
|
| 313 |
+
m_queue.pop();
|
| 314 |
+
m_lastFlushedLine++;
|
| 315 |
+
|
| 316 |
+
m_compressedScores->push_back(pi.GetTrg());
|
| 317 |
+
|
| 318 |
+
if((pi.GetLine()+1) % 100000 == 0)
|
| 319 |
+
std::cerr << ".";
|
| 320 |
+
if((pi.GetLine()+1) % 5000000 == 0)
|
| 321 |
+
std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
|
| 322 |
+
}
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
if(force) {
|
| 326 |
+
m_lastFlushedLine = -1;
|
| 327 |
+
std::cerr << std::endl << std::endl;
|
| 328 |
+
}
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
//****************************************************************************//
|
| 332 |
+
|
| 333 |
+
size_t EncodingTaskReordering::m_lineNum = 0;
|
| 334 |
+
#ifdef WITH_THREADS
|
| 335 |
+
boost::mutex EncodingTaskReordering::m_mutex;
|
| 336 |
+
boost::mutex EncodingTaskReordering::m_fileMutex;
|
| 337 |
+
#endif
|
| 338 |
+
|
| 339 |
+
EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
|
| 340 |
+
: m_inFile(inFile), m_creator(creator) {}
|
| 341 |
+
|
| 342 |
+
void EncodingTaskReordering::operator()()
|
| 343 |
+
{
|
| 344 |
+
size_t lineNum = 0;
|
| 345 |
+
|
| 346 |
+
std::vector<std::string> lines;
|
| 347 |
+
size_t max_lines = 1000;
|
| 348 |
+
lines.reserve(max_lines);
|
| 349 |
+
|
| 350 |
+
{
|
| 351 |
+
#ifdef WITH_THREADS
|
| 352 |
+
boost::mutex::scoped_lock lock(m_fileMutex);
|
| 353 |
+
#endif
|
| 354 |
+
std::string line;
|
| 355 |
+
while(lines.size() < max_lines && std::getline(m_inFile, line))
|
| 356 |
+
lines.push_back(line);
|
| 357 |
+
lineNum = m_lineNum;
|
| 358 |
+
m_lineNum += lines.size();
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
std::vector<PackedItem> result;
|
| 362 |
+
result.reserve(max_lines);
|
| 363 |
+
|
| 364 |
+
while(lines.size()) {
|
| 365 |
+
for(size_t i = 0; i < lines.size(); i++) {
|
| 366 |
+
std::vector<std::string> tokens;
|
| 367 |
+
Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
|
| 368 |
+
|
| 369 |
+
std::string encodedLine = m_creator.EncodeLine(tokens);
|
| 370 |
+
|
| 371 |
+
std::string f = tokens[0];
|
| 372 |
+
|
| 373 |
+
std::string e;
|
| 374 |
+
if(tokens.size() > 2)
|
| 375 |
+
e = tokens[1];
|
| 376 |
+
|
| 377 |
+
PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
|
| 378 |
+
encodedLine, i);
|
| 379 |
+
result.push_back(packedItem);
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
{
|
| 383 |
+
#ifdef WITH_THREADS
|
| 384 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 385 |
+
#endif
|
| 386 |
+
for(size_t i = 0; i < result.size(); i++)
|
| 387 |
+
m_creator.AddEncodedLine(result[i]);
|
| 388 |
+
m_creator.FlushEncodedQueue();
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
lines.clear();
|
| 392 |
+
result.clear();
|
| 393 |
+
lines.reserve(max_lines);
|
| 394 |
+
result.reserve(max_lines);
|
| 395 |
+
|
| 396 |
+
#ifdef WITH_THREADS
|
| 397 |
+
boost::mutex::scoped_lock lock(m_fileMutex);
|
| 398 |
+
#endif
|
| 399 |
+
std::string line;
|
| 400 |
+
while(lines.size() < max_lines && std::getline(m_inFile, line))
|
| 401 |
+
lines.push_back(line);
|
| 402 |
+
lineNum = m_lineNum;
|
| 403 |
+
m_lineNum += lines.size();
|
| 404 |
+
}
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
//****************************************************************************//
|
| 408 |
+
|
| 409 |
+
size_t CompressionTaskReordering::m_scoresNum = 0;
|
| 410 |
+
#ifdef WITH_THREADS
|
| 411 |
+
boost::mutex CompressionTaskReordering::m_mutex;
|
| 412 |
+
#endif
|
| 413 |
+
|
| 414 |
+
CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
|
| 415 |
+
MmapAllocator>& encodedScores,
|
| 416 |
+
LexicalReorderingTableCreator& creator)
|
| 417 |
+
: m_encodedScores(encodedScores), m_creator(creator)
|
| 418 |
+
{ }
|
| 419 |
+
|
| 420 |
+
void CompressionTaskReordering::operator()()
|
| 421 |
+
{
|
| 422 |
+
size_t scoresNum;
|
| 423 |
+
{
|
| 424 |
+
#ifdef WITH_THREADS
|
| 425 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 426 |
+
#endif
|
| 427 |
+
scoresNum = m_scoresNum;
|
| 428 |
+
m_scoresNum++;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
while(scoresNum < m_encodedScores.size()) {
|
| 432 |
+
std::string scores = m_encodedScores[scoresNum];
|
| 433 |
+
std::string compressedScores
|
| 434 |
+
= m_creator.CompressEncodedScores(scores);
|
| 435 |
+
|
| 436 |
+
std::string dummy;
|
| 437 |
+
PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
|
| 438 |
+
|
| 439 |
+
#ifdef WITH_THREADS
|
| 440 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 441 |
+
#endif
|
| 442 |
+
m_creator.AddCompressedScores(packedItem);
|
| 443 |
+
m_creator.FlushCompressedQueue();
|
| 444 |
+
|
| 445 |
+
scoresNum = m_scoresNum;
|
| 446 |
+
m_scoresNum++;
|
| 447 |
+
}
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
}
|
mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_MmapAllocator_h
|
| 23 |
+
#define moses_MmapAllocator_h
|
| 24 |
+
|
| 25 |
+
#include <limits>
|
| 26 |
+
#include <iostream>
|
| 27 |
+
#include <cstdio>
|
| 28 |
+
#include <unistd.h>
|
| 29 |
+
|
| 30 |
+
#if defined(_WIN32) || defined(_WIN64)
|
| 31 |
+
#include <windows.h>
|
| 32 |
+
#include <io.h>
|
| 33 |
+
#else
|
| 34 |
+
#include <sys/mman.h>
|
| 35 |
+
#endif
|
| 36 |
+
|
| 37 |
+
#include "util/mmap.hh"
|
| 38 |
+
|
| 39 |
+
namespace Moses
|
| 40 |
+
{
|
| 41 |
+
template <class T>
|
| 42 |
+
class MmapAllocator
|
| 43 |
+
{
|
| 44 |
+
protected:
|
| 45 |
+
std::FILE* m_file_ptr;
|
| 46 |
+
size_t m_file_desc;
|
| 47 |
+
|
| 48 |
+
size_t m_page_size;
|
| 49 |
+
size_t m_map_size;
|
| 50 |
+
|
| 51 |
+
char* m_data_ptr;
|
| 52 |
+
size_t m_data_offset;
|
| 53 |
+
bool m_fixed;
|
| 54 |
+
size_t* m_count;
|
| 55 |
+
|
| 56 |
+
public:
|
| 57 |
+
typedef T value_type;
|
| 58 |
+
typedef T* pointer;
|
| 59 |
+
typedef const T* const_pointer;
|
| 60 |
+
typedef T& reference;
|
| 61 |
+
typedef const T& const_reference;
|
| 62 |
+
typedef std::size_t size_type;
|
| 63 |
+
typedef std::ptrdiff_t difference_type;
|
| 64 |
+
|
| 65 |
+
MmapAllocator() throw()
|
| 66 |
+
: m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
|
| 67 |
+
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
| 68 |
+
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
MmapAllocator(std::FILE* f_ptr) throw()
|
| 72 |
+
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
| 73 |
+
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
| 74 |
+
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
|
| 78 |
+
: m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
|
| 79 |
+
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
| 80 |
+
m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
MmapAllocator(std::string fileName) throw()
|
| 84 |
+
: m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
|
| 85 |
+
m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
|
| 86 |
+
m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
MmapAllocator(const MmapAllocator& c) throw()
|
| 90 |
+
: m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
|
| 91 |
+
m_page_size(c.m_page_size), m_map_size(c.m_map_size),
|
| 92 |
+
m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
|
| 93 |
+
m_fixed(c.m_fixed), m_count(c.m_count) {
|
| 94 |
+
(*m_count)++;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
~MmapAllocator() throw() {
|
| 98 |
+
if(m_data_ptr && *m_count == 0) {
|
| 99 |
+
util::UnmapOrThrow(m_data_ptr, m_map_size);
|
| 100 |
+
if(!m_fixed && std::ftell(m_file_ptr) != -1)
|
| 101 |
+
std::fclose(m_file_ptr);
|
| 102 |
+
}
|
| 103 |
+
(*m_count)--;
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
template <class U>
|
| 107 |
+
struct rebind {
|
| 108 |
+
typedef MmapAllocator<U> other;
|
| 109 |
+
};
|
| 110 |
+
|
| 111 |
+
pointer address (reference value) const {
|
| 112 |
+
return &value;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
const_pointer address (const_reference value) const {
|
| 116 |
+
return &value;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
size_type max_size () const throw() {
|
| 120 |
+
return std::numeric_limits<size_t>::max() / sizeof(value_type);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
pointer allocate (size_type num, const void* = 0) {
|
| 124 |
+
m_map_size = num * sizeof(T);
|
| 125 |
+
|
| 126 |
+
#if defined(_WIN32) || defined(_WIN64)
|
| 127 |
+
// On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
|
| 128 |
+
const int map_shared = 0;
|
| 129 |
+
#else
|
| 130 |
+
const int map_shared = MAP_SHARED;
|
| 131 |
+
#endif
|
| 132 |
+
if(!m_fixed) {
|
| 133 |
+
size_t read = 0;
|
| 134 |
+
read += ftruncate(m_file_desc, m_map_size);
|
| 135 |
+
m_data_ptr = (char *)util::MapOrThrow(
|
| 136 |
+
m_map_size, true, map_shared, false, m_file_desc, 0);
|
| 137 |
+
return (pointer)m_data_ptr;
|
| 138 |
+
} else {
|
| 139 |
+
const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
| 140 |
+
const size_t relative_offset = m_data_offset - map_offset;
|
| 141 |
+
const size_t adjusted_map_size = m_map_size + relative_offset;
|
| 142 |
+
|
| 143 |
+
m_data_ptr = (char *)util::MapOrThrow(
|
| 144 |
+
adjusted_map_size, false, map_shared, false, m_file_desc, map_offset);
|
| 145 |
+
|
| 146 |
+
return (pointer)(m_data_ptr + relative_offset);
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
void deallocate (pointer p, size_type num) {
|
| 151 |
+
if(!m_fixed) {
|
| 152 |
+
util::UnmapOrThrow(p, num * sizeof(T));
|
| 153 |
+
} else {
|
| 154 |
+
const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
|
| 155 |
+
const size_t relative_offset = m_data_offset - map_offset;
|
| 156 |
+
const size_t adjusted_map_size = m_map_size + relative_offset;
|
| 157 |
+
|
| 158 |
+
util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size);
|
| 159 |
+
}
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
void construct (pointer p, const T& value) {
|
| 163 |
+
if(!m_fixed)
|
| 164 |
+
new(p) value_type(value);
|
| 165 |
+
}
|
| 166 |
+
void destroy (pointer p) {
|
| 167 |
+
if(!m_fixed)
|
| 168 |
+
p->~T();
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
template <class T1, class T2>
|
| 172 |
+
friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
|
| 173 |
+
|
| 174 |
+
template <class T1, class T2>
|
| 175 |
+
friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
|
| 176 |
+
};
|
| 177 |
+
|
| 178 |
+
template <class T1, class T2>
|
| 179 |
+
bool operator== (const MmapAllocator<T1>& a1,
|
| 180 |
+
const MmapAllocator<T2>& a2) throw()
|
| 181 |
+
{
|
| 182 |
+
bool equal = true;
|
| 183 |
+
equal &= a1.m_file_ptr == a2.m_file_ptr;
|
| 184 |
+
equal &= a1.m_file_desc == a2.m_file_desc;
|
| 185 |
+
equal &= a1.m_page_size == a2.m_page_size;
|
| 186 |
+
equal &= a1.m_map_size == a2.m_map_size;
|
| 187 |
+
equal &= a1.m_data_ptr == a2.m_data_ptr;
|
| 188 |
+
equal &= a1.m_data_offset == a2.m_data_offset;
|
| 189 |
+
equal &= a1.m_fixed == a2.m_fixed;
|
| 190 |
+
return equal;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
template <class T1, class T2>
|
| 194 |
+
bool operator!=(const MmapAllocator<T1>& a1,
|
| 195 |
+
const MmapAllocator<T2>& a2) throw()
|
| 196 |
+
{
|
| 197 |
+
return !(a1 == a2);
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_MonotonicVector_h
|
| 23 |
+
#define moses_MonotonicVector_h
|
| 24 |
+
|
| 25 |
+
// MonotonicVector - Represents a monotonic increasing function that maps
|
| 26 |
+
// positive integers of any size onto a given number type. Each value has to be
|
| 27 |
+
// equal or larger than the previous one. Depending on the stepSize it can save
|
| 28 |
+
// up to 90% of memory compared to a std::vector<long>. Time complexity is roughly
|
| 29 |
+
// constant, in the worst case, however, stepSize times slower than a normal
|
| 30 |
+
// std::vector.
|
| 31 |
+
|
| 32 |
+
#include <vector>
|
| 33 |
+
#include <limits>
|
| 34 |
+
#include <algorithm>
|
| 35 |
+
#include <cstdio>
|
| 36 |
+
#include <cassert>
|
| 37 |
+
|
| 38 |
+
#include "ThrowingFwrite.h"
|
| 39 |
+
#include "ListCoders.h"
|
| 40 |
+
#include "MmapAllocator.h"
|
| 41 |
+
|
| 42 |
+
namespace Moses
|
| 43 |
+
{
|
| 44 |
+
|
| 45 |
+
template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
|
| 46 |
+
template <typename> class Allocator = std::allocator>
|
| 47 |
+
class MonotonicVector
|
| 48 |
+
{
|
| 49 |
+
private:
|
| 50 |
+
typedef std::vector<NumT, Allocator<NumT> > Anchors;
|
| 51 |
+
typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
|
| 52 |
+
|
| 53 |
+
Anchors m_anchors;
|
| 54 |
+
Diffs m_diffs;
|
| 55 |
+
std::vector<unsigned int> m_tempDiffs;
|
| 56 |
+
|
| 57 |
+
size_t m_size;
|
| 58 |
+
PosT m_last;
|
| 59 |
+
bool m_final;
|
| 60 |
+
|
| 61 |
+
public:
|
| 62 |
+
typedef PosT value_type;
|
| 63 |
+
|
| 64 |
+
MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
|
| 65 |
+
|
| 66 |
+
size_t size() const {
|
| 67 |
+
return m_size + m_tempDiffs.size();
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
PosT at(size_t i) const {
|
| 71 |
+
PosT s = stepSize;
|
| 72 |
+
PosT j = m_anchors[i / s];
|
| 73 |
+
PosT r = i % s;
|
| 74 |
+
|
| 75 |
+
typename Diffs::const_iterator it = m_diffs.begin() + j;
|
| 76 |
+
|
| 77 |
+
PosT k = 0;
|
| 78 |
+
k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
|
| 79 |
+
if(i < m_size)
|
| 80 |
+
k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
|
| 81 |
+
else if(i < m_size + m_tempDiffs.size())
|
| 82 |
+
for(size_t l = 0; l < r; l++)
|
| 83 |
+
k += m_tempDiffs[l];
|
| 84 |
+
|
| 85 |
+
return k;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
PosT operator[](PosT i) const {
|
| 89 |
+
return at(i);
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
PosT back() const {
|
| 93 |
+
return at(size()-1);
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void push_back(PosT i) {
|
| 97 |
+
assert(m_final != true);
|
| 98 |
+
|
| 99 |
+
if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
|
| 100 |
+
m_anchors.push_back(0);
|
| 101 |
+
VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
|
| 102 |
+
m_last = i;
|
| 103 |
+
m_size++;
|
| 104 |
+
|
| 105 |
+
return;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
if(m_tempDiffs.size() == stepSize-1) {
|
| 109 |
+
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
|
| 110 |
+
std::back_inserter(m_diffs));
|
| 111 |
+
m_anchors.push_back(m_diffs.size());
|
| 112 |
+
VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
|
| 113 |
+
|
| 114 |
+
m_size += m_tempDiffs.size() + 1;
|
| 115 |
+
m_tempDiffs.clear();
|
| 116 |
+
} else {
|
| 117 |
+
PosT last = m_last;
|
| 118 |
+
PosT diff = i - last;
|
| 119 |
+
m_tempDiffs.push_back(diff);
|
| 120 |
+
}
|
| 121 |
+
m_last = i;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
void commit() {
|
| 125 |
+
assert(m_final != true);
|
| 126 |
+
Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
|
| 127 |
+
std::back_inserter(m_diffs));
|
| 128 |
+
m_size += m_tempDiffs.size();
|
| 129 |
+
m_tempDiffs.clear();
|
| 130 |
+
m_final = true;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
size_t usage() {
|
| 134 |
+
return m_diffs.size() * sizeof(unsigned int)
|
| 135 |
+
+ m_anchors.size() * sizeof(NumT);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
size_t load(std::FILE* in, bool map = false) {
|
| 139 |
+
size_t byteSize = 0;
|
| 140 |
+
|
| 141 |
+
byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
|
| 142 |
+
byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
|
| 143 |
+
byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
|
| 144 |
+
|
| 145 |
+
byteSize += loadVector(m_diffs, in, map);
|
| 146 |
+
byteSize += loadVector(m_anchors, in, map);
|
| 147 |
+
|
| 148 |
+
return byteSize;
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
template <typename ValueT>
|
| 152 |
+
size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
|
| 153 |
+
std::FILE* in, bool map = false) {
|
| 154 |
+
// Can only be read into memory. Mapping not possible with std:allocator.
|
| 155 |
+
assert(map == false);
|
| 156 |
+
|
| 157 |
+
size_t byteSize = 0;
|
| 158 |
+
|
| 159 |
+
size_t valSize;
|
| 160 |
+
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
| 161 |
+
|
| 162 |
+
v.resize(valSize, 0);
|
| 163 |
+
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
| 164 |
+
|
| 165 |
+
return byteSize;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
template <typename ValueT>
|
| 169 |
+
size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
|
| 170 |
+
std::FILE* in, bool map = false) {
|
| 171 |
+
size_t byteSize = 0;
|
| 172 |
+
|
| 173 |
+
size_t valSize;
|
| 174 |
+
byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
|
| 175 |
+
|
| 176 |
+
if(map == false) {
|
| 177 |
+
// Read data into temporary file (default constructor of MmapAllocator)
|
| 178 |
+
// and map memory onto temporary file. Can be resized.
|
| 179 |
+
|
| 180 |
+
v.resize(valSize, 0);
|
| 181 |
+
byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
|
| 182 |
+
} else {
|
| 183 |
+
// Map it directly on specified region of file "in" starting at valPos
|
| 184 |
+
// with length valSize * sizeof(ValueT). Mapped region cannot be resized.
|
| 185 |
+
|
| 186 |
+
size_t valPos = std::ftell(in);
|
| 187 |
+
|
| 188 |
+
Allocator<ValueT> alloc(in, valPos);
|
| 189 |
+
std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
|
| 190 |
+
vTemp.resize(valSize);
|
| 191 |
+
v.swap(vTemp);
|
| 192 |
+
|
| 193 |
+
std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
|
| 194 |
+
byteSize += valSize * sizeof(ValueT);
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
return byteSize;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
size_t save(std::FILE* out) {
|
| 201 |
+
if(!m_final)
|
| 202 |
+
commit();
|
| 203 |
+
|
| 204 |
+
bool byteSize = 0;
|
| 205 |
+
byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
|
| 206 |
+
byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
|
| 207 |
+
byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
|
| 208 |
+
|
| 209 |
+
size_t size = m_diffs.size();
|
| 210 |
+
byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
|
| 211 |
+
byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
|
| 212 |
+
|
| 213 |
+
size = m_anchors.size();
|
| 214 |
+
byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
|
| 215 |
+
byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
|
| 216 |
+
|
| 217 |
+
return byteSize;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv) {
|
| 221 |
+
if(!m_final)
|
| 222 |
+
commit();
|
| 223 |
+
|
| 224 |
+
m_diffs.swap(mv.m_diffs);
|
| 225 |
+
m_anchors.swap(mv.m_anchors);
|
| 226 |
+
}
|
| 227 |
+
};
|
| 228 |
+
|
| 229 |
+
}
|
| 230 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_PhraseDecoder_h
|
| 23 |
+
#define moses_PhraseDecoder_h
|
| 24 |
+
|
| 25 |
+
#include <sstream>
|
| 26 |
+
#include <vector>
|
| 27 |
+
#include <boost/unordered_map.hpp>
|
| 28 |
+
#include <boost/unordered_set.hpp>
|
| 29 |
+
#include <string>
|
| 30 |
+
#include <iterator>
|
| 31 |
+
#include <algorithm>
|
| 32 |
+
#include <sys/stat.h>
|
| 33 |
+
|
| 34 |
+
#include "moses/TypeDef.h"
|
| 35 |
+
#include "moses/FactorCollection.h"
|
| 36 |
+
#include "moses/Word.h"
|
| 37 |
+
#include "moses/Util.h"
|
| 38 |
+
#include "moses/InputFileStream.h"
|
| 39 |
+
#include "moses/StaticData.h"
|
| 40 |
+
#include "moses/Range.h"
|
| 41 |
+
|
| 42 |
+
#include "PhraseDictionaryCompact.h"
|
| 43 |
+
#include "StringVector.h"
|
| 44 |
+
#include "CanonicalHuffman.h"
|
| 45 |
+
#include "TargetPhraseCollectionCache.h"
|
| 46 |
+
|
| 47 |
+
namespace Moses
|
| 48 |
+
{
|
| 49 |
+
|
| 50 |
+
class PhraseDictionaryCompact;
|
| 51 |
+
|
| 52 |
+
class PhraseDecoder
|
| 53 |
+
{
|
| 54 |
+
protected:
|
| 55 |
+
|
| 56 |
+
friend class PhraseDictionaryCompact;
|
| 57 |
+
|
| 58 |
+
typedef std::pair<unsigned char, unsigned char> AlignPoint;
|
| 59 |
+
typedef std::pair<unsigned, unsigned> SrcTrg;
|
| 60 |
+
|
| 61 |
+
enum Coding { None, REnc, PREnc } m_coding;
|
| 62 |
+
|
| 63 |
+
size_t m_numScoreComponent;
|
| 64 |
+
bool m_containsAlignmentInfo;
|
| 65 |
+
size_t m_maxRank;
|
| 66 |
+
size_t m_maxPhraseLength;
|
| 67 |
+
|
| 68 |
+
boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
|
| 69 |
+
StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
|
| 70 |
+
StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
|
| 71 |
+
|
| 72 |
+
std::vector<size_t> m_lexicalTableIndex;
|
| 73 |
+
std::vector<SrcTrg> m_lexicalTable;
|
| 74 |
+
|
| 75 |
+
CanonicalHuffman<unsigned>* m_symbolTree;
|
| 76 |
+
|
| 77 |
+
bool m_multipleScoreTrees;
|
| 78 |
+
std::vector<CanonicalHuffman<float>*> m_scoreTrees;
|
| 79 |
+
|
| 80 |
+
CanonicalHuffman<AlignPoint>* m_alignTree;
|
| 81 |
+
|
| 82 |
+
TargetPhraseCollectionCache m_decodingCache;
|
| 83 |
+
|
| 84 |
+
PhraseDictionaryCompact& m_phraseDictionary;
|
| 85 |
+
|
| 86 |
+
// ***********************************************
|
| 87 |
+
|
| 88 |
+
const std::vector<FactorType>* m_input;
|
| 89 |
+
const std::vector<FactorType>* m_output;
|
| 90 |
+
|
| 91 |
+
std::string m_separator;
|
| 92 |
+
|
| 93 |
+
// ***********************************************
|
| 94 |
+
|
| 95 |
+
unsigned GetSourceSymbolId(std::string& s);
|
| 96 |
+
std::string GetTargetSymbol(unsigned id) const;
|
| 97 |
+
|
| 98 |
+
size_t GetREncType(unsigned encodedSymbol);
|
| 99 |
+
size_t GetPREncType(unsigned encodedSymbol);
|
| 100 |
+
|
| 101 |
+
unsigned GetTranslation(unsigned srcIdx, size_t rank);
|
| 102 |
+
|
| 103 |
+
size_t GetMaxSourcePhraseLength();
|
| 104 |
+
|
| 105 |
+
unsigned DecodeREncSymbol1(unsigned encodedSymbol);
|
| 106 |
+
unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
|
| 107 |
+
unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
|
| 108 |
+
unsigned DecodeREncSymbol3(unsigned encodedSymbol);
|
| 109 |
+
|
| 110 |
+
unsigned DecodePREncSymbol1(unsigned encodedSymbol);
|
| 111 |
+
int DecodePREncSymbol2Left(unsigned encodedSymbol);
|
| 112 |
+
int DecodePREncSymbol2Right(unsigned encodedSymbol);
|
| 113 |
+
unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
|
| 114 |
+
|
| 115 |
+
std::string MakeSourceKey(std::string &);
|
| 116 |
+
|
| 117 |
+
public:
|
| 118 |
+
|
| 119 |
+
PhraseDecoder(
|
| 120 |
+
PhraseDictionaryCompact &phraseDictionary,
|
| 121 |
+
const std::vector<FactorType>* input,
|
| 122 |
+
const std::vector<FactorType>* output,
|
| 123 |
+
size_t numScoreComponent
|
| 124 |
+
);
|
| 125 |
+
|
| 126 |
+
~PhraseDecoder();
|
| 127 |
+
|
| 128 |
+
size_t Load(std::FILE* in);
|
| 129 |
+
|
| 130 |
+
TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
|
| 131 |
+
bool topLevel = false, bool eval = true);
|
| 132 |
+
|
| 133 |
+
TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
|
| 134 |
+
BitWrapper<> &encodedBitStream,
|
| 135 |
+
const Phrase &sourcePhrase,
|
| 136 |
+
bool topLevel,
|
| 137 |
+
bool eval);
|
| 138 |
+
|
| 139 |
+
void PruneCache();
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
#endif
|
mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h
ADDED
|
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// $Id$
|
| 2 |
+
// vim:tabstop=2
|
| 3 |
+
/***********************************************************************
|
| 4 |
+
Moses - factored phrase-based language decoder
|
| 5 |
+
Copyright (C) 2006 University of Edinburgh
|
| 6 |
+
|
| 7 |
+
This library is free software; you can redistribute it and/or
|
| 8 |
+
modify it under the terms of the GNU Lesser General Public
|
| 9 |
+
License as published by the Free Software Foundation; either
|
| 10 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 11 |
+
|
| 12 |
+
This library is distributed in the hope that it will be useful,
|
| 13 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 14 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 15 |
+
Lesser General Public License for more details.
|
| 16 |
+
|
| 17 |
+
You should have received a copy of the GNU Lesser General Public
|
| 18 |
+
License along with this library; if not, write to the Free Software
|
| 19 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 20 |
+
***********************************************************************/
|
| 21 |
+
|
| 22 |
+
#ifndef moses_PhraseTableCreator_h
|
| 23 |
+
#define moses_PhraseTableCreator_h
|
| 24 |
+
|
| 25 |
+
#include <sstream>
|
| 26 |
+
#include <iostream>
|
| 27 |
+
#include <queue>
|
| 28 |
+
#include <vector>
|
| 29 |
+
#include <set>
|
| 30 |
+
#include <boost/unordered_map.hpp>
|
| 31 |
+
|
| 32 |
+
#include "moses/InputFileStream.h"
|
| 33 |
+
#include "moses/ThreadPool.h"
|
| 34 |
+
#include "moses/Util.h"
|
| 35 |
+
|
| 36 |
+
#include "BlockHashIndex.h"
|
| 37 |
+
#include "StringVector.h"
|
| 38 |
+
#include "StringVectorTemp.h"
|
| 39 |
+
#include "CanonicalHuffman.h"
|
| 40 |
+
|
| 41 |
+
namespace Moses
|
| 42 |
+
{
|
| 43 |
+
|
| 44 |
+
typedef std::pair<unsigned char, unsigned char> AlignPoint;
|
| 45 |
+
|
| 46 |
+
template <typename DataType>
|
| 47 |
+
class Counter
|
| 48 |
+
{
|
| 49 |
+
public:
|
| 50 |
+
typedef boost::unordered_map<DataType, size_t> FreqMap;
|
| 51 |
+
typedef typename FreqMap::iterator iterator;
|
| 52 |
+
typedef typename FreqMap::mapped_type mapped_type;
|
| 53 |
+
typedef typename FreqMap::value_type value_type;
|
| 54 |
+
|
| 55 |
+
private:
|
| 56 |
+
#ifdef WITH_THREADS
|
| 57 |
+
boost::mutex m_mutex;
|
| 58 |
+
#endif
|
| 59 |
+
FreqMap m_freqMap;
|
| 60 |
+
size_t m_maxSize;
|
| 61 |
+
std::vector<DataType> m_bestVec;
|
| 62 |
+
|
| 63 |
+
struct FreqSorter {
|
| 64 |
+
bool operator()(const value_type& a, const value_type& b) const {
|
| 65 |
+
if(a.second > b.second)
|
| 66 |
+
return true;
|
| 67 |
+
// Check impact on translation quality!
|
| 68 |
+
if(a.second == b.second && a.first > b.first)
|
| 69 |
+
return true;
|
| 70 |
+
return false;
|
| 71 |
+
}
|
| 72 |
+
};
|
| 73 |
+
|
| 74 |
+
public:
|
| 75 |
+
Counter() : m_maxSize(0) {}
|
| 76 |
+
|
| 77 |
+
iterator Begin() {
|
| 78 |
+
return m_freqMap.begin();
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
iterator End() {
|
| 82 |
+
return m_freqMap.end();
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
void Increase(DataType data) {
|
| 86 |
+
#ifdef WITH_THREADS
|
| 87 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 88 |
+
#endif
|
| 89 |
+
m_freqMap[data]++;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
void IncreaseBy(DataType data, size_t num) {
|
| 93 |
+
#ifdef WITH_THREADS
|
| 94 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 95 |
+
#endif
|
| 96 |
+
m_freqMap[data] += num;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
mapped_type& operator[](DataType data) {
|
| 100 |
+
return m_freqMap[data];
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
size_t Size() {
|
| 104 |
+
#ifdef WITH_THREADS
|
| 105 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 106 |
+
#endif
|
| 107 |
+
return m_freqMap.size();
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
void Quantize(size_t maxSize) {
|
| 111 |
+
#ifdef WITH_THREADS
|
| 112 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 113 |
+
#endif
|
| 114 |
+
m_maxSize = maxSize;
|
| 115 |
+
std::vector<std::pair<DataType, mapped_type> > freqVec;
|
| 116 |
+
freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
|
| 117 |
+
std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
|
| 118 |
+
|
| 119 |
+
for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
|
| 120 |
+
m_bestVec.push_back(freqVec[i].first);
|
| 121 |
+
|
| 122 |
+
std::sort(m_bestVec.begin(), m_bestVec.end());
|
| 123 |
+
|
| 124 |
+
FreqMap t_freqMap;
|
| 125 |
+
for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
|
| 126 |
+
= freqVec.begin(); it != freqVec.end(); it++) {
|
| 127 |
+
DataType closest = LowerBound(it->first);
|
| 128 |
+
t_freqMap[closest] += it->second;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
m_freqMap.swap(t_freqMap);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
void Clear() {
|
| 135 |
+
#ifdef WITH_THREADS
|
| 136 |
+
boost::mutex::scoped_lock lock(m_mutex);
|
| 137 |
+
#endif
|
| 138 |
+
m_freqMap.clear();
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
DataType LowerBound(DataType data) {
|
| 142 |
+
if(m_maxSize == 0 || m_bestVec.size() == 0)
|
| 143 |
+
return data;
|
| 144 |
+
else {
|
| 145 |
+
typename std::vector<DataType>::iterator it
|
| 146 |
+
= std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
|
| 147 |
+
if(it != m_bestVec.end())
|
| 148 |
+
return *it;
|
| 149 |
+
else
|
| 150 |
+
return m_bestVec.back();
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
};
|
| 154 |
+
|
| 155 |
+
class PackedItem
|
| 156 |
+
{
|
| 157 |
+
private:
|
| 158 |
+
long m_line;
|
| 159 |
+
std::string m_sourcePhrase;
|
| 160 |
+
std::string m_packedTargetPhrase;
|
| 161 |
+
size_t m_rank;
|
| 162 |
+
float m_score;
|
| 163 |
+
|
| 164 |
+
public:
|
| 165 |
+
PackedItem(long line, std::string sourcePhrase,
|
| 166 |
+
std::string packedTargetPhrase, size_t rank,
|
| 167 |
+
float m_score = 0);
|
| 168 |
+
|
| 169 |
+
long GetLine() const;
|
| 170 |
+
const std::string& GetSrc() const;
|
| 171 |
+
const std::string& GetTrg() const;
|
| 172 |
+
size_t GetRank() const;
|
| 173 |
+
float GetScore() const;
|
| 174 |
+
};
|
| 175 |
+
|
| 176 |
+
bool operator<(const PackedItem &pi1, const PackedItem &pi2);
|
| 177 |
+
|
| 178 |
+
class PhraseTableCreator
|
| 179 |
+
{
|
| 180 |
+
public:
|
| 181 |
+
enum Coding { None, REnc, PREnc };
|
| 182 |
+
|
| 183 |
+
private:
|
| 184 |
+
std::string m_inPath;
|
| 185 |
+
std::string m_outPath;
|
| 186 |
+
std::string m_tempfilePath;
|
| 187 |
+
|
| 188 |
+
std::FILE* m_outFile;
|
| 189 |
+
|
| 190 |
+
size_t m_numScoreComponent;
|
| 191 |
+
size_t m_sortScoreIndex;
|
| 192 |
+
size_t m_warnMe;
|
| 193 |
+
|
| 194 |
+
Coding m_coding;
|
| 195 |
+
size_t m_orderBits;
|
| 196 |
+
size_t m_fingerPrintBits;
|
| 197 |
+
bool m_useAlignmentInfo;
|
| 198 |
+
bool m_multipleScoreTrees;
|
| 199 |
+
size_t m_quantize;
|
| 200 |
+
size_t m_maxRank;
|
| 201 |
+
|
| 202 |
+
static std::string m_phraseStopSymbol;
|
| 203 |
+
static std::string m_separator;
|
| 204 |
+
|
| 205 |
+
#ifdef WITH_THREADS
|
| 206 |
+
size_t m_threads;
|
| 207 |
+
boost::mutex m_mutex;
|
| 208 |
+
#endif
|
| 209 |
+
|
| 210 |
+
BlockHashIndex m_srcHash;
|
| 211 |
+
BlockHashIndex m_rnkHash;
|
| 212 |
+
|
| 213 |
+
size_t m_maxPhraseLength;
|
| 214 |
+
|
| 215 |
+
std::vector<unsigned> m_ranks;
|
| 216 |
+
|
| 217 |
+
typedef std::pair<unsigned, unsigned> SrcTrg;
|
| 218 |
+
typedef std::pair<std::string, std::string> SrcTrgString;
|
| 219 |
+
typedef std::pair<SrcTrgString, float> SrcTrgProb;
|
| 220 |
+
|
| 221 |
+
struct SrcTrgProbSorter {
|
| 222 |
+
bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const {
|
| 223 |
+
if(a.first.first < b.first.first)
|
| 224 |
+
return true;
|
| 225 |
+
|
| 226 |
+
if(a.first.first == b.first.first && a.second > b.second)
|
| 227 |
+
return true;
|
| 228 |
+
|
| 229 |
+
if(a.first.first == b.first.first
|
| 230 |
+
&& a.second == b.second
|
| 231 |
+
&& a.first.second < b.first.second)
|
| 232 |
+
return true;
|
| 233 |
+
|
| 234 |
+
return false;
|
| 235 |
+
}
|
| 236 |
+
};
|
| 237 |
+
|
| 238 |
+
std::vector<size_t> m_lexicalTableIndex;
|
| 239 |
+
std::vector<SrcTrg> m_lexicalTable;
|
| 240 |
+
|
| 241 |
+
StringVectorTemp<unsigned char, unsigned long, MmapAllocator>*
|
| 242 |
+
m_encodedTargetPhrases;
|
| 243 |
+
|
| 244 |
+
StringVector<unsigned char, unsigned long, MmapAllocator>*
|
| 245 |
+
m_compressedTargetPhrases;
|
| 246 |
+
|
| 247 |
+
boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
|
| 248 |
+
boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
|
| 249 |
+
|
| 250 |
+
typedef Counter<unsigned> SymbolCounter;
|
| 251 |
+
typedef Counter<float> ScoreCounter;
|
| 252 |
+
typedef Counter<AlignPoint> AlignCounter;
|
| 253 |
+
|
| 254 |
+
typedef CanonicalHuffman<unsigned> SymbolTree;
|
| 255 |
+
typedef CanonicalHuffman<float> ScoreTree;
|
| 256 |
+
typedef CanonicalHuffman<AlignPoint> AlignTree;
|
| 257 |
+
|
| 258 |
+
SymbolCounter m_symbolCounter;
|
| 259 |
+
SymbolTree* m_symbolTree;
|
| 260 |
+
|
| 261 |
+
AlignCounter m_alignCounter;
|
| 262 |
+
AlignTree* m_alignTree;
|
| 263 |
+
|
| 264 |
+
std::vector<ScoreCounter*> m_scoreCounters;
|
| 265 |
+
std::vector<ScoreTree*> m_scoreTrees;
|
| 266 |
+
|
| 267 |
+
std::priority_queue<PackedItem> m_queue;
|
| 268 |
+
long m_lastFlushedLine;
|
| 269 |
+
long m_lastFlushedSourceNum;
|
| 270 |
+
std::string m_lastFlushedSourcePhrase;
|
| 271 |
+
std::vector<std::string> m_lastSourceRange;
|
| 272 |
+
std::priority_queue<std::pair<float, size_t> > m_rankQueue;
|
| 273 |
+
std::vector<std::string> m_lastCollection;
|
| 274 |
+
|
| 275 |
+
void Save();
|
| 276 |
+
void PrintInfo();
|
| 277 |
+
|
| 278 |
+
void AddSourceSymbolId(std::string& symbol);
|
| 279 |
+
unsigned GetSourceSymbolId(std::string& symbol);
|
| 280 |
+
|
| 281 |
+
void AddTargetSymbolId(std::string& symbol);
|
| 282 |
+
unsigned GetTargetSymbolId(std::string& symbol);
|
| 283 |
+
unsigned GetOrAddTargetSymbolId(std::string& symbol);
|
| 284 |
+
|
| 285 |
+
unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
|
| 286 |
+
|
| 287 |
+
unsigned EncodeREncSymbol1(unsigned symbol);
|
| 288 |
+
unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
|
| 289 |
+
unsigned EncodeREncSymbol3(unsigned rank);
|
| 290 |
+
|
| 291 |
+
unsigned EncodePREncSymbol1(unsigned symbol);
|
| 292 |
+
unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
|
| 293 |
+
|
| 294 |
+
void EncodeTargetPhraseNone(std::vector<std::string>& t,
|
| 295 |
+
std::ostream& os);
|
| 296 |
+
|
| 297 |
+
void EncodeTargetPhraseREnc(std::vector<std::string>& s,
|
| 298 |
+
std::vector<std::string>& t,
|
| 299 |
+
std::set<AlignPoint>& a,
|
| 300 |
+
std::ostream& os);
|
| 301 |
+
|
| 302 |
+
void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
|
| 303 |
+
std::vector<std::string>& t,
|
| 304 |
+
std::set<AlignPoint>& a, size_t ownRank,
|
| 305 |
+
std::ostream& os);
|
| 306 |
+
|
| 307 |
+
void EncodeScores(std::vector<float>& scores, std::ostream& os);
|
| 308 |
+
void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
|
| 309 |
+
|
| 310 |
+
std::string MakeSourceKey(std::string&);
|
| 311 |
+
std::string MakeSourceTargetKey(std::string&, std::string&);
|
| 312 |
+
|
| 313 |
+
void LoadLexicalTable(std::string filePath);
|
| 314 |
+
|
| 315 |
+
void CreateRankHash();
|
| 316 |
+
void EncodeTargetPhrases();
|
| 317 |
+
void CalcHuffmanCodes();
|
| 318 |
+
void CompressTargetPhrases();
|
| 319 |
+
|
| 320 |
+
void AddRankedLine(PackedItem& pi);
|
| 321 |
+
void FlushRankedQueue(bool force = false);
|
| 322 |
+
|
| 323 |
+
std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
|
| 324 |
+
void AddEncodedLine(PackedItem& pi);
|
| 325 |
+
void FlushEncodedQueue(bool force = false);
|
| 326 |
+
|
| 327 |
+
std::string CompressEncodedCollection(std::string encodedCollection);
|
| 328 |
+
void AddCompressedCollection(PackedItem& pi);
|
| 329 |
+
void FlushCompressedQueue(bool force = false);
|
| 330 |
+
|
| 331 |
+
public:
|
| 332 |
+
|
| 333 |
+
PhraseTableCreator(std::string inPath,
|
| 334 |
+
std::string outPath,
|
| 335 |
+
std::string tempfilePath,
|
| 336 |
+
size_t numScoreComponent = 5,
|
| 337 |
+
size_t sortScoreIndex = 2,
|
| 338 |
+
Coding coding = PREnc,
|
| 339 |
+
size_t orderBits = 10,
|
| 340 |
+
size_t fingerPrintBits = 16,
|
| 341 |
+
bool useAlignmentInfo = false,
|
| 342 |
+
bool multipleScoreTrees = true,
|
| 343 |
+
size_t quantize = 0,
|
| 344 |
+
size_t maxRank = 100,
|
| 345 |
+
bool warnMe = true
|
| 346 |
+
#ifdef WITH_THREADS
|
| 347 |
+
, size_t threads = 2
|
| 348 |
+
#endif
|
| 349 |
+
);
|
| 350 |
+
|
| 351 |
+
~PhraseTableCreator();
|
| 352 |
+
|
| 353 |
+
friend class RankingTask;
|
| 354 |
+
friend class EncodingTask;
|
| 355 |
+
friend class CompressionTask;
|
| 356 |
+
};
|
| 357 |
+
|
| 358 |
+
class RankingTask
|
| 359 |
+
{
|
| 360 |
+
private:
|
| 361 |
+
#ifdef WITH_THREADS
|
| 362 |
+
static boost::mutex m_mutex;
|
| 363 |
+
static boost::mutex m_fileMutex;
|
| 364 |
+
#endif
|
| 365 |
+
static size_t m_lineNum;
|
| 366 |
+
InputFileStream& m_inFile;
|
| 367 |
+
PhraseTableCreator& m_creator;
|
| 368 |
+
|
| 369 |
+
public:
|
| 370 |
+
RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
|
| 371 |
+
void operator()();
|
| 372 |
+
};
|
| 373 |
+
|
| 374 |
+
class EncodingTask
|
| 375 |
+
{
|
| 376 |
+
private:
|
| 377 |
+
#ifdef WITH_THREADS
|
| 378 |
+
static boost::mutex m_mutex;
|
| 379 |
+
static boost::mutex m_fileMutex;
|
| 380 |
+
#endif
|
| 381 |
+
static size_t m_lineNum;
|
| 382 |
+
static size_t m_sourcePhraseNum;
|
| 383 |
+
static std::string m_lastSourcePhrase;
|
| 384 |
+
|
| 385 |
+
InputFileStream& m_inFile;
|
| 386 |
+
PhraseTableCreator& m_creator;
|
| 387 |
+
|
| 388 |
+
public:
|
| 389 |
+
EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
|
| 390 |
+
void operator()();
|
| 391 |
+
};
|
| 392 |
+
|
| 393 |
+
class CompressionTask
|
| 394 |
+
{
|
| 395 |
+
private:
|
| 396 |
+
#ifdef WITH_THREADS
|
| 397 |
+
static boost::mutex m_mutex;
|
| 398 |
+
#endif
|
| 399 |
+
static size_t m_collectionNum;
|
| 400 |
+
StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
|
| 401 |
+
m_encodedCollections;
|
| 402 |
+
PhraseTableCreator& m_creator;
|
| 403 |
+
|
| 404 |
+
public:
|
| 405 |
+
CompressionTask(StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
|
| 406 |
+
encodedCollections, PhraseTableCreator& creator);
|
| 407 |
+
void operator()();
|
| 408 |
+
};
|
| 409 |
+
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
#endif
|
mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/***********************************************************************
|
| 2 |
+
Moses - statistical machine translation system
|
| 3 |
+
Copyright (C) 2006-2012 University of Edinburgh
|
| 4 |
+
|
| 5 |
+
This library is free software; you can redistribute it and/or
|
| 6 |
+
modify it under the terms of the GNU Lesser General Public
|
| 7 |
+
License as published by the Free Software Foundation; either
|
| 8 |
+
version 2.1 of the License, or (at your option) any later version.
|
| 9 |
+
|
| 10 |
+
This library is distributed in the hope that it will be useful,
|
| 11 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 12 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
| 13 |
+
Lesser General Public License for more details.
|
| 14 |
+
|
| 15 |
+
You should have received a copy of the GNU Lesser General Public
|
| 16 |
+
License along with this library; if not, write to the Free Software
|
| 17 |
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 18 |
+
***********************************************************************/
|
| 19 |
+
|
| 20 |
+
#include "Parser.h"
|
| 21 |
+
|
| 22 |
+
#include "moses/ChartParser.h"
|
| 23 |
+
#include "moses/ChartTranslationOptionList.h"
|
| 24 |
+
#include "moses/InputType.h"
|
| 25 |
+
#include "moses/NonTerminal.h"
|
| 26 |
+
#include "moses/TranslationModel/RuleTable/UTrieNode.h"
|
| 27 |
+
#include "moses/TranslationModel/RuleTable/UTrie.h"
|
| 28 |
+
#include "moses/StaticData.h"
|
| 29 |
+
#include "ApplicableRuleTrie.h"
|
| 30 |
+
#include "StackLattice.h"
|
| 31 |
+
#include "StackLatticeBuilder.h"
|
| 32 |
+
#include "StackLatticeSearcher.h"
|
| 33 |
+
#include "VarSpanTrieBuilder.h"
|
| 34 |
+
|
| 35 |
+
#include <memory>
|
| 36 |
+
#include <vector>
|
| 37 |
+
|
| 38 |
+
namespace Moses
|
| 39 |
+
{
|
| 40 |
+
|
| 41 |
+
void Scope3Parser::GetChartRuleCollection(
|
| 42 |
+
const InputPath &inputPath,
|
| 43 |
+
size_t last,
|
| 44 |
+
ChartParserCallback &outColl)
|
| 45 |
+
{
|
| 46 |
+
const Range &range = inputPath.GetWordsRange();
|
| 47 |
+
const size_t start = range.GetStartPos();
|
| 48 |
+
const size_t end = range.GetEndPos();
|
| 49 |
+
|
| 50 |
+
std::vector<std::pair<const UTrieNode *, const VarSpanNode *> > &pairVec
|
| 51 |
+
= m_ruleApplications[start][end-start+1];
|
| 52 |
+
|
| 53 |
+
MatchCallback matchCB(range, outColl);
|
| 54 |
+
for (std::vector<std::pair<const UTrieNode *, const VarSpanNode *> >::const_iterator p = pairVec.begin(); p != pairVec.end(); ++p) {
|
| 55 |
+
const UTrieNode &ruleNode = *(p->first);
|
| 56 |
+
const VarSpanNode &varSpanNode = *(p->second);
|
| 57 |
+
|
| 58 |
+
const UTrieNode::LabelMap &labelMap = ruleNode.GetLabelMap();
|
| 59 |
+
|
| 60 |
+
if (varSpanNode.m_rank == 0) { // Purely lexical rule.
|
| 61 |
+
assert(labelMap.size() == 1);
|
| 62 |
+
TargetPhraseCollection::shared_ptr tpc = labelMap.begin()->second;
|
| 63 |
+
matchCB.m_tpc = tpc;
|
| 64 |
+
matchCB(m_emptyStackVec);
|
| 65 |
+
} else { // Rule has at least one non-terminal.
|
| 66 |
+
varSpanNode.CalculateRanges(start, end, m_ranges);
|
| 67 |
+
m_latticeBuilder.Build(start, end, ruleNode, varSpanNode, m_ranges,
|
| 68 |
+
*this, m_lattice,
|
| 69 |
+
m_quickCheckTable);
|
| 70 |
+
StackLatticeSearcher<MatchCallback> searcher(m_lattice, m_ranges);
|
| 71 |
+
UTrieNode::LabelMap::const_iterator p = labelMap.begin();
|
| 72 |
+
for (; p != labelMap.end(); ++p) {
|
| 73 |
+
const std::vector<int> &labels = p->first;
|
| 74 |
+
TargetPhraseCollection::shared_ptr tpc = p->second;
|
| 75 |
+
assert(labels.size() == varSpanNode.m_rank);
|
| 76 |
+
bool failCheck = false;
|
| 77 |
+
for (size_t i = 0; i < varSpanNode.m_rank; ++i) {
|
| 78 |
+
if (!m_quickCheckTable[i][labels[i]]) {
|
| 79 |
+
failCheck = true;
|
| 80 |
+
break;
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
if (failCheck) {
|
| 84 |
+
continue;
|
| 85 |
+
}
|
| 86 |
+
matchCB.m_tpc = tpc;
|
| 87 |
+
searcher.Search(labels, matchCB);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
void Scope3Parser::Init()
|
| 94 |
+
{
|
| 95 |
+
InitRuleApplicationVector();
|
| 96 |
+
|
| 97 |
+
// Build a map from Words to index-sets.
|
| 98 |
+
SentenceMap sentMap;
|
| 99 |
+
FillSentenceMap(sentMap);
|
| 100 |
+
|
| 101 |
+
// Build a trie containing 'elastic' application contexts
|
| 102 |
+
const UTrieNode &rootNode = m_ruleTable.GetRootNode();
|
| 103 |
+
std::auto_ptr<ApplicableRuleTrie> art(new ApplicableRuleTrie(-1, -1, rootNode));
|
| 104 |
+
art->Extend(rootNode, -1, sentMap, false);
|
| 105 |
+
|
| 106 |
+
// Build a trie containing just the non-terminal contexts and insert pointers
|
| 107 |
+
// to its nodes back into the ART trie. Contiguous non-terminal contexts are
|
| 108 |
+
// merged and the number of split points is recorded.
|
| 109 |
+
VarSpanTrieBuilder vstBuilder;
|
| 110 |
+
m_varSpanTrie = vstBuilder.Build(*art);
|
| 111 |
+
|
| 112 |
+
// Fill each cell with a list of pointers to relevant ART nodes.
|
| 113 |
+
AddRulesToCells(*art, std::make_pair<int, int>(-1, -1), GetParser().GetSize()-1, 0);
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
void Scope3Parser::InitRuleApplicationVector()
|
| 117 |
+
{
|
| 118 |
+
const size_t sourceSize = GetParser().GetSize();
|
| 119 |
+
m_ruleApplications.resize(sourceSize);
|
| 120 |
+
for (size_t start = 0; start < sourceSize; ++start) {
|
| 121 |
+
size_t maxSpan = sourceSize-start+1;
|
| 122 |
+
m_ruleApplications[start].resize(maxSpan+1);
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
void Scope3Parser::FillSentenceMap(SentenceMap &sentMap)
|
| 127 |
+
{
|
| 128 |
+
for (size_t i = 0; i < GetParser().GetSize(); ++i) {
|
| 129 |
+
const Word &word = GetParser().GetInputPath(i, i).GetLastWord();
|
| 130 |
+
sentMap[word].push_back(i);
|
| 131 |
+
}
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
void Scope3Parser::AddRulesToCells(
|
| 135 |
+
const ApplicableRuleTrie &node,
|
| 136 |
+
std::pair<int, int> start,
|
| 137 |
+
int maxPos,
|
| 138 |
+
int depth)
|
| 139 |
+
{
|
| 140 |
+
if (depth > 0) {
|
| 141 |
+
// Determine the start range for this path if not already known.
|
| 142 |
+
if (start.first == -1 && start.second == -1) {
|
| 143 |
+
assert(depth == 1);
|
| 144 |
+
start.first = std::max(0, node.m_start);
|
| 145 |
+
start.second = node.m_start;
|
| 146 |
+
} else if (start.second < 0) {
|
| 147 |
+
assert(depth > 1);
|
| 148 |
+
if (node.m_start == -1) {
|
| 149 |
+
--start.second; // Record split point
|
| 150 |
+
} else {
|
| 151 |
+
int numSplitPoints = -1 - start.second;
|
| 152 |
+
start.second = node.m_start - (numSplitPoints+1);
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
if (node.m_node->HasRules()) {
|
| 158 |
+
assert(depth > 0);
|
| 159 |
+
assert(node.m_vstNode);
|
| 160 |
+
// Determine the end range for this path.
|
| 161 |
+
std::pair<int, int> end;
|
| 162 |
+
if (node.m_end == -1) {
|
| 163 |
+
end.first = (*(node.m_vstNode->m_label))[2];
|
| 164 |
+
end.second = (*(node.m_vstNode->m_label))[3];
|
| 165 |
+
assert(end.first != -1);
|
| 166 |
+
if (end.second == -1) {
|
| 167 |
+
end.second = maxPos;
|
| 168 |
+
}
|
| 169 |
+
} else {
|
| 170 |
+
assert(node.m_start == node.m_end); // Should be a terminal
|
| 171 |
+
end.first = end.second = node.m_start;
|
| 172 |
+
}
|
| 173 |
+
// Add a (rule trie node, VST node) pair for each cell in the range.
|
| 174 |
+
int s2 = start.second;
|
| 175 |
+
if (s2 < 0) {
|
| 176 |
+
int numSplitPoints = -1 - s2;
|
| 177 |
+
s2 = maxPos - numSplitPoints;
|
| 178 |
+
}
|
| 179 |
+
for (int i = start.first; i <= s2; ++i) {
|
| 180 |
+
int e1 = std::max(i+depth-1, end.first);
|
| 181 |
+
for (int j = e1; j <= end.second; ++j) {
|
| 182 |
+
size_t span = j-i+1;
|
| 183 |
+
assert(span >= 1);
|
| 184 |
+
if (m_maxChartSpan && span > m_maxChartSpan) {
|
| 185 |
+
break;
|
| 186 |
+
}
|
| 187 |
+
m_ruleApplications[i][span].push_back(std::make_pair(node.m_node,
|
| 188 |
+
node.m_vstNode));
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
for (std::vector<ApplicableRuleTrie*>::const_iterator p = node.m_children.begin(); p != node.m_children.end(); ++p) {
|
| 194 |
+
AddRulesToCells(**p, start, maxPos, depth+1);
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
} // namespace Moses
|
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <string>
|
| 2 |
+
#include <cassert>
|
| 3 |
+
#include <iomanip>
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include "ug_stringdist.h"
|
| 6 |
+
// string distance measures
|
| 7 |
+
// Code by Ulrich Germann
|
| 8 |
+
|
| 9 |
+
namespace stringdist
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
UErrorCode strip_accents(UnicodeString & trg)
|
| 13 |
+
{
|
| 14 |
+
UErrorCode status = U_ZERO_ERROR;
|
| 15 |
+
static Transliterator *stripper
|
| 16 |
+
= Transliterator::createInstance("NFD; [:M:] Remove; NFC",
|
| 17 |
+
UTRANS_FORWARD, status);
|
| 18 |
+
stripper->transliterate(trg);
|
| 19 |
+
return status;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
char const*
|
| 23 |
+
StringDiff::
|
| 24 |
+
Segment::
|
| 25 |
+
elabel[] = { "same", "cap", "flip", "permutation",
|
| 26 |
+
"accent", "duplication",
|
| 27 |
+
"insertion", "deletion",
|
| 28 |
+
"mismatch", "noinit" };
|
| 29 |
+
|
| 30 |
+
StringDiff::
|
| 31 |
+
StringDiff()
|
| 32 |
+
{}
|
| 33 |
+
|
| 34 |
+
StringDiff::
|
| 35 |
+
StringDiff(string const& a, string const& b)
|
| 36 |
+
{
|
| 37 |
+
set_a(a);
|
| 38 |
+
set_b(b);
|
| 39 |
+
align();
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
StringDiff::
|
| 43 |
+
Segment::
|
| 44 |
+
Segment()
|
| 45 |
+
: start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
|
| 46 |
+
{}
|
| 47 |
+
|
| 48 |
+
UnicodeString const&
|
| 49 |
+
StringDiff::
|
| 50 |
+
set_a(string const& a)
|
| 51 |
+
{
|
| 52 |
+
this->a = a.c_str();
|
| 53 |
+
return this->a;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
UnicodeString const&
|
| 57 |
+
StringDiff::
|
| 58 |
+
set_b(string const& b)
|
| 59 |
+
{
|
| 60 |
+
this->b = b.c_str();
|
| 61 |
+
return this->b;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
UnicodeString const&
|
| 65 |
+
StringDiff::
|
| 66 |
+
get_a() const
|
| 67 |
+
{
|
| 68 |
+
return this->a;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
UnicodeString const&
|
| 72 |
+
StringDiff::
|
| 73 |
+
get_b() const
|
| 74 |
+
{
|
| 75 |
+
return this->b;
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
size_t
|
| 79 |
+
StringDiff::
|
| 80 |
+
size()
|
| 81 |
+
{
|
| 82 |
+
return this->difflist.size();
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
// float
|
| 86 |
+
// StringDiff::
|
| 87 |
+
// levelshtein(bool force)
|
| 88 |
+
// {
|
| 89 |
+
// align(force);
|
| 90 |
+
// float ret = 0;
|
| 91 |
+
// for (size_t i = 0; i < difflist.size(); +++i)
|
| 92 |
+
// {
|
| 93 |
+
// Segment const& s = difflist[i];
|
| 94 |
+
// if (s.match == same) continue;
|
| 95 |
+
// else if (s.match == insertion) ret += s.end_b - s.start_b;
|
| 96 |
+
// else if (s.match == deletion) ret += s.end_a - s.start_a;
|
| 97 |
+
|
| 98 |
+
// }
|
| 99 |
+
// }
|
| 100 |
+
|
| 101 |
+
void
|
| 102 |
+
StringDiff::
|
| 103 |
+
fillAlignmentMatrix(vector<vector<float> > & M) const
|
| 104 |
+
{
|
| 105 |
+
assert(a.length() && b.length());
|
| 106 |
+
M.assign(a.length(),vector<float>(b.length(),0));
|
| 107 |
+
int i = 0,j;
|
| 108 |
+
while (i < b.length() && b[i] != a[0]) ++i;
|
| 109 |
+
while (i < b.length()) M[0][i++] = 1;
|
| 110 |
+
i = 0;
|
| 111 |
+
while (i < a.length() && a[i] != b[0]) ++i;
|
| 112 |
+
while (i < a.length()) M[i++][0] = 1;
|
| 113 |
+
for (i = 1; i < a.length(); ++i)
|
| 114 |
+
{
|
| 115 |
+
for (j = 1; j < b.length(); ++j)
|
| 116 |
+
{
|
| 117 |
+
float & s = M[i][j];
|
| 118 |
+
s = max(M[i-1][j],M[i][j-1]);
|
| 119 |
+
if (a[i] == b[j])
|
| 120 |
+
s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0));
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
#if 0
|
| 124 |
+
string abuf,bbuf;
|
| 125 |
+
a.toUTF8String(abuf);
|
| 126 |
+
b.toUTF8String(bbuf);
|
| 127 |
+
cout << " " << bbuf[0];
|
| 128 |
+
for (int x = 1; x < b.length(); ++x)
|
| 129 |
+
cout << " " << bbuf[x];
|
| 130 |
+
cout << endl;
|
| 131 |
+
for (int x = 0; x < a.length(); ++x)
|
| 132 |
+
{
|
| 133 |
+
cout << abuf[x] << " ";
|
| 134 |
+
for (int y = 0; y < b.length(); ++y)
|
| 135 |
+
cout << int(M[x][y]) << " ";
|
| 136 |
+
cout << endl;
|
| 137 |
+
}
|
| 138 |
+
#endif
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
float
|
| 142 |
+
fillAlignmentMatrix(UChar const* a, size_t const lenA,
|
| 143 |
+
UChar const* b, size_t const lenB,
|
| 144 |
+
vector<vector<float> > & M)
|
| 145 |
+
{
|
| 146 |
+
M.assign(lenA,vector<float>(lenB,0));
|
| 147 |
+
assert(lenA); assert(lenB);
|
| 148 |
+
size_t i = 0;
|
| 149 |
+
while (i < lenB && b[i] != a[0]) ++i;
|
| 150 |
+
while (i < lenB) M[0][i++] = 1;
|
| 151 |
+
i = 0;
|
| 152 |
+
while (i < lenA && a[i] != b[0]) ++i;
|
| 153 |
+
while (i < lenA) M[i++][0] = 1;
|
| 154 |
+
for (i = 1; i < lenA; ++i)
|
| 155 |
+
{
|
| 156 |
+
for (size_t j = 1; j < lenB; ++j)
|
| 157 |
+
{
|
| 158 |
+
float & s = M[i][j];
|
| 159 |
+
s = max(M[i-1][j], M[i][j-1]);
|
| 160 |
+
if (a[i] == b[j])
|
| 161 |
+
s = max(s, M[i-1][j-1] + 1);
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
return M.back().back();
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
float
|
| 168 |
+
levenshtein(UChar const* a, size_t const lenA,
|
| 169 |
+
UChar const* b, size_t const lenB)
|
| 170 |
+
{
|
| 171 |
+
vector<vector<float> > M;
|
| 172 |
+
fillAlignmentMatrix(a,lenA,b,lenB,M);
|
| 173 |
+
size_t ret = 0;
|
| 174 |
+
#define DEBUGME 0
|
| 175 |
+
#if DEBUGME
|
| 176 |
+
for (size_t i = 0; i < M.size(); ++i)
|
| 177 |
+
{
|
| 178 |
+
for (size_t j = 0; j < M[i].size(); ++j)
|
| 179 |
+
cout << M[i][j] << " ";
|
| 180 |
+
cout << endl;
|
| 181 |
+
}
|
| 182 |
+
cout << string(25,'-') << endl;
|
| 183 |
+
#endif
|
| 184 |
+
|
| 185 |
+
int i = M.size() -1;
|
| 186 |
+
int j = M.back().size() -1;
|
| 187 |
+
int I=i, J=j;
|
| 188 |
+
for (;i >= 0 || j >= 0; --i, --j)
|
| 189 |
+
{
|
| 190 |
+
I=i, J=j;
|
| 191 |
+
if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i;
|
| 192 |
+
if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j;
|
| 193 |
+
size_t ilen = I >= 0 ? I - i : 0;
|
| 194 |
+
size_t jlen = J >= 0 ? J - j : 0;
|
| 195 |
+
ret += max(ilen,jlen);
|
| 196 |
+
#if DEBUGME
|
| 197 |
+
cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
|
| 198 |
+
#endif
|
| 199 |
+
I=i, J=j;
|
| 200 |
+
}
|
| 201 |
+
size_t ilen = I >= 0 ? I - i : 0;
|
| 202 |
+
size_t jlen = J >= 0 ? J - j : 0;
|
| 203 |
+
ret += max(ilen,jlen);
|
| 204 |
+
#if DEBUGME
|
| 205 |
+
cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
|
| 206 |
+
#endif
|
| 207 |
+
return ret;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
StringDiff::
|
| 213 |
+
Segment::
|
| 214 |
+
Segment(size_t const as, size_t const ae,
|
| 215 |
+
size_t const bs, size_t const be,
|
| 216 |
+
UnicodeString const& a,
|
| 217 |
+
UnicodeString const& b)
|
| 218 |
+
{
|
| 219 |
+
dist = 0;
|
| 220 |
+
start_a = as; end_a = ae;
|
| 221 |
+
start_b = bs; end_b = be;
|
| 222 |
+
if (as == ae)
|
| 223 |
+
match = bs == be ? same : insertion;
|
| 224 |
+
else if (bs == be)
|
| 225 |
+
match = deletion;
|
| 226 |
+
else if (be-bs != ae-as)
|
| 227 |
+
{
|
| 228 |
+
match = mismatch;
|
| 229 |
+
dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
|
| 230 |
+
b.getBuffer() + bs, be - bs);
|
| 231 |
+
}
|
| 232 |
+
else
|
| 233 |
+
{
|
| 234 |
+
match = same;
|
| 235 |
+
size_t stop = ae-as;
|
| 236 |
+
for (size_t i = 0; i < stop && match == same; ++i)
|
| 237 |
+
if (a[as+i] != b[bs+i]) match = mismatch;
|
| 238 |
+
if (match == mismatch)
|
| 239 |
+
{
|
| 240 |
+
if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs])
|
| 241 |
+
match = flip;
|
| 242 |
+
else
|
| 243 |
+
{
|
| 244 |
+
vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae);
|
| 245 |
+
vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be);
|
| 246 |
+
sort(x.begin(),x.end());
|
| 247 |
+
sort(y.begin(),y.end());
|
| 248 |
+
if (x == y) match = permutation;
|
| 249 |
+
else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
|
| 250 |
+
b.getBuffer() + bs, be - bs);
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
}
|
| 254 |
+
if (match == insertion)
|
| 255 |
+
{
|
| 256 |
+
dist = be-bs;
|
| 257 |
+
}
|
| 258 |
+
else if (match == deletion)
|
| 259 |
+
{
|
| 260 |
+
dist = ae-as;
|
| 261 |
+
}
|
| 262 |
+
else if (match == flip) dist = 1;
|
| 263 |
+
else if (match == permutation) dist = ae-as-1;
|
| 264 |
+
if (match == mismatch)
|
| 265 |
+
{
|
| 266 |
+
UnicodeString ax(a,as,ae-as);
|
| 267 |
+
UnicodeString bx(b,bs,be-bs);
|
| 268 |
+
if (ax.toLower() == bx.toLower())
|
| 269 |
+
match = cap;
|
| 270 |
+
else
|
| 271 |
+
{
|
| 272 |
+
strip_accents(ax);
|
| 273 |
+
strip_accents(bx);
|
| 274 |
+
if (ax == bx) match = accent;
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
size_t
|
| 280 |
+
StringDiff::
|
| 281 |
+
align(bool force)
|
| 282 |
+
{
|
| 283 |
+
if (force) difflist.clear();
|
| 284 |
+
if (difflist.size()) return 0;
|
| 285 |
+
vector<vector<float> > M;
|
| 286 |
+
fillAlignmentMatrix(M);
|
| 287 |
+
// now backtrack
|
| 288 |
+
int i = a.length() - 1;
|
| 289 |
+
int j = b.length() - 1;
|
| 290 |
+
vector<int> A(a.length(), -1);
|
| 291 |
+
vector<int> B(b.length(), -1);
|
| 292 |
+
while (i + j)
|
| 293 |
+
{
|
| 294 |
+
while (i && M[i-1][j] == M[i][j]) --i;
|
| 295 |
+
while (j && M[i][j-1] == M[i][j]) --j;
|
| 296 |
+
if (a[i] == b[j]) { A[i] = j; B[j] = i; }
|
| 297 |
+
if (i) --i;
|
| 298 |
+
if (j) --j;
|
| 299 |
+
}
|
| 300 |
+
i = a.length() - 1;
|
| 301 |
+
j = b.length() - 1;
|
| 302 |
+
vector<int> A2(a.length(), -1);
|
| 303 |
+
vector<int> B2(b.length(), -1);
|
| 304 |
+
while (i + j)
|
| 305 |
+
{
|
| 306 |
+
while (j && M[i][j-1] == M[i][j]) --j;
|
| 307 |
+
while (i && M[i-1][j] == M[i][j]) --i;
|
| 308 |
+
if (a[i] == b[j]) { A2[i] = j; B2[j] = i; }
|
| 309 |
+
if (i) --i;
|
| 310 |
+
if (j) --j;
|
| 311 |
+
}
|
| 312 |
+
for (size_t k = 0; k < A.size(); ++k)
|
| 313 |
+
A[k] = min(A[k],A2[k]);
|
| 314 |
+
for (size_t k = 0; k < B.size(); ++k)
|
| 315 |
+
B[k] = min(B[k],B2[k]);
|
| 316 |
+
|
| 317 |
+
if (a[i] == b[j]) { A[i] = j; B[j] = i; }
|
| 318 |
+
i = 0;
|
| 319 |
+
j = 0;
|
| 320 |
+
size_t I, J;
|
| 321 |
+
while (i < a.length() and j < b.length())
|
| 322 |
+
{
|
| 323 |
+
if (A[i] < 0)
|
| 324 |
+
{
|
| 325 |
+
I = i + 1;
|
| 326 |
+
while (I < A.size() and A[I] < 0) ++I;
|
| 327 |
+
if (i)
|
| 328 |
+
{ for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); }
|
| 329 |
+
else if (I < A.size())
|
| 330 |
+
{ for (j = J = A[I]; j && B[j-1] < 0; --j); }
|
| 331 |
+
else J = B.size();
|
| 332 |
+
difflist.push_back(Segment(i,I,j,J,a,b));
|
| 333 |
+
i = I; j = J;
|
| 334 |
+
}
|
| 335 |
+
else if (B[j] < 0)
|
| 336 |
+
{
|
| 337 |
+
for (J = j + 1; J < B.size() && B[J] < 0; ++J);
|
| 338 |
+
difflist.push_back(Segment(i,i,j,J,a,b));
|
| 339 |
+
j = J;
|
| 340 |
+
}
|
| 341 |
+
else
|
| 342 |
+
{
|
| 343 |
+
I = i;
|
| 344 |
+
J = j;
|
| 345 |
+
while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
|
| 346 |
+
{ ++I; ++J; }
|
| 347 |
+
difflist.push_back(Segment(i,I,j,J,a,b));
|
| 348 |
+
i = I; j = J;
|
| 349 |
+
}
|
| 350 |
+
}
|
| 351 |
+
if (i < a.length() || j < b.length())
|
| 352 |
+
difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
|
| 353 |
+
|
| 354 |
+
diffcnt.assign(noinit,0);
|
| 355 |
+
for (size_t i = 0; i < difflist.size(); ++i)
|
| 356 |
+
{
|
| 357 |
+
Segment & s = difflist[i];
|
| 358 |
+
if (s.match == insertion and
|
| 359 |
+
((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
|
| 360 |
+
(s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
|
| 361 |
+
{
|
| 362 |
+
bool sameletter = true;
|
| 363 |
+
for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i)
|
| 364 |
+
sameletter = b[i] == b[i-1];
|
| 365 |
+
if (sameletter) s.match = duplication;
|
| 366 |
+
}
|
| 367 |
+
else if (s.match == deletion and
|
| 368 |
+
((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
|
| 369 |
+
(s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
|
| 370 |
+
{
|
| 371 |
+
bool sameletter = true;
|
| 372 |
+
for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i)
|
| 373 |
+
sameletter = a[i] == a[i-1];
|
| 374 |
+
if (sameletter) s.match= duplication;
|
| 375 |
+
}
|
| 376 |
+
++diffcnt[s.match];
|
| 377 |
+
}
|
| 378 |
+
return 0;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
void
|
| 382 |
+
StringDiff::
|
| 383 |
+
showDiff(std::ostream& out)
|
| 384 |
+
{
|
| 385 |
+
if (difflist.size() == 0) align();
|
| 386 |
+
vector<size_t> fromEnd(difflist.size(),0);
|
| 387 |
+
for (int d = difflist.size()-1; d-- > 0;)
|
| 388 |
+
{
|
| 389 |
+
fromEnd[d] = a.length() - difflist[d].end_a;
|
| 390 |
+
// cout << d << " " << fromEnd[d] << " "
|
| 391 |
+
// << difflist[d].start_a << "-"
|
| 392 |
+
// << difflist[d].end_a << endl;
|
| 393 |
+
}
|
| 394 |
+
for (size_t d = 0; d < difflist.size(); ++d)
|
| 395 |
+
{
|
| 396 |
+
Segment const& s = difflist[d];
|
| 397 |
+
UnicodeString aseg,bseg;
|
| 398 |
+
a.extract(s.start_a, s.end_a - s.start_a, aseg);
|
| 399 |
+
b.extract(s.start_b, s.end_b - s.start_b, bseg);
|
| 400 |
+
string abuf,bbuf;
|
| 401 |
+
aseg.toUTF8String(abuf);
|
| 402 |
+
bseg.toUTF8String(bbuf);
|
| 403 |
+
out << abuf << " ";
|
| 404 |
+
out << bbuf << " ";
|
| 405 |
+
out << s.label() << " "
|
| 406 |
+
<< s.dist << " "
|
| 407 |
+
<< fromEnd[d]
|
| 408 |
+
<< endl;
|
| 409 |
+
}
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
char const*
|
| 413 |
+
StringDiff::
|
| 414 |
+
Segment::
|
| 415 |
+
label() const
|
| 416 |
+
{
|
| 417 |
+
return elabel[this->match];
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
StringDiff::Segment const&
|
| 421 |
+
StringDiff::
|
| 422 |
+
operator[](uint32_t const i) const
|
| 423 |
+
{
|
| 424 |
+
return difflist.at(i);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
vector<int> const&
|
| 428 |
+
StringDiff::
|
| 429 |
+
getFeatures() const
|
| 430 |
+
{
|
| 431 |
+
return diffcnt;
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//-*- c++ -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
// string distance measures
|
| 5 |
+
// Code by Ulrich Germann
|
| 6 |
+
#include<iostream>
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
#include <unicode/stringpiece.h>
|
| 10 |
+
#include <unicode/translit.h>
|
| 11 |
+
#include <unicode/utypes.h>
|
| 12 |
+
#include <unicode/unistr.h>
|
| 13 |
+
#include <unicode/uchar.h>
|
| 14 |
+
#include <unicode/utf8.h>
|
| 15 |
+
#include <vector>
|
| 16 |
+
|
| 17 |
+
#include "moses/TranslationModel/UG/mm/tpt_typedefs.h"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
namespace stringdist
|
| 21 |
+
{
|
| 22 |
+
float
|
| 23 |
+
levenshtein(UChar const* a, size_t const lenA,
|
| 24 |
+
UChar const* b, size_t const lenB);
|
| 25 |
+
|
| 26 |
+
UErrorCode strip_accents(UnicodeString & trg);
|
| 27 |
+
|
| 28 |
+
float
|
| 29 |
+
fillAlignmentMatrix(UChar const* a, size_t const lenA,
|
| 30 |
+
UChar const* b, size_t const lenB,
|
| 31 |
+
std::vector<std::vector<float> > & M);
|
| 32 |
+
|
| 33 |
+
class StringDiff
|
| 34 |
+
{
|
| 35 |
+
public:
|
| 36 |
+
enum MATCHTYPE
|
| 37 |
+
{
|
| 38 |
+
same, // a and b are identical
|
| 39 |
+
cap, // a and b differ only in capitalization
|
| 40 |
+
flip, // two-letter flip
|
| 41 |
+
permutation, // a and b have same letters but in different order
|
| 42 |
+
accent, // a and b are the same basic letters, ignoring accents
|
| 43 |
+
duplication, // a is empty
|
| 44 |
+
insertion, // a is empty
|
| 45 |
+
deletion, // b is empty
|
| 46 |
+
mismatch, // none of the above
|
| 47 |
+
noinit // not initialized
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
struct Segment
|
| 51 |
+
{
|
| 52 |
+
static char const* elabel[];
|
| 53 |
+
int start_a, end_a;
|
| 54 |
+
int start_b, end_b;
|
| 55 |
+
MATCHTYPE match;
|
| 56 |
+
float dist;
|
| 57 |
+
Segment();
|
| 58 |
+
Segment(size_t const as, size_t const ae,
|
| 59 |
+
size_t const bs, size_t const be,
|
| 60 |
+
UnicodeString const& a,
|
| 61 |
+
UnicodeString const& b);
|
| 62 |
+
char const* label() const;
|
| 63 |
+
};
|
| 64 |
+
private:
|
| 65 |
+
UnicodeString a,b;
|
| 66 |
+
std::vector<Segment> difflist;
|
| 67 |
+
std::vector<int> diffcnt;
|
| 68 |
+
public:
|
| 69 |
+
UnicodeString const& set_a(std::string const& a);
|
| 70 |
+
UnicodeString const& set_b(std::string const& b);
|
| 71 |
+
UnicodeString const& get_a() const;
|
| 72 |
+
UnicodeString const& get_b() const;
|
| 73 |
+
StringDiff(std::string const& a, std::string const& b);
|
| 74 |
+
StringDiff();
|
| 75 |
+
size_t size();
|
| 76 |
+
size_t align(bool force=false); // returns the levenshtein distance
|
| 77 |
+
void showDiff(std::ostream& out);
|
| 78 |
+
float levenshtein();
|
| 79 |
+
Segment const& operator[](uint32_t i) const;
|
| 80 |
+
void fillAlignmentMatrix(std::vector<std::vector<float> > & M) const;
|
| 81 |
+
vector<int> const& getFeatures() const;
|
| 82 |
+
};
|
| 83 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "ug_thread_pool.h"
|
| 2 |
+
namespace ug {
|
| 3 |
+
|
| 4 |
+
ThreadPool::
|
| 5 |
+
ThreadPool(size_t const num_workers)
|
| 6 |
+
: m_service(), m_busywork(new boost::asio::io_service::work(m_service))
|
| 7 |
+
{
|
| 8 |
+
m_workers.reserve(num_workers);
|
| 9 |
+
for (size_t i = 0; i < num_workers; ++i)
|
| 10 |
+
{
|
| 11 |
+
// boost::shared_ptr<boost::thread> t;
|
| 12 |
+
// t.reset(new boost::thread(boost::bind(&service_t::run, &m_service)));
|
| 13 |
+
boost::thread* t;
|
| 14 |
+
t = new boost::thread(boost::bind(&service_t::run, &m_service));
|
| 15 |
+
m_pool.add_thread(t);
|
| 16 |
+
// m_workers.push_back(t);
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
ThreadPool::
|
| 21 |
+
~ThreadPool()
|
| 22 |
+
{
|
| 23 |
+
m_busywork.reset();
|
| 24 |
+
m_pool.join_all();
|
| 25 |
+
m_service.stop();
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include <boost/asio.hpp>
|
| 4 |
+
#include <boost/bind.hpp>
|
| 5 |
+
#include <boost/thread.hpp>
|
| 6 |
+
#include <boost/scoped_ptr.hpp>
|
| 7 |
+
#include <boost/shared_ptr.hpp>
|
| 8 |
+
|
| 9 |
+
#include <iostream>
|
| 10 |
+
#include <vector>
|
| 11 |
+
#include <string>
|
| 12 |
+
|
| 13 |
+
namespace ug {
|
| 14 |
+
class ThreadPool
|
| 15 |
+
{
|
| 16 |
+
typedef boost::asio::io_service service_t;
|
| 17 |
+
service_t m_service;
|
| 18 |
+
boost::thread_group m_pool;
|
| 19 |
+
boost::scoped_ptr<service_t::work> m_busywork;
|
| 20 |
+
std::vector<boost::shared_ptr<boost::thread> > m_workers;
|
| 21 |
+
|
| 22 |
+
public:
|
| 23 |
+
ThreadPool(size_t const num_workers);
|
| 24 |
+
~ThreadPool();
|
| 25 |
+
|
| 26 |
+
template<class callable>
|
| 27 |
+
void add(callable& job) { m_service.post(job); }
|
| 28 |
+
|
| 29 |
+
}; // end of class declaration ThreadPool
|
| 30 |
+
} // end of namespace ug
|
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
|
| 2 |
+
#include "ug_thread_safe_counter.h"
|
| 3 |
+
// obsolete once <atomic> can be assumed to be available everywhere
|
| 4 |
+
|
| 5 |
+
namespace Moses
|
| 6 |
+
{
|
| 7 |
+
ThreadSafeCounter::
|
| 8 |
+
ThreadSafeCounter()
|
| 9 |
+
: ctr(0)
|
| 10 |
+
{ }
|
| 11 |
+
|
| 12 |
+
size_t
|
| 13 |
+
ThreadSafeCounter::
|
| 14 |
+
operator++()
|
| 15 |
+
{
|
| 16 |
+
boost::lock_guard<boost::mutex> guard(this->lock);
|
| 17 |
+
return ++ctr;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
size_t
|
| 21 |
+
ThreadSafeCounter::
|
| 22 |
+
operator++(int foo)
|
| 23 |
+
{
|
| 24 |
+
boost::lock_guard<boost::mutex> guard(this->lock);
|
| 25 |
+
return ctr++;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
ThreadSafeCounter::
|
| 29 |
+
operator size_t() const
|
| 30 |
+
{
|
| 31 |
+
return ctr;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
size_t
|
| 35 |
+
ThreadSafeCounter::
|
| 36 |
+
operator--()
|
| 37 |
+
{
|
| 38 |
+
boost::lock_guard<boost::mutex> guard(this->lock);
|
| 39 |
+
return --ctr;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
size_t
|
| 43 |
+
ThreadSafeCounter::
|
| 44 |
+
operator--(int foo)
|
| 45 |
+
{
|
| 46 |
+
boost::lock_guard<boost::mutex> guard(this->lock);
|
| 47 |
+
return ctr--;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <boost/thread.hpp>
|
| 3 |
+
|
| 4 |
+
namespace Moses
|
| 5 |
+
{
|
| 6 |
+
class ThreadSafeCounter
|
| 7 |
+
{
|
| 8 |
+
size_t ctr;
|
| 9 |
+
boost::mutex lock;
|
| 10 |
+
public:
|
| 11 |
+
ThreadSafeCounter();
|
| 12 |
+
size_t operator++();
|
| 13 |
+
size_t operator++(int);
|
| 14 |
+
size_t operator--();
|
| 15 |
+
size_t operator--(int);
|
| 16 |
+
operator size_t() const;
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Some systems apparently distinguish between shell
|
| 2 |
+
# variables and environment variables. The latter are
|
| 3 |
+
# visible to the make utility, the former apparently not,
|
| 4 |
+
# so we need to set them if they are not defined yet
|
| 5 |
+
|
| 6 |
+
# ===============================================================================
|
| 7 |
+
# COMPILATION PREFERENCES
|
| 8 |
+
# ===============================================================================
|
| 9 |
+
# CCACHE: if set to ccache, use ccache to speed up compilation
|
| 10 |
+
# OPTI: optimization level
|
| 11 |
+
# PROF: profiler switches
|
| 12 |
+
|
| 13 |
+
CCACHE = ccache
|
| 14 |
+
OPTI = 3
|
| 15 |
+
EXE_TAG = exe
|
| 16 |
+
PROF =
|
| 17 |
+
# PROF = -g -pg
|
| 18 |
+
|
| 19 |
+
# ===============================================================================
|
| 20 |
+
|
| 21 |
+
SHELL = bash
|
| 22 |
+
MAKEFLAGS += --warn-undefined-variables
|
| 23 |
+
.DEFAULT_GOAL = all
|
| 24 |
+
.SUFFIXES:
|
| 25 |
+
|
| 26 |
+
# ===============================================================================
|
| 27 |
+
# COMPILATION 'LOCALIZATION'
|
| 28 |
+
HOST ?= $(shell hostname)
|
| 29 |
+
HOSTTYPE ?= $(shell uname -m)
|
| 30 |
+
KERNEL = $(shell uname -r)
|
| 31 |
+
|
| 32 |
+
MOSES_ROOT ?= ${HOME}/code/mosesdecoder
|
| 33 |
+
WDIR = build/${HOSTTYPE}/${KERNEL}/${OPTI}
|
| 34 |
+
VPATH = ${HOME}/code/mosesdecoder/
|
| 35 |
+
CXXFLAGS = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES}
|
| 36 |
+
CXXFLAGS += -DMAX_NUM_FACTORS=4
|
| 37 |
+
CXXFLAGS += -DKENLM_MAX_ORDER=5
|
| 38 |
+
modirs := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
|
| 39 |
+
CXXFLAGS += -I${MOSES_ROOT}
|
| 40 |
+
INCLUDES =
|
| 41 |
+
BZLIB =
|
| 42 |
+
BOOSTLIBTAG =
|
| 43 |
+
|
| 44 |
+
REQLIBS = m z pthread lzma ${BZLIB} \
|
| 45 |
+
boost_thread${BOOSTLIBTAG} \
|
| 46 |
+
boost_iostreams${BOOSTLIBTAG} \
|
| 47 |
+
boost_program_options${BOOSTLIBTAG} \
|
| 48 |
+
boost_system${BOOSTLIBTAG} \
|
| 49 |
+
boost_filesystem${BOOSTLIBTAG}
|
| 50 |
+
|
| 51 |
+
# icuuc icuio icui18n \
|
| 52 |
+
|
| 53 |
+
LIBS = $(addprefix -l, ${REQLIBS} moses)
|
| 54 |
+
LIBDIRS = -L${HOME}/code/mosesdecoder/lib
|
| 55 |
+
LIBDIRS += -L${HOME}/lib
|
| 56 |
+
PREFIX ?= .
|
| 57 |
+
BINDIR ?= ${PREFIX}/bin
|
| 58 |
+
ifeq "$(OPTI)" "0"
|
| 59 |
+
BINPREF = debug.
|
| 60 |
+
else
|
| 61 |
+
BINPREF =
|
| 62 |
+
endif
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
OBJ2 :=
|
| 66 |
+
|
| 67 |
+
define compile
|
| 68 |
+
|
| 69 |
+
DEP += ${WDIR}/$(basename $(notdir $1)).d
|
| 70 |
+
${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
|
| 71 |
+
@echo -e "COMPILING $1"
|
| 72 |
+
@mkdir -p $$(@D)
|
| 73 |
+
${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
|
| 74 |
+
|
| 75 |
+
endef
|
| 76 |
+
|
| 77 |
+
testprogs = test-dynamic-im-tsa
|
| 78 |
+
programs = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
|
| 79 |
+
programs += mtt-count-words calc-coverage
|
| 80 |
+
|
| 81 |
+
all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
|
| 82 |
+
@echo $^
|
| 83 |
+
clean:
|
| 84 |
+
rm -f ${WDIR}/*.o ${WDIR}/*.d
|
| 85 |
+
|
| 86 |
+
custom-pt: ${BINDIR}/${BINPREF}custom-pt
|
| 87 |
+
echo $^
|
| 88 |
+
|
| 89 |
+
INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
|
| 90 |
+
OBJ = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
|
| 91 |
+
OBJ += $(patsubst %.cpp,%.o,${INMOGEN})
|
| 92 |
+
EXE = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
|
| 93 |
+
|
| 94 |
+
$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
|
| 95 |
+
$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
|
| 96 |
+
$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
|
| 97 |
+
$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a
|
| 98 |
+
${BINDIR}/${BINPREF}%: ${WDIR}/%.o
|
| 99 |
+
echo PREREQS: $<
|
| 100 |
+
$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS}
|
| 101 |
+
|
| 102 |
+
.SECONDARY:
|
| 103 |
+
|
| 104 |
+
-include $(DEP)
|
| 105 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
|
| 2 |
+
#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
|
| 3 |
+
#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
|
| 4 |
+
#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
|
| 5 |
+
#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
|
| 6 |
+
#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
|
| 7 |
+
#include "moses/TranslationModel/UG/mm/ug_bitext.h"
|
| 8 |
+
#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
|
| 9 |
+
|
| 10 |
+
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
| 11 |
+
|
| 12 |
+
// using namespace Moses;
|
| 13 |
+
using namespace ugdiss;
|
| 14 |
+
using namespace sapt;
|
| 15 |
+
using namespace std;
|
| 16 |
+
|
| 17 |
+
typedef L2R_Token<SimpleWordId> Token;
|
| 18 |
+
TokenIndex V;
|
| 19 |
+
SPTR<vector<vector<Token> > > C(new vector<vector<Token> >());
|
| 20 |
+
void
|
| 21 |
+
add_file(string fname)
|
| 22 |
+
{
|
| 23 |
+
boost::iostreams::filtering_istream in;
|
| 24 |
+
open_input_stream(fname,in);
|
| 25 |
+
string line;
|
| 26 |
+
while (getline(in,line))
|
| 27 |
+
{
|
| 28 |
+
C->push_back(vector<Token>());
|
| 29 |
+
fill_token_seq(V,line,C->back());
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
int
|
| 34 |
+
main(int argc, char* argv[])
|
| 35 |
+
{
|
| 36 |
+
V.setDynamic(true);
|
| 37 |
+
add_file(argv[1]);
|
| 38 |
+
SPTR<imTtrack<Token> > T(new imTtrack<Token>(C));
|
| 39 |
+
imTSA<Token> I(T,NULL,NULL);
|
| 40 |
+
string line;
|
| 41 |
+
while (getline(cin,line))
|
| 42 |
+
{
|
| 43 |
+
vector<Token> seq; fill_token_seq<Token>(V,line,seq);
|
| 44 |
+
for (size_t i = 0; i < seq.size(); ++i)
|
| 45 |
+
{
|
| 46 |
+
TSA<Token>::tree_iterator m(&I);
|
| 47 |
+
cout << V[seq[i].id()];
|
| 48 |
+
for (size_t k = i; k < seq.size() && m.extend(seq[k]); ++k)
|
| 49 |
+
{
|
| 50 |
+
cout << " ";
|
| 51 |
+
if (k > i) cout << V[seq[k].id()] << " ";
|
| 52 |
+
cout << "[" << m.approxOccurrenceCount() << "]";
|
| 53 |
+
}
|
| 54 |
+
cout << endl;
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
// Program to extract word cooccurrence counts from a memory-mapped
|
| 3 |
+
// word-aligned bitext stores the counts lexicon in the format for
|
| 4 |
+
// mm2dTable<uint32_t> (ug_mm_2d_table.h)
|
| 5 |
+
//
|
| 6 |
+
// (c) 2010-2012 Ulrich Germann
|
| 7 |
+
|
| 8 |
+
// to do: multi-threading
|
| 9 |
+
|
| 10 |
+
#include <queue>
|
| 11 |
+
#include <iomanip>
|
| 12 |
+
#include <vector>
|
| 13 |
+
#include <iterator>
|
| 14 |
+
#include <sstream>
|
| 15 |
+
#include <algorithm>
|
| 16 |
+
|
| 17 |
+
#include <boost/program_options.hpp>
|
| 18 |
+
#include <boost/dynamic_bitset.hpp>
|
| 19 |
+
#include <boost/shared_ptr.hpp>
|
| 20 |
+
#include <boost/foreach.hpp>
|
| 21 |
+
#include <boost/thread.hpp>
|
| 22 |
+
#include <boost/math/distributions/binomial.hpp>
|
| 23 |
+
#include <boost/unordered_map.hpp>
|
| 24 |
+
#include <boost/unordered_set.hpp>
|
| 25 |
+
|
| 26 |
+
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
|
| 27 |
+
#include "moses/Util.h"
|
| 28 |
+
#include "ug_mm_2d_table.h"
|
| 29 |
+
#include "ug_mm_ttrack.h"
|
| 30 |
+
#include "ug_corpus_token.h"
|
| 31 |
+
|
| 32 |
+
using namespace std;
|
| 33 |
+
using namespace sapt;
|
| 34 |
+
using namespace ugdiss;
|
| 35 |
+
using namespace boost::math;
|
| 36 |
+
|
| 37 |
+
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
|
| 38 |
+
typedef SimpleWordId Token;
|
| 39 |
+
|
| 40 |
+
// DECLARATIONS
|
| 41 |
+
void interpret_args(int ac, char* av[]);
|
| 42 |
+
|
| 43 |
+
mmTtrack<Token> T1,T2;
|
| 44 |
+
mmTtrack<char> Tx;
|
| 45 |
+
TokenIndex V1,V2;
|
| 46 |
+
|
| 47 |
+
typedef pair<id_type,id_type> wpair;
|
| 48 |
+
struct Count
|
| 49 |
+
{
|
| 50 |
+
uint32_t a;
|
| 51 |
+
uint32_t c;
|
| 52 |
+
Count() : a(0), c(0) {};
|
| 53 |
+
Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
|
| 54 |
+
};
|
| 55 |
+
|
| 56 |
+
bool
|
| 57 |
+
operator<(pair<id_type,Count> const& a,
|
| 58 |
+
pair<id_type,Count> const& b)
|
| 59 |
+
{
|
| 60 |
+
return a.first < b.first;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
typedef boost::unordered_map<wpair,Count> countmap_t;
|
| 65 |
+
typedef vector<vector<pair<id_type,Count> > > countlist_t;
|
| 66 |
+
|
| 67 |
+
vector<countlist_t> XLEX;
|
| 68 |
+
|
| 69 |
+
class Counter
|
| 70 |
+
{
|
| 71 |
+
public:
|
| 72 |
+
countmap_t CNT;
|
| 73 |
+
countlist_t & LEX;
|
| 74 |
+
size_t offset;
|
| 75 |
+
size_t skip;
|
| 76 |
+
Counter(countlist_t& lex, size_t o, size_t s)
|
| 77 |
+
: LEX(lex), offset(o), skip(s) {}
|
| 78 |
+
void processSentence(id_type sid);
|
| 79 |
+
void operator()();
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
string bname,cfgFile,L1,L2,oname,cooc;
|
| 83 |
+
int verbose;
|
| 84 |
+
size_t truncat;
|
| 85 |
+
size_t num_threads;
|
| 86 |
+
|
| 87 |
+
void
|
| 88 |
+
Counter::
|
| 89 |
+
operator()()
|
| 90 |
+
{
|
| 91 |
+
for (size_t sid = offset; sid < min(truncat,T1.size()); sid += skip)
|
| 92 |
+
processSentence(sid);
|
| 93 |
+
|
| 94 |
+
LEX.resize(V1.ksize());
|
| 95 |
+
for (countmap_t::const_iterator c = CNT.begin(); c != CNT.end(); ++c)
|
| 96 |
+
{
|
| 97 |
+
pair<id_type,Count> foo(c->first.second,c->second);
|
| 98 |
+
LEX.at(c->first.first).push_back(foo);
|
| 99 |
+
}
|
| 100 |
+
typedef vector<pair<id_type,Count> > v_t;
|
| 101 |
+
BOOST_FOREACH(v_t& v, LEX)
|
| 102 |
+
sort(v.begin(),v.end());
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
struct lexsorter
|
| 106 |
+
{
|
| 107 |
+
vector<countlist_t> const& v;
|
| 108 |
+
id_type wid;
|
| 109 |
+
lexsorter(vector<countlist_t> const& vx, id_type widx)
|
| 110 |
+
: v(vx),wid(widx) {}
|
| 111 |
+
bool operator()(pair<uint32_t,uint32_t> const& a,
|
| 112 |
+
pair<uint32_t,uint32_t> const& b) const
|
| 113 |
+
{
|
| 114 |
+
return (v.at(a.first).at(wid).at(a.second).first >
|
| 115 |
+
v.at(b.first).at(wid).at(b.second).first);
|
| 116 |
+
}
|
| 117 |
+
};
|
| 118 |
+
|
| 119 |
+
void
|
| 120 |
+
writeTableHeader(ostream& out)
|
| 121 |
+
{
|
| 122 |
+
filepos_type idxOffset=0;
|
| 123 |
+
tpt::numwrite(out,idxOffset); // blank for the time being
|
| 124 |
+
tpt::numwrite(out,id_type(V1.ksize()));
|
| 125 |
+
tpt::numwrite(out,id_type(V2.ksize()));
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
void writeTable(ostream* aln_out, ostream* coc_out)
|
| 129 |
+
{
|
| 130 |
+
vector<uint32_t> m1a(V1.ksize(),0); // marginals L1
|
| 131 |
+
vector<uint32_t> m2a(V2.ksize(),0); // marginals L2
|
| 132 |
+
vector<uint32_t> m1c(V1.ksize(),0); // marginals L1
|
| 133 |
+
vector<uint32_t> m2c(V2.ksize(),0); // marginals L2
|
| 134 |
+
vector<id_type> idxa(V1.ksize()+1,0);
|
| 135 |
+
vector<id_type> idxc(V1.ksize()+1,0);
|
| 136 |
+
if (aln_out) writeTableHeader(*aln_out);
|
| 137 |
+
if (coc_out) writeTableHeader(*coc_out);
|
| 138 |
+
size_t CellCountA=0,CellCountC=0;
|
| 139 |
+
for (size_t id1 = 0; id1 < V1.ksize(); ++id1)
|
| 140 |
+
{
|
| 141 |
+
idxa[id1] = CellCountA;
|
| 142 |
+
idxc[id1] = CellCountC;
|
| 143 |
+
lexsorter sorter(XLEX,id1);
|
| 144 |
+
vector<pair<uint32_t,uint32_t> > H; H.reserve(num_threads);
|
| 145 |
+
for (size_t i = 0; i < num_threads; ++i)
|
| 146 |
+
{
|
| 147 |
+
if (id1 < XLEX.at(i).size() && XLEX[i][id1].size())
|
| 148 |
+
H.push_back(pair<uint32_t,uint32_t>(i,0));
|
| 149 |
+
}
|
| 150 |
+
if (!H.size()) continue;
|
| 151 |
+
make_heap(H.begin(),H.end(),sorter);
|
| 152 |
+
while (H.size())
|
| 153 |
+
{
|
| 154 |
+
id_type id2 = XLEX[H[0].first][id1][H[0].second].first;
|
| 155 |
+
uint32_t aln = XLEX[H[0].first][id1][H[0].second].second.a;
|
| 156 |
+
uint32_t coc = XLEX[H[0].first][id1][H[0].second].second.c;
|
| 157 |
+
pop_heap(H.begin(),H.end(),sorter);
|
| 158 |
+
++H.back().second;
|
| 159 |
+
if (H.back().second == XLEX[H.back().first][id1].size())
|
| 160 |
+
H.pop_back();
|
| 161 |
+
else
|
| 162 |
+
push_heap(H.begin(),H.end(),sorter);
|
| 163 |
+
while (H.size() &&
|
| 164 |
+
XLEX[H[0].first][id1].at(H[0].second).first == id2)
|
| 165 |
+
{
|
| 166 |
+
aln += XLEX[H[0].first][id1][H[0].second].second.a;
|
| 167 |
+
coc += XLEX[H[0].first][id1][H[0].second].second.c;
|
| 168 |
+
pop_heap(H.begin(),H.end(),sorter);
|
| 169 |
+
++H.back().second;
|
| 170 |
+
if (H.back().second == XLEX[H.back().first][id1].size())
|
| 171 |
+
H.pop_back();
|
| 172 |
+
else
|
| 173 |
+
push_heap(H.begin(),H.end(),sorter);
|
| 174 |
+
}
|
| 175 |
+
if (aln_out)
|
| 176 |
+
{
|
| 177 |
+
++CellCountA;
|
| 178 |
+
tpt::numwrite(*aln_out,id2);
|
| 179 |
+
tpt::numwrite(*aln_out,aln);
|
| 180 |
+
m1a[id1] += aln;
|
| 181 |
+
m2a[id2] += aln;
|
| 182 |
+
}
|
| 183 |
+
if (coc_out && coc)
|
| 184 |
+
{
|
| 185 |
+
++CellCountC;
|
| 186 |
+
tpt::numwrite(*coc_out,id2);
|
| 187 |
+
tpt::numwrite(*coc_out,coc);
|
| 188 |
+
m1c[id1] += coc;
|
| 189 |
+
m2c[id2] += coc;
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
idxa.back() = CellCountA;
|
| 194 |
+
idxc.back() = CellCountC;
|
| 195 |
+
if (aln_out)
|
| 196 |
+
{
|
| 197 |
+
filepos_type idxOffsetA = aln_out->tellp();
|
| 198 |
+
BOOST_FOREACH(id_type foo, idxa)
|
| 199 |
+
tpt::numwrite(*aln_out,foo);
|
| 200 |
+
aln_out->write(reinterpret_cast<char const*>(&m1a[0]),m1a.size()*4);
|
| 201 |
+
aln_out->write(reinterpret_cast<char const*>(&m2a[0]),m2a.size()*4);
|
| 202 |
+
aln_out->seekp(0);
|
| 203 |
+
tpt::numwrite(*aln_out,idxOffsetA);
|
| 204 |
+
}
|
| 205 |
+
if (coc_out)
|
| 206 |
+
{
|
| 207 |
+
filepos_type idxOffsetC = coc_out->tellp();
|
| 208 |
+
BOOST_FOREACH(id_type foo, idxc)
|
| 209 |
+
tpt::numwrite(*coc_out,foo);
|
| 210 |
+
coc_out->write(reinterpret_cast<char const*>(&m1c[0]),m1c.size()*4);
|
| 211 |
+
coc_out->write(reinterpret_cast<char const*>(&m2c[0]),m2c.size()*4);
|
| 212 |
+
coc_out->seekp(0);
|
| 213 |
+
tpt::numwrite(*coc_out,idxOffsetC);
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
void
|
| 218 |
+
Counter::
|
| 219 |
+
processSentence(id_type sid)
|
| 220 |
+
{
|
| 221 |
+
Token const* s1 = T1.sntStart(sid);
|
| 222 |
+
Token const* e1 = T1.sntEnd(sid);
|
| 223 |
+
Token const* s2 = T2.sntStart(sid);
|
| 224 |
+
Token const* e2 = T2.sntEnd(sid);
|
| 225 |
+
// vector<ushort> cnt1(V1.ksize(),0);
|
| 226 |
+
// vector<ushort> cnt2(V2.ksize(),0);
|
| 227 |
+
// for (Token const* x = s1; x < e1; ++x)
|
| 228 |
+
// ++cnt1.at(x->id());
|
| 229 |
+
// for (Token const* x = s2; x < e2; ++x)
|
| 230 |
+
// ++cnt2.at(x->id());
|
| 231 |
+
|
| 232 |
+
// boost::unordered_set<wpair> seen;
|
| 233 |
+
bitvector check1(T1.sntLen(sid)); check1.set();
|
| 234 |
+
bitvector check2(T2.sntLen(sid)); check2.set();
|
| 235 |
+
|
| 236 |
+
// count links
|
| 237 |
+
char const* p = Tx.sntStart(sid);
|
| 238 |
+
char const* q = Tx.sntEnd(sid);
|
| 239 |
+
ushort r,c;
|
| 240 |
+
if (verbose && sid % 1000000 == 0)
|
| 241 |
+
cerr << sid/1000000 << " M sentences processed" << endl;
|
| 242 |
+
while (p < q)
|
| 243 |
+
{
|
| 244 |
+
p = tpt::binread(p,r);
|
| 245 |
+
p = tpt::binread(p,c);
|
| 246 |
+
// cout << sid << " " << r << "-" << c << endl;
|
| 247 |
+
UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
|
| 248 |
+
UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
|
| 249 |
+
// assert(r < check1.size());
|
| 250 |
+
// assert(c < check2.size());
|
| 251 |
+
UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
|
| 252 |
+
UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
|
| 253 |
+
// assert(s1+r < e1);
|
| 254 |
+
// assert(s2+c < e2);
|
| 255 |
+
check1.reset(r);
|
| 256 |
+
check2.reset(c);
|
| 257 |
+
id_type id1 = (s1+r)->id();
|
| 258 |
+
id_type id2 = (s2+c)->id();
|
| 259 |
+
wpair k(id1,id2);
|
| 260 |
+
Count& cnt = CNT[k];
|
| 261 |
+
cnt.a++;
|
| 262 |
+
// if (seen.insert(k).second)
|
| 263 |
+
// cnt.c += cnt1[id1] * cnt2[id2];
|
| 264 |
+
}
|
| 265 |
+
// count unaliged words
|
| 266 |
+
for (size_t i = check1.find_first();
|
| 267 |
+
i < check1.size();
|
| 268 |
+
i = check1.find_next(i))
|
| 269 |
+
CNT[wpair((s1+i)->id(),0)].a++;
|
| 270 |
+
for (size_t i = check2.find_first();
|
| 271 |
+
i < check2.size();
|
| 272 |
+
i = check2.find_next(i))
|
| 273 |
+
CNT[wpair(0,(s2+i)->id())].a++;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
int
|
| 277 |
+
main(int argc, char* argv[])
|
| 278 |
+
{
|
| 279 |
+
interpret_args(argc,argv);
|
| 280 |
+
char c = *bname.rbegin();
|
| 281 |
+
if (c != '/' && c != '.') bname += '.';
|
| 282 |
+
T1.open(bname+L1+".mct");
|
| 283 |
+
T2.open(bname+L2+".mct");
|
| 284 |
+
Tx.open(bname+L1+"-"+L2+".mam");
|
| 285 |
+
V1.open(bname+L1+".tdx");
|
| 286 |
+
V2.open(bname+L2+".tdx");
|
| 287 |
+
if (!truncat) truncat = T1.size();
|
| 288 |
+
XLEX.resize(num_threads);
|
| 289 |
+
vector<boost::shared_ptr<boost::thread> > workers(num_threads);
|
| 290 |
+
for (size_t i = 0; i < num_threads; ++i)
|
| 291 |
+
workers[i].reset(new boost::thread(Counter(XLEX[i],i,num_threads)));
|
| 292 |
+
for (size_t i = 0; i < workers.size(); ++i)
|
| 293 |
+
workers[i]->join();
|
| 294 |
+
// cerr << "done counting" << endl;
|
| 295 |
+
ofstream aln_out,coc_out;
|
| 296 |
+
if (oname.size()) aln_out.open(oname.c_str());
|
| 297 |
+
// if (cooc.size()) coc_out.open(cooc.c_str());
|
| 298 |
+
writeTable(oname.size() ? &aln_out : NULL,
|
| 299 |
+
cooc.size() ? &coc_out : NULL);
|
| 300 |
+
if (oname.size()) aln_out.close();
|
| 301 |
+
// if (cooc.size()) coc_out.close();
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
void
|
| 305 |
+
interpret_args(int ac, char* av[])
|
| 306 |
+
{
|
| 307 |
+
namespace po=boost::program_options;
|
| 308 |
+
po::variables_map vm;
|
| 309 |
+
po::options_description o("Options");
|
| 310 |
+
po::options_description h("Hidden Options");
|
| 311 |
+
po::positional_options_description a;
|
| 312 |
+
|
| 313 |
+
o.add_options()
|
| 314 |
+
("help,h", "print this message")
|
| 315 |
+
("cfg,f", po::value<string>(&cfgFile),"config file")
|
| 316 |
+
("oname,o", po::value<string>(&oname),"output file name")
|
| 317 |
+
// ("cooc,c", po::value<string>(&cooc),
|
| 318 |
+
// "file name for raw co-occurrence counts")
|
| 319 |
+
("verbose,v", po::value<int>(&verbose)->default_value(0)->implicit_value(1),
|
| 320 |
+
"verbosity level")
|
| 321 |
+
("threads,t", po::value<size_t>(&num_threads)->default_value(4),
|
| 322 |
+
"count in <N> parallel threads")
|
| 323 |
+
("truncate,n", po::value<size_t>(&truncat)->default_value(0),
|
| 324 |
+
"truncate corpus to <N> sentences (for debugging)")
|
| 325 |
+
;
|
| 326 |
+
|
| 327 |
+
h.add_options()
|
| 328 |
+
("bname", po::value<string>(&bname), "base name")
|
| 329 |
+
("L1", po::value<string>(&L1),"L1 tag")
|
| 330 |
+
("L2", po::value<string>(&L2),"L2 tag")
|
| 331 |
+
;
|
| 332 |
+
a.add("bname",1);
|
| 333 |
+
a.add("L1",1);
|
| 334 |
+
a.add("L2",1);
|
| 335 |
+
get_options(ac,av,h.add(o),a,vm,"cfg");
|
| 336 |
+
|
| 337 |
+
if (vm.count("help") || bname.empty() || (oname.empty() && cooc.empty()))
|
| 338 |
+
{
|
| 339 |
+
cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> [-o <output file>] [-c <output file>]\n" << endl;
|
| 340 |
+
cout << "at least one of -o / -c must be specified." << endl;
|
| 341 |
+
cout << o << endl;
|
| 342 |
+
exit(0);
|
| 343 |
+
}
|
| 344 |
+
size_t num_cores = boost::thread::hardware_concurrency();
|
| 345 |
+
num_threads = min(num_threads,num_cores);
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc
ADDED
|
@@ -0,0 +1,498 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
// Converts a corpus in text format (plain text, one centence per line) or
|
| 3 |
+
// conll format or treetagger output format (which one is automatically
|
| 4 |
+
// recognized based on the number of fields per line) into memory-mapped
|
| 5 |
+
// format. (c) 2007-2013 Ulrich Germann
|
| 6 |
+
|
| 7 |
+
#include <boost/algorithm/string/predicate.hpp>
|
| 8 |
+
#include <boost/program_options.hpp>
|
| 9 |
+
#include <boost/program_options/options_description.hpp>
|
| 10 |
+
#include <boost/program_options/parsers.hpp>
|
| 11 |
+
#include <boost/program_options/variables_map.hpp>
|
| 12 |
+
#include <boost/iostreams/device/mapped_file.hpp>
|
| 13 |
+
|
| 14 |
+
#include <iostream>
|
| 15 |
+
#include <fstream>
|
| 16 |
+
#include <sstream>
|
| 17 |
+
#include <iomanip>
|
| 18 |
+
#include <vector>
|
| 19 |
+
#include <string>
|
| 20 |
+
|
| 21 |
+
#include <sys/types.h>
|
| 22 |
+
#include <sys/wait.h>
|
| 23 |
+
|
| 24 |
+
#include "ug_conll_record.h"
|
| 25 |
+
#include "tpt_tokenindex.h"
|
| 26 |
+
#include "ug_mm_ttrack.h"
|
| 27 |
+
#include "tpt_pickler.h"
|
| 28 |
+
#include "ug_deptree.h"
|
| 29 |
+
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
| 30 |
+
#include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
|
| 31 |
+
|
| 32 |
+
using namespace std;
|
| 33 |
+
using namespace sapt;
|
| 34 |
+
using namespace Moses;
|
| 35 |
+
using namespace boost;
|
| 36 |
+
using namespace boost::algorithm;
|
| 37 |
+
namespace po=boost::program_options;
|
| 38 |
+
|
| 39 |
+
int with_pfas;
|
| 40 |
+
int with_dcas;
|
| 41 |
+
int with_sfas;
|
| 42 |
+
|
| 43 |
+
bool incremental = false; // build / grow vocabs automatically
|
| 44 |
+
bool is_conll = false; // text or conll format?
|
| 45 |
+
bool quiet = false; // no progress reporting
|
| 46 |
+
|
| 47 |
+
string vocabBase; // base name for existing vocabs that should be used
|
| 48 |
+
string baseName; // base name for all files
|
| 49 |
+
string tmpFile, mttFile; /* name of temporary / actual track file
|
| 50 |
+
* (.mtt for Conll format, .mct for plain text)
|
| 51 |
+
*/
|
| 52 |
+
string UNK;
|
| 53 |
+
|
| 54 |
+
TokenIndex SF; // surface form
|
| 55 |
+
TokenIndex LM; // lemma
|
| 56 |
+
TokenIndex PS; // part of speech
|
| 57 |
+
TokenIndex DT; // dependency type
|
| 58 |
+
|
| 59 |
+
void interpret_args(int ac, char* av[]);
|
| 60 |
+
|
| 61 |
+
inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
|
| 62 |
+
|
| 63 |
+
id_type
|
| 64 |
+
get_id(TokenIndex const& T, string const& w)
|
| 65 |
+
{
|
| 66 |
+
id_type ret = T[w];
|
| 67 |
+
if (ret == 1 && w != UNK)
|
| 68 |
+
{
|
| 69 |
+
cerr << "Warning! Unkown vocabulary item '" << w << "', but "
|
| 70 |
+
<< "incremental mode (-i) is not set." << endl;
|
| 71 |
+
assert(0);
|
| 72 |
+
}
|
| 73 |
+
return ret;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
void
|
| 77 |
+
open_vocab(TokenIndex& T, string fname)
|
| 78 |
+
{
|
| 79 |
+
if (!access(fname.c_str(), F_OK))
|
| 80 |
+
{
|
| 81 |
+
T.open(fname,UNK);
|
| 82 |
+
assert(T[UNK] == 1);
|
| 83 |
+
}
|
| 84 |
+
else T.setUnkLabel(UNK);
|
| 85 |
+
if (incremental) T.setDynamic(true);
|
| 86 |
+
assert(T["NULL"] == 0);
|
| 87 |
+
assert(T[UNK] == 1);
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
void
|
| 91 |
+
ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
|
| 92 |
+
{
|
| 93 |
+
v.resize(T.totalVocabSize());
|
| 94 |
+
for (size_t i = 0; i < T.totalVocabSize(); ++i)
|
| 95 |
+
{
|
| 96 |
+
v[i].first = T[i];
|
| 97 |
+
v[i].second = 0;
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
void
|
| 102 |
+
write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o)
|
| 103 |
+
{
|
| 104 |
+
if (!quiet) cerr << "Writing " << fname << endl;
|
| 105 |
+
vector<id_type> o2n(n2o.size());
|
| 106 |
+
for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i;
|
| 107 |
+
vector<pair<string,uint32_t> > v(n2o.size());
|
| 108 |
+
for (id_type i = 0; i < n2o.size(); ++i)
|
| 109 |
+
{
|
| 110 |
+
v[i].first = T[n2o[i]];
|
| 111 |
+
v[i].second = i;
|
| 112 |
+
}
|
| 113 |
+
T.close();
|
| 114 |
+
sort(v.begin(),v.end());
|
| 115 |
+
write_tokenindex_to_disk(v, fname, UNK);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
void init(int argc, char* argv[])
|
| 119 |
+
{
|
| 120 |
+
interpret_args(argc,argv);
|
| 121 |
+
if (is_conll)
|
| 122 |
+
{
|
| 123 |
+
open_vocab(SF, vocabBase+".tdx.sfo"); // surface form
|
| 124 |
+
open_vocab(LM, vocabBase+".tdx.lem"); // lemma
|
| 125 |
+
open_vocab(PS, vocabBase+".tdx.pos"); // part-of-speech
|
| 126 |
+
open_vocab(DT, vocabBase+".tdx.drl"); // dependency type
|
| 127 |
+
}
|
| 128 |
+
else open_vocab(SF, vocabBase+".tdx"); // surface form
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
void fill_rec(Conll_Record& rec, vector<string> const& w)
|
| 132 |
+
{
|
| 133 |
+
if (w.size() == 3) // treetagger output
|
| 134 |
+
{
|
| 135 |
+
rec.sform = get_id(SF, w[0]);
|
| 136 |
+
rec.lemma = get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]);
|
| 137 |
+
rec.majpos = rangeCheck(get_id(PS, w[1]), 256);
|
| 138 |
+
rec.minpos = rangeCheck(get_id(PS, w[1]), 256);
|
| 139 |
+
rec.dtype = 0;
|
| 140 |
+
rec.parent = -1;
|
| 141 |
+
}
|
| 142 |
+
else if (w.size() >= 8) // CONLL format
|
| 143 |
+
{
|
| 144 |
+
int id = atoi(w[0].c_str());
|
| 145 |
+
int gov = atoi(w[6].c_str());
|
| 146 |
+
rec.sform = get_id(SF, w[1]);
|
| 147 |
+
rec.lemma = get_id(LM, w[2]);
|
| 148 |
+
rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
|
| 149 |
+
rec.minpos = rangeCheck(get_id(PS, w[4]), 256);
|
| 150 |
+
rec.dtype = get_id(DT, w[7]);
|
| 151 |
+
rec.parent = gov ? gov - id : 0;
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
void log_progress(size_t ctr)
|
| 156 |
+
{
|
| 157 |
+
if (ctr % 100000 == 0)
|
| 158 |
+
{
|
| 159 |
+
if (ctr) cerr << endl;
|
| 160 |
+
cerr << setw(12) << ctr / 1000 << "K sentences processed ";
|
| 161 |
+
}
|
| 162 |
+
else if (ctr % 10000 == 0)
|
| 163 |
+
{
|
| 164 |
+
cerr << ".";
|
| 165 |
+
}
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
size_t
|
| 170 |
+
process_plain_input(ostream& out, vector<id_type> & s_index)
|
| 171 |
+
{
|
| 172 |
+
id_type totalWords = 0;
|
| 173 |
+
string line,w;
|
| 174 |
+
while (getline(cin,line))
|
| 175 |
+
{
|
| 176 |
+
istringstream buf(line);
|
| 177 |
+
if (!quiet) log_progress(s_index.size());
|
| 178 |
+
s_index.push_back(totalWords);
|
| 179 |
+
while (buf>>w)
|
| 180 |
+
{
|
| 181 |
+
tpt::numwrite(out,get_id(SF,w));
|
| 182 |
+
++totalWords;
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
s_index.push_back(totalWords);
|
| 186 |
+
return totalWords;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
size_t
|
| 190 |
+
process_tagged_input(ostream& out,
|
| 191 |
+
vector<id_type> & s_index,
|
| 192 |
+
vector<id_type> & p_index)
|
| 193 |
+
{
|
| 194 |
+
string line;
|
| 195 |
+
Conll_Record rec;
|
| 196 |
+
bool new_sent = true;
|
| 197 |
+
bool new_par = true;
|
| 198 |
+
id_type totalWords = 0;
|
| 199 |
+
|
| 200 |
+
while (getline(cin,line))
|
| 201 |
+
{
|
| 202 |
+
vector<string> w; string f; istringstream buf(line);
|
| 203 |
+
while (buf>>f) w.push_back(f);
|
| 204 |
+
|
| 205 |
+
if (w.size() == 0 || starts_with(w[0], "SID="))
|
| 206 |
+
new_sent = true;
|
| 207 |
+
|
| 208 |
+
else if (w.size() == 1 && w[0] == "<P>")
|
| 209 |
+
new_par = new_sent = true;
|
| 210 |
+
|
| 211 |
+
if (w.size() < 3) continue;
|
| 212 |
+
if (!quiet && new_sent) log_progress(s_index.size());
|
| 213 |
+
if (new_sent) { s_index.push_back(totalWords); new_sent = false; }
|
| 214 |
+
if (new_par) { p_index.push_back(totalWords); new_par = false; }
|
| 215 |
+
fill_rec(rec,w);
|
| 216 |
+
out.write(reinterpret_cast<char const*>(&rec),sizeof(rec));
|
| 217 |
+
++totalWords;
|
| 218 |
+
}
|
| 219 |
+
s_index.push_back(totalWords);
|
| 220 |
+
return totalWords;
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
size_t
|
| 224 |
+
numberize()
|
| 225 |
+
{
|
| 226 |
+
ofstream out(tmpFile.c_str());
|
| 227 |
+
filepos_type startIdx=0;
|
| 228 |
+
id_type idxSize=0,totalWords=0;
|
| 229 |
+
tpt::numwrite(out,startIdx); // place holder, to be filled at the end
|
| 230 |
+
tpt::numwrite(out,idxSize); // place holder, to be filled at the end
|
| 231 |
+
tpt::numwrite(out,totalWords); // place holder, to be filled at the end
|
| 232 |
+
|
| 233 |
+
vector<id_type> s_index, p_index;
|
| 234 |
+
|
| 235 |
+
if(is_conll)
|
| 236 |
+
totalWords = process_tagged_input(out,s_index,p_index);
|
| 237 |
+
else
|
| 238 |
+
totalWords = process_plain_input(out,s_index);
|
| 239 |
+
|
| 240 |
+
vector<id_type> const* index = &s_index;
|
| 241 |
+
if (p_index.size() && p_index.back())
|
| 242 |
+
{
|
| 243 |
+
p_index.push_back(totalWords);
|
| 244 |
+
index = &p_index;
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
if (!quiet)
|
| 248 |
+
cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
|
| 249 |
+
|
| 250 |
+
startIdx = out.tellp();
|
| 251 |
+
for (size_t i = 0; i < index->size(); i++)
|
| 252 |
+
tpt::numwrite(out,(*index)[i]);
|
| 253 |
+
out.seekp(0);
|
| 254 |
+
idxSize = index->size();
|
| 255 |
+
tpt::numwrite(out, startIdx);
|
| 256 |
+
tpt::numwrite(out, idxSize - 1);
|
| 257 |
+
tpt::numwrite(out, totalWords);
|
| 258 |
+
out.close();
|
| 259 |
+
if (!quiet) cerr << "done" << endl;
|
| 260 |
+
return totalWords;
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
vector<id_type> smap,lmap,pmap,dmap;
|
| 264 |
+
|
| 265 |
+
void
|
| 266 |
+
invert(vector<id_type> const& from, vector<id_type> & to)
|
| 267 |
+
{
|
| 268 |
+
to.resize(from.size());
|
| 269 |
+
for (size_t i = 0 ; i < to.size(); ++i)
|
| 270 |
+
to[from[i]] = i;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
// sorts new items based on occurrence counts but won't reassign
|
| 274 |
+
// existing token ids
|
| 275 |
+
void
|
| 276 |
+
conservative_sort(TokenIndex const & V,
|
| 277 |
+
vector<size_t> const & cnt,
|
| 278 |
+
vector<id_type> & xmap)
|
| 279 |
+
{
|
| 280 |
+
xmap.resize(V.totalVocabSize());
|
| 281 |
+
for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i;
|
| 282 |
+
VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt);
|
| 283 |
+
sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter);
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// reassign token ids in the corpus track based on the id map created by
|
| 287 |
+
// conservative_sort
|
| 288 |
+
void remap()
|
| 289 |
+
{
|
| 290 |
+
if (!quiet) cerr << "Remapping ids ... ";
|
| 291 |
+
filepos_type idxOffset;
|
| 292 |
+
id_type totalWords, idxSize;
|
| 293 |
+
boost::iostreams::mapped_file mtt(tmpFile);
|
| 294 |
+
char const* p = mtt.data();
|
| 295 |
+
p = tpt::numread(p,idxOffset);
|
| 296 |
+
p = tpt::numread(p,idxSize);
|
| 297 |
+
p = tpt::numread(p,totalWords);
|
| 298 |
+
if (is_conll)
|
| 299 |
+
{
|
| 300 |
+
vector<size_t> sf(SF.totalVocabSize(), 0);
|
| 301 |
+
vector<size_t> lm(LM.totalVocabSize(), 0);
|
| 302 |
+
vector<size_t> ps(PS.totalVocabSize(), 0);
|
| 303 |
+
vector<size_t> dt(DT.totalVocabSize(), 0);
|
| 304 |
+
Conll_Record* w = reinterpret_cast<Conll_Record*>(const_cast<char*>(p));
|
| 305 |
+
for (size_t i = 0; i < totalWords; ++i)
|
| 306 |
+
{
|
| 307 |
+
++sf.at(w[i].sform);
|
| 308 |
+
++lm.at(w[i].lemma);
|
| 309 |
+
++ps.at(w[i].majpos);
|
| 310 |
+
++ps.at(w[i].minpos);
|
| 311 |
+
++dt.at(w[i].dtype);
|
| 312 |
+
}
|
| 313 |
+
conservative_sort(SF,sf,smap);
|
| 314 |
+
conservative_sort(LM,lm,lmap);
|
| 315 |
+
conservative_sort(PS,ps,pmap);
|
| 316 |
+
conservative_sort(DT,dt,dmap);
|
| 317 |
+
vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
|
| 318 |
+
vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i);
|
| 319 |
+
vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i);
|
| 320 |
+
vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i);
|
| 321 |
+
for (size_t i = 0; i < totalWords; ++i)
|
| 322 |
+
{
|
| 323 |
+
w[i].sform = smap_i[w[i].sform];
|
| 324 |
+
w[i].lemma = lmap_i[w[i].lemma];
|
| 325 |
+
w[i].majpos = pmap_i[w[i].majpos];
|
| 326 |
+
w[i].minpos = pmap_i[w[i].minpos];
|
| 327 |
+
w[i].dtype = dmap_i[w[i].dtype];
|
| 328 |
+
}
|
| 329 |
+
}
|
| 330 |
+
else
|
| 331 |
+
{
|
| 332 |
+
vector<size_t> sf(SF.totalVocabSize(), 0);
|
| 333 |
+
id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p));
|
| 334 |
+
for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]);
|
| 335 |
+
conservative_sort(SF,sf,smap);
|
| 336 |
+
vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
|
| 337 |
+
for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]];
|
| 338 |
+
}
|
| 339 |
+
mtt.close();
|
| 340 |
+
if (!quiet) cerr << "done." << endl;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
void save_vocabs()
|
| 344 |
+
{
|
| 345 |
+
string vbase = baseName;
|
| 346 |
+
if (is_conll)
|
| 347 |
+
{
|
| 348 |
+
if (SF.totalVocabSize() > SF.knownVocabSize())
|
| 349 |
+
write_tokenindex(vbase+".tdx.sfo",SF,smap);
|
| 350 |
+
if (LM.totalVocabSize() > LM.knownVocabSize())
|
| 351 |
+
write_tokenindex(vbase+".tdx.lem",LM,lmap);
|
| 352 |
+
if (PS.totalVocabSize() > PS.knownVocabSize())
|
| 353 |
+
write_tokenindex(vbase+".tdx.pos",PS,pmap);
|
| 354 |
+
if (DT.totalVocabSize() > DT.knownVocabSize())
|
| 355 |
+
write_tokenindex(vbase+".tdx.drl",DT,dmap);
|
| 356 |
+
}
|
| 357 |
+
else if (SF.totalVocabSize() > SF.knownVocabSize())
|
| 358 |
+
write_tokenindex(vbase+".tdx",SF,smap);
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
template<typename Token>
|
| 362 |
+
void
|
| 363 |
+
build_mmTSA(string infile, string outfile)
|
| 364 |
+
{
|
| 365 |
+
// size_t mypid = fork();
|
| 366 |
+
// if(mypid) return mypid;
|
| 367 |
+
boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
|
| 368 |
+
bdBitset filter;
|
| 369 |
+
filter.resize(T->size(),true);
|
| 370 |
+
imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
|
| 371 |
+
S.save_as_mm_tsa(outfile);
|
| 372 |
+
// exit(0);
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
bool
|
| 376 |
+
build_plaintext_tsas()
|
| 377 |
+
{
|
| 378 |
+
typedef L2R_Token<SimpleWordId> L2R;
|
| 379 |
+
typedef R2L_Token<SimpleWordId> R2L;
|
| 380 |
+
// size_t c = with_sfas + with_pfas;
|
| 381 |
+
if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
|
| 382 |
+
if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
|
| 383 |
+
// while (c--) wait(NULL);
|
| 384 |
+
return true;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
void build_conll_tsas()
|
| 388 |
+
{
|
| 389 |
+
string bn = baseName;
|
| 390 |
+
string mtt = tmpFile;
|
| 391 |
+
size_t c = 3 * (with_sfas + with_pfas + with_dcas);
|
| 392 |
+
if (with_sfas)
|
| 393 |
+
{
|
| 394 |
+
build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
|
| 395 |
+
build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
|
| 396 |
+
build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
if (with_pfas)
|
| 400 |
+
{
|
| 401 |
+
build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
|
| 402 |
+
build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
|
| 403 |
+
build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
if (with_dcas)
|
| 407 |
+
{
|
| 408 |
+
build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
|
| 409 |
+
build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
|
| 410 |
+
build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
|
| 411 |
+
}
|
| 412 |
+
// while (c--) wait(NULL);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
int main(int argc, char* argv[])
|
| 417 |
+
{
|
| 418 |
+
init(argc,argv);
|
| 419 |
+
numberize();
|
| 420 |
+
if (SF.totalVocabSize() > SF.knownVocabSize() ||
|
| 421 |
+
LM.totalVocabSize() > LM.knownVocabSize() ||
|
| 422 |
+
PS.totalVocabSize() > PS.knownVocabSize() ||
|
| 423 |
+
DT.totalVocabSize() > DT.knownVocabSize())
|
| 424 |
+
{
|
| 425 |
+
remap();
|
| 426 |
+
save_vocabs();
|
| 427 |
+
}
|
| 428 |
+
if (is_conll) build_conll_tsas();
|
| 429 |
+
else build_plaintext_tsas();
|
| 430 |
+
if (!quiet) cerr << endl;
|
| 431 |
+
rename(tmpFile.c_str(),mttFile.c_str());
|
| 432 |
+
}
|
| 433 |
+
|
| 434 |
+
void
|
| 435 |
+
interpret_args(int ac, char* av[])
|
| 436 |
+
{
|
| 437 |
+
po::variables_map vm;
|
| 438 |
+
po::options_description o("Options");
|
| 439 |
+
o.add_options()
|
| 440 |
+
|
| 441 |
+
("help,h", "print this message")
|
| 442 |
+
|
| 443 |
+
("quiet,q", po::bool_switch(&quiet),
|
| 444 |
+
"don't print progress information")
|
| 445 |
+
|
| 446 |
+
("incremental,i", po::bool_switch(&incremental),
|
| 447 |
+
"incremental mode; rewrites vocab files!")
|
| 448 |
+
|
| 449 |
+
("vocab-base,v", po::value<string>(&vocabBase),
|
| 450 |
+
"base name of various vocabularies")
|
| 451 |
+
|
| 452 |
+
("output,o", po::value<string>(&baseName),
|
| 453 |
+
"base file name of the resulting file(s)")
|
| 454 |
+
|
| 455 |
+
("sfa,s", po::value<int>(&with_sfas)->default_value(1),
|
| 456 |
+
"also build suffix arrays")
|
| 457 |
+
|
| 458 |
+
("pfa,p", po::value<int>(&with_pfas)
|
| 459 |
+
->default_value(0)->implicit_value(1),
|
| 460 |
+
"also build prefix arrays")
|
| 461 |
+
|
| 462 |
+
("dca,d", po::value<int>(&with_dcas)
|
| 463 |
+
->default_value(0)->implicit_value(1),
|
| 464 |
+
"also build dependency chain arrays")
|
| 465 |
+
|
| 466 |
+
("conll,c", po::bool_switch(&is_conll),
|
| 467 |
+
"corpus is in CoNLL format (default: plain text)")
|
| 468 |
+
|
| 469 |
+
("unk,u", po::value<string>(&UNK)->default_value("UNK"),
|
| 470 |
+
"label for unknown tokens")
|
| 471 |
+
|
| 472 |
+
// ("map,m", po::value<string>(&vmap),
|
| 473 |
+
// "map words to word classes for indexing")
|
| 474 |
+
|
| 475 |
+
;
|
| 476 |
+
|
| 477 |
+
po::options_description h("Hidden Options");
|
| 478 |
+
h.add_options()
|
| 479 |
+
;
|
| 480 |
+
h.add(o);
|
| 481 |
+
po::positional_options_description a;
|
| 482 |
+
a.add("output",1);
|
| 483 |
+
|
| 484 |
+
po::store(po::command_line_parser(ac,av)
|
| 485 |
+
.options(h)
|
| 486 |
+
.positional(a)
|
| 487 |
+
.run(),vm);
|
| 488 |
+
po::notify(vm);
|
| 489 |
+
if (vm.count("help") || !vm.count("output"))
|
| 490 |
+
{
|
| 491 |
+
cout << "\nusage:\n\t cat <corpus> | " << av[0]
|
| 492 |
+
<< " [options] <output .mtt file>" << endl;
|
| 493 |
+
cout << o << endl;
|
| 494 |
+
exit(0);
|
| 495 |
+
}
|
| 496 |
+
mttFile = baseName + (is_conll ? ".mtt" : ".mct");
|
| 497 |
+
tmpFile = mttFile + "_";
|
| 498 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// (c) 2008-2010 Ulrich Germann
|
| 3 |
+
#include <boost/program_options.hpp>
|
| 4 |
+
#include <iomanip>
|
| 5 |
+
|
| 6 |
+
#include "tpt_typedefs.h"
|
| 7 |
+
#include "ug_mm_ttrack.h"
|
| 8 |
+
#include "tpt_tokenindex.h"
|
| 9 |
+
#include "ug_deptree.h"
|
| 10 |
+
#include "ug_corpus_token.h"
|
| 11 |
+
|
| 12 |
+
using namespace std;
|
| 13 |
+
using namespace sapt;
|
| 14 |
+
namespace po = boost::program_options;
|
| 15 |
+
|
| 16 |
+
string bname,mtt,mct;
|
| 17 |
+
vector<string> range;
|
| 18 |
+
|
| 19 |
+
typedef L2R_Token<Conll_Sform> Token;
|
| 20 |
+
|
| 21 |
+
TokenIndex SF,LM,PS,DT;
|
| 22 |
+
mmTtrack<Token> MTT;
|
| 23 |
+
mmTtrack<SimpleWordId> MCT;
|
| 24 |
+
bool sform;
|
| 25 |
+
bool have_mtt, have_mct;
|
| 26 |
+
bool with_sids;
|
| 27 |
+
bool with_positions;
|
| 28 |
+
void
|
| 29 |
+
interpret_args(int ac, char* av[])
|
| 30 |
+
{
|
| 31 |
+
po::variables_map vm;
|
| 32 |
+
po::options_description o("Options");
|
| 33 |
+
o.add_options()
|
| 34 |
+
("help,h", "print this message")
|
| 35 |
+
("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
|
| 36 |
+
("sform,s", po::bool_switch(&sform), "sform only")
|
| 37 |
+
("with-positions,p", po::bool_switch(&with_positions), "show word positions")
|
| 38 |
+
;
|
| 39 |
+
|
| 40 |
+
po::options_description h("Hidden Options");
|
| 41 |
+
h.add_options()
|
| 42 |
+
("bname", po::value<string>(&bname), "base name")
|
| 43 |
+
("range", po::value<vector<string> >(&range), "range")
|
| 44 |
+
;
|
| 45 |
+
po::positional_options_description a;
|
| 46 |
+
a.add("bname",1);
|
| 47 |
+
a.add("range",-1);
|
| 48 |
+
|
| 49 |
+
po::store(po::command_line_parser(ac,av)
|
| 50 |
+
.options(h.add(o))
|
| 51 |
+
.positional(a)
|
| 52 |
+
.run(),vm);
|
| 53 |
+
po::notify(vm); // IMPORTANT
|
| 54 |
+
if (vm.count("help") || bname.empty())
|
| 55 |
+
{
|
| 56 |
+
cout << "usage:\n\t"
|
| 57 |
+
<< av[0] << " track name [<range>]\n"
|
| 58 |
+
<< endl;
|
| 59 |
+
cout << o << endl;
|
| 60 |
+
exit(0);
|
| 61 |
+
}
|
| 62 |
+
mtt = bname+".mtt";
|
| 63 |
+
mct = bname+".mct";
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
void
|
| 67 |
+
printRangeMTT(size_t start, size_t stop)
|
| 68 |
+
{
|
| 69 |
+
for (;start < stop; start++)
|
| 70 |
+
{
|
| 71 |
+
size_t i = 0;
|
| 72 |
+
Token const* s = MTT.sntStart(start);
|
| 73 |
+
Token const* e = MTT.sntEnd(start);
|
| 74 |
+
if (with_sids) cout << start << " ";
|
| 75 |
+
for (Token const* t = s; t < e; ++t)
|
| 76 |
+
{
|
| 77 |
+
#if 0
|
| 78 |
+
uchar const* x = reinterpret_cast<uchar const*>(t);
|
| 79 |
+
cout << *reinterpret_cast<id_type const*>(x) << " ";
|
| 80 |
+
cout << *reinterpret_cast<id_type const*>(x+4) << " ";
|
| 81 |
+
cout << int(*(x+8)) << " ";
|
| 82 |
+
cout << int(*(x+9)) << " ";
|
| 83 |
+
cout << *reinterpret_cast<short const*>(x+10) << endl;
|
| 84 |
+
#endif
|
| 85 |
+
if (!sform)
|
| 86 |
+
{
|
| 87 |
+
cout << setw(2) << right << ++i << " ";
|
| 88 |
+
cout << setw(30) << right << SF[t->sform] << " ";
|
| 89 |
+
cout << setw(4) << right << PS[t->majpos] << " ";
|
| 90 |
+
cout << setw(4) << right << PS[t->minpos] << " ";
|
| 91 |
+
cout << setw(30) << left << LM[t->lemma] << " ";
|
| 92 |
+
cout << i+t->parent << " ";
|
| 93 |
+
cout << DT[t->dtype] << endl;
|
| 94 |
+
}
|
| 95 |
+
else
|
| 96 |
+
{
|
| 97 |
+
if (with_positions) cout << t-s << ":";
|
| 98 |
+
cout << SF[t->id()] << " ";
|
| 99 |
+
}
|
| 100 |
+
}
|
| 101 |
+
cout << endl;
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
void
|
| 106 |
+
printRangeMCT(size_t start, size_t stop)
|
| 107 |
+
{
|
| 108 |
+
for (;start < stop; start++)
|
| 109 |
+
{
|
| 110 |
+
SimpleWordId const* s = MCT.sntStart(start);
|
| 111 |
+
SimpleWordId const* t = s;
|
| 112 |
+
SimpleWordId const* e = MCT.sntEnd(start);
|
| 113 |
+
if (with_sids) cout << start << " ";
|
| 114 |
+
while (t < e)
|
| 115 |
+
{
|
| 116 |
+
if (with_positions) cout << t-s << ":";
|
| 117 |
+
cout << SF[(t++)->id()] << " ";
|
| 118 |
+
}
|
| 119 |
+
cout << endl;
|
| 120 |
+
}
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
int
|
| 124 |
+
main(int argc, char*argv[])
|
| 125 |
+
{
|
| 126 |
+
interpret_args(argc,argv);
|
| 127 |
+
have_mtt = !access(mtt.c_str(),F_OK);
|
| 128 |
+
have_mct = !have_mtt && !access(mct.c_str(),F_OK);
|
| 129 |
+
if (!have_mtt && !have_mct)
|
| 130 |
+
{
|
| 131 |
+
cerr << "FATAL ERROR: neither " << mtt << " nor " << mct << " exit." << endl;
|
| 132 |
+
exit(1);
|
| 133 |
+
}
|
| 134 |
+
if (have_mtt)
|
| 135 |
+
{
|
| 136 |
+
SF.open(bname+".tdx.sfo"); SF.iniReverseIndex();
|
| 137 |
+
LM.open(bname+".tdx.lem"); LM.iniReverseIndex();
|
| 138 |
+
PS.open(bname+".tdx.pos"); PS.iniReverseIndex();
|
| 139 |
+
DT.open(bname+".tdx.drl"); DT.iniReverseIndex();
|
| 140 |
+
MTT.open(mtt);
|
| 141 |
+
}
|
| 142 |
+
else
|
| 143 |
+
{
|
| 144 |
+
sform = true;
|
| 145 |
+
SF.open(bname+".tdx"); SF.iniReverseIndex();
|
| 146 |
+
MCT.open(mct);
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
if (!range.size())
|
| 150 |
+
have_mtt ? printRangeMTT(0, MTT.size()) : printRangeMCT(0, MCT.size());
|
| 151 |
+
else
|
| 152 |
+
{
|
| 153 |
+
for (size_t i = 0; i < range.size(); i++)
|
| 154 |
+
{
|
| 155 |
+
istringstream buf(range[i]);
|
| 156 |
+
size_t first,last; uchar c;
|
| 157 |
+
buf>>first;
|
| 158 |
+
if (buf.peek() == '-') buf>>c>>last;
|
| 159 |
+
else last = first;
|
| 160 |
+
if (have_mtt && last < MTT.size())
|
| 161 |
+
printRangeMTT(first,last+1);
|
| 162 |
+
else if (last < MCT.size())
|
| 163 |
+
printRangeMCT(first,last+1);
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// build a phrase table for the given input
|
| 2 |
+
#include "ug_mm_ttrack.h"
|
| 3 |
+
#include "ug_mm_tsa.h"
|
| 4 |
+
#include "tpt_tokenindex.h"
|
| 5 |
+
#include "ug_corpus_token.h"
|
| 6 |
+
#include <string>
|
| 7 |
+
#include <vector>
|
| 8 |
+
#include <cassert>
|
| 9 |
+
#include <boost/unordered_map.hpp>
|
| 10 |
+
#include <boost/foreach.hpp>
|
| 11 |
+
#include <iomanip>
|
| 12 |
+
#include "ug_typedefs.h"
|
| 13 |
+
#include "tpt_pickler.h"
|
| 14 |
+
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
| 15 |
+
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
| 16 |
+
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
| 17 |
+
#include <algorithm>
|
| 18 |
+
#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
|
| 19 |
+
|
| 20 |
+
using namespace std;
|
| 21 |
+
using namespace ugdiss;
|
| 22 |
+
using namespace Moses;
|
| 23 |
+
typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
|
| 24 |
+
typedef sapt::mmTSA<Token>::tree_iterator iter;
|
| 25 |
+
typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
|
| 26 |
+
|
| 27 |
+
#define CACHING_THRESHOLD 1000
|
| 28 |
+
|
| 29 |
+
sapt::mmTtrack<Token> T; // token tracks
|
| 30 |
+
sapt::TokenIndex V; // vocabs
|
| 31 |
+
sapt::mmTSA<Token> I; // suffix arrays
|
| 32 |
+
|
| 33 |
+
void interpret_args(int ac, char* av[]);
|
| 34 |
+
string bname;
|
| 35 |
+
bool echo;
|
| 36 |
+
int main(int argc, char* argv[])
|
| 37 |
+
{
|
| 38 |
+
interpret_args(argc,argv);
|
| 39 |
+
|
| 40 |
+
T.open(bname+".mct");
|
| 41 |
+
V.open(bname+".tdx"); V.iniReverseIndex();
|
| 42 |
+
I.open(bname+".sfa",&T);
|
| 43 |
+
string line;
|
| 44 |
+
while (getline(cin,line))
|
| 45 |
+
{
|
| 46 |
+
vector<id_type> phr;
|
| 47 |
+
V.fillIdSeq(line,phr);
|
| 48 |
+
TSA<Token>::tree_iterator m(&I);
|
| 49 |
+
size_t i = 0;
|
| 50 |
+
while (i < phr.size() && m.extend(phr[i])) ++i;
|
| 51 |
+
if (echo) cout << line << ": ";
|
| 52 |
+
if (i < phr.size()) cout << 0 << endl;
|
| 53 |
+
else cout << m.rawCnt() << endl;
|
| 54 |
+
}
|
| 55 |
+
exit(0);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
void
|
| 59 |
+
interpret_args(int ac, char* av[])
|
| 60 |
+
{
|
| 61 |
+
namespace po=boost::program_options;
|
| 62 |
+
po::variables_map vm;
|
| 63 |
+
po::options_description o("Options");
|
| 64 |
+
po::options_description h("Hidden Options");
|
| 65 |
+
po::positional_options_description a;
|
| 66 |
+
|
| 67 |
+
o.add_options()
|
| 68 |
+
("help,h", "print this message")
|
| 69 |
+
("echo,e", po::bool_switch(&echo), "repeat lookup phrases")
|
| 70 |
+
;
|
| 71 |
+
|
| 72 |
+
h.add_options()
|
| 73 |
+
("bname", po::value<string>(&bname), "base name")
|
| 74 |
+
;
|
| 75 |
+
a.add("bname",1);
|
| 76 |
+
get_options(ac,av,h.add(o),a,vm);
|
| 77 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "num_read_write.h"
|
| 2 |
+
namespace tpt {
|
| 3 |
+
typedef unsigned char uchar;
|
| 4 |
+
|
| 5 |
+
void
|
| 6 |
+
numwrite(std::ostream& out, uint16_t const& x)
|
| 7 |
+
{
|
| 8 |
+
char buf[2];
|
| 9 |
+
buf[0] = x%256;
|
| 10 |
+
buf[1] = (x>>8)%256;
|
| 11 |
+
out.write(buf,2);
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
void
|
| 15 |
+
numwrite(std::ostream& out, uint32_t const& x)
|
| 16 |
+
{
|
| 17 |
+
char buf[4];
|
| 18 |
+
buf[0] = x%256;
|
| 19 |
+
buf[1] = (x>>8)%256;
|
| 20 |
+
buf[2] = (x>>16)%256;
|
| 21 |
+
buf[3] = (x>>24)%256;
|
| 22 |
+
out.write(buf,4);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
void
|
| 26 |
+
numwrite(std::ostream& out, uint64_t const& x)
|
| 27 |
+
{
|
| 28 |
+
char buf[8];
|
| 29 |
+
buf[0] = x%256;
|
| 30 |
+
buf[1] = (x>>8)%256;
|
| 31 |
+
buf[2] = (x>>16)%256;
|
| 32 |
+
buf[3] = (x>>24)%256;
|
| 33 |
+
buf[4] = (x>>32)%256;
|
| 34 |
+
buf[5] = (x>>40)%256;
|
| 35 |
+
buf[6] = (x>>48)%256;
|
| 36 |
+
buf[7] = (x>>56)%256;
|
| 37 |
+
out.write(buf,8);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
char const*
|
| 41 |
+
numread(char const* src, uint16_t & x)
|
| 42 |
+
{
|
| 43 |
+
uchar const* d = reinterpret_cast<uchar const*>(src);
|
| 44 |
+
x = (uint16_t(d[0])<<0) | (uint16_t(d[1])<<8);
|
| 45 |
+
return src+2;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
char const*
|
| 49 |
+
numread(char const* src, uint32_t & x)
|
| 50 |
+
{
|
| 51 |
+
uchar const* d = reinterpret_cast<uchar const*>(src);
|
| 52 |
+
x = ((uint32_t(d[0])<<0) |
|
| 53 |
+
(uint32_t(d[1])<<8) |
|
| 54 |
+
(uint32_t(d[2])<<16)|
|
| 55 |
+
(uint32_t(d[3])<<24));
|
| 56 |
+
return src+4;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
char const*
|
| 60 |
+
numread(char const* src, uint64_t & x)
|
| 61 |
+
{
|
| 62 |
+
uchar const* d = reinterpret_cast<uchar const*>(src);
|
| 63 |
+
x = ((uint64_t(d[0])<<0) |
|
| 64 |
+
(uint64_t(d[1])<<8) |
|
| 65 |
+
(uint64_t(d[2])<<16) |
|
| 66 |
+
(uint64_t(d[3])<<24) |
|
| 67 |
+
(uint64_t(d[4])<<32) |
|
| 68 |
+
(uint64_t(d[5])<<40) |
|
| 69 |
+
(uint64_t(d[6])<<48) |
|
| 70 |
+
(uint64_t(d[7])<<56));
|
| 71 |
+
return src+8;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- c++ -*-
|
| 2 |
+
#include "ug_http_client.h"
|
| 3 |
+
|
| 4 |
+
int main(int argc, char* argv[])
|
| 5 |
+
{
|
| 6 |
+
try
|
| 7 |
+
{
|
| 8 |
+
if (argc != 2)
|
| 9 |
+
{
|
| 10 |
+
std::cout << "Usage: async_client <url>\n";
|
| 11 |
+
std::cout << "Example:\n";
|
| 12 |
+
std::cout << " async_client www.boost.org/LICENSE_1_0.txt\n";
|
| 13 |
+
return 1;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
boost::asio::io_service io_service;
|
| 17 |
+
Moses::http_client c(io_service, argv[1]);
|
| 18 |
+
io_service.run();
|
| 19 |
+
std::cout << c.content() << std::endl;
|
| 20 |
+
}
|
| 21 |
+
catch (std::exception& e)
|
| 22 |
+
{
|
| 23 |
+
std::cout << "Exception: " << e.what() << "\n";
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
return 0;
|
| 27 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <iostream>
|
| 2 |
+
#include <string>
|
| 3 |
+
#include <iomanip>
|
| 4 |
+
#include "ug_http_client.h"
|
| 5 |
+
|
| 6 |
+
using namespace std;
|
| 7 |
+
int main()
|
| 8 |
+
{
|
| 9 |
+
string line;
|
| 10 |
+
while (getline(cin,line))
|
| 11 |
+
cout << Moses::uri_encode(line) << endl;
|
| 12 |
+
}
|
| 13 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc
ADDED
|
@@ -0,0 +1,594 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// (c) 2007,2008 Ulrich Germann
|
| 3 |
+
|
| 4 |
+
/* Functions for writing indices tightly (use only the bytes you need).
|
| 5 |
+
* The first bit indicates whether a byte belongs to a key or a value.
|
| 6 |
+
* The remaining 7 bits are part of the respective integer value.
|
| 7 |
+
* (c) 2007 Ulrich Germann
|
| 8 |
+
*/
|
| 9 |
+
//
|
| 10 |
+
// ugTightIndex.cc
|
| 11 |
+
//
|
| 12 |
+
// Made by Ulrich Germann
|
| 13 |
+
// Login <germann@germann-laptop>
|
| 14 |
+
//
|
| 15 |
+
// Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
|
| 16 |
+
// Started on Tue Jul 17 15:09:33 2007 Ulrich Germann
|
| 17 |
+
//
|
| 18 |
+
|
| 19 |
+
#include <iostream>
|
| 20 |
+
#include <cassert>
|
| 21 |
+
#include "tpt_tightindex.h"
|
| 22 |
+
|
| 23 |
+
namespace tpt
|
| 24 |
+
{
|
| 25 |
+
|
| 26 |
+
// #define LOG_WRITE_ACTIVITY
|
| 27 |
+
|
| 28 |
+
// write a key or value into a tight index
|
| 29 |
+
// flag indicates wheter it's a key or a value
|
| 30 |
+
void tightwrite(std::ostream& out, uint64_t data, bool flag)
|
| 31 |
+
{
|
| 32 |
+
// assert(sizeof(size_t)==4);
|
| 33 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 34 |
+
size_t bytes_written=1;
|
| 35 |
+
std::cerr << "starting at file position " << out.tellp()
|
| 36 |
+
<< ": tightwrite " << data;
|
| 37 |
+
#endif
|
| 38 |
+
if (flag)
|
| 39 |
+
{
|
| 40 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 41 |
+
std::cerr << " with flag 1 ";
|
| 42 |
+
#endif
|
| 43 |
+
while (data >= 128)
|
| 44 |
+
{
|
| 45 |
+
char c = char(data%128)|char(-128);
|
| 46 |
+
out.put(c);
|
| 47 |
+
data >>= 7;
|
| 48 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 49 |
+
bytes_written++;
|
| 50 |
+
#endif
|
| 51 |
+
}
|
| 52 |
+
char c = char(data%128)|char(-128);
|
| 53 |
+
out.put(c);
|
| 54 |
+
}
|
| 55 |
+
else
|
| 56 |
+
{
|
| 57 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 58 |
+
std::cerr << " with flag 0 ";
|
| 59 |
+
#endif
|
| 60 |
+
while (data >= 128)
|
| 61 |
+
{
|
| 62 |
+
char c = data&127;
|
| 63 |
+
out.put(c);
|
| 64 |
+
data >>= 7;
|
| 65 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 66 |
+
bytes_written++;
|
| 67 |
+
#endif
|
| 68 |
+
}
|
| 69 |
+
char c = (data&127);
|
| 70 |
+
out.put(c);
|
| 71 |
+
}
|
| 72 |
+
#ifdef LOG_WRITE_ACTIVITY
|
| 73 |
+
std::cerr << " in " << bytes_written << " bytes" << std::endl;
|
| 74 |
+
#endif
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// For the code below: does it make a difference if I hard-code the
|
| 78 |
+
// unraveled loop or does code optimization by the compiler take care
|
| 79 |
+
// of that?
|
| 80 |
+
|
| 81 |
+
#define DEBUG_TIGHTREAD 0
|
| 82 |
+
|
| 83 |
+
// read a key value from a tight index; filepos_type must be at least as
|
| 84 |
+
// large as count_type
|
| 85 |
+
filepos_type
|
| 86 |
+
tightread(std::istream& in, std::ios::pos_type stop)
|
| 87 |
+
{
|
| 88 |
+
// debug=true;
|
| 89 |
+
// assert(sizeof(size_t) == 4);
|
| 90 |
+
assert(in.rdbuf()->in_avail() > 0);
|
| 91 |
+
filepos_type data = 0;
|
| 92 |
+
short int bitshift = 7;
|
| 93 |
+
int pos = in.tellg();
|
| 94 |
+
#if DEBUG_TIGHTREAD
|
| 95 |
+
if (debug)
|
| 96 |
+
cerr << bitpattern(uint(in.peek())) << " " << in.peek()
|
| 97 |
+
<< " pos=" << in.tellg() << "\n";
|
| 98 |
+
#endif
|
| 99 |
+
int buf = in.get();
|
| 100 |
+
if (stop == std::ios::pos_type(0))
|
| 101 |
+
stop = size_t(in.tellg())+in.rdbuf()->in_avail();
|
| 102 |
+
else
|
| 103 |
+
stop = std::min(size_t(stop),size_t(in.tellg())+in.rdbuf()->in_avail());
|
| 104 |
+
if (buf < 0)
|
| 105 |
+
std::cerr << "number read: " << buf << " " << pos << " "
|
| 106 |
+
<< in.tellg() << std::endl;
|
| 107 |
+
assert (buf>=0);
|
| 108 |
+
|
| 109 |
+
if (buf >= 128) // continuation bit is 1
|
| 110 |
+
{
|
| 111 |
+
data = buf-128; // unset the bit
|
| 112 |
+
while (in.tellg() < stop && in.peek() >= 128)
|
| 113 |
+
{
|
| 114 |
+
#if DEBUG_TIGHTREAD
|
| 115 |
+
if (debug)
|
| 116 |
+
cerr << bitpattern(uint(in.peek())) << " " << in.peek();
|
| 117 |
+
#endif
|
| 118 |
+
// cerr << bitpattern(size_t(in.peek())) << std::endl;
|
| 119 |
+
data += size_t(in.get()-128)<<bitshift;
|
| 120 |
+
bitshift += 7;
|
| 121 |
+
#if DEBUG_TIGHTREAD
|
| 122 |
+
if (debug)
|
| 123 |
+
cerr << " " << data << " pos=" << in.tellg() << std::endl;
|
| 124 |
+
#endif
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
else
|
| 128 |
+
{
|
| 129 |
+
data = buf;
|
| 130 |
+
while (in.tellg() < stop && in.peek() < 128)
|
| 131 |
+
{
|
| 132 |
+
// cerr << bitpattern(size_t(in.peek())) << std::endl;
|
| 133 |
+
#if DEBUG_TIGHTREAD
|
| 134 |
+
if (debug)
|
| 135 |
+
cerr << bitpattern(uint(in.peek())) << " " << in.peek();
|
| 136 |
+
|
| 137 |
+
#endif
|
| 138 |
+
data += size_t(in.get())<<bitshift;
|
| 139 |
+
bitshift += 7;
|
| 140 |
+
#if DEBUG_TIGHTREAD
|
| 141 |
+
if (debug)
|
| 142 |
+
cerr << " " << data << " pos=" << in.tellg() << "\n";
|
| 143 |
+
#endif
|
| 144 |
+
}
|
| 145 |
+
}
|
| 146 |
+
return data;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
#define DEBUG_TIGHTFIND 0
|
| 150 |
+
#if DEBUG_TIGHTFIND
|
| 151 |
+
bool debug=true;
|
| 152 |
+
#endif
|
| 153 |
+
bool
|
| 154 |
+
tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop)
|
| 155 |
+
{
|
| 156 |
+
in.seekg((start+stop)/2);
|
| 157 |
+
// Jump approximately to the middle. Since we might land in the
|
| 158 |
+
// middle of a number, we need to find the start of the next
|
| 159 |
+
// [index key/file offset] pair first. Bytes belonging to an index
|
| 160 |
+
// key have the leftmost bit set to 0, bytes belonging to a file
|
| 161 |
+
// offset have it set to 1
|
| 162 |
+
|
| 163 |
+
// if we landed in the middle of an index key, skip to the end of it
|
| 164 |
+
while (static_cast<filepos_type>(in.tellg()) < stop && in.get() < 128)
|
| 165 |
+
{
|
| 166 |
+
#if DEBUG_TIGHTFIND
|
| 167 |
+
if (debug)
|
| 168 |
+
{
|
| 169 |
+
in.unget();
|
| 170 |
+
char c = in.get();
|
| 171 |
+
std::cerr << in.tellg() << " skipped key byte " << c << std::endl;
|
| 172 |
+
}
|
| 173 |
+
#endif
|
| 174 |
+
if (in.eof()) return false;
|
| 175 |
+
}
|
| 176 |
+
// Also skip the associated file offset:
|
| 177 |
+
while (static_cast<filepos_type>(in.tellg()) < stop && in.peek() >= 128)
|
| 178 |
+
{
|
| 179 |
+
#if DEBUG_TIGHTFIND
|
| 180 |
+
int r = in.get();
|
| 181 |
+
if (debug)
|
| 182 |
+
std::cerr << in.tellg() << " skipped value byte " << r
|
| 183 |
+
<< " next is " << in.peek()
|
| 184 |
+
<< std::endl;
|
| 185 |
+
#else
|
| 186 |
+
in.get();
|
| 187 |
+
#endif
|
| 188 |
+
}
|
| 189 |
+
return true;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
char const*
|
| 193 |
+
tightfind_midpoint(char const* const start,
|
| 194 |
+
char const* const stop)
|
| 195 |
+
{
|
| 196 |
+
char const* mp = start + (stop - start)/2;
|
| 197 |
+
while (*mp < 0 && mp > start) mp--;
|
| 198 |
+
while (*mp >= 0 && mp > start) mp--;
|
| 199 |
+
return (*mp < 0) ? ++mp : mp;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
bool
|
| 203 |
+
linear_search(std::istream& in, filepos_type start, filepos_type stop,
|
| 204 |
+
id_type key, unsigned char& flags)
|
| 205 |
+
{ // performs a linear search in the range
|
| 206 |
+
in.seekg(start);
|
| 207 |
+
|
| 208 |
+
#if DEBUG_TIGHTFIND
|
| 209 |
+
if (debug) std::cerr << in.tellg() << " ";
|
| 210 |
+
#endif
|
| 211 |
+
|
| 212 |
+
// ATTENTION! The bitshift operations below are important:
|
| 213 |
+
// We use some of the bits in the key value to store additional
|
| 214 |
+
// information about what and where node iformation is stored.
|
| 215 |
+
|
| 216 |
+
id_type foo;
|
| 217 |
+
for(foo = tightread(in,stop);
|
| 218 |
+
(foo>>FLAGBITS) < key;
|
| 219 |
+
foo = tightread(in,stop))
|
| 220 |
+
{
|
| 221 |
+
// skip the value associated with key /foo/
|
| 222 |
+
while (static_cast<filepos_type>(in.tellg()) < stop
|
| 223 |
+
&& in.peek() >= 128) in.get();
|
| 224 |
+
|
| 225 |
+
#if DEBUG_TIGHTFIND
|
| 226 |
+
if (debug)
|
| 227 |
+
std::cerr << (foo>>FLAGBITS) << " [" << key << "] "
|
| 228 |
+
<< in.tellg() << std::endl;
|
| 229 |
+
#endif
|
| 230 |
+
|
| 231 |
+
if (in.tellg() == std::ios::pos_type(stop))
|
| 232 |
+
return false; // not found
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
#if DEBUG_TIGHTFIND
|
| 236 |
+
if (debug && (foo>>FLAGBITS)==key)
|
| 237 |
+
std::cerr << "found entry for " << key << std::endl;
|
| 238 |
+
std::cerr << "current file position is " << in.tellg()
|
| 239 |
+
<< " (value read: " << key << std::endl;
|
| 240 |
+
#endif
|
| 241 |
+
|
| 242 |
+
assert(static_cast<filepos_type>(in.tellg()) < stop);
|
| 243 |
+
if ((foo>>FLAGBITS)==key)
|
| 244 |
+
{
|
| 245 |
+
flags = (foo%256);
|
| 246 |
+
flags &= FLAGMASK;
|
| 247 |
+
return true;
|
| 248 |
+
}
|
| 249 |
+
else
|
| 250 |
+
return false;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
bool
|
| 254 |
+
tightfind(std::istream& in, filepos_type start, filepos_type stop,
|
| 255 |
+
id_type key, unsigned char& flags)
|
| 256 |
+
{
|
| 257 |
+
// returns true if the value is found
|
| 258 |
+
#if DEBUG_TIGHTFIND
|
| 259 |
+
if (debug)
|
| 260 |
+
std::cerr << "looking for " << key
|
| 261 |
+
<< " in range [" << start << ":" << stop << "]" << std::endl;
|
| 262 |
+
#endif
|
| 263 |
+
if (start==stop) return false;
|
| 264 |
+
assert(stop>start);
|
| 265 |
+
if ((start+1)==stop) return false; // list is empty
|
| 266 |
+
|
| 267 |
+
unsigned int const granularity = sizeof(filepos_type)*5;
|
| 268 |
+
// granularity: point where we should switch to linear search,
|
| 269 |
+
// because otherwise we might skip over the entry we are looking for
|
| 270 |
+
// because we land right in the middle of it.
|
| 271 |
+
|
| 272 |
+
if (stop > start + granularity)
|
| 273 |
+
if (!tightfind_midpoint(in,start,stop))
|
| 274 |
+
return false; // something went wrong (empty index)
|
| 275 |
+
|
| 276 |
+
if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
|
| 277 |
+
{ // If the search range is very short, tightfind_midpoint might skip the
|
| 278 |
+
// entry we are loking for. In this case, we can afford a linear
|
| 279 |
+
// search
|
| 280 |
+
return linear_search(in,start,stop,key,flags);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
// perform binary search
|
| 284 |
+
filepos_type curpos = in.tellg();
|
| 285 |
+
id_type foo = tightread(in,stop);
|
| 286 |
+
id_type tmpid = foo>>FLAGBITS;
|
| 287 |
+
if (tmpid == key)
|
| 288 |
+
{
|
| 289 |
+
flags = foo%256;
|
| 290 |
+
flags &= FLAGMASK;
|
| 291 |
+
#if DEBUG_TIGHTFIND
|
| 292 |
+
if (debug) std::cerr << "found entry for " << key << std::endl;
|
| 293 |
+
#endif
|
| 294 |
+
return true; // done, found
|
| 295 |
+
}
|
| 296 |
+
else if (tmpid > key)
|
| 297 |
+
{ // look in the lower half
|
| 298 |
+
#if DEBUG_TIGHTFIND
|
| 299 |
+
if (debug) std::cerr << foo << " > " << key << std::endl;
|
| 300 |
+
#endif
|
| 301 |
+
return tightfind(in,start,curpos,key,flags);
|
| 302 |
+
}
|
| 303 |
+
else
|
| 304 |
+
{ // look in the upper half
|
| 305 |
+
while (static_cast<filepos_type>(in.tellg()) < stop
|
| 306 |
+
&& in.rdbuf()->in_avail() > 0 // is that still necessary???
|
| 307 |
+
&& in.peek() >= 128)
|
| 308 |
+
in.get(); // skip associated value
|
| 309 |
+
if (in.rdbuf()->in_avail() == 0 || in.tellg() == std::ios::pos_type(stop))
|
| 310 |
+
return false;
|
| 311 |
+
#if DEBUG_TIGHTFIND
|
| 312 |
+
if (debug) std::cerr << foo << " < " << key << std::endl;
|
| 313 |
+
#endif
|
| 314 |
+
return tightfind(in,in.tellg(),stop,key,flags);
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
char const*
|
| 320 |
+
tightfind(char const* const start,
|
| 321 |
+
char const* const stop,
|
| 322 |
+
id_type key,
|
| 323 |
+
unsigned char& flags)
|
| 324 |
+
{
|
| 325 |
+
// returns true if the value is found
|
| 326 |
+
|
| 327 |
+
if (start==stop) return NULL;
|
| 328 |
+
assert(stop>start);
|
| 329 |
+
if ((start+1)==stop) return NULL; // list is empty
|
| 330 |
+
char const* p = tightfind_midpoint(start,stop);
|
| 331 |
+
// if ids can be larger than 67,108,864 on 32-bit machines
|
| 332 |
+
// (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
|
| 333 |
+
size_t foo;
|
| 334 |
+
char const* after = tightread(p,stop,foo);
|
| 335 |
+
id_type tmpId = foo>>FLAGBITS;
|
| 336 |
+
if (tmpId == key)
|
| 337 |
+
{
|
| 338 |
+
flags = foo%256;
|
| 339 |
+
flags &= FLAGMASK;
|
| 340 |
+
return after;
|
| 341 |
+
}
|
| 342 |
+
else if (tmpId > key)
|
| 343 |
+
{ // look in the lower half
|
| 344 |
+
return tightfind(start,p,key,flags);
|
| 345 |
+
}
|
| 346 |
+
else
|
| 347 |
+
{ // look in the upper half
|
| 348 |
+
while (*after<0 && ++after < stop);
|
| 349 |
+
if (after == stop) return NULL;
|
| 350 |
+
return tightfind(after,stop,key,flags);
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
char const*
|
| 355 |
+
tightfind_noflags(char const* const start,
|
| 356 |
+
char const* const stop,
|
| 357 |
+
id_type key)
|
| 358 |
+
{
|
| 359 |
+
// returns true if the value is found
|
| 360 |
+
|
| 361 |
+
if (start==stop) return NULL;
|
| 362 |
+
assert(stop>start);
|
| 363 |
+
if ((start+1)==stop) return NULL; // list is empty
|
| 364 |
+
char const* p = tightfind_midpoint(start,stop);
|
| 365 |
+
// if ids can be larger than 67,108,864 on 32-bit machines
|
| 366 |
+
// (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
|
| 367 |
+
size_t foo;
|
| 368 |
+
char const* after = tightread(p,stop,foo);
|
| 369 |
+
if (foo == key)
|
| 370 |
+
return after;
|
| 371 |
+
else if (foo > key)
|
| 372 |
+
{ // look in the lower half
|
| 373 |
+
return tightfind_noflags(start,p,key);
|
| 374 |
+
}
|
| 375 |
+
else
|
| 376 |
+
{ // look in the upper half
|
| 377 |
+
while (*after<0 && ++after < stop);
|
| 378 |
+
if (after == stop) return NULL;
|
| 379 |
+
return tightfind_noflags(after,stop,key);
|
| 380 |
+
}
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
bool
|
| 384 |
+
linear_search_noflags(std::istream& in, filepos_type start,
|
| 385 |
+
filepos_type stop, id_type key)
|
| 386 |
+
{ // performs a linear search in the range
|
| 387 |
+
std::ios::pos_type mystop = stop;
|
| 388 |
+
|
| 389 |
+
in.seekg(start);
|
| 390 |
+
id_type foo;
|
| 391 |
+
for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop))
|
| 392 |
+
{
|
| 393 |
+
// skip the value associated with key /foo/
|
| 394 |
+
while (in.tellg() < mystop && in.peek() >= 128)
|
| 395 |
+
in.get();
|
| 396 |
+
if (in.tellg() == mystop)
|
| 397 |
+
return false; // not found
|
| 398 |
+
}
|
| 399 |
+
assert(in.tellg() < mystop);
|
| 400 |
+
return (foo==key);
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
|
| 404 |
+
bool
|
| 405 |
+
tightfind_noflags(std::istream& in, filepos_type start,
|
| 406 |
+
filepos_type stop, id_type key)
|
| 407 |
+
{
|
| 408 |
+
// returns true if the value is found
|
| 409 |
+
if (start==stop) return false;
|
| 410 |
+
assert(stop>start);
|
| 411 |
+
if ((start+1)==stop) return false; // list is empty
|
| 412 |
+
|
| 413 |
+
// granularity: point where we should switch to linear search,
|
| 414 |
+
// because otherwise we might skip over the entry we are looking for
|
| 415 |
+
// because we land right in the middle of it.
|
| 416 |
+
unsigned int const granularity = sizeof(filepos_type)*5;
|
| 417 |
+
// UG: why 5? we should be able to get away with less!
|
| 418 |
+
|
| 419 |
+
if (stop > start + granularity)
|
| 420 |
+
if (!tightfind_midpoint(in,start,stop))
|
| 421 |
+
return false; // something went wrong (empty index)
|
| 422 |
+
|
| 423 |
+
// If the search range is very short, tightfind_midpoint might skip the
|
| 424 |
+
// entry we are loking for. In this case, we can afford a linear
|
| 425 |
+
// search
|
| 426 |
+
if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
|
| 427 |
+
return linear_search_noflags(in,start,stop,key);
|
| 428 |
+
|
| 429 |
+
// Otherwise, perform binary search
|
| 430 |
+
filepos_type curpos = in.tellg();
|
| 431 |
+
id_type foo = tightread(in,stop);
|
| 432 |
+
if (foo == key)
|
| 433 |
+
return true; // done, found
|
| 434 |
+
|
| 435 |
+
else if (foo > key) // search first half
|
| 436 |
+
return tightfind_noflags(in,start,curpos,key);
|
| 437 |
+
|
| 438 |
+
else // search second half
|
| 439 |
+
{
|
| 440 |
+
std::ios::pos_type mystop = stop;
|
| 441 |
+
while (in.tellg() < mystop
|
| 442 |
+
&& in.rdbuf()->in_avail() > 0 // is that still necessary???
|
| 443 |
+
&& in.peek() >= 128)
|
| 444 |
+
in.get(); // skip associated value
|
| 445 |
+
if (in.rdbuf()->in_avail() == 0 || in.tellg() == mystop)
|
| 446 |
+
return false;
|
| 447 |
+
return tightfind_noflags(in,in.tellg(),stop,key);
|
| 448 |
+
}
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
void tightwrite2(std::ostream& out, size_t data, bool flag)
|
| 452 |
+
{
|
| 453 |
+
// same as tightwrite, but uses basic storage units of size 2
|
| 454 |
+
// assert(sizeof(size_t)==4);
|
| 455 |
+
short int foo = (data%32768);
|
| 456 |
+
if (flag)
|
| 457 |
+
{
|
| 458 |
+
foo += 32768; // set first bit
|
| 459 |
+
while (data >= 32768) // = 2^15
|
| 460 |
+
{
|
| 461 |
+
out.write(reinterpret_cast<char*>(&foo),2);
|
| 462 |
+
data >>= 15;
|
| 463 |
+
foo = (data%32768)+32768;
|
| 464 |
+
}
|
| 465 |
+
}
|
| 466 |
+
else
|
| 467 |
+
{
|
| 468 |
+
while (data >= 32768) // = 2^15
|
| 469 |
+
{
|
| 470 |
+
out.write(reinterpret_cast<char*>(&foo),2);
|
| 471 |
+
data >>= 15;
|
| 472 |
+
foo = data%32768;
|
| 473 |
+
}
|
| 474 |
+
}
|
| 475 |
+
out.write(reinterpret_cast<char*>(&foo),2);
|
| 476 |
+
}
|
| 477 |
+
|
| 478 |
+
char const*
|
| 479 |
+
tightread8(char const* start,
|
| 480 |
+
char const* stop,
|
| 481 |
+
uint64_t& dest)
|
| 482 |
+
{
|
| 483 |
+
static char bitmask=127;
|
| 484 |
+
dest = 0;
|
| 485 |
+
if (*start < 0)
|
| 486 |
+
{
|
| 487 |
+
dest = (*start)&bitmask;
|
| 488 |
+
if (++start==stop || *start >= 0) return start;
|
| 489 |
+
dest += uint64_t((*start)&bitmask)<<7;
|
| 490 |
+
if (++start==stop || *start >= 0) return start;
|
| 491 |
+
dest += uint64_t((*start)&bitmask)<<14;
|
| 492 |
+
if (++start==stop || *start >= 0) return start;
|
| 493 |
+
dest += uint64_t((*start)&bitmask)<<21;
|
| 494 |
+
if (++start==stop || *start >= 0) return start;
|
| 495 |
+
dest += uint64_t((*start)&bitmask)<<28;
|
| 496 |
+
if (++start==stop || *start >= 0) return start;
|
| 497 |
+
dest += uint64_t((*start)&bitmask)<<35;
|
| 498 |
+
if (++start==stop || *start >= 0) return start;
|
| 499 |
+
dest += uint64_t((*start)&bitmask)<<42;
|
| 500 |
+
if (++start==stop || *start >= 0) return start;
|
| 501 |
+
dest += uint64_t((*start)&bitmask)<<49;
|
| 502 |
+
if (++start==stop || *start >= 0) return start;
|
| 503 |
+
dest += uint64_t((*start)&bitmask)<<56;
|
| 504 |
+
if (++start==stop || *start >= 0) return start;
|
| 505 |
+
dest += uint64_t((*start)&bitmask)<<63;
|
| 506 |
+
}
|
| 507 |
+
else
|
| 508 |
+
{
|
| 509 |
+
dest = *start;
|
| 510 |
+
if (++start==stop || *start < 0) return start;
|
| 511 |
+
dest += uint64_t(*start)<<7;
|
| 512 |
+
if (++start==stop || *start < 0) return start;
|
| 513 |
+
dest += uint64_t(*start)<<14;
|
| 514 |
+
if (++start==stop || *start < 0) return start;
|
| 515 |
+
dest += uint64_t(*start)<<21;
|
| 516 |
+
if (++start==stop || *start < 0) return start;
|
| 517 |
+
dest += uint64_t(*start)<<28;
|
| 518 |
+
if (++start==stop || *start < 0) return start;
|
| 519 |
+
dest += uint64_t(*start)<<35;
|
| 520 |
+
if (++start==stop || *start < 0) return start;
|
| 521 |
+
dest += uint64_t(*start)<<42;
|
| 522 |
+
if (++start==stop || *start < 0) return start;
|
| 523 |
+
dest += uint64_t(*start)<<49;
|
| 524 |
+
if (++start==stop || *start < 0) return start;
|
| 525 |
+
dest += uint64_t(*start)<<56;
|
| 526 |
+
if (++start==stop || *start < 0) return start;
|
| 527 |
+
dest += uint64_t(*start)<<63;
|
| 528 |
+
}
|
| 529 |
+
assert(start<stop);
|
| 530 |
+
return ++start;
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
char const*
|
| 534 |
+
tightread4(char const* start,
|
| 535 |
+
char const* stop,
|
| 536 |
+
uint32_t& dest)
|
| 537 |
+
{
|
| 538 |
+
static char bitmask=127;
|
| 539 |
+
dest = 0;
|
| 540 |
+
if (*start < 0)
|
| 541 |
+
{
|
| 542 |
+
dest = (*start)&bitmask;
|
| 543 |
+
if (++start==stop || *start >= 0) return start;
|
| 544 |
+
dest += uint32_t((*start)&bitmask)<<7;
|
| 545 |
+
if (++start==stop || *start >= 0) return start;
|
| 546 |
+
dest += uint32_t((*start)&bitmask)<<14;
|
| 547 |
+
if (++start==stop || *start >= 0) return start;
|
| 548 |
+
dest += uint32_t((*start)&bitmask)<<21;
|
| 549 |
+
if (++start==stop || *start >= 0) return start;
|
| 550 |
+
dest += uint32_t((*start)&bitmask)<<28;
|
| 551 |
+
}
|
| 552 |
+
else
|
| 553 |
+
{
|
| 554 |
+
dest = *start;
|
| 555 |
+
if (++start==stop || *start < 0) return start;
|
| 556 |
+
dest += uint32_t(*start)<<7;
|
| 557 |
+
if (++start==stop || *start < 0) return start;
|
| 558 |
+
dest += uint32_t(*start)<<14;
|
| 559 |
+
if (++start==stop || *start < 0) return start;
|
| 560 |
+
dest += uint32_t(*start)<<21;
|
| 561 |
+
if (++start==stop || *start < 0) return start;
|
| 562 |
+
dest += uint32_t(*start)<<28;
|
| 563 |
+
}
|
| 564 |
+
assert(start<stop);
|
| 565 |
+
return ++start;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
char const*
|
| 569 |
+
tightread2(char const* start,
|
| 570 |
+
char const* stop,
|
| 571 |
+
uint16_t& dest)
|
| 572 |
+
{
|
| 573 |
+
static char bitmask=127;
|
| 574 |
+
dest = 0;
|
| 575 |
+
if (*start < 0)
|
| 576 |
+
{
|
| 577 |
+
dest = (*start)&bitmask;
|
| 578 |
+
if (++start==stop || *start >= 0) return start;
|
| 579 |
+
dest += uint32_t((*start)&bitmask)<<7;
|
| 580 |
+
if (++start==stop || *start >= 0) return start;
|
| 581 |
+
dest += uint32_t((*start)&bitmask)<<14;
|
| 582 |
+
}
|
| 583 |
+
else
|
| 584 |
+
{
|
| 585 |
+
dest = *start;
|
| 586 |
+
if (++start==stop || *start < 0) return start;
|
| 587 |
+
dest += uint32_t(*start)<<7;
|
| 588 |
+
if (++start==stop || *start < 0) return start;
|
| 589 |
+
dest += uint32_t(*start)<<14;
|
| 590 |
+
}
|
| 591 |
+
assert(start<stop);
|
| 592 |
+
return ++start;
|
| 593 |
+
}
|
| 594 |
+
} // end namespace ugdiss
|
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// (c) 2007,2008 Ulrich Germann
|
| 3 |
+
/* Functions for writing indices tightly (use only the bytes you need).
|
| 4 |
+
* The first bit indicates whether a byte belongs to a key or a value.
|
| 5 |
+
* The remaining 7 bits are part of the respective integer value.
|
| 6 |
+
*/
|
| 7 |
+
#ifndef __ugTightIndex
|
| 8 |
+
#define __ugTightIndex
|
| 9 |
+
#include <map>
|
| 10 |
+
#include <iostream>
|
| 11 |
+
#include <sstream>
|
| 12 |
+
#include "tpt_typedefs.h"
|
| 13 |
+
#include <cassert>
|
| 14 |
+
|
| 15 |
+
#ifndef uchar
|
| 16 |
+
#endif
|
| 17 |
+
|
| 18 |
+
#define FLAGBITS 2
|
| 19 |
+
#define FLAGMASK (uchar(3))
|
| 20 |
+
#define HAS_VALUE_MASK (uchar(2))
|
| 21 |
+
#define HAS_CHILD_MASK (uchar(1))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
extern bool debug;
|
| 25 |
+
|
| 26 |
+
namespace tpt
|
| 27 |
+
{
|
| 28 |
+
// void tightwritex(iostream& out, size_t data, bool flag);
|
| 29 |
+
void
|
| 30 |
+
tightwrite(std::ostream& out, uint64_t data, bool flag);
|
| 31 |
+
|
| 32 |
+
filepos_type
|
| 33 |
+
tightread(std::istream& in, std::ios::pos_type stop);
|
| 34 |
+
|
| 35 |
+
bool
|
| 36 |
+
tightfind(std::istream& in,
|
| 37 |
+
filepos_type start,
|
| 38 |
+
filepos_type stop,
|
| 39 |
+
id_type key,
|
| 40 |
+
unsigned char& flags);
|
| 41 |
+
|
| 42 |
+
bool
|
| 43 |
+
tightfind_noflags(std::istream& in,
|
| 44 |
+
filepos_type start,
|
| 45 |
+
filepos_type stop,
|
| 46 |
+
id_type key);
|
| 47 |
+
|
| 48 |
+
char const*
|
| 49 |
+
tightfind(char const* const start,
|
| 50 |
+
char const* const stop,
|
| 51 |
+
id_type key,
|
| 52 |
+
unsigned char& flags);
|
| 53 |
+
|
| 54 |
+
char const*
|
| 55 |
+
tightfind_noflags(char const* const start,
|
| 56 |
+
char const* const stop,
|
| 57 |
+
id_type key);
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
/** move read header in istream /in/ to the first entry after the midpoint of
|
| 62 |
+
* file position range [start,stop) in in a 'tight' index
|
| 63 |
+
* @param in the data input stream
|
| 64 |
+
* @param start start of the search range
|
| 65 |
+
* @param stop end of the search range
|
| 66 |
+
* @return true if no errors occurred
|
| 67 |
+
*/
|
| 68 |
+
bool
|
| 69 |
+
tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop);
|
| 70 |
+
|
| 71 |
+
// the bitpattern functions below are for debugging
|
| 72 |
+
// They return a string showing the bits of the argument value
|
| 73 |
+
// std::string bitpattern(unsigned int s);
|
| 74 |
+
// std::string bitpattern(unsigned char c);
|
| 75 |
+
// std::string bitpattern(char c);
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
/** read a number from a tight index directy from a memory location
|
| 79 |
+
* @param start start of read range
|
| 80 |
+
* @param stop non-inclusive end of read range
|
| 81 |
+
* @param dest destination
|
| 82 |
+
* @return first memory position after the number
|
| 83 |
+
*/
|
| 84 |
+
|
| 85 |
+
char const*
|
| 86 |
+
tightread2(char const* start, char const* stop, uint16_t& dest);
|
| 87 |
+
|
| 88 |
+
char const*
|
| 89 |
+
tightread4(char const* start, char const* stop, uint32_t& dest);
|
| 90 |
+
|
| 91 |
+
char const*
|
| 92 |
+
tightread8(char const* start, char const* stop, uint64_t& dest);
|
| 93 |
+
|
| 94 |
+
template<typename numType>
|
| 95 |
+
char const*
|
| 96 |
+
tightread(char const* start, char const* stop, numType& dest)
|
| 97 |
+
{
|
| 98 |
+
if (sizeof(numType)==2)
|
| 99 |
+
return tightread2(start,stop,reinterpret_cast<uint16_t&>(dest));
|
| 100 |
+
if (sizeof(numType)==4)
|
| 101 |
+
return tightread4(start,stop,reinterpret_cast<uint32_t&>(dest));
|
| 102 |
+
else if (sizeof(numType)==8)
|
| 103 |
+
return tightread8(start,stop,reinterpret_cast<uint64_t&>(dest));
|
| 104 |
+
assert(0);
|
| 105 |
+
return NULL;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
// char const*
|
| 109 |
+
// tightread(char const* start, char const* stop, uint64_t& dest);
|
| 110 |
+
|
| 111 |
+
// char const*
|
| 112 |
+
// tightread(char const* start, char const* stop, filepos_type& dest);
|
| 113 |
+
|
| 114 |
+
#if 0
|
| 115 |
+
template<typename dtype>
|
| 116 |
+
char const*
|
| 117 |
+
tightread(char const* start,
|
| 118 |
+
char const* stop,
|
| 119 |
+
dtype& dest)
|
| 120 |
+
{
|
| 121 |
+
static char bitmask=127;
|
| 122 |
+
dest = 0;
|
| 123 |
+
if (*start < 0)
|
| 124 |
+
{
|
| 125 |
+
dest = (*start)&bitmask;
|
| 126 |
+
if (++start==stop || *start >= 0) return start;
|
| 127 |
+
dest += dtype((*start)&bitmask)<<7;
|
| 128 |
+
if (++start==stop || *start >= 0) return start;
|
| 129 |
+
dest += dtype((*start)&bitmask)<<14;
|
| 130 |
+
if (++start==stop || *start >= 0) return start;
|
| 131 |
+
dest += dtype((*start)&bitmask)<<21;
|
| 132 |
+
if (++start==stop || *start >= 0) return start;
|
| 133 |
+
dest += dtype((*start)&bitmask)<<28;
|
| 134 |
+
if (++start==stop || *start >= 0) return start;
|
| 135 |
+
assert(sizeof(dtype) > 4);
|
| 136 |
+
dest += dtype((*start)&bitmask)<<35;
|
| 137 |
+
if (++start==stop || *start >= 0) return start;
|
| 138 |
+
dest += dtype((*start)&bitmask)<<42;
|
| 139 |
+
if (++start==stop || *start >= 0) return start;
|
| 140 |
+
dest += dtype((*start)&bitmask)<<49;
|
| 141 |
+
if (++start==stop || *start >= 0) return start;
|
| 142 |
+
dest += dtype((*start)&bitmask)<<56;
|
| 143 |
+
if (++start==stop || *start >= 0) return start;
|
| 144 |
+
dest += dtype((*start)&bitmask)<<63;
|
| 145 |
+
}
|
| 146 |
+
else
|
| 147 |
+
{
|
| 148 |
+
dest = *start;
|
| 149 |
+
if (++start==stop || *start < 0) return start;
|
| 150 |
+
dest += dtype(*start)<<7;
|
| 151 |
+
if (++start==stop || *start < 0) return start;
|
| 152 |
+
dest += dtype(*start)<<14;
|
| 153 |
+
if (++start==stop || *start < 0) return start;
|
| 154 |
+
dest += dtype(*start)<<21;
|
| 155 |
+
if (++start==stop || *start < 0) return start;
|
| 156 |
+
dest += dtype(*start)<<28;
|
| 157 |
+
if (++start==stop || *start < 0) return start;
|
| 158 |
+
assert(sizeof(dtype) > 4);
|
| 159 |
+
dest += dtype(*start)<<35;
|
| 160 |
+
if (++start==stop || *start < 0) return start;
|
| 161 |
+
dest += dtype(*start)<<42;
|
| 162 |
+
if (++start==stop || *start < 0) return start;
|
| 163 |
+
dest += dtype(*start)<<49;
|
| 164 |
+
if (++start==stop || *start < 0) return start;
|
| 165 |
+
dest += dtype(*start)<<56;
|
| 166 |
+
if (++start==stop || *start < 0) return start;
|
| 167 |
+
dest += dtype(*start)<<63;
|
| 168 |
+
}
|
| 169 |
+
assert(start<stop);
|
| 170 |
+
return ++start;
|
| 171 |
+
}
|
| 172 |
+
#endif
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
}
|
| 176 |
+
#endif
|
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// (c) 2007-2013 Ulrich Germann
|
| 3 |
+
#include <sstream>
|
| 4 |
+
#include <cstring>
|
| 5 |
+
#include <algorithm>
|
| 6 |
+
#include <iostream>
|
| 7 |
+
#include <stdexcept>
|
| 8 |
+
|
| 9 |
+
#include <boost/pool/pool_alloc.hpp>
|
| 10 |
+
|
| 11 |
+
#include "tpt_tokenindex.h"
|
| 12 |
+
#include "ug_typedefs.h"
|
| 13 |
+
|
| 14 |
+
using namespace std;
|
| 15 |
+
namespace sapt
|
| 16 |
+
{
|
| 17 |
+
|
| 18 |
+
TokenIndex::
|
| 19 |
+
TokenIndex(string unkToken)
|
| 20 |
+
: ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
|
| 21 |
+
, startIdx(0), endIdx(0)
|
| 22 |
+
{
|
| 23 |
+
lock.reset(new boost::mutex());
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
#if 0
|
| 27 |
+
TokenIndex::
|
| 28 |
+
TokenIndex(string fname, string unkToken,bool dyna)
|
| 29 |
+
: ridx(0),unkLabel(unkToken)
|
| 30 |
+
{
|
| 31 |
+
this->open(fname,unkToken,dyna);
|
| 32 |
+
};
|
| 33 |
+
#endif
|
| 34 |
+
|
| 35 |
+
void
|
| 36 |
+
TokenIndex::
|
| 37 |
+
open(string fname, string unkToken,bool dyna)
|
| 38 |
+
{
|
| 39 |
+
if (access(fname.c_str(),F_OK))
|
| 40 |
+
{
|
| 41 |
+
ostringstream msg;
|
| 42 |
+
msg << "TokenIndex::open: File '" << fname << "' does not exist.";
|
| 43 |
+
throw std::runtime_error(msg.str().c_str());
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
file.open(fname);
|
| 47 |
+
if (!file.is_open())
|
| 48 |
+
{
|
| 49 |
+
ostringstream msg;
|
| 50 |
+
msg << "TokenIndex::open: Error opening file '" << fname << "'.";
|
| 51 |
+
throw std::runtime_error(msg.str().c_str());
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
|
| 55 |
+
unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
|
| 56 |
+
|
| 57 |
+
startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
|
| 58 |
+
endIdx = startIdx + numTokens;
|
| 59 |
+
comp.base = reinterpret_cast<char const*>(endIdx);
|
| 60 |
+
if (!unkToken.empty())
|
| 61 |
+
{
|
| 62 |
+
Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
|
| 63 |
+
unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
|
| 64 |
+
? bla->id
|
| 65 |
+
: numTokens);
|
| 66 |
+
}
|
| 67 |
+
this->dynamic=dyna;
|
| 68 |
+
if (dyna)
|
| 69 |
+
{
|
| 70 |
+
this->str2idExtra.reset(new map<string,id_type>());
|
| 71 |
+
this->newWords.reset(new vector<string>());
|
| 72 |
+
}
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
void
|
| 76 |
+
TokenIndex::
|
| 77 |
+
close()
|
| 78 |
+
{
|
| 79 |
+
file.close();
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
TokenIndex::
|
| 83 |
+
CompFunc::
|
| 84 |
+
CompFunc()
|
| 85 |
+
{};
|
| 86 |
+
|
| 87 |
+
bool
|
| 88 |
+
TokenIndex::
|
| 89 |
+
CompFunc::
|
| 90 |
+
operator()(Entry const& A, char const* w)
|
| 91 |
+
{
|
| 92 |
+
return strcmp(base+A.offset,w) < 0;
|
| 93 |
+
};
|
| 94 |
+
|
| 95 |
+
id_type
|
| 96 |
+
TokenIndex::
|
| 97 |
+
operator[](char const* p) const
|
| 98 |
+
{
|
| 99 |
+
if (startIdx != endIdx)
|
| 100 |
+
{
|
| 101 |
+
Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
|
| 102 |
+
if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
|
| 103 |
+
return bla->id;
|
| 104 |
+
if (!dynamic) return unkId;
|
| 105 |
+
}
|
| 106 |
+
else if (!dynamic) return strcmp(p,"NULL") && unkId;
|
| 107 |
+
|
| 108 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 109 |
+
// stuff below is new as of 2011-01-30, for dynamic adding of
|
| 110 |
+
// unknown items IMPORTANT: numTokens is not currently not
|
| 111 |
+
// changed, it is the number of PRE-EXISING TOKENS, not including
|
| 112 |
+
// dynamically added Items
|
| 113 |
+
// if (!str2idExtra)
|
| 114 |
+
// {
|
| 115 |
+
// this->str2idExtra.reset(new map<string,id_type>());
|
| 116 |
+
// this->newWords.reset(new vector<string>());
|
| 117 |
+
// }
|
| 118 |
+
map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
|
| 119 |
+
pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
|
| 120 |
+
if (foo.second) // it actually is a new item
|
| 121 |
+
newWords->push_back(foo.first->first);
|
| 122 |
+
return foo.first->second;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
id_type
|
| 126 |
+
TokenIndex::
|
| 127 |
+
operator[](string const& w) const
|
| 128 |
+
{
|
| 129 |
+
return (*this)[w.c_str()];
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
vector<char const*>
|
| 133 |
+
TokenIndex::
|
| 134 |
+
reverseIndex() const
|
| 135 |
+
{
|
| 136 |
+
size_t numToks = endIdx-startIdx;
|
| 137 |
+
|
| 138 |
+
// cout << "tokenindex has " << numToks << " tokens" << endl;
|
| 139 |
+
|
| 140 |
+
vector<char const*> v(numToks,NULL);
|
| 141 |
+
// v.reserve(endIdx-startIdx);
|
| 142 |
+
for (Entry const* x = startIdx; x != endIdx; x++)
|
| 143 |
+
{
|
| 144 |
+
if (x->id >= v.size())
|
| 145 |
+
v.resize(x->id+1);
|
| 146 |
+
v[x->id] = comp.base+x->offset;
|
| 147 |
+
}
|
| 148 |
+
// cout << "done reversing index " << endl;
|
| 149 |
+
return v;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
char const* const
|
| 153 |
+
TokenIndex::
|
| 154 |
+
operator[](id_type id) const
|
| 155 |
+
{
|
| 156 |
+
if (!ridx.size())
|
| 157 |
+
{
|
| 158 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 159 |
+
// Someone else (multi-threading!) may have created the
|
| 160 |
+
// reverse index in the meantime, so let's check again
|
| 161 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 162 |
+
}
|
| 163 |
+
if (id < ridx.size())
|
| 164 |
+
return ridx[id];
|
| 165 |
+
|
| 166 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 167 |
+
if (dynamic && id < ridx.size()+newWords->size())
|
| 168 |
+
return (*newWords)[id-ridx.size()].c_str();
|
| 169 |
+
return unkLabel.c_str();
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
void
|
| 173 |
+
TokenIndex::
|
| 174 |
+
iniReverseIndex()
|
| 175 |
+
{
|
| 176 |
+
if (!ridx.size())
|
| 177 |
+
{
|
| 178 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 179 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
char const* const
|
| 185 |
+
TokenIndex::
|
| 186 |
+
operator[](id_type id)
|
| 187 |
+
{
|
| 188 |
+
if (!ridx.size())
|
| 189 |
+
{
|
| 190 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 191 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 192 |
+
}
|
| 193 |
+
if (id < ridx.size())
|
| 194 |
+
return ridx[id];
|
| 195 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 196 |
+
if (dynamic && id < ridx.size()+newWords->size())
|
| 197 |
+
return (*newWords)[id-ridx.size()].c_str();
|
| 198 |
+
return unkLabel.c_str();
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
string
|
| 202 |
+
TokenIndex::
|
| 203 |
+
toString(vector<id_type> const& v)
|
| 204 |
+
{
|
| 205 |
+
if (!ridx.size())
|
| 206 |
+
{
|
| 207 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 208 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 209 |
+
}
|
| 210 |
+
ostringstream buf;
|
| 211 |
+
for (size_t i = 0; i < v.size(); i++)
|
| 212 |
+
buf << (i ? " " : "") << (*this)[v[i]];
|
| 213 |
+
return buf.str();
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
string
|
| 217 |
+
TokenIndex::
|
| 218 |
+
toString(vector<id_type> const& v) const
|
| 219 |
+
{
|
| 220 |
+
if (!ridx.size())
|
| 221 |
+
{
|
| 222 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 223 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 224 |
+
}
|
| 225 |
+
ostringstream buf;
|
| 226 |
+
for (size_t i = 0; i < v.size(); i++)
|
| 227 |
+
buf << (i ? " " : "") << (*this)[v[i]];
|
| 228 |
+
return buf.str();
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
string
|
| 232 |
+
TokenIndex::
|
| 233 |
+
toString(id_type const* start, id_type const* const stop)
|
| 234 |
+
{
|
| 235 |
+
if (!ridx.size())
|
| 236 |
+
{
|
| 237 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 238 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 239 |
+
}
|
| 240 |
+
ostringstream buf;
|
| 241 |
+
if (start < stop)
|
| 242 |
+
buf << (*this)[*start];
|
| 243 |
+
while (++start < stop)
|
| 244 |
+
buf << " " << (*this)[*start];
|
| 245 |
+
return buf.str();
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
string
|
| 249 |
+
TokenIndex::
|
| 250 |
+
toString(id_type const* start, id_type const* const stop) const
|
| 251 |
+
{
|
| 252 |
+
if (!ridx.size())
|
| 253 |
+
{
|
| 254 |
+
boost::lock_guard<boost::mutex> lk(*this->lock);
|
| 255 |
+
if (!ridx.size()) ridx = reverseIndex();
|
| 256 |
+
}
|
| 257 |
+
ostringstream buf;
|
| 258 |
+
if (start < stop)
|
| 259 |
+
buf << (*this)[*start];
|
| 260 |
+
while (++start < stop)
|
| 261 |
+
buf << " " << (*this)[*start];
|
| 262 |
+
return buf.str();
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
vector<id_type>
|
| 266 |
+
TokenIndex::
|
| 267 |
+
toIdSeq(string const& line) const
|
| 268 |
+
{
|
| 269 |
+
istringstream buf(line);
|
| 270 |
+
string w;
|
| 271 |
+
vector<id_type> retval;
|
| 272 |
+
while (buf>>w)
|
| 273 |
+
retval.push_back((*this)[w]);
|
| 274 |
+
return retval;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
/// Return false if line contains unknown tokens, true otherwise
|
| 278 |
+
bool
|
| 279 |
+
TokenIndex::
|
| 280 |
+
fillIdSeq(string const& line, vector<id_type> & v) const
|
| 281 |
+
{
|
| 282 |
+
bool allgood = true; string w;
|
| 283 |
+
v.clear();
|
| 284 |
+
for (istringstream buf(line); buf>>w;)
|
| 285 |
+
{
|
| 286 |
+
v.push_back((*this)[w]);
|
| 287 |
+
allgood = allgood && v.back() > 1;
|
| 288 |
+
}
|
| 289 |
+
return allgood;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
id_type
|
| 293 |
+
TokenIndex::
|
| 294 |
+
getNumTokens() const
|
| 295 |
+
{
|
| 296 |
+
return numTokens;
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
id_type
|
| 300 |
+
TokenIndex::
|
| 301 |
+
getUnkId() const
|
| 302 |
+
{
|
| 303 |
+
return unkId;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
char const* const
|
| 307 |
+
TokenIndex::
|
| 308 |
+
getUnkToken() const
|
| 309 |
+
{
|
| 310 |
+
return unkLabel.c_str();
|
| 311 |
+
// return (*this)[unkId];
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
id_type
|
| 315 |
+
TokenIndex::
|
| 316 |
+
knownVocabSize() const
|
| 317 |
+
{
|
| 318 |
+
return numTokens;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
id_type
|
| 322 |
+
TokenIndex::
|
| 323 |
+
ksize() const
|
| 324 |
+
{
|
| 325 |
+
return numTokens;
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
id_type
|
| 329 |
+
TokenIndex::
|
| 330 |
+
totalVocabSize() const
|
| 331 |
+
{ return tsize(); }
|
| 332 |
+
|
| 333 |
+
id_type
|
| 334 |
+
TokenIndex::
|
| 335 |
+
tsize() const
|
| 336 |
+
{
|
| 337 |
+
return (newWords != NULL
|
| 338 |
+
? numTokens+newWords->size()
|
| 339 |
+
: numTokens);
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
void
|
| 343 |
+
write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
|
| 344 |
+
string const& ofile, string const& unkToken)
|
| 345 |
+
{
|
| 346 |
+
typedef pair<uint32_t,id_type> IndexEntry; // offset and id
|
| 347 |
+
|
| 348 |
+
// Write token strings to a buffer, keep track of offsets
|
| 349 |
+
vector<IndexEntry> index(tok.size());
|
| 350 |
+
ostringstream data;
|
| 351 |
+
id_type unkId = tok.size();
|
| 352 |
+
for (size_t i = 0; i < tok.size(); i++)
|
| 353 |
+
{
|
| 354 |
+
if (tok[i].first == unkToken)
|
| 355 |
+
unkId = tok[i].second;
|
| 356 |
+
index[i].first = data.tellp(); // offset of string
|
| 357 |
+
index[i].second = tok[i].second; // respective ID
|
| 358 |
+
data<<tok[i].first<<char(0); // write string to buffer
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
// Now write the actual file
|
| 362 |
+
ofstream out(ofile.c_str());
|
| 363 |
+
uint32_t vsize = index.size(); // how many vocab items?
|
| 364 |
+
out.write(reinterpret_cast<char*>(&vsize),4);
|
| 365 |
+
out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type));
|
| 366 |
+
for (size_t i = 0; i < index.size(); i++)
|
| 367 |
+
{
|
| 368 |
+
out.write(reinterpret_cast<char*>(&index[i].first),4);
|
| 369 |
+
out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type));
|
| 370 |
+
}
|
| 371 |
+
out<<data.str();
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
void
|
| 375 |
+
TokenIndex::
|
| 376 |
+
write(string fname)
|
| 377 |
+
{
|
| 378 |
+
typedef pair<string,uint32_t> Token; // token and id
|
| 379 |
+
vector<Token> tok(totalVocabSize());
|
| 380 |
+
for (id_type i = 0; i < tok.size(); ++i)
|
| 381 |
+
tok[i] = Token((*this)[i],i);
|
| 382 |
+
sort(tok.begin(),tok.end());
|
| 383 |
+
write_tokenindex_to_disk(tok,fname,unkLabel);
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
bool
|
| 387 |
+
TokenIndex::
|
| 388 |
+
isDynamic() const
|
| 389 |
+
{
|
| 390 |
+
return dynamic;
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
bool
|
| 394 |
+
TokenIndex::
|
| 395 |
+
setDynamic(bool on)
|
| 396 |
+
{
|
| 397 |
+
bool ret = dynamic;
|
| 398 |
+
if (on && this->str2idExtra == NULL)
|
| 399 |
+
{
|
| 400 |
+
this->str2idExtra.reset(new map<string,id_type>());
|
| 401 |
+
this->newWords.reset(new vector<string>());
|
| 402 |
+
}
|
| 403 |
+
dynamic = on;
|
| 404 |
+
if (on)
|
| 405 |
+
{
|
| 406 |
+
(*this)["NULL"];
|
| 407 |
+
(*this)[unkLabel];
|
| 408 |
+
}
|
| 409 |
+
return ret;
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
void
|
| 413 |
+
TokenIndex::
|
| 414 |
+
setUnkLabel(string unk)
|
| 415 |
+
{
|
| 416 |
+
unkId = (*this)[unk];
|
| 417 |
+
unkLabel = unk;
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
//-*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
|
| 2 |
+
|
| 3 |
+
#include "ug_bitext.h"
|
| 4 |
+
#include <algorithm>
|
| 5 |
+
#include <boost/math/distributions/binomial.hpp>
|
| 6 |
+
|
| 7 |
+
namespace sapt
|
| 8 |
+
{
|
| 9 |
+
|
| 10 |
+
float
|
| 11 |
+
lbop(size_t const tries, size_t const succ, float const confidence)
|
| 12 |
+
{
|
| 13 |
+
return (confidence == 0
|
| 14 |
+
? float(succ)/tries
|
| 15 |
+
: (boost::math::binomial_distribution<>::
|
| 16 |
+
find_lower_bound_on_p(tries, succ, confidence)));
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
void
|
| 20 |
+
snt_adder<L2R_Token<SimpleWordId> >::
|
| 21 |
+
operator()()
|
| 22 |
+
{
|
| 23 |
+
typedef L2R_Token<SimpleWordId> tkn;
|
| 24 |
+
std::vector<id_type> sids; sids.reserve(snt.size());
|
| 25 |
+
BOOST_FOREACH(std::string const& foo, snt)
|
| 26 |
+
{
|
| 27 |
+
sids.push_back(track ? track->size() : 0);
|
| 28 |
+
std::istringstream buf(foo);
|
| 29 |
+
std::string w;
|
| 30 |
+
std::vector<tkn> s; s.reserve(100);
|
| 31 |
+
while (buf >> w) s.push_back(tkn(V[w]));
|
| 32 |
+
track = append(track,s);
|
| 33 |
+
}
|
| 34 |
+
if (index)
|
| 35 |
+
index.reset(new imTSA<tkn>(*index,track,sids,V.tsize()));
|
| 36 |
+
else
|
| 37 |
+
index.reset(new imTSA<tkn>(track,NULL,NULL));
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
snt_adder<L2R_Token<SimpleWordId> >::
|
| 41 |
+
snt_adder(std::vector<std::string> const& s, TokenIndex& v,
|
| 42 |
+
SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t,
|
| 43 |
+
SPTR<imTSA<L2R_Token<SimpleWordId> > >& i)
|
| 44 |
+
: snt(s), V(v), track(t), index(i)
|
| 45 |
+
{ }
|
| 46 |
+
|
| 47 |
+
bool
|
| 48 |
+
expand_phrase_pair
|
| 49 |
+
(std::vector<std::vector<ushort> >& a1,
|
| 50 |
+
std::vector<std::vector<ushort> >& a2,
|
| 51 |
+
ushort const s2, // next word on in target side
|
| 52 |
+
ushort const L1, ushort const R1, // limits of previous phrase
|
| 53 |
+
ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
|
| 54 |
+
{
|
| 55 |
+
if (a2[s2].size() == 0)
|
| 56 |
+
{
|
| 57 |
+
std::cout << __FILE__ << ":" << __LINE__ << std::endl;
|
| 58 |
+
return false;
|
| 59 |
+
}
|
| 60 |
+
bitvector done1(a1.size());
|
| 61 |
+
bitvector done2(a2.size());
|
| 62 |
+
std::vector<std::pair<ushort,ushort> > agenda;
|
| 63 |
+
// x.first: side (1 or 2)
|
| 64 |
+
// x.second: word position
|
| 65 |
+
agenda.reserve(a1.size() + a2.size());
|
| 66 |
+
agenda.push_back(std::pair<ushort,ushort>(2,s2));
|
| 67 |
+
e2 = s2;
|
| 68 |
+
s1 = e1 = a2[s2].front();
|
| 69 |
+
if (s1 >= L1 && s1 < R1)
|
| 70 |
+
{
|
| 71 |
+
std::cout << __FILE__ << ":" << __LINE__ << std::endl;
|
| 72 |
+
return false;
|
| 73 |
+
}
|
| 74 |
+
agenda.push_back(std::pair<ushort,ushort>(2,s2));
|
| 75 |
+
while (agenda.size())
|
| 76 |
+
{
|
| 77 |
+
ushort side = agenda.back().first;
|
| 78 |
+
ushort p = agenda.back().second;
|
| 79 |
+
agenda.pop_back();
|
| 80 |
+
if (side == 1)
|
| 81 |
+
{
|
| 82 |
+
done1.set(p);
|
| 83 |
+
BOOST_FOREACH(ushort i, a1[p])
|
| 84 |
+
{
|
| 85 |
+
if (i < s2)
|
| 86 |
+
{
|
| 87 |
+
// cout << __FILE__ << ":" << __LINE__ << endl;
|
| 88 |
+
return false;
|
| 89 |
+
}
|
| 90 |
+
if (done2[i]) continue;
|
| 91 |
+
for (;e2 <= i;++e2)
|
| 92 |
+
if (!done2[e2])
|
| 93 |
+
agenda.push_back(std::pair<ushort,ushort>(2,e2));
|
| 94 |
+
}
|
| 95 |
+
}
|
| 96 |
+
else
|
| 97 |
+
{
|
| 98 |
+
done2.set(p);
|
| 99 |
+
BOOST_FOREACH(ushort i, a2[p])
|
| 100 |
+
{
|
| 101 |
+
if ((e1 < L1 && i >= L1) ||
|
| 102 |
+
(s1 >= R1 && i < R1) ||
|
| 103 |
+
(i >= L1 && i < R1))
|
| 104 |
+
{
|
| 105 |
+
// cout << __FILE__ << ":" << __LINE__ << " "
|
| 106 |
+
// << L1 << "-" << R1 << " " << i << " "
|
| 107 |
+
// << s1 << "-" << e1<< endl;
|
| 108 |
+
return false;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
if (e1 < i)
|
| 112 |
+
{
|
| 113 |
+
for (; e1 <= i; ++e1)
|
| 114 |
+
if (!done1[e1])
|
| 115 |
+
agenda.push_back(std::pair<ushort,ushort>(1,e1));
|
| 116 |
+
}
|
| 117 |
+
else if (s1 > i)
|
| 118 |
+
{
|
| 119 |
+
for (; i <= s1; ++i)
|
| 120 |
+
if (!done1[i])
|
| 121 |
+
agenda.push_back(std::pair<ushort,ushort>(1,i));
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
++e1;
|
| 127 |
+
++e2;
|
| 128 |
+
return true;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
void
|
| 132 |
+
print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2,
|
| 133 |
+
ushort b1, ushort e1, ushort b2, ushort e2)
|
| 134 |
+
{
|
| 135 |
+
using namespace std;
|
| 136 |
+
std::vector<bitvector> M(a1.size(),bitvector(len2));
|
| 137 |
+
for (ushort j = 0; j < a1.size(); ++j)
|
| 138 |
+
{
|
| 139 |
+
BOOST_FOREACH(ushort k, a1[j])
|
| 140 |
+
M[j].set(k);
|
| 141 |
+
}
|
| 142 |
+
cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
|
| 143 |
+
cout << " ";
|
| 144 |
+
for (size_t c = 0; c < len2;++c)
|
| 145 |
+
cout << c%10;
|
| 146 |
+
cout << endl;
|
| 147 |
+
for (size_t r = 0; r < M.size(); ++r)
|
| 148 |
+
{
|
| 149 |
+
cout << setw(3) << r << " ";
|
| 150 |
+
for (size_t c = 0; c < M[r].size(); ++c)
|
| 151 |
+
{
|
| 152 |
+
if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
|
| 153 |
+
cout << (M[r][c] ? 'x' : '-');
|
| 154 |
+
else cout << (M[r][c] ? 'o' : '.');
|
| 155 |
+
}
|
| 156 |
+
cout << endl;
|
| 157 |
+
}
|
| 158 |
+
cout << std::string(90,'-') << endl;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
void
|
| 162 |
+
write_bitvector(bitvector const& v, std::ostream& out)
|
| 163 |
+
{
|
| 164 |
+
for (size_t i = v.find_first(); i < v.size();)
|
| 165 |
+
{
|
| 166 |
+
out << i;
|
| 167 |
+
if ((i = v.find_next(i)) < v.size()) out << ",";
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
}
|
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h
ADDED
|
@@ -0,0 +1,782 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
// Implementations of word-aligned bitext.
|
| 4 |
+
// Written by Ulrich Germann
|
| 5 |
+
//
|
| 6 |
+
// mmBitext: static, memory-mapped bitext
|
| 7 |
+
// imBitext: dynamic, in-memory bitext
|
| 8 |
+
//
|
| 9 |
+
|
| 10 |
+
// things we can do to speed up things:
|
| 11 |
+
// - set up threads at startup time that force the
|
| 12 |
+
// data in to memory sequentially
|
| 13 |
+
//
|
| 14 |
+
// - use multiple agendas for better load balancing and to avoid
|
| 15 |
+
// competition for locks
|
| 16 |
+
//
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
#define UG_BITEXT_TRACK_ACTIVE_THREADS 0
|
| 20 |
+
|
| 21 |
+
#include <string>
|
| 22 |
+
#include <vector>
|
| 23 |
+
#include <cassert>
|
| 24 |
+
#include <iomanip>
|
| 25 |
+
#include <algorithm>
|
| 26 |
+
|
| 27 |
+
#include <boost/foreach.hpp>
|
| 28 |
+
#include <boost/random.hpp>
|
| 29 |
+
#include <boost/format.hpp>
|
| 30 |
+
#include <boost/thread.hpp>
|
| 31 |
+
#include <boost/unordered_map.hpp>
|
| 32 |
+
#include <boost/math/distributions/binomial.hpp>
|
| 33 |
+
|
| 34 |
+
#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
|
| 35 |
+
#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
|
| 36 |
+
#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
|
| 37 |
+
#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
|
| 38 |
+
#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
|
| 39 |
+
// #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
|
| 40 |
+
#include "moses/Util.h"
|
| 41 |
+
|
| 42 |
+
#ifndef NO_MOSES
|
| 43 |
+
// #pragma message "COMPILING WITH MOSES SUPPORT!"
|
| 44 |
+
#include "moses/StaticData.h"
|
| 45 |
+
#include "moses/thread_safe_container.h"
|
| 46 |
+
#include "moses/ContextScope.h"
|
| 47 |
+
#include "moses/TranslationTask.h"
|
| 48 |
+
#else
|
| 49 |
+
// #pragma message "COMPILING WITHOUT MOSES SUPPORT!"
|
| 50 |
+
#endif
|
| 51 |
+
|
| 52 |
+
#include "util/exception.hh"
|
| 53 |
+
// #include "util/check.hh"
|
| 54 |
+
|
| 55 |
+
#include "ug_typedefs.h"
|
| 56 |
+
#include "ug_mm_ttrack.h"
|
| 57 |
+
#include "ug_im_ttrack.h"
|
| 58 |
+
#include "ug_mm_tsa.h"
|
| 59 |
+
#include "ug_im_tsa.h"
|
| 60 |
+
#include "tpt_tokenindex.h"
|
| 61 |
+
#include "ug_corpus_token.h"
|
| 62 |
+
#include "tpt_pickler.h"
|
| 63 |
+
#include "ug_lexical_phrase_scorer2.h"
|
| 64 |
+
#include "ug_lru_cache.h"
|
| 65 |
+
#include "ug_lexical_reordering.h"
|
| 66 |
+
#include "ug_sampling_bias.h"
|
| 67 |
+
#include "ug_phrasepair.h"
|
| 68 |
+
#include "ug_bitext_phrase_extraction_record.h"
|
| 69 |
+
#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
|
| 70 |
+
|
| 71 |
+
// Minimum source count for caching phrase lookup statistics.
|
| 72 |
+
// If source phrase occurs less frequently, never cache;
|
| 73 |
+
// always re-compute.
|
| 74 |
+
#define PSTATS_CACHE_THRESHOLD 50
|
| 75 |
+
|
| 76 |
+
namespace Moses { class Mmsapt; }
|
| 77 |
+
namespace sapt
|
| 78 |
+
{
|
| 79 |
+
using Moses::ttasksptr;
|
| 80 |
+
using Moses::ttaskwptr;
|
| 81 |
+
using tpt::binread;
|
| 82 |
+
using tpt::binwrite;
|
| 83 |
+
|
| 84 |
+
float lbop(size_t const tries, size_t const succ, float const confidence);
|
| 85 |
+
void write_bitvector(bitvector const& v, std::ostream& out);
|
| 86 |
+
|
| 87 |
+
#ifndef NO_MOSES
|
| 88 |
+
struct
|
| 89 |
+
ContextForQuery
|
| 90 |
+
{
|
| 91 |
+
// needs to be made thread-safe
|
| 92 |
+
// ttasksptr const m_ttask;
|
| 93 |
+
// size_t max_samples;
|
| 94 |
+
boost::shared_mutex lock;
|
| 95 |
+
SPTR<SamplingBias> bias;
|
| 96 |
+
SPTR<pstats::cache_t> cache1, cache2;
|
| 97 |
+
std::ostream* bias_log;
|
| 98 |
+
ContextForQuery() : bias_log(NULL) { }
|
| 99 |
+
};
|
| 100 |
+
#endif
|
| 101 |
+
|
| 102 |
+
template<typename Token> class BitextSampler;
|
| 103 |
+
|
| 104 |
+
template<typename TKN>
|
| 105 |
+
class Bitext // : public Moses::reference_counter
|
| 106 |
+
{
|
| 107 |
+
public:
|
| 108 |
+
template<typename Token> friend class BitextSampler;
|
| 109 |
+
typedef TKN Token;
|
| 110 |
+
typedef typename TSA<Token>::tree_iterator iter;
|
| 111 |
+
typedef typename std::vector<PhrasePair<Token> > vec_ppair;
|
| 112 |
+
typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
|
| 113 |
+
typedef TSA<Token> tsa;
|
| 114 |
+
friend class Moses::Mmsapt;
|
| 115 |
+
protected:
|
| 116 |
+
mutable boost::shared_mutex m_lock; // for thread-safe operation
|
| 117 |
+
|
| 118 |
+
class agenda; // for parallel sampling see ug_bitext_agenda.h
|
| 119 |
+
mutable SPTR<agenda> ag;
|
| 120 |
+
size_t m_num_workers; // number of workers available to the agenda
|
| 121 |
+
|
| 122 |
+
size_t m_default_sample_size;
|
| 123 |
+
size_t m_pstats_cache_threshold; // threshold for caching sampling results
|
| 124 |
+
SPTR<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
|
| 125 |
+
|
| 126 |
+
std::vector<std::string> m_docname;
|
| 127 |
+
std::map<std::string,id_type> m_docname2docid; // maps from doc names to ids
|
| 128 |
+
SPTR<std::vector<id_type> > m_sid2docid; // maps from sentences to docs (ids)
|
| 129 |
+
|
| 130 |
+
mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
|
| 131 |
+
// caches for unbiased sampling; biased sampling uses the caches that
|
| 132 |
+
// are stored locally on the translation task
|
| 133 |
+
public:
|
| 134 |
+
SPTR<Ttrack<char> > Tx; // word alignments
|
| 135 |
+
SPTR<Ttrack<Token> > T1; // token track
|
| 136 |
+
SPTR<Ttrack<Token> > T2; // token track
|
| 137 |
+
SPTR<TokenIndex> V1; // vocab
|
| 138 |
+
SPTR<TokenIndex> V2; // vocab
|
| 139 |
+
SPTR<TSA<Token> > I1; // indices
|
| 140 |
+
SPTR<TSA<Token> > I2; // indices
|
| 141 |
+
|
| 142 |
+
/// given the source phrase sid[start:stop]
|
| 143 |
+
// find the possible start (s1 .. s2) and end (e1 .. e2)
|
| 144 |
+
// points of the target phrase; if non-NULL, store word
|
| 145 |
+
// alignments in *core_alignment. If /flip/, source phrase is
|
| 146 |
+
// L2.
|
| 147 |
+
bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
|
| 148 |
+
bool find_trg_phr_bounds
|
| 149 |
+
( size_t const sid, // sentence to investigate
|
| 150 |
+
size_t const start, // start of source phrase
|
| 151 |
+
size_t const stop, // last position of source phrase
|
| 152 |
+
size_t & s1, size_t & s2, // beginning and end of target start
|
| 153 |
+
size_t & e1, size_t & e2, // beginning and end of target end
|
| 154 |
+
int& po_fwd, int& po_bwd, // phrase orientations
|
| 155 |
+
std::vector<unsigned char> * core_alignment, // stores the core alignment
|
| 156 |
+
bitvector* full_alignment, // stores full word alignment for this sent.
|
| 157 |
+
bool const flip) const; // flip source and target (reverse lookup)
|
| 158 |
+
|
| 159 |
+
// prep2 launches sampling and returns immediately.
|
| 160 |
+
// lookup (below) waits for the job to finish before it returns
|
| 161 |
+
SPTR<pstats>
|
| 162 |
+
prep2(iter const& phrase, int max_sample = -1) const;
|
| 163 |
+
|
| 164 |
+
#ifndef NO_MOSES
|
| 165 |
+
SPTR<pstats>
|
| 166 |
+
prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
|
| 167 |
+
int max_sample = -1) const;
|
| 168 |
+
#endif
|
| 169 |
+
|
| 170 |
+
protected:
|
| 171 |
+
Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
|
| 172 |
+
|
| 173 |
+
Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
|
| 174 |
+
Ttrack<char>* const tx,
|
| 175 |
+
TokenIndex* const v1, TokenIndex* const v2,
|
| 176 |
+
TSA<Token>* const i1, TSA<Token>* const i2,
|
| 177 |
+
size_t const max_sample=1000,
|
| 178 |
+
size_t const xnum_workers=16);
|
| 179 |
+
public:
|
| 180 |
+
virtual void
|
| 181 |
+
open(std::string const base, std::string const L1, std::string const L2) = 0;
|
| 182 |
+
|
| 183 |
+
SPTR<pstats>
|
| 184 |
+
lookup(iter const& phrase, int max_sample = -1) const;
|
| 185 |
+
|
| 186 |
+
void prep(iter const& phrase) const;
|
| 187 |
+
|
| 188 |
+
#ifndef NO_MOSES
|
| 189 |
+
SPTR<pstats>
|
| 190 |
+
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
|
| 191 |
+
|
| 192 |
+
void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
|
| 193 |
+
#endif
|
| 194 |
+
|
| 195 |
+
void setDefaultSampleSize(size_t const max_samples);
|
| 196 |
+
size_t getDefaultSampleSize() const;
|
| 197 |
+
|
| 198 |
+
std::string toString(uint64_t pid, int isL2) const;
|
| 199 |
+
|
| 200 |
+
virtual size_t revision() const { return 0; }
|
| 201 |
+
|
| 202 |
+
SPTR<SentenceBias>
|
| 203 |
+
loadSentenceBias(std::string const& fname) const;
|
| 204 |
+
|
| 205 |
+
SPTR<DocumentBias>
|
| 206 |
+
SetupDocumentBias(std::string const& bserver, std::string const& text,
|
| 207 |
+
std::ostream* log) const;
|
| 208 |
+
|
| 209 |
+
SPTR<DocumentBias>
|
| 210 |
+
SetupDocumentBias(std::map<std::string,float> context_weights,
|
| 211 |
+
std::ostream* log) const;
|
| 212 |
+
|
| 213 |
+
void
|
| 214 |
+
mark_match(Token const* start, Token const* end, iter const& m,
|
| 215 |
+
bitvector& check) const;
|
| 216 |
+
void
|
| 217 |
+
write_yawat_alignment
|
| 218 |
+
( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
|
| 219 |
+
|
| 220 |
+
std::string sid2docname(id_type const sid) const;
|
| 221 |
+
std::string docid2name(id_type const sid) const;
|
| 222 |
+
int docname2docid(std::string const& name) const;
|
| 223 |
+
|
| 224 |
+
std::vector<id_type> const* sid2did() const;
|
| 225 |
+
int sid2did(uint32_t sid) const;
|
| 226 |
+
};
|
| 227 |
+
|
| 228 |
+
#include "ug_bitext_agenda.h"
|
| 229 |
+
|
| 230 |
+
template<typename Token>
|
| 231 |
+
int
|
| 232 |
+
Bitext<Token>::
|
| 233 |
+
docname2docid(std::string const& name) const
|
| 234 |
+
{
|
| 235 |
+
std::map<std::string,id_type>::const_iterator m;
|
| 236 |
+
m = m_docname2docid.find(name);
|
| 237 |
+
if (m != m_docname2docid.end()) return m->second;
|
| 238 |
+
return -1;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
template<typename Token>
|
| 242 |
+
std::string
|
| 243 |
+
Bitext<Token>::
|
| 244 |
+
docid2name(id_type const did) const
|
| 245 |
+
{
|
| 246 |
+
if (did < m_docname.size())
|
| 247 |
+
return m_docname[did];
|
| 248 |
+
else
|
| 249 |
+
return (boost::format("%d") % did).str();
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
template<typename Token>
|
| 253 |
+
std::string
|
| 254 |
+
Bitext<Token>::
|
| 255 |
+
sid2docname(id_type const sid) const
|
| 256 |
+
{
|
| 257 |
+
if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
|
| 258 |
+
return m_docname[(*m_sid2docid)[sid]];
|
| 259 |
+
else
|
| 260 |
+
return "";
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
template<typename Token>
|
| 264 |
+
std::vector<id_type> const*
|
| 265 |
+
Bitext<Token>::
|
| 266 |
+
sid2did() const
|
| 267 |
+
{
|
| 268 |
+
return m_sid2docid.get();
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
template<typename Token>
|
| 272 |
+
int
|
| 273 |
+
Bitext<Token>::
|
| 274 |
+
sid2did(uint32_t sid) const
|
| 275 |
+
{
|
| 276 |
+
if (m_sid2docid)
|
| 277 |
+
return m_sid2docid->at(sid);
|
| 278 |
+
return -1;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
template<typename Token>
|
| 283 |
+
SPTR<SentenceBias>
|
| 284 |
+
Bitext<Token>::
|
| 285 |
+
loadSentenceBias(std::string const& fname) const
|
| 286 |
+
{
|
| 287 |
+
SPTR<SentenceBias> ret(new SentenceBias(T1->size()));
|
| 288 |
+
std::ifstream in(fname.c_str());
|
| 289 |
+
size_t i = 0;
|
| 290 |
+
float v; while (in>>v) (*ret)[i++] = v;
|
| 291 |
+
UTIL_THROW_IF2(i != T1->size(),
|
| 292 |
+
"Mismatch between bias vector size and corpus size at "
|
| 293 |
+
<< HERE);
|
| 294 |
+
return ret;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
template<typename Token>
|
| 298 |
+
std::string
|
| 299 |
+
Bitext<Token>::
|
| 300 |
+
toString(uint64_t pid, int isL2) const
|
| 301 |
+
{
|
| 302 |
+
std::ostringstream buf;
|
| 303 |
+
uint32_t sid,off,len; parse_pid(pid,sid,off,len);
|
| 304 |
+
Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
|
| 305 |
+
Token const* x = t + len;
|
| 306 |
+
TokenIndex const& V = isL2 ? *V2 : *V1;
|
| 307 |
+
while (t < x)
|
| 308 |
+
{
|
| 309 |
+
buf << V[t->id()];
|
| 310 |
+
if (++t < x) buf << " ";
|
| 311 |
+
}
|
| 312 |
+
return buf.str();
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
template<typename Token>
|
| 316 |
+
size_t
|
| 317 |
+
Bitext<Token>::
|
| 318 |
+
getDefaultSampleSize() const
|
| 319 |
+
{
|
| 320 |
+
return m_default_sample_size;
|
| 321 |
+
}
|
| 322 |
+
template<typename Token>
|
| 323 |
+
void
|
| 324 |
+
Bitext<Token>::
|
| 325 |
+
setDefaultSampleSize(size_t const max_samples)
|
| 326 |
+
{
|
| 327 |
+
boost::unique_lock<boost::shared_mutex> guard(m_lock);
|
| 328 |
+
if (max_samples != m_default_sample_size)
|
| 329 |
+
{
|
| 330 |
+
m_cache1.reset(new pstats::cache_t);
|
| 331 |
+
m_cache2.reset(new pstats::cache_t);
|
| 332 |
+
m_default_sample_size = max_samples;
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
template<typename Token>
|
| 337 |
+
Bitext<Token>::
|
| 338 |
+
Bitext(size_t const max_sample, size_t const xnum_workers)
|
| 339 |
+
: m_num_workers(xnum_workers)
|
| 340 |
+
, m_default_sample_size(max_sample)
|
| 341 |
+
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
|
| 342 |
+
, m_cache1(new pstats::cache_t)
|
| 343 |
+
, m_cache2(new pstats::cache_t)
|
| 344 |
+
{ }
|
| 345 |
+
|
| 346 |
+
template<typename Token>
|
| 347 |
+
Bitext<Token>::
|
| 348 |
+
Bitext(Ttrack<Token>* const t1,
|
| 349 |
+
Ttrack<Token>* const t2,
|
| 350 |
+
Ttrack<char>* const tx,
|
| 351 |
+
TokenIndex* const v1,
|
| 352 |
+
TokenIndex* const v2,
|
| 353 |
+
TSA<Token>* const i1,
|
| 354 |
+
TSA<Token>* const i2,
|
| 355 |
+
size_t const max_sample,
|
| 356 |
+
size_t const xnum_workers)
|
| 357 |
+
: m_num_workers(xnum_workers)
|
| 358 |
+
, m_default_sample_size(max_sample)
|
| 359 |
+
, m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
|
| 360 |
+
, m_cache1(new pstats::cache_t)
|
| 361 |
+
, m_cache2(new pstats::cache_t)
|
| 362 |
+
, Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
|
| 363 |
+
{ }
|
| 364 |
+
|
| 365 |
+
template<typename TKN> class snt_adder;
|
| 366 |
+
template<> class snt_adder<L2R_Token<SimpleWordId> >;
|
| 367 |
+
|
| 368 |
+
template<>
|
| 369 |
+
class snt_adder<L2R_Token<SimpleWordId> >
|
| 370 |
+
{
|
| 371 |
+
typedef L2R_Token<SimpleWordId> TKN;
|
| 372 |
+
std::vector<std::string> const & snt;
|
| 373 |
+
TokenIndex & V;
|
| 374 |
+
SPTR<imTtrack<TKN> > & track;
|
| 375 |
+
SPTR<imTSA<TKN > > & index;
|
| 376 |
+
public:
|
| 377 |
+
snt_adder(std::vector<std::string> const& s, TokenIndex& v,
|
| 378 |
+
SPTR<imTtrack<TKN> >& t, SPTR<imTSA<TKN> >& i);
|
| 379 |
+
|
| 380 |
+
void operator()();
|
| 381 |
+
};
|
| 382 |
+
|
| 383 |
+
template<typename Token>
|
| 384 |
+
bool
|
| 385 |
+
Bitext<Token>::
|
| 386 |
+
find_trg_phr_bounds(PhraseExtractionRecord& rec) const
|
| 387 |
+
{
|
| 388 |
+
return find_trg_phr_bounds(rec.sid, rec.start, rec.stop,
|
| 389 |
+
rec.s1, rec.s2, rec.e1, rec.e2,
|
| 390 |
+
rec.po_fwd, rec.po_bwd,
|
| 391 |
+
rec.aln, rec.full_aln, rec.flip);
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
template<typename Token>
|
| 395 |
+
bool
|
| 396 |
+
Bitext<Token>::
|
| 397 |
+
find_trg_phr_bounds
|
| 398 |
+
( size_t const sid, // sentence to investigate
|
| 399 |
+
size_t const start, // start of source phrase
|
| 400 |
+
size_t const stop, // last position of source phrase
|
| 401 |
+
size_t & s1, size_t & s2, // beginning and end of target start
|
| 402 |
+
size_t & e1, size_t & e2, // beginning and end of target end
|
| 403 |
+
int& po_fwd, int& po_bwd, // phrase orientations
|
| 404 |
+
std::vector<unsigned char> * core_alignment, // stores the core alignment
|
| 405 |
+
bitvector* full_alignment, // stores full word alignment for this sent.
|
| 406 |
+
bool const flip) const // flip source and target (reverse lookup)
|
| 407 |
+
{
|
| 408 |
+
// if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
|
| 409 |
+
// a word on the core_alignment (core_alignment):
|
| 410 |
+
//
|
| 411 |
+
// Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
|
| 412 |
+
// < e2, respectively) are be definition unaligned, we store
|
| 413 |
+
// only the core alignment in *aln. It is up to the calling
|
| 414 |
+
// function to shift alignment points over for start positions
|
| 415 |
+
// of extracted phrases that start with a fringe word
|
| 416 |
+
assert(T1);
|
| 417 |
+
assert(T2);
|
| 418 |
+
assert(Tx);
|
| 419 |
+
|
| 420 |
+
size_t slen1,slen2;
|
| 421 |
+
if (flip)
|
| 422 |
+
{
|
| 423 |
+
slen1 = T2->sntLen(sid);
|
| 424 |
+
slen2 = T1->sntLen(sid);
|
| 425 |
+
}
|
| 426 |
+
else
|
| 427 |
+
{
|
| 428 |
+
slen1 = T1->sntLen(sid);
|
| 429 |
+
slen2 = T2->sntLen(sid);
|
| 430 |
+
}
|
| 431 |
+
bitvector forbidden(slen2);
|
| 432 |
+
if (full_alignment)
|
| 433 |
+
{
|
| 434 |
+
if (slen1*slen2 > full_alignment->size())
|
| 435 |
+
full_alignment->resize(slen1*slen2*2);
|
| 436 |
+
full_alignment->reset();
|
| 437 |
+
}
|
| 438 |
+
size_t src,trg;
|
| 439 |
+
size_t lft = forbidden.size();
|
| 440 |
+
size_t rgt = 0;
|
| 441 |
+
std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
|
| 442 |
+
|
| 443 |
+
// process word alignment for this sentence
|
| 444 |
+
char const* p = Tx->sntStart(sid);
|
| 445 |
+
char const* x = Tx->sntEnd(sid);
|
| 446 |
+
while (p < x)
|
| 447 |
+
{
|
| 448 |
+
if (flip)
|
| 449 |
+
{
|
| 450 |
+
p = binread(p,trg);
|
| 451 |
+
assert(p<x);
|
| 452 |
+
p = binread(p,src);
|
| 453 |
+
}
|
| 454 |
+
else
|
| 455 |
+
{
|
| 456 |
+
p = binread(p,src);
|
| 457 |
+
assert(p<x);
|
| 458 |
+
p = binread(p,trg);
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
|
| 462 |
+
"Alignment range error at sentence " << sid << "!\n"
|
| 463 |
+
<< src << "/" << slen1 << " " << trg << "/" << slen2);
|
| 464 |
+
|
| 465 |
+
if (src < start || src >= stop)
|
| 466 |
+
forbidden.set(trg);
|
| 467 |
+
else
|
| 468 |
+
{
|
| 469 |
+
lft = std::min(lft,trg);
|
| 470 |
+
rgt = std::max(rgt,trg);
|
| 471 |
+
}
|
| 472 |
+
if (core_alignment)
|
| 473 |
+
{
|
| 474 |
+
aln1[src].push_back(trg);
|
| 475 |
+
aln2[trg].push_back(src);
|
| 476 |
+
}
|
| 477 |
+
if (full_alignment)
|
| 478 |
+
full_alignment->set(src*slen2 + trg);
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
for (size_t i = lft; i <= rgt; ++i)
|
| 482 |
+
if (forbidden[i])
|
| 483 |
+
return false;
|
| 484 |
+
|
| 485 |
+
s2 = lft; for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
|
| 486 |
+
e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
|
| 487 |
+
|
| 488 |
+
if (lft > rgt) return false;
|
| 489 |
+
if (core_alignment)
|
| 490 |
+
{
|
| 491 |
+
core_alignment->clear();
|
| 492 |
+
for (size_t i = start; i < stop; ++i)
|
| 493 |
+
{
|
| 494 |
+
BOOST_FOREACH(ushort x, aln1[i])
|
| 495 |
+
{
|
| 496 |
+
core_alignment->push_back(i - start);
|
| 497 |
+
core_alignment->push_back(x - lft);
|
| 498 |
+
}
|
| 499 |
+
}
|
| 500 |
+
// now determine fwd and bwd phrase orientation
|
| 501 |
+
po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
|
| 502 |
+
po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
|
| 503 |
+
}
|
| 504 |
+
return lft <= rgt;
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
template<typename Token>
|
| 508 |
+
SPTR<DocumentBias>
|
| 509 |
+
Bitext<Token>::
|
| 510 |
+
SetupDocumentBias
|
| 511 |
+
( std::string const& bserver, std::string const& text, std::ostream* log ) const
|
| 512 |
+
{
|
| 513 |
+
SPTR<DocumentBias> ret;
|
| 514 |
+
UTIL_THROW_IF2(m_sid2docid == NULL,
|
| 515 |
+
"Document bias requested but no document map loaded.");
|
| 516 |
+
ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
|
| 517 |
+
bserver, text, log));
|
| 518 |
+
return ret;
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
template<typename Token>
|
| 522 |
+
SPTR<DocumentBias>
|
| 523 |
+
Bitext<Token>::
|
| 524 |
+
SetupDocumentBias
|
| 525 |
+
( std::map<std::string,float> context_weights, std::ostream* log ) const
|
| 526 |
+
{
|
| 527 |
+
SPTR<DocumentBias> ret;
|
| 528 |
+
UTIL_THROW_IF2(m_sid2docid == NULL,
|
| 529 |
+
"Document bias requested but no document map loaded.");
|
| 530 |
+
ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
|
| 531 |
+
context_weights, log));
|
| 532 |
+
return ret;
|
| 533 |
+
}
|
| 534 |
+
|
| 535 |
+
template<typename Token>
|
| 536 |
+
void
|
| 537 |
+
Bitext<Token>::
|
| 538 |
+
prep(iter const& phrase) const
|
| 539 |
+
{
|
| 540 |
+
prep2(phrase, m_default_sample_size);
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
|
| 544 |
+
|
| 545 |
+
// prep2 schedules a phrase for sampling, and returns immediately
|
| 546 |
+
// the member function lookup retrieves the respective pstats instance
|
| 547 |
+
// and waits until the sampling is finished before it returns.
|
| 548 |
+
// This allows sampling in the background
|
| 549 |
+
template<typename Token>
|
| 550 |
+
SPTR<pstats>
|
| 551 |
+
Bitext<Token>
|
| 552 |
+
::prep2
|
| 553 |
+
(iter const& phrase, int max_sample) const
|
| 554 |
+
{
|
| 555 |
+
if (max_sample < 0) max_sample = m_default_sample_size;
|
| 556 |
+
SPTR<SamplingBias> bias;
|
| 557 |
+
SPTR<pstats::cache_t> cache;
|
| 558 |
+
// - no caching for rare phrases and special requests (max_sample)
|
| 559 |
+
// (still need to test what a good caching threshold is ...)
|
| 560 |
+
// - use the task-specific cache when there is a sampling bias
|
| 561 |
+
if (max_sample == int(m_default_sample_size)
|
| 562 |
+
&& phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
|
| 563 |
+
{
|
| 564 |
+
cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
SPTR<pstats> ret;
|
| 568 |
+
SPTR<pstats> const* cached;
|
| 569 |
+
|
| 570 |
+
if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
|
| 571 |
+
return *cached;
|
| 572 |
+
boost::unique_lock<boost::shared_mutex> guard(m_lock);
|
| 573 |
+
if (!ag)
|
| 574 |
+
{
|
| 575 |
+
ag.reset(new agenda(*this));
|
| 576 |
+
if (m_num_workers > 1)
|
| 577 |
+
ag->add_workers(m_num_workers);
|
| 578 |
+
}
|
| 579 |
+
ret = ag->add_job(this, phrase, max_sample, bias);
|
| 580 |
+
if (cache) cache->set(phrase.getPid(),ret);
|
| 581 |
+
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
|
| 582 |
+
return ret;
|
| 583 |
+
}
|
| 584 |
+
|
| 585 |
+
// worker for scoring and sorting phrase table entries in parallel
|
| 586 |
+
template<typename Token>
|
| 587 |
+
class pstats2pplist
|
| 588 |
+
{
|
| 589 |
+
Ttrack<Token> const& m_other;
|
| 590 |
+
SPTR<pstats> m_pstats;
|
| 591 |
+
std::vector<PhrasePair<Token> >& m_pplist;
|
| 592 |
+
typename PhrasePair<Token>::Scorer const* m_scorer;
|
| 593 |
+
PhrasePair<Token> m_pp;
|
| 594 |
+
Token const* m_token;
|
| 595 |
+
size_t m_len;
|
| 596 |
+
uint64_t m_pid1;
|
| 597 |
+
bool m_is_inverse;
|
| 598 |
+
public:
|
| 599 |
+
|
| 600 |
+
// CONSTRUCTOR
|
| 601 |
+
pstats2pplist(typename TSA<Token>::tree_iterator const& m,
|
| 602 |
+
Ttrack<Token> const& other,
|
| 603 |
+
SPTR<pstats> const& ps,
|
| 604 |
+
std::vector<PhrasePair<Token> >& dest,
|
| 605 |
+
typename PhrasePair<Token>::Scorer const* scorer)
|
| 606 |
+
: m_other(other)
|
| 607 |
+
, m_pstats(ps)
|
| 608 |
+
, m_pplist(dest)
|
| 609 |
+
, m_scorer(scorer)
|
| 610 |
+
, m_token(m.getToken(0))
|
| 611 |
+
, m_len(m.size())
|
| 612 |
+
, m_pid1(m.getPid())
|
| 613 |
+
, m_is_inverse(false)
|
| 614 |
+
{ }
|
| 615 |
+
|
| 616 |
+
// WORKER
|
| 617 |
+
void
|
| 618 |
+
operator()()
|
| 619 |
+
{
|
| 620 |
+
// wait till all statistics have been collected
|
| 621 |
+
boost::unique_lock<boost::mutex> lock(m_pstats->lock);
|
| 622 |
+
while (m_pstats->in_progress)
|
| 623 |
+
m_pstats->ready.wait(lock);
|
| 624 |
+
|
| 625 |
+
m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
|
| 626 |
+
|
| 627 |
+
// convert pstats entries to phrase pairs
|
| 628 |
+
pstats::trg_map_t::iterator a;
|
| 629 |
+
for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
|
| 630 |
+
{
|
| 631 |
+
uint32_t sid,off,len;
|
| 632 |
+
parse_pid(a->first, sid, off, len);
|
| 633 |
+
m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
|
| 634 |
+
m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
|
| 635 |
+
m_pp.joint);
|
| 636 |
+
// Poor man's early pruning: if p(f|e) or p(e|f) < 1/128, don't
|
| 637 |
+
// even consider the phrase pair, as it is unlikely to ever be
|
| 638 |
+
// considered as a valid translation.
|
| 639 |
+
size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
|
| 640 |
+
if (m_pp.good1 > J || m_pp.good2 > J) continue;
|
| 641 |
+
if (m_scorer)
|
| 642 |
+
{
|
| 643 |
+
(*m_scorer)(m_pp);
|
| 644 |
+
}
|
| 645 |
+
m_pplist.push_back(m_pp);
|
| 646 |
+
}
|
| 647 |
+
std::greater<PhrasePair<Token> > sorter;
|
| 648 |
+
if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
|
| 649 |
+
}
|
| 650 |
+
};
|
| 651 |
+
|
| 652 |
+
template<typename Token>
|
| 653 |
+
void
|
| 654 |
+
Bitext<Token>
|
| 655 |
+
::mark_match(Token const* start, Token const* end,
|
| 656 |
+
iter const& m, bitvector& check) const
|
| 657 |
+
{
|
| 658 |
+
check.resize(end-start);
|
| 659 |
+
check.reset();
|
| 660 |
+
Token const* x = m.getToken(0);
|
| 661 |
+
for (Token const* s = start; s < end; ++s)
|
| 662 |
+
{
|
| 663 |
+
if (s->id() != x->id()) continue;
|
| 664 |
+
Token const* a = x;
|
| 665 |
+
Token const* b = s;
|
| 666 |
+
size_t i = 0;
|
| 667 |
+
while (a && b && a->id() == b->id() && i < m.size())
|
| 668 |
+
{
|
| 669 |
+
++i;
|
| 670 |
+
a = a->next();
|
| 671 |
+
b = b->next();
|
| 672 |
+
}
|
| 673 |
+
if (i == m.size())
|
| 674 |
+
{
|
| 675 |
+
b = s;
|
| 676 |
+
while (i-- > 0) { check.set(b-start); b = b->next(); }
|
| 677 |
+
}
|
| 678 |
+
}
|
| 679 |
+
}
|
| 680 |
+
|
| 681 |
+
template<typename Token>
|
| 682 |
+
void
|
| 683 |
+
Bitext<Token>::
|
| 684 |
+
write_yawat_alignment
|
| 685 |
+
( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const
|
| 686 |
+
{
|
| 687 |
+
std::vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
|
| 688 |
+
bitvector f1(a1.size()), f2(a2.size());
|
| 689 |
+
if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
|
| 690 |
+
if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
|
| 691 |
+
|
| 692 |
+
std::vector<std::pair<bitvector, bitvector> > agroups;
|
| 693 |
+
std::vector<std::string> grouplabel;
|
| 694 |
+
std::pair<bitvector, bitvector> ag;
|
| 695 |
+
ag.first.resize(a1.size());
|
| 696 |
+
ag.second.resize(a2.size());
|
| 697 |
+
char const* x = Tx->sntStart(sid);
|
| 698 |
+
size_t a, b;
|
| 699 |
+
while (x < Tx->sntEnd(sid))
|
| 700 |
+
{
|
| 701 |
+
x = binread(x,a);
|
| 702 |
+
x = binread(x,b);
|
| 703 |
+
if (a1.at(a) < 0 && a2.at(b) < 0)
|
| 704 |
+
{
|
| 705 |
+
a1[a] = a2[b] = agroups.size();
|
| 706 |
+
ag.first.reset();
|
| 707 |
+
ag.second.reset();
|
| 708 |
+
ag.first.set(a);
|
| 709 |
+
ag.second.set(b);
|
| 710 |
+
agroups.push_back(ag);
|
| 711 |
+
grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
|
| 712 |
+
}
|
| 713 |
+
else if (a1.at(a) < 0)
|
| 714 |
+
{
|
| 715 |
+
a1[a] = a2[b];
|
| 716 |
+
agroups[a2[b]].first.set(a);
|
| 717 |
+
if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
|
| 718 |
+
}
|
| 719 |
+
else if (a2.at(b) < 0)
|
| 720 |
+
{
|
| 721 |
+
a2[b] = a1[a];
|
| 722 |
+
agroups[a1[a]].second.set(b);
|
| 723 |
+
if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
|
| 724 |
+
}
|
| 725 |
+
else
|
| 726 |
+
{
|
| 727 |
+
agroups[a1[a]].first |= agroups[a2[b]].first;
|
| 728 |
+
agroups[a1[a]].second |= agroups[a2[b]].second;
|
| 729 |
+
a2[b] = a1[a];
|
| 730 |
+
if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
|
| 731 |
+
}
|
| 732 |
+
}
|
| 733 |
+
|
| 734 |
+
for (a = 0; a < a1.size(); ++a)
|
| 735 |
+
{
|
| 736 |
+
if (a1[a] < 0)
|
| 737 |
+
{
|
| 738 |
+
if (f1[a]) out << a << "::" << "infocusmono ";
|
| 739 |
+
continue;
|
| 740 |
+
}
|
| 741 |
+
bitvector const& A = agroups[a1[a]].first;
|
| 742 |
+
bitvector const& B = agroups[a1[a]].second;
|
| 743 |
+
if (A.find_first() < a) continue;
|
| 744 |
+
write_bitvector(A,out); out << ":";
|
| 745 |
+
write_bitvector(B,out); out << ":";
|
| 746 |
+
out << grouplabel[a1[a]] << " ";
|
| 747 |
+
}
|
| 748 |
+
for (b = 0; b < a2.size(); ++b)
|
| 749 |
+
{
|
| 750 |
+
if (a2[b] < 0 && f2[b])
|
| 751 |
+
out << "::" << "infocusmono ";
|
| 752 |
+
}
|
| 753 |
+
}
|
| 754 |
+
|
| 755 |
+
template<typename Token>
|
| 756 |
+
void
|
| 757 |
+
expand(typename Bitext<Token>::iter const& m,
|
| 758 |
+
Bitext<Token> const& bt, pstats const& ps,
|
| 759 |
+
std::vector<PhrasePair<Token> >& dest, std::ostream* log)
|
| 760 |
+
{
|
| 761 |
+
bool fwd = m.root == bt.I1.get();
|
| 762 |
+
dest.reserve(ps.trg.size());
|
| 763 |
+
PhrasePair<Token> pp;
|
| 764 |
+
pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
|
| 765 |
+
// cout << HERE << " "
|
| 766 |
+
// << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl;
|
| 767 |
+
pstats::trg_map_t::const_iterator a;
|
| 768 |
+
for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
|
| 769 |
+
{
|
| 770 |
+
uint32_t sid,off,len;
|
| 771 |
+
parse_pid(a->first, sid, off, len);
|
| 772 |
+
pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
|
| 773 |
+
len, a->second);
|
| 774 |
+
dest.push_back(pp);
|
| 775 |
+
}
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
+
} // end of namespace sapt
|
| 779 |
+
|
| 780 |
+
#include "ug_im_bitext.h"
|
| 781 |
+
#include "ug_mm_bitext.h"
|
| 782 |
+
#include "ug_bitext_moses.h"
|
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
// to be included from ug_bitext.h
|
| 3 |
+
|
| 4 |
+
// The agenda handles parallel sampling.
|
| 5 |
+
// It maintains a queue of unfinished sampling jobs and
|
| 6 |
+
// assigns them to a pool of workers.
|
| 7 |
+
//
|
| 8 |
+
template<typename Token>
|
| 9 |
+
class Bitext<Token>
|
| 10 |
+
::agenda
|
| 11 |
+
{
|
| 12 |
+
public:
|
| 13 |
+
class job;
|
| 14 |
+
class worker;
|
| 15 |
+
private:
|
| 16 |
+
boost::mutex lock;
|
| 17 |
+
std::list<SPTR<job> > joblist;
|
| 18 |
+
std::vector<SPTR<boost::thread> > workers;
|
| 19 |
+
bool shutdown;
|
| 20 |
+
size_t doomed;
|
| 21 |
+
|
| 22 |
+
public:
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
Bitext<Token> const& bt;
|
| 26 |
+
|
| 27 |
+
agenda(Bitext<Token> const& bitext);
|
| 28 |
+
~agenda();
|
| 29 |
+
|
| 30 |
+
void
|
| 31 |
+
add_workers(int n);
|
| 32 |
+
|
| 33 |
+
SPTR<pstats>
|
| 34 |
+
add_job(Bitext<Token> const* const theBitext,
|
| 35 |
+
typename TSA<Token>::tree_iterator const& phrase,
|
| 36 |
+
size_t const max_samples, SPTR<SamplingBias const> const& bias,
|
| 37 |
+
bool const track_sids);
|
| 38 |
+
// add_job(Bitext<Token> const* const theBitext,
|
| 39 |
+
// typename TSA<Token>::tree_iterator const& phrase,
|
| 40 |
+
// size_t const max_samples, SamplingBias const* const bias);
|
| 41 |
+
|
| 42 |
+
SPTR<job>
|
| 43 |
+
get_job();
|
| 44 |
+
};
|
| 45 |
+
|
| 46 |
+
template<typename Token>
|
| 47 |
+
class
|
| 48 |
+
Bitext<Token>::agenda::
|
| 49 |
+
worker
|
| 50 |
+
{
|
| 51 |
+
agenda& ag;
|
| 52 |
+
public:
|
| 53 |
+
worker(agenda& a) : ag(a) {}
|
| 54 |
+
void operator()();
|
| 55 |
+
};
|
| 56 |
+
|
| 57 |
+
#include "ug_bitext_agenda_worker.h"
|
| 58 |
+
#include "ug_bitext_agenda_job.h"
|
| 59 |
+
|
| 60 |
+
template<typename Token>
|
| 61 |
+
void Bitext<Token>
|
| 62 |
+
::agenda
|
| 63 |
+
::add_workers(int n)
|
| 64 |
+
{
|
| 65 |
+
static boost::posix_time::time_duration nodelay(0,0,0,0);
|
| 66 |
+
boost::lock_guard<boost::mutex> guard(this->lock);
|
| 67 |
+
|
| 68 |
+
int target = std::max(1, int(n + workers.size() - this->doomed));
|
| 69 |
+
// house keeping: remove all workers that have finished
|
| 70 |
+
for (size_t i = 0; i < workers.size(); )
|
| 71 |
+
{
|
| 72 |
+
if (workers[i]->timed_join(nodelay))
|
| 73 |
+
{
|
| 74 |
+
if (i + 1 < workers.size())
|
| 75 |
+
workers[i].swap(workers.back());
|
| 76 |
+
workers.pop_back();
|
| 77 |
+
}
|
| 78 |
+
else ++i;
|
| 79 |
+
}
|
| 80 |
+
// cerr << workers.size() << "/" << target << " active" << std::endl;
|
| 81 |
+
if (int(workers.size()) > target)
|
| 82 |
+
this->doomed = workers.size() - target;
|
| 83 |
+
else
|
| 84 |
+
while (int(workers.size()) < target)
|
| 85 |
+
{
|
| 86 |
+
SPTR<boost::thread> w(new boost::thread(worker(*this)));
|
| 87 |
+
workers.push_back(w);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
template<typename Token>
|
| 93 |
+
SPTR<pstats> Bitext<Token>
|
| 94 |
+
::agenda
|
| 95 |
+
::add_job(Bitext<Token> const* const theBitext,
|
| 96 |
+
typename TSA<Token>::tree_iterator const& phrase,
|
| 97 |
+
size_t const max_samples, SPTR<SamplingBias const> const& bias,
|
| 98 |
+
bool const track_sids)
|
| 99 |
+
{
|
| 100 |
+
boost::unique_lock<boost::mutex> lk(this->lock);
|
| 101 |
+
static boost::posix_time::time_duration nodelay(0,0,0,0);
|
| 102 |
+
bool fwd = phrase.root == bt.I1.get();
|
| 103 |
+
SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
|
| 104 |
+
max_samples, fwd, bias, track_sids));
|
| 105 |
+
j->stats->register_worker();
|
| 106 |
+
|
| 107 |
+
joblist.push_back(j);
|
| 108 |
+
if (joblist.size() == 1)
|
| 109 |
+
{
|
| 110 |
+
size_t i = 0;
|
| 111 |
+
while (i < workers.size())
|
| 112 |
+
{
|
| 113 |
+
if (workers[i]->timed_join(nodelay))
|
| 114 |
+
{
|
| 115 |
+
if (doomed)
|
| 116 |
+
{
|
| 117 |
+
if (i+1 < workers.size())
|
| 118 |
+
workers[i].swap(workers.back());
|
| 119 |
+
workers.pop_back();
|
| 120 |
+
--doomed;
|
| 121 |
+
}
|
| 122 |
+
else
|
| 123 |
+
workers[i++] = SPTR<boost::thread>(new boost::thread(worker(*this)));
|
| 124 |
+
}
|
| 125 |
+
else ++i;
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
return j->stats;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
template<typename Token>
|
| 132 |
+
SPTR<typename Bitext<Token>::agenda::job>
|
| 133 |
+
Bitext<Token>
|
| 134 |
+
::agenda
|
| 135 |
+
::get_job()
|
| 136 |
+
{
|
| 137 |
+
// cerr << workers.size() << " workers on record" << std::endl;
|
| 138 |
+
SPTR<job> ret;
|
| 139 |
+
if (this->shutdown) return ret;
|
| 140 |
+
boost::unique_lock<boost::mutex> lock(this->lock);
|
| 141 |
+
if (this->doomed)
|
| 142 |
+
{ // the number of workers has been reduced, tell the redundant once to quit
|
| 143 |
+
--this->doomed;
|
| 144 |
+
return ret;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
typename std::list<SPTR<job> >::iterator j = joblist.begin();
|
| 148 |
+
while (j != joblist.end())
|
| 149 |
+
{
|
| 150 |
+
if ((*j)->done())
|
| 151 |
+
{
|
| 152 |
+
(*j)->stats->release();
|
| 153 |
+
joblist.erase(j++);
|
| 154 |
+
}
|
| 155 |
+
else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
|
| 156 |
+
else break; // found one
|
| 157 |
+
}
|
| 158 |
+
if (joblist.size())
|
| 159 |
+
{
|
| 160 |
+
ret = j == joblist.end() ? joblist.front() : *j;
|
| 161 |
+
// if we've reached the end of the queue (all jobs have 4 workers on them),
|
| 162 |
+
// take the first in the queue
|
| 163 |
+
boost::lock_guard<boost::mutex> jguard(ret->lock);
|
| 164 |
+
++ret->workers;
|
| 165 |
+
}
|
| 166 |
+
return ret;
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
template<typename Token>
|
| 170 |
+
Bitext<Token>::
|
| 171 |
+
agenda::
|
| 172 |
+
~agenda()
|
| 173 |
+
{
|
| 174 |
+
this->lock.lock();
|
| 175 |
+
this->shutdown = true;
|
| 176 |
+
this->lock.unlock();
|
| 177 |
+
for (size_t i = 0; i < workers.size(); ++i)
|
| 178 |
+
workers[i]->join();
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
template<typename Token>
|
| 182 |
+
Bitext<Token>::
|
| 183 |
+
agenda::
|
| 184 |
+
agenda(Bitext<Token> const& thebitext)
|
| 185 |
+
: shutdown(false), doomed(0), bt(thebitext)
|
| 186 |
+
{ }
|
| 187 |
+
|
| 188 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#include <string>
|
| 4 |
+
#include <stdint.h>
|
| 5 |
+
#include "ug_typedefs.h"
|
| 6 |
+
#include "ug_lexical_reordering.h"
|
| 7 |
+
#include <boost/thread.hpp>
|
| 8 |
+
|
| 9 |
+
namespace sapt
|
| 10 |
+
{
|
| 11 |
+
|
| 12 |
+
// "joint" (i.e., phrase std::pair) statistics
|
| 13 |
+
class
|
| 14 |
+
jstats
|
| 15 |
+
{
|
| 16 |
+
boost::mutex lock;
|
| 17 |
+
uint32_t my_rcnt; // unweighted joint count
|
| 18 |
+
uint32_t my_cnt2; // raw counts L2
|
| 19 |
+
float my_wcnt; // weighted joint count
|
| 20 |
+
float my_bcnt; // cumulative bias
|
| 21 |
+
|
| 22 |
+
// to do: use a static alignment pattern store that stores each pattern only
|
| 23 |
+
// once, so that we don't have to store so many alignment std::vectors
|
| 24 |
+
std::vector<std::pair<size_t, std::vector<unsigned char> > > my_aln;
|
| 25 |
+
// internal word alignment
|
| 26 |
+
|
| 27 |
+
uint32_t ofwd[LRModel::NONE+1]; // forward distortion type counts
|
| 28 |
+
uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
|
| 29 |
+
|
| 30 |
+
public:
|
| 31 |
+
SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
|
| 32 |
+
std::map<uint32_t,uint32_t> indoc;
|
| 33 |
+
// std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
|
| 34 |
+
jstats();
|
| 35 |
+
jstats(jstats const& other);
|
| 36 |
+
uint32_t rcnt() const; // raw joint counts
|
| 37 |
+
uint32_t cnt2() const; // raw target phrase occurrence count
|
| 38 |
+
float wcnt() const; // weighted joint counts
|
| 39 |
+
float bcnt() const; // cumulative bias scores
|
| 40 |
+
|
| 41 |
+
std::vector<std::pair<size_t, std::vector<unsigned char> > > const & aln() const;
|
| 42 |
+
|
| 43 |
+
size_t
|
| 44 |
+
add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
|
| 45 |
+
uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
|
| 46 |
+
bool const track_sid);
|
| 47 |
+
|
| 48 |
+
void invalidate();
|
| 49 |
+
void validate();
|
| 50 |
+
bool valid();
|
| 51 |
+
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
|
| 52 |
+
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
|
| 53 |
+
void fill_lr_vec(LRModel::Direction const& dir,
|
| 54 |
+
LRModel::ModelType const& mdl,
|
| 55 |
+
std::vector<float>& v);
|
| 56 |
+
};
|
| 57 |
+
}
|
| 58 |
+
|
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; cc-style: moses-cc-style -*-
|
| 2 |
+
#pragma once
|
| 3 |
+
#ifndef NO_MOSES
|
| 4 |
+
namespace sapt {
|
| 5 |
+
|
| 6 |
+
template<typename Token>
|
| 7 |
+
SPTR<pstats>
|
| 8 |
+
Bitext<Token>::
|
| 9 |
+
lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
|
| 10 |
+
{
|
| 11 |
+
SPTR<pstats> ret = prep2(ttask, phrase, max_sample);
|
| 12 |
+
UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
|
| 13 |
+
|
| 14 |
+
// Why were we locking here?
|
| 15 |
+
if (m_num_workers <= 1)
|
| 16 |
+
{
|
| 17 |
+
boost::unique_lock<boost::shared_mutex> guard(m_lock);
|
| 18 |
+
typename agenda::worker(*this->ag)();
|
| 19 |
+
}
|
| 20 |
+
else
|
| 21 |
+
{
|
| 22 |
+
boost::unique_lock<boost::mutex> lock(ret->lock);
|
| 23 |
+
while (ret->in_progress)
|
| 24 |
+
ret->ready.wait(lock);
|
| 25 |
+
}
|
| 26 |
+
return ret;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
template<typename Token>
|
| 31 |
+
void
|
| 32 |
+
Bitext<Token>::
|
| 33 |
+
prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
|
| 34 |
+
{
|
| 35 |
+
prep2(ttask, phrase, track_sids, m_default_sample_size);
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
// prep2 schedules a phrase for sampling, and returns immediately
|
| 40 |
+
// the member function lookup retrieves the respective pstats instance
|
| 41 |
+
// and waits until the sampling is finished before it returns.
|
| 42 |
+
// This allows sampling in the background
|
| 43 |
+
template<typename Token>
|
| 44 |
+
SPTR<pstats>
|
| 45 |
+
Bitext<Token>
|
| 46 |
+
::prep2
|
| 47 |
+
( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
|
| 48 |
+
int max_sample) const
|
| 49 |
+
{
|
| 50 |
+
if (max_sample < 0) max_sample = m_default_sample_size;
|
| 51 |
+
SPTR<SamplingBias> bias;
|
| 52 |
+
SPTR<Moses::ContextScope> scope = ttask->GetScope();
|
| 53 |
+
SPTR<ContextForQuery> context = scope->get<ContextForQuery>(this);
|
| 54 |
+
if (context) bias = context->bias;
|
| 55 |
+
SPTR<pstats::cache_t> cache;
|
| 56 |
+
// - no caching for rare phrases and special requests (max_sample)
|
| 57 |
+
// (still need to test what a good caching threshold is ...)
|
| 58 |
+
// - use the task-specific cache when there is a sampling bias
|
| 59 |
+
if (max_sample == int(m_default_sample_size)
|
| 60 |
+
&& phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
|
| 61 |
+
{
|
| 62 |
+
cache = (phrase.root == I1.get()
|
| 63 |
+
? (bias ? context->cache1 : m_cache1)
|
| 64 |
+
: (bias ? context->cache2 : m_cache2));
|
| 65 |
+
}
|
| 66 |
+
SPTR<pstats> ret;
|
| 67 |
+
SPTR<pstats> const* cached;
|
| 68 |
+
|
| 69 |
+
if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
|
| 70 |
+
return *cached;
|
| 71 |
+
boost::unique_lock<boost::shared_mutex> guard(m_lock);
|
| 72 |
+
if (!ag)
|
| 73 |
+
{
|
| 74 |
+
ag.reset(new agenda(*this));
|
| 75 |
+
if (m_num_workers > 1)
|
| 76 |
+
ag->add_workers(m_num_workers);
|
| 77 |
+
}
|
| 78 |
+
ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
|
| 79 |
+
if (cache) cache->set(phrase.getPid(),ret);
|
| 80 |
+
UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
|
| 81 |
+
return ret;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
}
|
| 87 |
+
#endif
|