sleepyhead111 commited on Apr 20, 2025

Commit

1747e32

verified ·

1 Parent(s): dc27c50

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mosesdecoder/moses/BitmapContainer.cpp +498 -0
mosesdecoder/moses/Bitmaps.h +32 -0
mosesdecoder/moses/ChartManager.h +162 -0
mosesdecoder/moses/ChartTranslationOptions.cpp +168 -0
mosesdecoder/moses/DecodeStepGeneration.cpp +169 -0
mosesdecoder/moses/FloydWarshall.cpp +36 -0
mosesdecoder/moses/HypothesisStack.h +64 -0
mosesdecoder/moses/Sentence.cpp +372 -0
mosesdecoder/moses/Syntax/Cube.h +62 -0
mosesdecoder/moses/Syntax/CubeQueue.cpp +37 -0
mosesdecoder/moses/Syntax/CubeQueue.h +52 -0
mosesdecoder/moses/Syntax/InputWeightFF.cpp +48 -0
mosesdecoder/moses/Syntax/Manager.cpp +229 -0
mosesdecoder/moses/Syntax/NonTerminalMap.h +85 -0
mosesdecoder/moses/Syntax/PHyperedge.h +21 -0
mosesdecoder/moses/Syntax/RuleTableFF.h +60 -0
mosesdecoder/moses/Syntax/SHyperedgeBundle.h +31 -0
mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h +26 -0
mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp +424 -0
mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h +112 -0
mosesdecoder/moses/TranslationModel/CompactPT/Jamfile +17 -0
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp +450 -0
mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h +202 -0
mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h +230 -0
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h +144 -0
mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h +412 -0
mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp +198 -0
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc +434 -0
mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h +83 -0
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc +31 -0
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h +30 -0
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc +51 -0
mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h +21 -0
mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x +105 -0
mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc +57 -0
mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc +348 -0
mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc +498 -0
mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc +166 -0
mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc +77 -0
mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc +74 -0
mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc +27 -0
mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc +13 -0
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc +594 -0
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h +176 -0
mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc +420 -0
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc +171 -0
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h +782 -0
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h +188 -0
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h +58 -0
mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h +87 -0

mosesdecoder/moses/BitmapContainer.cpp ADDED Viewed

	@@ -0,0 +1,498 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <algorithm>
+#include <limits>
+#include <utility>
+#include "BitmapContainer.h"
+#include "HypothesisStackCubePruning.h"
+#include "moses/FF/DistortionScoreProducer.h"
+#include "TranslationOptionList.h"
+#include "Manager.h"
+namespace Moses
+{
+class HypothesisScoreOrdererNoDistortion
+{
+public:
+  bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
+    const float scoreA = hypoA->GetScore();
+    const float scoreB = hypoB->GetScore();
+    if (scoreA > scoreB) {
+      return true;
+    } else if (scoreA < scoreB) {
+      return false;
+    } else {
+      return hypoA < hypoB;
+    }
+  }
+};
+class HypothesisScoreOrdererWithDistortion
+{
+private:
+  bool m_deterministic;
+public:
+  HypothesisScoreOrdererWithDistortion(const Range* transOptRange,
+                                       const bool deterministic = false)
+    : m_deterministic(deterministic)
+    , m_transOptRange(transOptRange) {
+    m_totalWeightDistortion = 0;
+    const StaticData &staticData = StaticData::Instance();
+    const std::vector<const DistortionScoreProducer*> &ffs = DistortionScoreProducer::GetDistortionFeatureFunctions();
+    std::vector<const DistortionScoreProducer*>::const_iterator iter;
+    for (iter = ffs.begin(); iter != ffs.end(); ++iter) {
+      const DistortionScoreProducer *ff = *iter;
+      float weight =staticData.GetAllWeights().GetScoreForProducer(ff);
+      m_totalWeightDistortion += weight;
+    }
+  }
+  const Range* m_transOptRange;
+  float m_totalWeightDistortion;
+  bool operator()(const Hypothesis* hypoA, const Hypothesis* hypoB) const {
+    UTIL_THROW_IF2(m_transOptRange == NULL, "Words range not set");
+    const float distortionScoreA = DistortionScoreProducer::CalculateDistortionScore(
+                                     *hypoA,
+                                     hypoA->GetCurrSourceWordsRange(),
+                                     *m_transOptRange,
+                                     hypoA->GetWordsBitmap().GetFirstGapPos()
+                                   );
+    const float distortionScoreB = DistortionScoreProducer::CalculateDistortionScore(
+                                     *hypoB,
+                                     hypoB->GetCurrSourceWordsRange(),
+                                     *m_transOptRange,
+                                     hypoB->GetWordsBitmap().GetFirstGapPos()
+                                   );
+    const float scoreA = hypoA->GetScore() + distortionScoreA * m_totalWeightDistortion;
+    const float scoreB = hypoB->GetScore() + distortionScoreB * m_totalWeightDistortion;
+    if (scoreA > scoreB) {
+      return true;
+    } else if (scoreA < scoreB) {
+      return false;
+    } else {
+      if (m_deterministic) {
+        // Equal scores: break ties by comparing target phrases
+        return (hypoA->GetCurrTargetPhrase().Compare(hypoB->GetCurrTargetPhrase()) < 0);
+      }
+      // Fallback: non-deterministic sort
+      return hypoA < hypoB;
+    }
+  }
+};
+////////////////////////////////////////////////////////////////////////////////
+// BackwardsEdge Code
+////////////////////////////////////////////////////////////////////////////////
+BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
+                             , BitmapContainer &parent
+                             , const TranslationOptionList &translations
+                             , const SquareMatrix &estimatedScores,
+                             const InputType& itype,
+                             const bool deterministic)
+  : m_initialized(false)
+  , m_prevBitmapContainer(prevBitmapContainer)
+  , m_parent(parent)
+  , m_translations(translations)
+  , m_estimatedScores(estimatedScores)
+  , m_deterministic(deterministic)
+  , m_seenPosition()
+{
+  // If either dimension is empty, we haven't got anything to do.
+  if(m_prevBitmapContainer.GetHypotheses().size() == 0 || m_translations.size() == 0) {
+    VERBOSE(3, "Empty cube on BackwardsEdge" << std::endl);
+    return;
+  }
+  // Fetch the things we need for distortion cost computation.
+  // int maxDistortion = StaticData::Instance().GetMaxDistortion();
+  int maxDistortion  = itype.options()->reordering.max_distortion;
+  if (maxDistortion == -1) {
+    for (HypothesisSet::const_iterator iter = m_prevBitmapContainer.GetHypotheses().begin(); iter != m_prevBitmapContainer.GetHypotheses().end(); ++iter) {
+      m_hypotheses.push_back(*iter);
+    }
+    return;
+  }
+  const Range &transOptRange = translations.Get(0)->GetSourceWordsRange();
+  HypothesisSet::const_iterator iterHypo = m_prevBitmapContainer.GetHypotheses().begin();
+  HypothesisSet::const_iterator iterEnd = m_prevBitmapContainer.GetHypotheses().end();
+  while (iterHypo != iterEnd) {
+    const Hypothesis &hypo = **iterHypo;
+    // Special case: If this is the first hypothesis used to seed the search,
+    // it doesn't have a valid range, and we create the hypothesis, if the
+    // initial position is not further into the sentence than the distortion limit.
+    if (hypo.GetWordsBitmap().GetNumWordsCovered() == 0) {
+      if ((int)transOptRange.GetStartPos() <= maxDistortion)
+        m_hypotheses.push_back(&hypo);
+    } else {
+      int distortionDistance = itype.ComputeDistortionDistance(hypo.GetCurrSourceWordsRange()
+                               , transOptRange);
+      if (distortionDistance <= maxDistortion)
+        m_hypotheses.push_back(&hypo);
+    }
+    ++iterHypo;
+  }
+  if (m_translations.size() > 1) {
+    UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
+                   "Non-monotonic future score: "
+                   << m_translations.Get(0)->GetFutureScore() << " vs. "
+                   << m_translations.Get(1)->GetFutureScore());
+  }
+  if (m_hypotheses.size() > 1) {
+    UTIL_THROW_IF2(m_hypotheses[0]->GetFutureScore() < m_hypotheses[1]->GetFutureScore(),
+                   "Non-monotonic total score"
+                   << m_hypotheses[0]->GetFutureScore() << " vs. "
+                   << m_hypotheses[1]->GetFutureScore());
+  }
+  HypothesisScoreOrdererWithDistortion orderer (&transOptRange, m_deterministic);
+  std::sort(m_hypotheses.begin(), m_hypotheses.end(), orderer);
+  // std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrdererNoDistortion());
+}
+BackwardsEdge::~BackwardsEdge()
+{
+  m_seenPosition.clear();
+  m_hypotheses.clear();
+}
+void
+BackwardsEdge::Initialize()
+{
+  if(m_hypotheses.size() == 0 || m_translations.size() == 0) {
+    m_initialized = true;
+    return;
+  }
+  const Bitmap &bm = m_hypotheses[0]->GetWordsBitmap();
+  const Range &newRange = m_translations.Get(0)->GetSourceWordsRange();
+  m_estimatedScore = m_estimatedScores.CalcEstimatedScore(bm, newRange.GetStartPos(), newRange.GetEndPos());
+  Hypothesis *expanded = CreateHypothesis(*m_hypotheses[0], *m_translations.Get(0));
+  m_parent.Enqueue(0, 0, expanded, this);
+  SetSeenPosition(0, 0);
+  m_initialized = true;
+}
+Hypothesis *BackwardsEdge::CreateHypothesis(const Hypothesis &hypothesis, const TranslationOption &transOpt)
+{
+  // create hypothesis and calculate all its scores
+  IFVERBOSE(2) {
+    hypothesis.GetManager().GetSentenceStats().StartTimeBuildHyp();
+  }
+  const Bitmap &bitmap = m_parent.GetWordsBitmap();
+  Hypothesis *newHypo = new Hypothesis(hypothesis, transOpt, bitmap, hypothesis.GetManager().GetNextHypoId());
+  IFVERBOSE(2) {
+    hypothesis.GetManager().GetSentenceStats().StopTimeBuildHyp();
+  }
+  newHypo->EvaluateWhenApplied(m_estimatedScore);
+  return newHypo;
+}
+bool
+BackwardsEdge::SeenPosition(const size_t x, const size_t y)
+{
+  boost::unordered_set< int >::iterator iter = m_seenPosition.find((x<<16) + y);
+  return (iter != m_seenPosition.end());
+}
+void
+BackwardsEdge::SetSeenPosition(const size_t x, const size_t y)
+{
+  UTIL_THROW_IF2(x >= (1<<17), "Error");
+  UTIL_THROW_IF2(y >= (1<<17), "Error");
+  m_seenPosition.insert((x<<16) + y);
+}
+bool
+BackwardsEdge::GetInitialized()
+{
+  return m_initialized;
+}
+const BitmapContainer&
+BackwardsEdge::GetBitmapContainer() const
+{
+  return m_prevBitmapContainer;
+}
+void
+BackwardsEdge::PushSuccessors(const size_t x, const size_t y)
+{
+  Hypothesis *newHypo;
+  if(y + 1 < m_translations.size() && !SeenPosition(x, y + 1)) {
+    SetSeenPosition(x, y + 1);
+    newHypo = CreateHypothesis(*m_hypotheses[x], *m_translations.Get(y + 1));
+    if(newHypo != NULL) {
+      m_parent.Enqueue(x, y + 1, newHypo, (BackwardsEdge*)this);
+    }
+  }
+  if(x + 1 < m_hypotheses.size() && !SeenPosition(x + 1, y)) {
+    SetSeenPosition(x + 1, y);
+    newHypo = CreateHypothesis(*m_hypotheses[x + 1], *m_translations.Get(y));
+    if(newHypo != NULL) {
+      m_parent.Enqueue(x + 1, y, newHypo, (BackwardsEdge*)this);
+    }
+  }
+}
+////////////////////////////////////////////////////////////////////////////////
+// BitmapContainer Code
+////////////////////////////////////////////////////////////////////////////////
+BitmapContainer::BitmapContainer(const Bitmap &bitmap
+                                 , HypothesisStackCubePruning &stack
+                                 , bool deterministic)
+  : m_bitmap(bitmap)
+  , m_stack(stack)
+  , m_numStackInsertions(0)
+  , m_deterministic(deterministic)
+{
+  m_hypotheses = HypothesisSet();
+  m_edges = BackwardsEdgeSet();
+  m_queue = HypothesisQueue();
+}
+BitmapContainer::~BitmapContainer()
+{
+  // As we have created the square position objects we clean up now.
+  while (!m_queue.empty()) {
+    HypothesisQueueItem *item = m_queue.top();
+    m_queue.pop();
+    delete item->GetHypothesis();
+    delete item;
+  }
+  // Delete all edges.
+  RemoveAllInColl(m_edges);
+  m_hypotheses.clear();
+  m_edges.clear();
+}
+void
+BitmapContainer::Enqueue(int hypothesis_pos
+                         , int translation_pos
+                         , Hypothesis *hypothesis
+                         , BackwardsEdge *edge)
+{
+  // Only supply target phrase if running deterministic search mode
+  const TargetPhrase *target_phrase = m_deterministic ? &(hypothesis->GetCurrTargetPhrase()) : NULL;
+  HypothesisQueueItem *item = new HypothesisQueueItem(hypothesis_pos
+      , translation_pos
+      , hypothesis
+      , edge
+      , target_phrase);
+  IFVERBOSE(2) {
+    item->GetHypothesis()->GetManager().GetSentenceStats().StartTimeManageCubes();
+  }
+  m_queue.push(item);
+  IFVERBOSE(2) {
+    item->GetHypothesis()->GetManager().GetSentenceStats().StopTimeManageCubes();
+  }
+}
+HypothesisQueueItem*
+BitmapContainer::Dequeue(bool keepValue)
+{
+  if (!m_queue.empty()) {
+    HypothesisQueueItem *item = m_queue.top();
+    if (!keepValue) {
+      m_queue.pop();
+    }
+    return item;
+  }
+  return NULL;
+}
+HypothesisQueueItem*
+BitmapContainer::Top() const
+{
+  return m_queue.top();
+}
+size_t
+BitmapContainer::Size()
+{
+  return m_queue.size();
+}
+bool
+BitmapContainer::Empty() const
+{
+  return m_queue.empty();
+}
+const HypothesisSet&
+BitmapContainer::GetHypotheses() const
+{
+  return m_hypotheses;
+}
+size_t
+BitmapContainer::GetHypothesesSize() const
+{
+  return m_hypotheses.size();
+}
+const BackwardsEdgeSet&
+BitmapContainer::GetBackwardsEdges()
+{
+  return m_edges;
+}
+void
+BitmapContainer::AddHypothesis(Hypothesis *hypothesis)
+{
+  bool itemExists = false;
+  HypothesisSet::const_iterator iter = m_hypotheses.begin();
+  HypothesisSet::const_iterator iterEnd = m_hypotheses.end();
+  // cfedermann: do we actually need this check?
+  while (iter != iterEnd) {
+    if (*iter == hypothesis) {
+      itemExists = true;
+      break;
+    }
+    ++iter;
+  }
+  UTIL_THROW_IF2(itemExists, "Duplicate hypotheses");
+  m_hypotheses.push_back(hypothesis);
+}
+void
+BitmapContainer::AddBackwardsEdge(BackwardsEdge *edge)
+{
+  m_edges.insert(edge);
+}
+void
+BitmapContainer::InitializeEdges()
+{
+  BackwardsEdgeSet::iterator iter = m_edges.begin();
+  BackwardsEdgeSet::iterator iterEnd = m_edges.end();
+  while (iter != iterEnd) {
+    BackwardsEdge *edge = *iter;
+    edge->Initialize();
+    ++iter;
+  }
+}
+void
+BitmapContainer::EnsureMinStackHyps(const size_t minNumHyps)
+{
+  while ((!Empty()) && m_numStackInsertions < minNumHyps) {
+    ProcessBestHypothesis();
+  }
+}
+void
+BitmapContainer::ProcessBestHypothesis()
+{
+  if (m_queue.empty()) {
+    return;
+  }
+  // Get the currently best hypothesis from the queue.
+  HypothesisQueueItem *item = Dequeue();
+  // If the priority queue is exhausted, we are done and should have exited
+  UTIL_THROW_IF2(item == NULL, "Null object");
+  // check we are pulling things off of priority queue in right order
+  if (!Empty()) {
+    HypothesisQueueItem *check = Dequeue(true);
+    UTIL_THROW_IF2(item->GetHypothesis()->GetFutureScore() < check->GetHypothesis()->GetFutureScore(),
+                   "Non-monotonic total score: "
+                   << item->GetHypothesis()->GetFutureScore() << " vs. "
+                   << check->GetHypothesis()->GetFutureScore());
+  }
+  // Logging for the criminally insane
+  IFVERBOSE(3) {
+    item->GetHypothesis()->PrintHypothesis();
+  }
+  // Add best hypothesis to hypothesis stack.
+  const bool newstackentry = m_stack.AddPrune(item->GetHypothesis());
+  if (newstackentry)
+    m_numStackInsertions++;
+  IFVERBOSE(3) {
+    TRACE_ERR("new stack entry flag is " << newstackentry << std::endl);
+  }
+  // Create new hypotheses for the two successors of the hypothesis just added.
+  item->GetBackwardsEdge()->PushSuccessors(item->GetHypothesisPos(), item->GetTranslationPos());
+  // We are done with the queue item, we delete it.
+  delete item;
+}
+void
+BitmapContainer::SortHypotheses()
+{
+  std::sort(m_hypotheses.begin(), m_hypotheses.end(), HypothesisScoreOrderer(m_deterministic));
+}
+}

mosesdecoder/moses/Bitmaps.h ADDED Viewed

	@@ -0,0 +1,32 @@

+#pragma once
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
+#include <set>
+#include "Bitmap.h"
+#include "Util.h"
+namespace Moses
+{
+class Bitmaps
+{
+  typedef boost::unordered_map<Range, const Bitmap*> NextBitmaps;
+  typedef boost::unordered_map<const Bitmap*, NextBitmaps, UnorderedComparer<Bitmap>, UnorderedComparer<Bitmap> > Coll;
+  //typedef std::set<const Bitmap*, OrderedComparer<Bitmap> > Coll;
+  Coll m_coll;
+  const Bitmap *m_initBitmap;
+  const Bitmap &GetNextBitmap(const Bitmap &bm, const Range &range);
+public:
+  Bitmaps(size_t inputSize, const std::vector<bool> &initSourceCompleted);
+  virtual ~Bitmaps();
+  const Bitmap &GetInitialBitmap() const {
+    return *m_initBitmap;
+  }
+  const Bitmap &GetBitmap(const Bitmap &bm, const Range &range);
+};
+}

mosesdecoder/moses/ChartManager.h ADDED Viewed

	@@ -0,0 +1,162 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 Hieu Hoang
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#pragma once
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include "ChartCell.h"
+#include "ChartCellCollection.h"
+#include "Range.h"
+#include "SentenceStats.h"
+#include "ChartTranslationOptionList.h"
+#include "ChartParser.h"
+#include "ChartKBestExtractor.h"
+#include "BaseManager.h"
+#include "moses/Syntax/KBestExtractor.h"
+namespace Moses
+{
+class ChartHypothesis;
+class ChartSearchGraphWriter;
+/** Holds everything you need to decode 1 sentence with the hierachical/syntax decoder
+ */
+class ChartManager : public BaseManager
+{
+private:
+  ChartCellCollection m_hypoStackColl;
+  std::auto_ptr<SentenceStats> m_sentenceStats;
+  clock_t m_start; /**< starting time, used for logging */
+  unsigned m_hypothesisId; /* For handing out hypothesis ids to ChartHypothesis */
+  ChartParser m_parser;
+  ChartTranslationOptionList m_translationOptionList; /**< pre-computed list of translation options for the phrases in this sentence */
+  /* auxilliary functions for SearchGraphs */
+  void FindReachableHypotheses(
+    const ChartHypothesis *hypo, std::map<unsigned,bool> &reachable , size_t* winners, size_t* losers) const;
+  void WriteSearchGraph(const ChartSearchGraphWriter& writer) const;
+  // output
+  void OutputNBestList(OutputCollector *collector,
+                       const ChartKBestExtractor::KBestVec &nBestList,
+                       long translationId) const;
+  size_t CalcSourceSize(const Moses::ChartHypothesis *hypo) const;
+  size_t OutputAlignmentNBest(Alignments &retAlign,
+                              const Moses::ChartKBestExtractor::Derivation &derivation,
+                              size_t startTarget) const;
+  size_t OutputAlignment(Alignments &retAlign,
+                         const Moses::ChartHypothesis *hypo,
+                         size_t startTarget) const;
+  void OutputDetailedTranslationReport(
+    OutputCollector *collector,
+    const ChartHypothesis *hypo,
+    const Sentence &sentence,
+    long translationId) const;
+  void OutputTranslationOptions(std::ostream &out,
+                                ApplicationContext &applicationContext,
+                                const ChartHypothesis *hypo,
+                                const Sentence &sentence,
+                                long translationId) const;
+  void OutputTranslationOption(std::ostream &out,
+                               ApplicationContext &applicationContext,
+                               const ChartHypothesis *hypo,
+                               const Sentence &sentence,
+                               long translationId) const;
+  void ReconstructApplicationContext(const ChartHypothesis &hypo,
+                                     const Sentence &sentence,
+                                     ApplicationContext &context) const;
+  void OutputTreeFragmentsTranslationOptions(std::ostream &out,
+      ApplicationContext &applicationContext,
+      const ChartHypothesis *hypo,
+      const Sentence &sentence,
+      long translationId) const;
+  void OutputDetailedAllTranslationReport(
+    OutputCollector *collector,
+    const std::vector<boost::shared_ptr<Moses::ChartKBestExtractor::Derivation> > &nBestList,
+    const Sentence &sentence,
+    long translationId) const;
+  void OutputBestHypo(OutputCollector *collector, const ChartHypothesis *hypo, long translationId) const;
+  void Backtrack(const ChartHypothesis *hypo) const;
+public:
+  ChartManager(ttasksptr const& ttask);
+  ~ChartManager();
+  void Decode();
+  void AddXmlChartOptions();
+  const ChartHypothesis *GetBestHypothesis() const;
+  void CalcNBest(size_t n, std::vector<boost::shared_ptr<ChartKBestExtractor::Derivation> > &nBestList, bool onlyDistinct=false) const;
+  /** "Moses" (osg)  type format */
+  void OutputSearchGraphMoses(std::ostream &outputSearchGraphStream) const;
+  /** Output in (modified) Kenneth hypergraph format */
+  void OutputSearchGraphAsHypergraph(std::ostream &outputSearchGraphStream) const;
+  //! debug data collected when decoding sentence
+  SentenceStats& GetSentenceStats() const {
+    return *m_sentenceStats;
+  }
+  //DIMw
+  const ChartCellCollection& GetChartCellCollection() const {
+    return m_hypoStackColl;
+  }
+  void CalcDecoderStatistics() const {
+  }
+  void ResetSentenceStats(const InputType& source) {
+    m_sentenceStats = std::auto_ptr<SentenceStats>(new SentenceStats(source));
+  }
+  //! contigious hypo id for each input sentence. For debugging purposes
+  unsigned GetNextHypoId() {
+    return m_hypothesisId++;
+  }
+  const ChartParser &GetParser() const {
+    return m_parser;
+  }
+  // outputs
+  void OutputBest(OutputCollector *collector) const;
+  void OutputNBest(OutputCollector *collector) const;
+  void OutputLatticeSamples(OutputCollector *collector) const {
+  }
+  void OutputAlignment(OutputCollector *collector) const;
+  void OutputDetailedTranslationReport(OutputCollector *collector) const;
+  void OutputUnknowns(OutputCollector *collector) const;
+  void OutputDetailedTreeFragmentsTranslationReport(OutputCollector *collector) const;
+  void OutputWordGraph(OutputCollector *collector) const {
+  }
+  void OutputSearchGraph(OutputCollector *collector) const;
+  void OutputSearchGraphSLF() const {
+  }
+  // void OutputSearchGraphHypergraph() const;
+};
+}

mosesdecoder/moses/ChartTranslationOptions.cpp ADDED Viewed

	@@ -0,0 +1,168 @@

+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 Hieu Hoang
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "ChartTranslationOptions.h"
+#include "ChartHypothesis.h"
+#include "ChartCellLabel.h"
+#include "ChartTranslationOption.h"
+#include "InputPath.h"
+#include "StaticData.h"
+#include "TranslationTask.h"
+using namespace std;
+namespace Moses
+{
+ChartTranslationOptions::ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
+    const StackVec &stackVec,
+    const Range &range,
+    float score)
+  : m_stackVec(stackVec)
+  , m_wordsRange(&range)
+  , m_estimateOfBestScore(score)
+{
+  TargetPhraseCollection::const_iterator iter;
+  for (iter = targetPhraseColl.begin(); iter != targetPhraseColl.end(); ++iter) {
+    const TargetPhrase *origTP = *iter;
+    boost::shared_ptr<ChartTranslationOption> ptr(new ChartTranslationOption(*origTP));
+    m_collection.push_back(ptr);
+  }
+}
+ChartTranslationOptions::~ChartTranslationOptions()
+{
+}
+//! functor to compare (chart) hypotheses by (descending) score
+class ChartTranslationOptionScoreOrderer
+{
+public:
+  bool operator()(const boost::shared_ptr<ChartTranslationOption> &transOptA
+                  , const boost::shared_ptr<ChartTranslationOption> &transOptB) const {
+    const ScoreComponentCollection &scoresA = transOptA->GetScores();
+    const ScoreComponentCollection &scoresB = transOptB->GetScores();
+    return scoresA.GetWeightedScore() > scoresB.GetWeightedScore();
+  }
+};
+void ChartTranslationOptions::EvaluateWithSourceContext(const InputType &input, const InputPath &inputPath)
+{
+  SetInputPath(&inputPath);
+  // if (StaticData::Instance().GetPlaceholderFactor() != NOT_FOUND) {
+  if (inputPath.ttask->options()->input.placeholder_factor != NOT_FOUND) {
+    CreateSourceRuleFromInputPath();
+  }
+  CollType::iterator iter;
+  for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
+    ChartTranslationOption &transOpt = **iter;
+    transOpt.SetInputPath(&inputPath);
+    transOpt.EvaluateWithSourceContext(input, inputPath, m_stackVec);
+  }
+  // get rid of -inf trans opts
+  size_t numDiscard = 0;
+  for (size_t i = 0; i < m_collection.size(); ++i) {
+    ChartTranslationOption *transOpt = m_collection[i].get();
+    if (transOpt->GetScores().GetWeightedScore() == - std::numeric_limits<float>::infinity()) {
+      ++numDiscard;
+    } else if (numDiscard) {
+      m_collection[i - numDiscard] = m_collection[i];
+    }
+  }
+  size_t newSize = m_collection.size() - numDiscard;
+  m_collection.resize(newSize);
+  // sort if necessary
+  const StaticData &staticData = StaticData::Instance();
+  if (staticData.RequireSortingAfterSourceContext()) {
+    std::sort(m_collection.begin()
+              , m_collection.begin() + newSize
+              , ChartTranslationOptionScoreOrderer());
+  }
+}
+void ChartTranslationOptions::SetInputPath(const InputPath *inputPath)
+{
+  CollType::iterator iter;
+  for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
+    ChartTranslationOption &transOpt = **iter;
+    transOpt.SetInputPath(inputPath);
+  }
+}
+void ChartTranslationOptions::CreateSourceRuleFromInputPath()
+{
+  if (m_collection.size() == 0) {
+    return;
+  }
+  const InputPath *inputPath = m_collection.front()->GetInputPath();
+  assert(inputPath);
+  std::vector<const Word*> &ruleSourceFromInputPath = inputPath->AddRuleSourceFromInputPath();
+  size_t chartCellIndex = 0;
+  const ChartCellLabel *chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
+  size_t ind = 0;
+  for (size_t sourcePos = m_wordsRange->GetStartPos(); sourcePos <= m_wordsRange->GetEndPos(); ++sourcePos, ++ind) {
+    if (chartCellLabel) {
+      if (sourcePos == chartCellLabel->GetCoverage().GetEndPos()) {
+        // end of child range. push an empty word to denote non-term
+        ruleSourceFromInputPath.push_back(NULL);
+        ++chartCellIndex;
+        chartCellLabel = (chartCellIndex < m_stackVec.size()) ? m_stackVec[chartCellIndex] : NULL;
+      } else if (sourcePos >= chartCellLabel->GetCoverage().GetStartPos()) {
+        // in the range of child hypo. do nothing
+      } else {
+        // not yet reached child range. add word
+        ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
+      }
+    } else {
+      // no child in sight. add word
+      ruleSourceFromInputPath.push_back(&inputPath->GetPhrase().GetWord(ind));
+    }
+  }
+  // save it to each trans opt
+  CollType::iterator iter;
+  for (iter = m_collection.begin(); iter != m_collection.end(); ++iter) {
+    ChartTranslationOption &transOpt = **iter;
+    transOpt.SetSourceRuleFromInputPath(&ruleSourceFromInputPath);
+  }
+}
+std::ostream& operator<<(std::ostream &out, const ChartTranslationOptions &obj)
+{
+  for (size_t i = 0; i < obj.m_collection.size(); ++i) {
+    const ChartTranslationOption &transOpt = *obj.m_collection[i];
+    out << transOpt << endl;
+  }
+  return out;
+}
+}

mosesdecoder/moses/DecodeStepGeneration.cpp ADDED Viewed

	@@ -0,0 +1,169 @@

+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "DecodeStepGeneration.h"
+#include "GenerationDictionary.h"
+#include "TranslationOption.h"
+#include "TranslationOptionCollection.h"
+#include "PartialTranslOptColl.h"
+#include "FactorCollection.h"
+namespace Moses
+{
+using namespace std;
+DecodeStepGeneration::DecodeStepGeneration(GenerationDictionary* dict,
+    const DecodeStep* prev,
+    const std::vector<FeatureFunction*> &features)
+  : DecodeStep(dict, prev, features)
+{
+}
+// helpers
+typedef pair<Word, ScoreComponentCollection> WordPair;
+typedef list< WordPair > WordList;
+// 1st = word
+// 2nd = score
+typedef list< WordPair >::const_iterator WordListIterator;
+/** used in generation: increases iterators when looping through the exponential number of generation expansions */
+inline void IncrementIterators(vector< WordListIterator > &wordListIterVector
+                               , const vector< WordList > &wordListVector)
+{
+  for (size_t currPos = 0 ; currPos < wordListVector.size() ; currPos++) {
+    WordListIterator &iter = wordListIterVector[currPos];
+    iter++;
+    if (iter != wordListVector[currPos].end()) {
+      // eg. 4 -> 5
+      return;
+    } else {
+      //  eg 9 -> 10
+      iter = wordListVector[currPos].begin();
+    }
+  }
+}
+void DecodeStepGeneration::Process(const TranslationOption &inputPartialTranslOpt
+                                   , const DecodeStep &decodeStep
+                                   , PartialTranslOptColl &outputPartialTranslOptColl
+                                   , TranslationOptionCollection * /* toc */
+                                   , bool /*adhereTableLimit*/) const
+{
+  if (inputPartialTranslOpt.GetTargetPhrase().GetSize() == 0) {
+    // word deletion
+    TranslationOption *newTransOpt = new TranslationOption(inputPartialTranslOpt);
+    outputPartialTranslOptColl.Add(newTransOpt);
+    return;
+  }
+  // normal generation step
+  const GenerationDictionary* generationDictionary  = decodeStep.GetGenerationDictionaryFeature();
+  const Phrase &targetPhrase  = inputPartialTranslOpt.GetTargetPhrase();
+  const InputPath &inputPath = inputPartialTranslOpt.GetInputPath();
+  size_t targetLength         = targetPhrase.GetSize();
+  // generation list for each word in phrase
+  vector< WordList > wordListVector(targetLength);
+  // create generation list
+  int wordListVectorPos = 0;
+  for (size_t currPos = 0 ; currPos < targetLength ; currPos++) { // going thorugh all words
+    // generatable factors for this word to be put in wordList
+    WordList &wordList = wordListVector[wordListVectorPos];
+    const Word &word = targetPhrase.GetWord(currPos);
+    // consult dictionary for possible generations for this word
+    const OutputWordCollection *wordColl = generationDictionary->FindWord(word);
+    if (wordColl == NULL) {
+      // word not found in generation dictionary
+      //toc->ProcessUnknownWord(sourceWordsRange.GetStartPos(), factorCollection);
+      return; // can't be part of a phrase, special handling
+    } else {
+      // sort(*wordColl, CompareWordCollScore);
+      OutputWordCollection::const_iterator iterWordColl;
+      for (iterWordColl = wordColl->begin() ; iterWordColl != wordColl->end(); ++iterWordColl) {
+        const Word &outputWord = (*iterWordColl).first;
+        const ScoreComponentCollection& score = (*iterWordColl).second;
+        // enter into word list generated factor(s) and its(their) score(s)
+        wordList.push_back(WordPair(outputWord, score));
+      }
+      wordListVectorPos++; // done, next word
+    }
+  }
+  // use generation list (wordList)
+  // set up iterators (total number of expansions)
+  size_t numIteration = 1;
+  vector< WordListIterator >  wordListIterVector(targetLength);
+  vector< const Word* >       mergeWords(targetLength);
+  for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
+    wordListIterVector[currPos] = wordListVector[currPos].begin();
+    numIteration *= wordListVector[currPos].size();
+  }
+  // go thru each possible factor for each word & create hypothesis
+  for (size_t currIter = 0 ; currIter < numIteration ; currIter++) {
+    ScoreComponentCollection generationScore; // total score for this string of words
+    // create vector of words with new factors for last phrase
+    for (size_t currPos = 0 ; currPos < targetLength ; currPos++) {
+      const WordPair &wordPair = *wordListIterVector[currPos];
+      mergeWords[currPos] = &(wordPair.first);
+      generationScore.PlusEquals(wordPair.second);
+    }
+    // merge with existing trans opt
+    Phrase genPhrase( mergeWords);
+    if (IsFilteringStep()) {
+      if (!inputPartialTranslOpt.IsCompatible(genPhrase, m_conflictFactors))
+        continue;
+    }
+    const TargetPhrase &inPhrase = inputPartialTranslOpt.GetTargetPhrase();
+    TargetPhrase outPhrase(inPhrase);
+    outPhrase.GetScoreBreakdown().PlusEquals(generationScore);
+    outPhrase.MergeFactors(genPhrase, m_newOutputFactors);
+    outPhrase.EvaluateInIsolation(inputPath.GetPhrase(), m_featuresToApply);
+    const Range &sourceWordsRange = inputPartialTranslOpt.GetSourceWordsRange();
+    TranslationOption *newTransOpt = new TranslationOption(sourceWordsRange, outPhrase);
+    assert(newTransOpt);
+    newTransOpt->SetInputPath(inputPath);
+    outputPartialTranslOptColl.Add(newTransOpt);
+    // increment iterators
+    IncrementIterators(wordListIterVector, wordListVector);
+  }
+}
+}

mosesdecoder/moses/FloydWarshall.cpp ADDED Viewed

	@@ -0,0 +1,36 @@

+#include "util/exception.hh"
+#include <climits>
+#include <vector>
+#define MAX_DIST (INT_MAX / 2)
+//#include "FloydWarshall.h"
+using namespace std;
+// All-pairs shortest path algorithm
+void floyd_warshall(const std::vector<std::vector<bool> >& edges, std::vector<std::vector<int> >& dist)
+{
+  UTIL_THROW_IF2(edges.size() != edges.front().size(), "Error");
+  dist.clear();
+  dist.resize(edges.size(), std::vector<int>(edges.size(), 0));
+  size_t num_edges = edges.size();
+  for (size_t i=0; i<num_edges; ++i) {
+    for (size_t j=0; j<num_edges; ++j) {
+      if (edges[i][j])
+        dist[i][j] = 1;
+      else
+        dist[i][j] = MAX_DIST;
+      if (i == j) dist[i][j] = MAX_DIST;
+    }
+  }
+  for (size_t k=0; k<num_edges; ++k)
+    for (size_t i=0; i<num_edges; ++i)
+      for (size_t j=0; j<num_edges; ++j)
+        if (dist[i][j] > (dist[i][k] + dist[k][j]))
+          dist[i][j] = dist[i][k] + dist[k][j];
+}

mosesdecoder/moses/HypothesisStack.h ADDED Viewed

	@@ -0,0 +1,64 @@

+#ifndef moses_HypothesisStack_h
+#define moses_HypothesisStack_h
+#include <vector>
+#include <set>
+#include <boost/unordered_set.hpp>
+#include "Hypothesis.h"
+#include "Bitmap.h"
+namespace Moses
+{
+class Manager;
+/** abstract unique set of hypotheses that cover a certain number of words,
+ *  ie. a stack in phrase-based decoding
+ */
+class HypothesisStack
+{
+protected:
+  typedef boost::unordered_set< Hypothesis*, UnorderedComparer<Hypothesis>, UnorderedComparer<Hypothesis> > _HCType;
+  _HCType m_hypos; /**< contains hypotheses */
+  Manager& m_manager;
+public:
+  HypothesisStack(Manager& manager): m_manager(manager) {}
+  typedef _HCType::iterator iterator;
+  typedef _HCType::const_iterator const_iterator;
+  //! iterators
+  const_iterator begin() const {
+    return m_hypos.begin();
+  }
+  const_iterator end() const {
+    return m_hypos.end();
+  }
+  size_t size() const {
+    return m_hypos.size();
+  }
+  virtual inline float GetWorstScore() const {
+    return -std::numeric_limits<float>::infinity();
+  };
+  virtual float GetWorstScoreForBitmap( WordsBitmapID ) {
+    return -std::numeric_limits<float>::infinity();
+  };
+  virtual float GetWorstScoreForBitmap( const Bitmap& ) {
+    return -std::numeric_limits<float>::infinity();
+  };
+  virtual ~HypothesisStack();
+  virtual bool AddPrune(Hypothesis *hypothesis) = 0;
+  virtual const Hypothesis *GetBestHypothesis() const = 0;
+  virtual std::vector<const Hypothesis*> GetSortedList() const = 0;
+  //! remove hypothesis pointed to by iterator but don't delete the object
+  virtual void Detach(const HypothesisStack::iterator &iter);
+  /** destroy Hypothesis pointed to by iterator (object pool version) */
+  virtual void Remove(const HypothesisStack::iterator &iter);
+};
+}
+#endif

mosesdecoder/moses/Sentence.cpp ADDED Viewed

	@@ -0,0 +1,372 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <stdexcept>
+#include <boost/algorithm/string.hpp>
+#include <boost/foreach.hpp>
+#include "Sentence.h"
+#include "TranslationOptionCollectionText.h"
+#include "StaticData.h"
+#include "moses/FF/DynamicCacheBasedLanguageModel.h"
+#include "moses/TranslationModel/PhraseDictionaryDynamicCacheBased.h"
+#include "ChartTranslationOptions.h"
+#include "Util.h"
+#include "XmlOption.h"
+#include "FactorCollection.h"
+#include "TranslationTask.h"
+using namespace std;
+namespace Moses
+{
+Sentence::
+Sentence(AllOptions::ptr const& opts) : Phrase(0) , InputType(opts)
+{
+  if (is_syntax(opts->search.algo))
+    m_defaultLabelSet.insert(opts->syntax.input_default_non_terminal);
+}
+Sentence::
+~Sentence()
+{
+  RemoveAllInColl(m_xmlOptions);
+}
+void
+Sentence::
+aux_init_partial_translation(string& line)
+{
+  string sourceCompletedStr;
+  int loc1 = line.find( "|||", 0 );
+  int loc2 = line.find( "|||", loc1 + 3 );
+  if (loc1 > -1 && loc2 > -1) {
+    m_initialTargetPhrase = Trim(line.substr(0, loc1));
+    string scov = Trim(line.substr(loc1 + 3, loc2 - loc1 - 3));
+    line = line.substr(loc2 + 3);
+    m_sourceCompleted.resize(scov.size());
+    int contiguous = 1;
+    for (size_t i = 0; i < scov.size(); ++i) {
+      if (sourceCompletedStr.at(i) == '1') {
+        m_sourceCompleted[i] = true;
+        if (contiguous) m_frontSpanCoveredLength++;
+      } else {
+        m_sourceCompleted[i] = false;
+        contiguous = 0;
+      }
+    }
+  }
+}
+void
+Sentence::
+aux_interpret_sgml_markup(string& line)
+{
+  // if sentences is specified as "<seg id=1> ... </seg>", extract id
+  typedef std::map<std::string, std::string> metamap;
+  metamap meta = ProcessAndStripSGML(line);
+  metamap::const_iterator i;
+  if ((i = meta.find("id")) != meta.end())
+    this->SetTranslationId(atol(i->second.c_str()));
+  if ((i = meta.find("docid")) != meta.end()) {
+    this->SetDocumentId(atol(i->second.c_str()));
+    this->SetUseTopicId(false);
+    this->SetUseTopicIdAndProb(false);
+  }
+  if ((i = meta.find("topic")) != meta.end()) {
+    vector<string> topic_params;
+    boost::split(topic_params, i->second, boost::is_any_of("\t "));
+    if (topic_params.size() == 1) {
+      this->SetTopicId(atol(topic_params[0].c_str()));
+      this->SetUseTopicId(true);
+      this->SetUseTopicIdAndProb(false);
+    } else {
+      this->SetTopicIdAndProb(topic_params);
+      this->SetUseTopicId(false);
+      this->SetUseTopicIdAndProb(true);
+    }
+  }
+  if ((i = meta.find("weight-setting")) != meta.end()) {
+    this->SetWeightSetting(i->second);
+    this->SetSpecifiesWeightSetting(true);
+    StaticData::Instance().SetWeightSetting(i->second);
+    // oh this is so horrible! Why does this have to be propagated globally?
+    // --- UG
+  } else this->SetSpecifiesWeightSetting(false);
+}
+void
+Sentence::
+aux_interpret_dlt(string& line) // whatever DLT means ... --- UG
+{
+  using namespace std;
+  typedef map<string, string> str2str_map;
+  m_dlt_meta = ProcessAndStripDLT(line);
+  // what's happening below is most likely not thread-safe! UG
+  BOOST_FOREACH(str2str_map const& M, m_dlt_meta) {
+    str2str_map::const_iterator i,j;
+    if ((i = M.find("type")) != M.end()) {
+      j = M.find("id");
+      string id = j == M.end() ? "default" : j->second;
+      if (i->second == "cbtm") {
+        PhraseDictionaryDynamicCacheBased* cbtm;
+        cbtm = PhraseDictionaryDynamicCacheBased::InstanceNonConst(id);
+        if (cbtm) cbtm->ExecuteDlt(M);
+      }
+      if (i->second == "cblm") {
+        DynamicCacheBasedLanguageModel* cblm;
+        cblm = DynamicCacheBasedLanguageModel::InstanceNonConst(id);
+        if (cblm) cblm->ExecuteDlt(M);
+      }
+    }
+  }
+}
+void
+Sentence::
+aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls,
+                  std::vector<std::pair<size_t, std::string> >& placeholders)
+{
+  // parse XML markup in translation line
+  using namespace std;
+  if (m_options->input.xml_policy != XmlPassThrough) {
+    bool OK = ProcessAndStripXMLTags(*m_options, line,
+                                     m_xmlOptions,
+                                     m_reorderingConstraint,
+                                     xmlWalls, placeholders,
+                                     *this);
+    if (!OK) {
+      TRACE_ERR("Unable to parse XML in line: " << line);
+    }
+  }
+}
+void
+Sentence::
+init(string line)
+{
+  using namespace std;
+  m_frontSpanCoveredLength = 0;
+  m_sourceCompleted.resize(0);
+  if (m_options->input.continue_partial_translation)
+    aux_init_partial_translation(line);
+  line = Trim(line);
+  aux_interpret_sgml_markup(line); // for "<seg id=..." markup
+  aux_interpret_dlt(line); // some poorly documented cache-based stuff
+  // if sentences is specified as "<passthrough tag1=""/>"
+  if (m_options->output.PrintPassThrough ||m_options->nbest.include_passthrough) {
+    string pthru = PassthroughSGML(line,"passthrough");
+    this->SetPassthroughInformation(pthru);
+  }
+  vector<size_t> xmlWalls;
+  vector<pair<size_t, string> >placeholders;
+  aux_interpret_xml(line, xmlWalls, placeholders);
+  Phrase::CreateFromString(Input, m_options->input.factor_order, line, NULL);
+  ProcessPlaceholders(placeholders);
+  if (is_syntax(m_options->search.algo))
+    InitStartEndWord();
+  // now that we have final word positions in phrase (from
+  // CreateFromString), we can make input phrase objects to go with
+  // our XmlOptions and create TranslationOptions
+  // only fill the vector if we are parsing XML
+  if (m_options->input.xml_policy != XmlPassThrough) {
+    m_xmlCoverageMap.assign(GetSize(), false);
+    BOOST_FOREACH(XmlOption const* o, m_xmlOptions) {
+      Range const& r = o->range;
+      for(size_t j = r.GetStartPos(); j <= r.GetEndPos(); ++j)
+        m_xmlCoverageMap[j]=true;
+    }
+  }
+  // reordering walls and zones
+  m_reorderingConstraint.InitializeWalls(GetSize());
+  // set reordering walls, if "-monotone-at-punction" is set
+  if (m_options->reordering.monotone_at_punct && GetSize()) {
+    Range r(0, GetSize()-1);
+    m_reorderingConstraint.SetMonotoneAtPunctuation(GetSubString(r));
+  }
+  // set walls obtained from xml
+  for(size_t i=0; i<xmlWalls.size(); i++)
+    if(xmlWalls[i] < GetSize()) // no buggy walls, please
+      m_reorderingConstraint.SetWall(xmlWalls[i], true);
+  m_reorderingConstraint.FinalizeWalls();
+}
+int
+Sentence::
+Read(std::istream& in)
+{
+  std::string line;
+  if (getline(in, line, '\n').eof())
+    return 0;
+  init(line);
+  return 1;
+}
+void
+Sentence::
+ProcessPlaceholders(const std::vector< std::pair<size_t, std::string> > &placeholders)
+{
+  FactorType placeholderFactor = m_options->input.placeholder_factor;
+  if (placeholderFactor == NOT_FOUND) {
+    return;
+  }
+  for (size_t i = 0; i < placeholders.size(); ++i) {
+    size_t pos = placeholders[i].first;
+    const string &str = placeholders[i].second;
+    const Factor *factor = FactorCollection::Instance().AddFactor(str);
+    Word &word = Phrase::GetWord(pos);
+    word[placeholderFactor] = factor;
+  }
+}
+TranslationOptionCollection*
+Sentence::
+CreateTranslationOptionCollection(ttasksptr const& ttask) const
+{
+  TranslationOptionCollection *rv
+  = new TranslationOptionCollectionText(ttask, *this);
+  assert(rv);
+  return rv;
+}
+void Sentence::Print(std::ostream& out) const
+{
+  out<<*static_cast<Phrase const*>(this);
+}
+bool Sentence::XmlOverlap(size_t startPos, size_t endPos) const
+{
+  for (size_t pos = startPos; pos <=  endPos ; pos++) {
+    if (pos < m_xmlCoverageMap.size() && m_xmlCoverageMap[pos]) {
+      return true;
+    }
+  }
+  return false;
+}
+void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list) const
+{
+  for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
+       iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
+    const XmlOption &xmlOption = **iterXMLOpts;
+    const Range &range = xmlOption.range;
+    const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
+    TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
+    list.push_back(transOpt);
+  }
+}
+void Sentence::GetXmlTranslationOptions(std::vector <TranslationOption*> &list, size_t startPos, size_t endPos) const
+{
+  //iterate over XmlOptions list, find exact source/target matches
+  for (std::vector<XmlOption const*>::const_iterator iterXMLOpts = m_xmlOptions.begin();
+       iterXMLOpts != m_xmlOptions.end(); ++iterXMLOpts) {
+    const XmlOption &xmlOption = **iterXMLOpts;
+    const Range &range = xmlOption.range;
+    if (startPos == range.GetStartPos()
+        && endPos == range.GetEndPos()) {
+      const TargetPhrase &targetPhrase = xmlOption.targetPhrase;
+      TranslationOption *transOpt = new TranslationOption(range, targetPhrase);
+      list.push_back(transOpt);
+    }
+  }
+}
+std::vector <ChartTranslationOptions*>
+Sentence::
+GetXmlChartTranslationOptions() const
+{
+  std::vector <ChartTranslationOptions*> ret;
+  // XML Options
+  // this code is a copy of the 1 in Sentence.
+  //only fill the vector if we are parsing XML
+  if (m_options->input.xml_policy != XmlPassThrough ) {
+    //TODO: needed to handle exclusive
+    //for (size_t i=0; i<GetSize(); i++) {
+    //  m_xmlCoverageMap.push_back(false);
+    //}
+    //iterXMLOpts will be empty for XmlIgnore
+    //look at each column
+    for(std::vector<XmlOption const*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
+        iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {
+      const XmlOption &xmlOption = **iterXmlOpts;
+      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);
+      Range *range = new Range(xmlOption.range);
+      StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted
+      TargetPhraseCollection *tpc = new TargetPhraseCollection;
+      tpc->Add(targetPhrase);
+      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
+      ret.push_back(transOpt);
+      //TODO: needed to handle exclusive
+      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
+      //  m_xmlCoverageMap[j]=true;
+      //}
+    }
+  }
+  return ret;
+}
+void
+Sentence::
+CreateFromString(vector<FactorType> const& FOrder, string const& phraseString)
+{
+  Phrase::CreateFromString(Input, FOrder, phraseString, NULL);
+}
+Sentence::
+Sentence(AllOptions::ptr const& opts, size_t const transId, string stext)
+  : InputType(opts, transId)
+{
+  init(stext);
+}
+}

mosesdecoder/moses/Syntax/Cube.h ADDED Viewed

	@@ -0,0 +1,62 @@

+#pragma once
+#include <queue>
+#include <vector>
+#include <utility>
+#include <boost/unordered_set.hpp>
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+namespace Moses
+{
+namespace Syntax
+{
+// A cube -- in the cube pruning sense (see Chiang (2007)) -- that lazily
+// produces SHyperedge objects from a SHyperedgeBundle in approximately
+// best-first order.
+class Cube
+{
+public:
+  Cube(const SHyperedgeBundle &);
+  ~Cube();
+  SHyperedge *Pop();
+  SHyperedge *Top() const {
+    return m_queue.top().first;
+  }
+  bool IsEmpty() const {
+    return m_queue.empty();
+  }
+private:
+  typedef boost::unordered_set<std::vector<int> > CoordinateSet;
+  typedef std::pair<SHyperedge *, const std::vector<int> *> QueueItem;
+  class QueueItemOrderer
+  {
+  public:
+    bool operator()(const QueueItem &p, const QueueItem &q) const {
+      return p.first->label.futureScore < q.first->label.futureScore;
+    }
+  };
+  typedef std::priority_queue<QueueItem, std::vector<QueueItem>,
+          QueueItemOrderer> Queue;
+  SHyperedge *CreateHyperedge(const std::vector<int> &);
+  void CreateNeighbour(const std::vector<int> &);
+  void CreateNeighbours(const std::vector<int> &);
+  const SHyperedgeBundle &m_bundle;
+  CoordinateSet m_visited;
+  Queue m_queue;
+};
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/CubeQueue.cpp ADDED Viewed

	@@ -0,0 +1,37 @@

+#include "CubeQueue.h"
+namespace Moses
+{
+namespace Syntax
+{
+CubeQueue::~CubeQueue()
+{
+  while (!m_queue.empty()) {
+    Cube *cube = m_queue.top();
+    m_queue.pop();
+    delete cube;
+  }
+}
+SHyperedge *CubeQueue::Pop()
+{
+  // pop the most promising cube
+  Cube *cube = m_queue.top();
+  m_queue.pop();
+  // pop the most promising hyperedge from the cube
+  SHyperedge *hyperedge = cube->Pop();
+  // if the cube contains more items then push it back onto the queue
+  if (!cube->IsEmpty()) {
+    m_queue.push(cube);
+  } else {
+    delete cube;
+  }
+  return hyperedge;
+}
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/CubeQueue.h ADDED Viewed

	@@ -0,0 +1,52 @@

+#pragma once
+#include <queue>
+#include <vector>
+#include "Cube.h"
+#include "SHyperedge.h"
+#include "SHyperedgeBundle.h"
+namespace Moses
+{
+namespace Syntax
+{
+class CubeQueue
+{
+public:
+  template<typename InputIterator>
+  CubeQueue(InputIterator, InputIterator);
+  ~CubeQueue();
+  SHyperedge *Pop();
+  bool IsEmpty() const {
+    return m_queue.empty();
+  }
+private:
+  class CubeOrderer
+  {
+  public:
+    bool operator()(const Cube *p, const Cube *q) const {
+      return p->Top()->label.futureScore < q->Top()->label.futureScore;
+    }
+  };
+  typedef std::priority_queue<Cube*, std::vector<Cube*>, CubeOrderer> Queue;
+  Queue m_queue;
+};
+template<typename InputIterator>
+CubeQueue::CubeQueue(InputIterator first, InputIterator last)
+{
+  while (first != last) {
+    m_queue.push(new Cube(*first++));
+  }
+}
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/InputWeightFF.cpp ADDED Viewed

	@@ -0,0 +1,48 @@

+#include "InputWeightFF.h"
+#include <vector>
+#include "moses/ScoreComponentCollection.h"
+#include "moses/Syntax/SHyperedge.h"
+#include "moses/TargetPhrase.h"
+namespace Moses
+{
+namespace Syntax
+{
+InputWeightFF::InputWeightFF(const std::string &line)
+  : StatelessFeatureFunction(1, line)
+{
+  ReadParameters();
+}
+void InputWeightFF::EvaluateWhenApplied(const Hypothesis& hypo,
+                                        ScoreComponentCollection* accumulator) const
+{
+  // TODO Throw exception.
+  assert(false);
+}
+void InputWeightFF::EvaluateWhenApplied(const ChartHypothesis &hypo,
+                                        ScoreComponentCollection* accumulator) const
+{
+  // TODO Throw exception.
+  assert(false);
+}
+void InputWeightFF::EvaluateWhenApplied(
+  const Syntax::SHyperedge &hyperedge,
+  ScoreComponentCollection* accumulator) const
+{
+  accumulator->PlusEquals(this, hyperedge.label.inputWeight);
+}
+void InputWeightFF::SetParameter(const std::string& key,
+                                 const std::string& value)
+{
+  StatelessFeatureFunction::SetParameter(key, value);
+}
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/Manager.cpp ADDED Viewed

	@@ -0,0 +1,229 @@

+#include <sstream>
+#include "Manager.h"
+#include "PVertex.h"
+#include "moses/OutputCollector.h"
+#include "moses/Util.h"
+namespace Moses
+{
+namespace Syntax
+{
+Manager::Manager(ttasksptr const& ttask)
+  : Moses::BaseManager(ttask)
+{ }
+void Manager::OutputBest(OutputCollector *collector) const
+{
+  if (!collector) {
+    return;
+  }
+  std::ostringstream out;
+  FixPrecision(out);
+  const SHyperedge *best = GetBestSHyperedge();
+  if (best == NULL) {
+    VERBOSE(1, "NO BEST TRANSLATION" << std::endl);
+    if (options()->output.ReportHypoScore) {
+      out << "0 ";
+    }
+    out << '\n';
+  } else {
+    if (options()->output.ReportHypoScore) {
+      out << best->label.futureScore << " ";
+    }
+    Phrase yield = GetOneBestTargetYield(*best);
+    // delete 1st & last
+    UTIL_THROW_IF2(yield.GetSize() < 2,
+                   "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+    yield.RemoveWord(0);
+    yield.RemoveWord(yield.GetSize()-1);
+    out << yield.GetStringRep(options()->output.factor_order);
+    out << '\n';
+  }
+  collector->Write(m_source.GetTranslationId(), out.str());
+}
+void Manager::OutputNBest(OutputCollector *collector) const
+{
+  if (collector) {
+    long translationId = m_source.GetTranslationId();
+    KBestExtractor::KBestVec nBestList;
+    ExtractKBest(options()->nbest.nbest_size, nBestList,
+                 options()->nbest.only_distinct);
+    OutputNBestList(collector, nBestList, translationId);
+  }
+}
+void Manager::OutputUnknowns(OutputCollector *collector) const
+{
+  if (collector) {
+    long translationId = m_source.GetTranslationId();
+    std::ostringstream out;
+    for (boost::unordered_set<Moses::Word>::const_iterator p = m_oovs.begin();
+         p != m_oovs.end(); ++p) {
+      out << *p;
+    }
+    out << std::endl;
+    collector->Write(translationId, out.str());
+  }
+}
+void Manager::OutputNBestList(OutputCollector *collector,
+                              const KBestExtractor::KBestVec &nBestList,
+                              long translationId) const
+{
+  const std::vector<FactorType> &outputFactorOrder = options()->output.factor_order;
+  std::ostringstream out;
+  if (collector->OutputIsCout()) {
+    // Set precision only if we're writing the n-best list to cout.  This is to
+    // preserve existing behaviour, but should probably be done either way.
+    FixPrecision(out);
+  }
+  bool includeWordAlignment = options()->nbest.include_alignment_info;
+  bool PrintNBestTrees = options()->nbest.print_trees; // PrintNBestTrees();
+  for (KBestExtractor::KBestVec::const_iterator p = nBestList.begin();
+       p != nBestList.end(); ++p) {
+    const KBestExtractor::Derivation &derivation = **p;
+    // get the derivation's target-side yield
+    Phrase outputPhrase = KBestExtractor::GetOutputPhrase(derivation);
+    // delete <s> and </s>
+    UTIL_THROW_IF2(outputPhrase.GetSize() < 2,
+                   "Output phrase should have contained at least 2 words (beginning and end-of-sentence)");
+    outputPhrase.RemoveWord(0);
+    outputPhrase.RemoveWord(outputPhrase.GetSize() - 1);
+    // print the translation ID, surface factors, and scores
+    out << translationId << " ||| ";
+    OutputSurface(out, outputPhrase); // , outputFactorOrder, false);
+    out << " ||| ";
+    bool with_labels = options()->nbest.include_feature_labels;
+    derivation.scoreBreakdown.OutputAllFeatureScores(out, with_labels);
+    out << " ||| " << derivation.score;
+    // optionally, print word alignments
+    if (includeWordAlignment) {
+      out << " ||| ";
+      Alignments align;
+      OutputAlignmentNBest(align, derivation, 0);
+      for (Alignments::const_iterator q = align.begin(); q != align.end();
+           ++q) {
+        out << q->first << "-" << q->second << " ";
+      }
+    }
+    // optionally, print tree
+    if (PrintNBestTrees) {
+      TreePointer tree = KBestExtractor::GetOutputTree(derivation);
+      out << " ||| " << tree->GetString();
+    }
+    out << std::endl;
+  }
+  assert(collector);
+  collector->Write(translationId, out.str());
+}
+std::size_t Manager::OutputAlignmentNBest(
+  Alignments &retAlign,
+  const KBestExtractor::Derivation &derivation,
+  std::size_t startTarget) const
+{
+  const SHyperedge &shyperedge = derivation.edge->shyperedge;
+  std::size_t totalTargetSize = 0;
+  std::size_t startSource = shyperedge.head->pvertex->span.GetStartPos();
+  const TargetPhrase &tp = *(shyperedge.label.translation);
+  std::size_t thisSourceSize = CalcSourceSize(derivation);
+  // position of each terminal word in translation rule, irrespective of
+  // alignment if non-term, number is undefined
+  std::vector<std::size_t> sourceOffsets(thisSourceSize, 0);
+  std::vector<std::size_t> targetOffsets(tp.GetSize(), 0);
+  const AlignmentInfo &aiNonTerm =
+    shyperedge.label.translation->GetAlignNonTerm();
+  std::vector<std::size_t> sourceInd2pos = aiNonTerm.GetSourceIndex2PosMap();
+  const AlignmentInfo::NonTermIndexMap &targetPos2SourceInd =
+    aiNonTerm.GetNonTermIndexMap();
+  UTIL_THROW_IF2(sourceInd2pos.size() != derivation.subderivations.size(),
+                 "Error");
+  std::size_t targetInd = 0;
+  for (std::size_t targetPos = 0; targetPos < tp.GetSize(); ++targetPos) {
+    if (tp.GetWord(targetPos).IsNonTerminal()) {
+      UTIL_THROW_IF2(targetPos >= targetPos2SourceInd.size(), "Error");
+      std::size_t sourceInd = targetPos2SourceInd[targetPos];
+      std::size_t sourcePos = sourceInd2pos[sourceInd];
+      const KBestExtractor::Derivation &subderivation =
+        *derivation.subderivations[sourceInd];
+      // calc source size
+      std::size_t sourceSize =
+        subderivation.edge->head->svertex.pvertex->span.GetNumWordsCovered();
+      sourceOffsets[sourcePos] = sourceSize;
+      // calc target size.
+      // Recursively look thru child hypos
+      std::size_t currStartTarget = startTarget + totalTargetSize;
+      std::size_t targetSize = OutputAlignmentNBest(retAlign, subderivation,
+                               currStartTarget);
+      targetOffsets[targetPos] = targetSize;
+      totalTargetSize += targetSize;
+      ++targetInd;
+    } else {
+      ++totalTargetSize;
+    }
+  }
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
+  ShiftOffsets(sourceOffsets, startSource);
+  ShiftOffsets(targetOffsets, startTarget);
+  // get alignments from this hypo
+  const AlignmentInfo &aiTerm = shyperedge.label.translation->GetAlignTerm();
+  // add to output arg, offsetting by source & target
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<std::size_t, std::size_t> &align = *iter;
+    std::size_t relSource = align.first;
+    std::size_t relTarget = align.second;
+    std::size_t absSource = sourceOffsets[relSource];
+    std::size_t absTarget = targetOffsets[relTarget];
+    std::pair<std::size_t, std::size_t> alignPoint(absSource, absTarget);
+    std::pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    UTIL_THROW_IF2(!ret.second, "Error");
+  }
+  return totalTargetSize;
+}
+std::size_t Manager::CalcSourceSize(const KBestExtractor::Derivation &d) const
+{
+  const SHyperedge &shyperedge = d.edge->shyperedge;
+  std::size_t ret = shyperedge.head->pvertex->span.GetNumWordsCovered();
+  for (std::size_t i = 0; i < shyperedge.tail.size(); ++i) {
+    std::size_t childSize =
+      shyperedge.tail[i]->pvertex->span.GetNumWordsCovered();
+    ret -= (childSize - 1);
+  }
+  return ret;
+}
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/NonTerminalMap.h ADDED Viewed

	@@ -0,0 +1,85 @@

+#pragma once
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "SymbolEqualityPred.h"
+#include "SymbolHasher.h"
+namespace Moses
+{
+namespace Syntax
+{
+// Hybrid map/vector-based container for key-value pairs where the key is a
+// non-terminal Word.  The interface is like a (stripped-down) map type, with
+// the main differences being that:
+//   1. Find() is implemented using vector indexing to make it fast.
+//   2. Once a value has been inserted it can be modified but can't be removed.
+template<typename T>
+class NonTerminalMap
+{
+private:
+  typedef boost::unordered_map<Word, T, SymbolHasher, SymbolEqualityPred> Map;
+  typedef std::vector<T*> Vec;
+public:
+  typedef typename Map::iterator Iterator;
+  typedef typename Map::const_iterator ConstIterator;
+  NonTerminalMap()
+    : m_vec(FactorCollection::Instance().GetNumNonTerminals(), NULL) {}
+  Iterator Begin() {
+    return m_map.begin();
+  }
+  Iterator End() {
+    return m_map.end();
+  }
+  ConstIterator Begin() const {
+    return m_map.begin();
+  }
+  ConstIterator End() const {
+    return m_map.end();
+  }
+  std::size_t Size() const {
+    return m_map.size();
+  }
+  bool IsEmpty() const {
+    return m_map.empty();
+  }
+  std::pair<Iterator, bool> Insert(const Word &, const T &);
+  T *Find(const Word &w) const {
+    return m_vec[w[0]->GetId()];
+  }
+private:
+  Map m_map;
+  Vec m_vec;
+};
+template<typename T>
+std::pair<typename NonTerminalMap<T>::Iterator, bool> NonTerminalMap<T>::Insert(
+  const Word &key, const T &value)
+{
+  std::pair<typename Map::iterator, bool> result =
+    m_map.insert(typename Map::value_type(key, value));
+  if (result.second) {
+    T *p = &(result.first->second);
+    std::size_t i = key[0]->GetId();
+    m_vec[i] = p;
+  }
+  return result;
+}
+}  // namespace Syntax
+}  // namespace Moses

mosesdecoder/moses/Syntax/PHyperedge.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+#include <vector>
+#include "PLabel.h"
+namespace Moses
+{
+namespace Syntax
+{
+struct PVertex;
+struct PHyperedge {
+  PVertex *head;
+  std::vector<PVertex*> tail;
+  PLabel label;
+};
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/RuleTableFF.h ADDED Viewed

	@@ -0,0 +1,60 @@

+#pragma once
+#include <string>
+#include "moses/TranslationModel/PhraseDictionary.h"
+namespace Moses
+{
+class ChartParser;
+class ChartCellCollectionBase;
+class AllOptions;
+namespace Syntax
+{
+class RuleTable;
+// Feature function for dealing with local rule scores (that come from a
+// rule table).  The scores themselves are stored on TargetPhrase objects
+// and the decoder accesses them directly, so this object doesn't really do
+// anything except provide somewhere to store the weights and parameter values.
+class RuleTableFF : public PhraseDictionary
+{
+public:
+  RuleTableFF(const std::string &);
+  // FIXME Delete m_table?
+  ~RuleTableFF() {}
+  void Load(AllOptions::ptr const& opts);
+  const RuleTable *GetTable() const {
+    return m_table;
+  }
+  static const std::vector<RuleTableFF*> &Instances() {
+    return s_instances;
+  }
+  ChartRuleLookupManager *CreateRuleLookupManager(
+    const ChartParser &, const ChartCellCollectionBase &, std::size_t) {
+    assert(false);
+    return 0;
+  }
+  // Get the source terminal vocabulary for this table's grammar (as a set of
+  // factor IDs)
+  const boost::unordered_set<std::size_t> &GetSourceTerminalSet() const {
+    return m_sourceTerminalSet;
+  }
+private:
+  static std::vector<RuleTableFF*> s_instances;
+  const RuleTable *m_table;
+  boost::unordered_set<std::size_t> m_sourceTerminalSet;
+};
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/SHyperedgeBundle.h ADDED Viewed

	@@ -0,0 +1,31 @@

+#pragma once
+#include <vector>
+#include "moses/ScoreComponentCollection.h"
+#include "moses/TargetPhraseCollection.h"
+#include "SVertexStack.h"
+namespace Moses
+{
+namespace Syntax
+{
+struct PVertex;
+struct SHyperedgeBundle {
+  float inputWeight;
+  std::vector<const SVertexStack*> stacks;
+  TargetPhraseCollection::shared_ptr translations;
+  friend void swap(SHyperedgeBundle &x, SHyperedgeBundle &y) {
+    using std::swap;
+    swap(x.inputWeight, y.inputWeight);
+    swap(x.stacks, y.stacks);
+    swap(x.translations, y.translations);
+  }
+};
+}  // Syntax
+}  // Moses

mosesdecoder/moses/Syntax/SVertexRecombinationHasher.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#pragma once
+#include "moses/FF/FFState.h"
+#include "SVertex.h"
+namespace Moses
+{
+namespace Syntax
+{
+class SVertexRecombinationHasher
+{
+public:
+  std::size_t operator()(const SVertex *v) const {
+    std::size_t seed = 0;
+    for (std::vector<FFState*>::const_iterator p = v->states.begin();
+         p != v->states.end(); ++p) {
+      boost::hash_combine(seed, (*p)->hash());
+    }
+    return seed;
+  }
+};
+}  // Syntax
+}  // Moses

mosesdecoder/moses/TranslationModel/CompactPT/BlockHashIndex.cpp ADDED Viewed

	@@ -0,0 +1,424 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "ThrowingFwrite.h"
+#include "BlockHashIndex.h"
+#include "CmphStringVectorAdapter.h"
+#include "util/exception.hh"
+#include "util/string_stream.hh"
+#ifdef HAVE_CMPH
+#include "cmph.h"
+#endif
+namespace Moses
+{
+#ifdef WITH_THREADS
+BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits,
+                               size_t threadsNum)
+  : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+    m_fileHandle(0), m_fileHandleStart(0), m_landmarks(true), m_size(0),
+    m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0),
+    m_threadPool(threadsNum)
+{
+#ifndef HAVE_CMPH
+  std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+  exit(1);
+#endif
+}
+#else
+BlockHashIndex::BlockHashIndex(size_t orderBits, size_t fingerPrintBits)
+  : m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+    m_fileHandle(0), m_fileHandleStart(0), m_size(0),
+    m_lastSaved(-1), m_lastDropped(-1), m_numLoadedRanges(0)
+{
+#ifndef HAVE_CMPH
+  std::cerr << "minphr: CMPH support not compiled in." << std::endl;
+  exit(1);
+#endif
+}
+#endif
+BlockHashIndex::~BlockHashIndex()
+{
+#ifdef HAVE_CMPH
+  for(std::vector<void*>::iterator it = m_hashes.begin();
+      it != m_hashes.end(); it++)
+    if(*it != 0)
+      cmph_destroy((cmph_t*)*it);
+  for(std::vector<PairedPackedArray<>*>::iterator it = m_arrays.begin();
+      it != m_arrays.end(); it++)
+    if(*it != 0)
+      delete *it;
+#endif
+}
+size_t BlockHashIndex::GetHash(const char* key)
+{
+  std::string keyStr(key);
+  size_t i = std::distance(m_landmarks.begin(),
+                           std::upper_bound(m_landmarks.begin(),
+                               m_landmarks.end(), keyStr)) - 1;
+  if(i == 0ul-1)
+    return GetSize();
+  size_t pos = GetHash(i, key);
+  if(pos != GetSize())
+    return (1ul << m_orderBits) * i + pos;
+  else
+    return GetSize();
+}
+size_t BlockHashIndex::GetFprint(const char* key) const
+{
+  size_t hash;
+  MurmurHash3_x86_32(key, std::strlen(key), 100000, &hash);
+  hash &= (1ul << m_fingerPrintBits) - 1;
+  return hash;
+}
+size_t BlockHashIndex::GetHash(size_t i, const char* key)
+{
+//#ifdef WITH_THREADS
+//  boost::mutex::scoped_lock lock(m_mutex);
+//#endif
+  //if(m_hashes[i] == 0)
+  //LoadRange(i);
+#ifdef HAVE_CMPH
+  size_t idx = cmph_search((cmph_t*)m_hashes[i], key, (cmph_uint32) strlen(key));
+#else
+  assert(0);
+  size_t idx = 0;
+#endif
+  std::pair<size_t, size_t> orderPrint = m_arrays[i]->Get(idx, m_orderBits, m_fingerPrintBits);
+  m_clocks[i] = clock();
+  if(GetFprint(key) == orderPrint.second)
+    return orderPrint.first;
+  else
+    return GetSize();
+}
+size_t BlockHashIndex::GetHash(std::string key)
+{
+  return GetHash(key.c_str());
+}
+size_t BlockHashIndex::operator[](std::string key)
+{
+  return GetHash(key);
+}
+size_t BlockHashIndex::operator[](char* key)
+{
+  return GetHash(key);
+}
+size_t BlockHashIndex::Save(std::string filename)
+{
+  std::FILE* mphf = std::fopen(filename.c_str(), "w");
+  size_t size = Save(mphf);
+  std::fclose(mphf);
+  return size;
+}
+void BlockHashIndex::BeginSave(std::FILE * mphf)
+{
+  m_fileHandle = mphf;
+  ThrowingFwrite(&m_orderBits, sizeof(size_t), 1, m_fileHandle);
+  ThrowingFwrite(&m_fingerPrintBits, sizeof(size_t), 1, m_fileHandle);
+  m_fileHandleStart = std::ftell(m_fileHandle);
+  size_t relIndexPos = 0;
+  ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+}
+void BlockHashIndex::SaveRange(size_t i)
+{
+#ifdef HAVE_CMPH
+  if(m_seekIndex.size() <= i)
+    m_seekIndex.resize(i+1);
+  m_seekIndex[i] = std::ftell(m_fileHandle) - m_fileHandleStart;
+  cmph_dump((cmph_t*)m_hashes[i], m_fileHandle);
+  m_arrays[i]->Save(m_fileHandle);
+#endif
+}
+void BlockHashIndex::SaveLastRange()
+{
+#ifdef WITH_THREADS
+  boost::mutex::scoped_lock lock(m_mutex);
+#endif
+  while(!m_queue.empty() && m_lastSaved + 1 == -m_queue.top()) {
+    size_t current = -m_queue.top();
+    m_queue.pop();
+    SaveRange(current);
+    m_lastSaved = current;
+  }
+}
+void BlockHashIndex::DropRange(size_t i)
+{
+#ifdef HAVE_CMPH
+  if(m_hashes[i] != 0) {
+    cmph_destroy((cmph_t*)m_hashes[i]);
+    m_hashes[i] = 0;
+  }
+  if(m_arrays[i] != 0) {
+    delete m_arrays[i];
+    m_arrays[i] = 0;
+    m_clocks[i] = 0;
+  }
+  m_numLoadedRanges--;
+#endif
+}
+void BlockHashIndex::DropLastRange()
+{
+#ifdef WITH_THREADS
+  boost::mutex::scoped_lock lock(m_mutex);
+#endif
+  while(m_lastDropped != m_lastSaved)
+    DropRange(++m_lastDropped);
+}
+#ifdef WITH_THREADS
+void BlockHashIndex::WaitAll()
+{
+  m_threadPool.Stop(true);
+}
+#endif
+size_t BlockHashIndex::FinalizeSave()
+{
+#ifdef WITH_THREADS
+  m_threadPool.Stop(true);
+#endif
+  SaveLastRange();
+  size_t relIndexPos = std::ftell(m_fileHandle) - m_fileHandleStart;
+  std::fseek(m_fileHandle, m_fileHandleStart, SEEK_SET);
+  ThrowingFwrite(&relIndexPos, sizeof(size_t), 1, m_fileHandle);
+  std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
+  m_landmarks.save(m_fileHandle);
+  size_t seekIndexSize = m_seekIndex.size();
+  ThrowingFwrite(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
+  ThrowingFwrite(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
+  ThrowingFwrite(&m_size, sizeof(size_t), 1, m_fileHandle);
+  size_t fileHandleStop = std::ftell(m_fileHandle);
+  return fileHandleStop - m_fileHandleStart + sizeof(m_orderBits)
+         + sizeof(m_fingerPrintBits);
+}
+size_t BlockHashIndex::Save(std::FILE * mphf)
+{
+  m_queue = std::priority_queue<int>();
+  BeginSave(mphf);
+  for(size_t i = 0; i < m_hashes.size(); i++)
+    SaveRange(i);
+  return FinalizeSave();
+}
+size_t BlockHashIndex::LoadIndex(std::FILE* mphf)
+{
+  m_fileHandle = mphf;
+  size_t beginning = std::ftell(mphf);
+  size_t read = 0;
+  read += std::fread(&m_orderBits, sizeof(size_t), 1, mphf);
+  read += std::fread(&m_fingerPrintBits, sizeof(size_t), 1, mphf);
+  m_fileHandleStart = std::ftell(m_fileHandle);
+  size_t relIndexPos;
+  read += std::fread(&relIndexPos, sizeof(size_t), 1, mphf);
+  std::fseek(m_fileHandle, m_fileHandleStart + relIndexPos, SEEK_SET);
+  m_landmarks.load(mphf);
+  size_t seekIndexSize;
+  read += std::fread(&seekIndexSize, sizeof(size_t), 1, m_fileHandle);
+  m_seekIndex.resize(seekIndexSize);
+  read += std::fread(&m_seekIndex[0], sizeof(size_t), seekIndexSize, m_fileHandle);
+  m_hashes.resize(seekIndexSize, 0);
+  m_clocks.resize(seekIndexSize, 0);
+  m_arrays.resize(seekIndexSize, 0);
+  read += std::fread(&m_size, sizeof(size_t), 1, m_fileHandle);
+  size_t end = std::ftell(mphf);
+  return end - beginning;
+}
+void BlockHashIndex::LoadRange(size_t i)
+{
+#ifdef HAVE_CMPH
+  std::fseek(m_fileHandle, m_fileHandleStart + m_seekIndex[i], SEEK_SET);
+  cmph_t* hash = cmph_load(m_fileHandle);
+  m_arrays[i] = new PairedPackedArray<>(0, m_orderBits,
+                                        m_fingerPrintBits);
+  m_arrays[i]->Load(m_fileHandle);
+  m_hashes[i] = (void*)hash;
+  m_clocks[i] = clock();
+  m_numLoadedRanges++;
+#endif
+}
+size_t BlockHashIndex::Load(std::string filename)
+{
+  std::FILE* mphf = std::fopen(filename.c_str(), "r");
+  size_t size = Load(mphf);
+  std::fclose(mphf);
+  return size;
+}
+size_t BlockHashIndex::Load(std::FILE * mphf)
+{
+  size_t byteSize = LoadIndex(mphf);
+  size_t end = std::ftell(mphf);
+  for(size_t i = 0; i < m_seekIndex.size(); i++)
+    LoadRange(i);
+  std::fseek(m_fileHandle, end, SEEK_SET);
+  return byteSize;
+}
+size_t BlockHashIndex::GetSize() const
+{
+  return m_size;
+}
+void BlockHashIndex::KeepNLastRanges(float ratio, float tolerance)
+{
+  /*
+  #ifdef WITH_THREADS
+  boost::mutex::scoped_lock lock(m_mutex);
+  #endif
+  size_t n = m_hashes.size() * ratio;
+  size_t max = n * (1 + tolerance);
+  if(m_numLoadedRanges > max) {
+    typedef std::vector<std::pair<clock_t, size_t> > LastLoaded;
+    LastLoaded lastLoaded;
+    for(size_t i = 0; i < m_hashes.size(); i++)
+      if(m_hashes[i] != 0)
+        lastLoaded.push_back(std::make_pair(m_clocks[i], i));
+    std::sort(lastLoaded.begin(), lastLoaded.end());
+    for(LastLoaded::reverse_iterator it = lastLoaded.rbegin() + size_t(n * (1 - tolerance));
+        it != lastLoaded.rend(); it++)
+      DropRange(it->second);
+  }*/
+}
+void BlockHashIndex::CalcHash(size_t current, void* source_void)
+{
+#ifdef HAVE_CMPH
+  cmph_io_adapter_t* source = (cmph_io_adapter_t*) source_void;
+  cmph_config_t *config = cmph_config_new(source);
+  cmph_config_set_algo(config, CMPH_CHD);
+  cmph_t* hash = cmph_new(config);
+  PairedPackedArray<> *pv =
+    new PairedPackedArray<>(source->nkeys, m_orderBits, m_fingerPrintBits);
+  size_t i = 0;
+  source->rewind(source->data);
+  std::string lastKey = "";
+  while(i < source->nkeys) {
+    unsigned keylen;
+    char* key;
+    source->read(source->data, &key, &keylen);
+    std::string temp(key, keylen);
+    source->dispose(source->data, key, keylen);
+    if(lastKey > temp) {
+      if(source->nkeys != 2 || temp != "###DUMMY_KEY###") {
+        util::StringStream strme;
+        strme << "ERROR: Input file does not appear to be sorted with  LC_ALL=C sort\n";
+        strme << "1: " << lastKey << "\n";
+        strme << "2: " << temp << "\n";
+        UTIL_THROW2(strme.str());
+      }
+    }
+    lastKey = temp;
+    size_t fprint = GetFprint(temp.c_str());
+    size_t idx = cmph_search(hash, temp.c_str(),
+                             (cmph_uint32) temp.size());
+    pv->Set(idx, i, fprint, m_orderBits, m_fingerPrintBits);
+    i++;
+  }
+  cmph_config_destroy(config);
+#ifdef WITH_THREADS
+  boost::mutex::scoped_lock lock(m_mutex);
+#endif
+  if(m_hashes.size() <= current) {
+    m_hashes.resize(current + 1, 0);
+    m_arrays.resize(current + 1, 0);
+    m_clocks.resize(current + 1, 0);
+  }
+  m_hashes[current] = (void*)hash;
+  m_arrays[current] = pv;
+  m_clocks[current] = clock();
+  m_queue.push(-current);
+#endif
+}
+#ifdef HAVE_CMPH
+void* BlockHashIndex::vectorAdapter(std::vector<std::string>& v)
+{
+  return (void*)CmphVectorAdapter(v);
+}
+void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, std::allocator>& sv)
+{
+  return (void*)CmphStringVectorAdapter(sv);
+}
+void* BlockHashIndex::vectorAdapter(StringVector<unsigned, size_t, MmapAllocator>& sv)
+{
+  return (void*)CmphStringVectorAdapter(sv);
+}
+#endif
+}

mosesdecoder/moses/TranslationModel/CompactPT/ConsistentPhrases.h ADDED Viewed

	@@ -0,0 +1,112 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_ConsistentPhrases_h
+#define moses_ConsistentPhrases_h
+#include <set>
+namespace Moses
+{
+class ConsistentPhrases
+{
+public:
+  struct Phrase {
+    int i, j, m, n;
+    Phrase(int i_, int m_, int j_, int n_) : i(i_), j(j_), m(m_), n(n_) { }
+  };
+  struct PhraseSorter {
+    bool operator()(Phrase a, Phrase b) {
+      if(a.n > b.n)
+        return true;
+      if(a.n == b.n && a.j < b.j)
+        return true;
+      if(a.n == b.n && a.j == b.j && a.m > b.m)
+        return true;
+      if(a.n == b.n && a.j == b.j && a.m == b.m && a.i < b.i)
+        return true;
+      return false;
+    }
+  };
+private:
+  typedef std::set<Phrase, PhraseSorter> PhraseQueue;
+  PhraseQueue m_phraseQueue;
+  typedef std::pair<unsigned char, unsigned char> AlignPoint;
+  typedef std::set<AlignPoint> Alignment;
+public:
+  ConsistentPhrases(int mmax, int nmax, Alignment& a) {
+    for(int i = 0; i < mmax; i++) {
+      for(int m = 1; m <= mmax-i; m++) {
+        for(int j = 0; j < nmax; j++) {
+          for(int n = 1; n <= nmax-j; n++) {
+            bool consistant = true;
+            for(Alignment::iterator it = a.begin(); it != a.end(); it++) {
+              int ip = it->first;
+              int jp = it->second;
+              if((i <= ip && ip < i+m) != (j <= jp && jp < j+n)) {
+                consistant = false;
+                break;
+              }
+            }
+            if(consistant)
+              m_phraseQueue.insert(Phrase(i, m, j, n));
+          }
+        }
+      }
+    }
+    m_phraseQueue.erase(Phrase(0, mmax, 0, nmax));
+  }
+  size_t Empty() {
+    return !m_phraseQueue.size();
+  }
+  Phrase Pop() {
+    if(m_phraseQueue.size()) {
+      Phrase p = *m_phraseQueue.begin();
+      m_phraseQueue.erase(m_phraseQueue.begin());
+      return p;
+    }
+    return Phrase(0,0,0,0);
+  }
+  void RemoveOverlap(Phrase p) {
+    PhraseQueue ok;
+    for(PhraseQueue::iterator it = m_phraseQueue.begin(); it != m_phraseQueue.end(); it++) {
+      Phrase pp = *it;
+      if(!((p.i <= pp.i && pp.i < p.i + p.m) || (pp.i <= p.i && p.i < pp.i + pp.m) ||
+           (p.j <= pp.j && pp.j < p.j + p.n) || (pp.j <= p.j && p.j < pp.j + pp.n)))
+        ok.insert(pp);
+    }
+    m_phraseQueue = ok;
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/Jamfile ADDED Viewed

	@@ -0,0 +1,17 @@

+local current = "" ;
+local includes = ;
+local with-cmph = [ option.get "with-cmph" ] ;
+if $(with-cmph) {
+  lib cmph : : <search>$(with-cmph)/lib <search>$(with-cmph)/lib64 ;
+  includes += <include>$(with-cmph)/include ;
+  current = "--with-cmph=$(with-cmph)" ;
+  fakelib CompactPT : [ glob *.cpp ] ../..//headers cmph : $(includes) <dependency>$(PT-LOG) : : $(includes) ;
+}
+else {
+  alias cmph ;
+  fakelib CompactPT ;
+}
+path-constant PT-LOG : bin/pt.log ;
+update-if-changed $(PT-LOG) $(current) ;

mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.cpp ADDED Viewed

	@@ -0,0 +1,450 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <sstream>
+#include "LexicalReorderingTableCreator.h"
+#include "ThrowingFwrite.h"
+#include "moses/Util.h"
+#include "util/file.hh"
+#include "util/exception.hh"
+namespace Moses
+{
+LexicalReorderingTableCreator::LexicalReorderingTableCreator(
+  std::string inPath, std::string outPath, std::string tempfilePath,
+  size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees,
+  size_t quantize
+#ifdef WITH_THREADS
+  , size_t threads
+#endif
+)
+  : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath),
+    m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits),
+    m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees),
+    m_quantize(quantize), m_separator(" ||| "),
+    m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1)
+#ifdef WITH_THREADS
+    , m_threads(threads)
+#endif
+{
+  PrintInfo();
+  m_outFile = std::fopen(m_outPath.c_str(), "w");
+  std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl;
+  m_hash.BeginSave(m_outFile);
+  if(tempfilePath.size()) {
+    MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath));
+    m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded);
+  } else {
+    m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
+  }
+  EncodeScores();
+  std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl;
+  CalcHuffmanCodes();
+  std::cerr << "Pass 2/2: Compressing scores" << std::endl;
+  if(tempfilePath.size()) {
+    MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath));
+    m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed);
+  } else {
+    m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(true);
+  }
+  CompressScores();
+  std::cerr << "Saving to " << m_outPath << std::endl;
+  Save();
+  std::cerr << "Done" << std::endl;
+  std::fclose(m_outFile);
+}
+void LexicalReorderingTableCreator::PrintInfo()
+{
+  std::cerr << "Used options:" << std::endl;
+  std::cerr << "\tText reordering table will be read from: " << m_inPath << std::endl;
+  std::cerr << "\tOutput reordering table will be written to: " << m_outPath << std::endl;
+  std::cerr << "\tStep size for source landmark phrases: 2^" << m_orderBits << "=" << (1ul << m_orderBits) << std::endl;
+  std::cerr << "\tPhrase fingerprint size: " << m_fingerPrintBits << " bits / P(fp)=" << (float(1)/(1ul << m_fingerPrintBits)) << std::endl;
+  std::cerr << "\tSingle Huffman code set for score components: " << (m_multipleScoreTrees ? "no" : "yes") << std::endl;
+  std::cerr << "\tUsing score quantization: ";
+  if(m_quantize)
+    std::cerr << m_quantize << " best" << std::endl;
+  else
+    std::cerr << "no" << std::endl;
+#ifdef WITH_THREADS
+  std::cerr << "\tRunning with " << m_threads << " threads" << std::endl;
+#endif
+  std::cerr << std::endl;
+}
+LexicalReorderingTableCreator::~LexicalReorderingTableCreator()
+{
+  for(size_t i = 0; i < m_scoreTrees.size(); i++) {
+    delete m_scoreTrees[i];
+    delete m_scoreCounters[i];
+  }
+  delete m_encodedScores;
+  delete m_compressedScores;
+}
+void LexicalReorderingTableCreator::EncodeScores()
+{
+  InputFileStream inFile(m_inPath);
+#ifdef WITH_THREADS
+  boost::thread_group threads;
+  for (size_t i = 0; i < m_threads; ++i) {
+    EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
+    threads.create_thread(*et);
+  }
+  threads.join_all();
+#else
+  EncodingTaskReordering* et = new EncodingTaskReordering(inFile, *this);
+  (*et)();
+  delete et;
+#endif
+  FlushEncodedQueue(true);
+}
+void LexicalReorderingTableCreator::CalcHuffmanCodes()
+{
+  std::vector<ScoreTree*>::iterator treeIt = m_scoreTrees.begin();
+  for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
+      it != m_scoreCounters.end(); it++) {
+    if(m_quantize)
+      (*it)->Quantize(m_quantize);
+    std::cerr << "\tCreating Huffman codes for " << (*it)->Size()
+              << " scores" << std::endl;
+    *treeIt = new ScoreTree((*it)->Begin(), (*it)->End());
+    treeIt++;
+  }
+  std::cerr << std::endl;
+}
+void LexicalReorderingTableCreator::CompressScores()
+{
+#ifdef WITH_THREADS
+  boost::thread_group threads;
+  for (size_t i = 0; i < m_threads; ++i) {
+    CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
+    threads.create_thread(*ct);
+  }
+  threads.join_all();
+#else
+  CompressionTaskReordering* ct = new CompressionTaskReordering(*m_encodedScores, *this);
+  (*ct)();
+  delete ct;
+#endif
+  FlushCompressedQueue(true);
+}
+void LexicalReorderingTableCreator::Save()
+{
+  ThrowingFwrite(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, m_outFile);
+  ThrowingFwrite(&m_multipleScoreTrees, sizeof(m_multipleScoreTrees), 1, m_outFile);
+  for(size_t i = 0; i < m_scoreTrees.size(); i++)
+    m_scoreTrees[i]->Save(m_outFile);
+  m_compressedScores->save(m_outFile);
+}
+std::string LexicalReorderingTableCreator::MakeSourceTargetKey(std::string &source, std::string &target)
+{
+  std::string key = source + m_separator;
+  if(!target.empty())
+    key += target + m_separator;
+  return key;
+}
+std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
+{
+  std::string scoresString = tokens.back();
+  std::stringstream scoresStream;
+  std::vector<float> scores;
+  Tokenize<float>(scores, scoresString);
+  if(!m_numScoreComponent) {
+    m_numScoreComponent = scores.size();
+    m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
+    for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
+        it != m_scoreCounters.end(); it++)
+      *it = new ScoreCounter();
+    m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
+  }
+  if(m_numScoreComponent != scores.size()) {
+    std::stringstream strme;
+    strme << "Error: Wrong number of scores detected ("
+          << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
+    strme << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
+    UTIL_THROW2(strme.str());
+  }
+  size_t c = 0;
+  float score;
+  while(c < m_numScoreComponent) {
+    score = scores[c];
+    score = FloorScore(TransformScore(score));
+    scoresStream.write((char*)&score, sizeof(score));
+    m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
+    c++;
+  }
+  return scoresStream.str();
+}
+void LexicalReorderingTableCreator::AddEncodedLine(PackedItem& pi)
+{
+  m_queue.push(pi);
+}
+void LexicalReorderingTableCreator::FlushEncodedQueue(bool force)
+{
+  if(force || m_queue.size() > 10000) {
+    while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
+      PackedItem pi = m_queue.top();
+      m_queue.pop();
+      m_lastFlushedLine++;
+      m_lastRange.push_back(pi.GetSrc());
+      m_encodedScores->push_back(pi.GetTrg());
+      if((pi.GetLine()+1) % 100000 == 0)
+        std::cerr << ".";
+      if((pi.GetLine()+1) % 5000000 == 0)
+        std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+      if(m_lastRange.size() == (1ul << m_orderBits)) {
+        m_hash.AddRange(m_lastRange);
+        m_hash.SaveLastRange();
+        m_hash.DropLastRange();
+        m_lastRange.clear();
+      }
+    }
+  }
+  if(force) {
+    m_lastFlushedLine = -1;
+    if(!m_lastRange.empty()) {
+      m_hash.AddRange(m_lastRange);
+      m_lastRange.clear();
+    }
+#ifdef WITH_THREADS
+    m_hash.WaitAll();
+#endif
+    m_hash.SaveLastRange();
+    m_hash.DropLastRange();
+    m_hash.FinalizeSave();
+    std::cerr << std::endl << std::endl;
+  }
+}
+std::string LexicalReorderingTableCreator::CompressEncodedScores(std::string &encodedScores)
+{
+  std::stringstream encodedScoresStream(encodedScores);
+  encodedScoresStream.unsetf(std::ios::skipws);
+  std::string compressedScores;
+  BitWrapper<> compressedScoresStream(compressedScores);
+  size_t currScore = 0;
+  float score;
+  encodedScoresStream.read((char*) &score, sizeof(score));
+  while(encodedScoresStream) {
+    size_t index = currScore % m_scoreTrees.size();
+    if(m_quantize)
+      score = m_scoreCounters[index]->LowerBound(score);
+    m_scoreTrees[index]->Put(compressedScoresStream, score);
+    encodedScoresStream.read((char*) &score, sizeof(score));
+    currScore++;
+  }
+  return compressedScores;
+}
+void LexicalReorderingTableCreator::AddCompressedScores(PackedItem& pi)
+{
+  m_queue.push(pi);
+}
+void LexicalReorderingTableCreator::FlushCompressedQueue(bool force)
+{
+  if(force || m_queue.size() > 10000) {
+    while(!m_queue.empty() && m_lastFlushedLine + 1 == m_queue.top().GetLine()) {
+      PackedItem pi = m_queue.top();
+      m_queue.pop();
+      m_lastFlushedLine++;
+      m_compressedScores->push_back(pi.GetTrg());
+      if((pi.GetLine()+1) % 100000 == 0)
+        std::cerr << ".";
+      if((pi.GetLine()+1) % 5000000 == 0)
+        std::cerr << "[" << (pi.GetLine()+1) << "]" << std::endl;
+    }
+  }
+  if(force) {
+    m_lastFlushedLine = -1;
+    std::cerr << std::endl << std::endl;
+  }
+}
+//****************************************************************************//
+size_t EncodingTaskReordering::m_lineNum = 0;
+#ifdef WITH_THREADS
+boost::mutex EncodingTaskReordering::m_mutex;
+boost::mutex EncodingTaskReordering::m_fileMutex;
+#endif
+EncodingTaskReordering::EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator)
+  : m_inFile(inFile), m_creator(creator) {}
+void EncodingTaskReordering::operator()()
+{
+  size_t lineNum = 0;
+  std::vector<std::string> lines;
+  size_t max_lines = 1000;
+  lines.reserve(max_lines);
+  {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_fileMutex);
+#endif
+    std::string line;
+    while(lines.size() < max_lines && std::getline(m_inFile, line))
+      lines.push_back(line);
+    lineNum = m_lineNum;
+    m_lineNum += lines.size();
+  }
+  std::vector<PackedItem> result;
+  result.reserve(max_lines);
+  while(lines.size()) {
+    for(size_t i = 0; i < lines.size(); i++) {
+      std::vector<std::string> tokens;
+      Moses::TokenizeMultiCharSeparator(tokens, lines[i], m_creator.m_separator);
+      std::string encodedLine = m_creator.EncodeLine(tokens);
+      std::string f = tokens[0];
+      std::string e;
+      if(tokens.size() > 2)
+        e = tokens[1];
+      PackedItem packedItem(lineNum + i, m_creator.MakeSourceTargetKey(f, e),
+                            encodedLine, i);
+      result.push_back(packedItem);
+    }
+    {
+#ifdef WITH_THREADS
+      boost::mutex::scoped_lock lock(m_mutex);
+#endif
+      for(size_t i = 0; i < result.size(); i++)
+        m_creator.AddEncodedLine(result[i]);
+      m_creator.FlushEncodedQueue();
+    }
+    lines.clear();
+    result.clear();
+    lines.reserve(max_lines);
+    result.reserve(max_lines);
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_fileMutex);
+#endif
+    std::string line;
+    while(lines.size() < max_lines && std::getline(m_inFile, line))
+      lines.push_back(line);
+    lineNum = m_lineNum;
+    m_lineNum += lines.size();
+  }
+}
+//****************************************************************************//
+size_t CompressionTaskReordering::m_scoresNum = 0;
+#ifdef WITH_THREADS
+boost::mutex CompressionTaskReordering::m_mutex;
+#endif
+CompressionTaskReordering::CompressionTaskReordering(StringVector<unsigned char, unsigned long,
+    MmapAllocator>& encodedScores,
+    LexicalReorderingTableCreator& creator)
+  : m_encodedScores(encodedScores), m_creator(creator)
+{ }
+void CompressionTaskReordering::operator()()
+{
+  size_t scoresNum;
+  {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    scoresNum = m_scoresNum;
+    m_scoresNum++;
+  }
+  while(scoresNum < m_encodedScores.size()) {
+    std::string scores = m_encodedScores[scoresNum];
+    std::string compressedScores
+    = m_creator.CompressEncodedScores(scores);
+    std::string dummy;
+    PackedItem packedItem(scoresNum, dummy, compressedScores, 0);
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    m_creator.AddCompressedScores(packedItem);
+    m_creator.FlushCompressedQueue();
+    scoresNum = m_scoresNum;
+    m_scoresNum++;
+  }
+}
+}

mosesdecoder/moses/TranslationModel/CompactPT/MmapAllocator.h ADDED Viewed

	@@ -0,0 +1,202 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_MmapAllocator_h
+#define moses_MmapAllocator_h
+#include <limits>
+#include <iostream>
+#include <cstdio>
+#include <unistd.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#include <io.h>
+#else
+#include <sys/mman.h>
+#endif
+#include "util/mmap.hh"
+namespace Moses
+{
+template <class T>
+class MmapAllocator
+{
+protected:
+  std::FILE* m_file_ptr;
+  size_t m_file_desc;
+  size_t m_page_size;
+  size_t m_map_size;
+  char* m_data_ptr;
+  size_t m_data_offset;
+  bool m_fixed;
+  size_t* m_count;
+public:
+  typedef T        value_type;
+  typedef T*       pointer;
+  typedef const T* const_pointer;
+  typedef T&       reference;
+  typedef const T& const_reference;
+  typedef std::size_t    size_type;
+  typedef std::ptrdiff_t difference_type;
+  MmapAllocator() throw()
+    : m_file_ptr(std::tmpfile()), m_file_desc(fileno(m_file_ptr)),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
+      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
+  }
+  MmapAllocator(std::FILE* f_ptr) throw()
+    : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
+      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
+  }
+  MmapAllocator(std::FILE* f_ptr, size_t data_offset) throw()
+    : m_file_ptr(f_ptr), m_file_desc(fileno(m_file_ptr)),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
+      m_data_offset(data_offset), m_fixed(true), m_count(new size_t(0)) {
+  }
+  MmapAllocator(std::string fileName) throw()
+    : m_file_ptr(std::fopen(fileName.c_str(), "wb+")), m_file_desc(fileno(m_file_ptr)),
+      m_page_size(util::SizePage()), m_map_size(0), m_data_ptr(0),
+      m_data_offset(0), m_fixed(false), m_count(new size_t(0)) {
+  }
+  MmapAllocator(const MmapAllocator& c) throw()
+    : m_file_ptr(c.m_file_ptr), m_file_desc(c.m_file_desc),
+      m_page_size(c.m_page_size), m_map_size(c.m_map_size),
+      m_data_ptr(c.m_data_ptr), m_data_offset(c.m_data_offset),
+      m_fixed(c.m_fixed), m_count(c.m_count) {
+    (*m_count)++;
+  }
+  ~MmapAllocator() throw() {
+    if(m_data_ptr && *m_count == 0) {
+      util::UnmapOrThrow(m_data_ptr, m_map_size);
+      if(!m_fixed && std::ftell(m_file_ptr) != -1)
+        std::fclose(m_file_ptr);
+    }
+    (*m_count)--;
+  }
+  template <class U>
+  struct rebind {
+    typedef MmapAllocator<U> other;
+  };
+  pointer address (reference value) const {
+    return &value;
+  }
+  const_pointer address (const_reference value) const {
+    return &value;
+  }
+  size_type max_size () const throw() {
+    return std::numeric_limits<size_t>::max() / sizeof(value_type);
+  }
+  pointer allocate (size_type num, const void* = 0) {
+    m_map_size = num * sizeof(T);
+#if defined(_WIN32) || defined(_WIN64)
+    // On Windows, MAP_SHARED is not defined and MapOrThrow ignores the flags.
+    const int map_shared = 0;
+#else
+    const int map_shared = MAP_SHARED;
+#endif
+    if(!m_fixed) {
+      size_t read = 0;
+      read += ftruncate(m_file_desc, m_map_size);
+      m_data_ptr = (char *)util::MapOrThrow(
+                     m_map_size, true, map_shared, false, m_file_desc, 0);
+      return (pointer)m_data_ptr;
+    } else {
+      const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+      const size_t relative_offset = m_data_offset - map_offset;
+      const size_t adjusted_map_size = m_map_size + relative_offset;
+      m_data_ptr = (char *)util::MapOrThrow(
+                     adjusted_map_size, false, map_shared, false, m_file_desc, map_offset);
+      return (pointer)(m_data_ptr + relative_offset);
+    }
+  }
+  void deallocate (pointer p, size_type num) {
+    if(!m_fixed) {
+      util::UnmapOrThrow(p, num * sizeof(T));
+    } else {
+      const size_t map_offset = (m_data_offset / m_page_size) * m_page_size;
+      const size_t relative_offset = m_data_offset - map_offset;
+      const size_t adjusted_map_size = m_map_size + relative_offset;
+      util::UnmapOrThrow((pointer)((char*)p - relative_offset), adjusted_map_size);
+    }
+  }
+  void construct (pointer p, const T& value) {
+    if(!m_fixed)
+      new(p) value_type(value);
+  }
+  void destroy (pointer p) {
+    if(!m_fixed)
+      p->~T();
+  }
+  template <class T1, class T2>
+  friend bool operator== (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+  template <class T1, class T2>
+  friend bool operator!= (const MmapAllocator<T1>&, const MmapAllocator<T2>&) throw();
+};
+template <class T1, class T2>
+bool operator== (const MmapAllocator<T1>& a1,
+                 const MmapAllocator<T2>& a2) throw()
+{
+  bool equal = true;
+  equal &= a1.m_file_ptr == a2.m_file_ptr;
+  equal &= a1.m_file_desc == a2.m_file_desc;
+  equal &= a1.m_page_size == a2.m_page_size;
+  equal &= a1.m_map_size == a2.m_map_size;
+  equal &= a1.m_data_ptr == a2.m_data_ptr;
+  equal &= a1.m_data_offset == a2.m_data_offset;
+  equal &= a1.m_fixed == a2.m_fixed;
+  return equal;
+}
+template <class T1, class T2>
+bool operator!=(const MmapAllocator<T1>& a1,
+                const MmapAllocator<T2>& a2) throw()
+{
+  return !(a1 == a2);
+}
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/MonotonicVector.h ADDED Viewed

	@@ -0,0 +1,230 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_MonotonicVector_h
+#define moses_MonotonicVector_h
+// MonotonicVector - Represents a monotonic increasing function that maps
+// positive integers of any size onto a given number type. Each value has to be
+// equal or larger than the previous one. Depending on the stepSize it can save
+// up to 90% of memory compared to a std::vector<long>. Time complexity is roughly
+// constant, in the worst case, however, stepSize times slower than a normal
+// std::vector.
+#include <vector>
+#include <limits>
+#include <algorithm>
+#include <cstdio>
+#include <cassert>
+#include "ThrowingFwrite.h"
+#include "ListCoders.h"
+#include "MmapAllocator.h"
+namespace Moses
+{
+template<typename PosT = size_t, typename NumT = size_t, PosT stepSize = 32,
+         template <typename> class Allocator = std::allocator>
+class MonotonicVector
+{
+private:
+  typedef std::vector<NumT, Allocator<NumT> > Anchors;
+  typedef std::vector<unsigned int, Allocator<unsigned int> > Diffs;
+  Anchors m_anchors;
+  Diffs m_diffs;
+  std::vector<unsigned int> m_tempDiffs;
+  size_t m_size;
+  PosT m_last;
+  bool m_final;
+public:
+  typedef PosT value_type;
+  MonotonicVector() : m_size(0), m_last(0), m_final(false) {}
+  size_t size() const {
+    return m_size + m_tempDiffs.size();
+  }
+  PosT at(size_t i) const {
+    PosT s = stepSize;
+    PosT j = m_anchors[i / s];
+    PosT r = i % s;
+    typename Diffs::const_iterator it = m_diffs.begin() + j;
+    PosT k = 0;
+    k += VarInt32::DecodeAndSum(it, m_diffs.end(), 1);
+    if(i < m_size)
+      k += Simple9::DecodeAndSum(it, m_diffs.end(), r);
+    else if(i < m_size + m_tempDiffs.size())
+      for(size_t l = 0; l < r; l++)
+        k += m_tempDiffs[l];
+    return k;
+  }
+  PosT operator[](PosT i) const {
+    return at(i);
+  }
+  PosT back() const {
+    return at(size()-1);
+  }
+  void push_back(PosT i) {
+    assert(m_final != true);
+    if(m_anchors.size() == 0 && m_tempDiffs.size() == 0) {
+      m_anchors.push_back(0);
+      VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
+      m_last = i;
+      m_size++;
+      return;
+    }
+    if(m_tempDiffs.size() == stepSize-1) {
+      Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+                      std::back_inserter(m_diffs));
+      m_anchors.push_back(m_diffs.size());
+      VarInt32::Encode(&i, &i+1, std::back_inserter(m_diffs));
+      m_size += m_tempDiffs.size() + 1;
+      m_tempDiffs.clear();
+    } else {
+      PosT last = m_last;
+      PosT diff = i - last;
+      m_tempDiffs.push_back(diff);
+    }
+    m_last = i;
+  }
+  void commit() {
+    assert(m_final != true);
+    Simple9::Encode(m_tempDiffs.begin(), m_tempDiffs.end(),
+                    std::back_inserter(m_diffs));
+    m_size += m_tempDiffs.size();
+    m_tempDiffs.clear();
+    m_final = true;
+  }
+  size_t usage() {
+    return m_diffs.size() * sizeof(unsigned int)
+           + m_anchors.size() * sizeof(NumT);
+  }
+  size_t load(std::FILE* in, bool map = false) {
+    size_t byteSize = 0;
+    byteSize += fread(&m_final, sizeof(bool), 1, in) * sizeof(bool);
+    byteSize += fread(&m_size, sizeof(size_t), 1, in) * sizeof(size_t);
+    byteSize += fread(&m_last, sizeof(PosT), 1, in) * sizeof(PosT);
+    byteSize += loadVector(m_diffs, in, map);
+    byteSize += loadVector(m_anchors, in, map);
+    return byteSize;
+  }
+  template <typename ValueT>
+  size_t loadVector(std::vector<ValueT, std::allocator<ValueT> >& v,
+                    std::FILE* in, bool map = false) {
+    // Can only be read into memory. Mapping not possible with std:allocator.
+    assert(map == false);
+    size_t byteSize = 0;
+    size_t valSize;
+    byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+    v.resize(valSize, 0);
+    byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+    return byteSize;
+  }
+  template <typename ValueT>
+  size_t loadVector(std::vector<ValueT, MmapAllocator<ValueT> >& v,
+                    std::FILE* in, bool map = false) {
+    size_t byteSize = 0;
+    size_t valSize;
+    byteSize += std::fread(&valSize, sizeof(size_t), 1, in) * sizeof(size_t);
+    if(map == false) {
+      // Read data into temporary file (default constructor of MmapAllocator)
+      // and map memory onto temporary file. Can be resized.
+      v.resize(valSize, 0);
+      byteSize += std::fread(&v[0], sizeof(ValueT), valSize, in) * sizeof(ValueT);
+    } else {
+      // Map it directly on specified region of file "in" starting at valPos
+      // with length valSize * sizeof(ValueT). Mapped region cannot be resized.
+      size_t valPos = std::ftell(in);
+      Allocator<ValueT> alloc(in, valPos);
+      std::vector<ValueT, Allocator<ValueT> > vTemp(alloc);
+      vTemp.resize(valSize);
+      v.swap(vTemp);
+      std::fseek(in, valSize * sizeof(ValueT), SEEK_CUR);
+      byteSize += valSize * sizeof(ValueT);
+    }
+    return byteSize;
+  }
+  size_t save(std::FILE* out) {
+    if(!m_final)
+      commit();
+    bool byteSize = 0;
+    byteSize += ThrowingFwrite(&m_final, sizeof(bool), 1, out) * sizeof(bool);
+    byteSize += ThrowingFwrite(&m_size, sizeof(size_t), 1, out) * sizeof(size_t);
+    byteSize += ThrowingFwrite(&m_last, sizeof(PosT), 1, out) * sizeof(PosT);
+    size_t size = m_diffs.size();
+    byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+    byteSize += ThrowingFwrite(&m_diffs[0], sizeof(unsigned int), size, out) * sizeof(unsigned int);
+    size = m_anchors.size();
+    byteSize += ThrowingFwrite(&size, sizeof(size_t), 1, out) * sizeof(size_t);
+    byteSize += ThrowingFwrite(&m_anchors[0], sizeof(NumT), size, out) * sizeof(NumT);
+    return byteSize;
+  }
+  void swap(MonotonicVector<PosT, NumT, stepSize, Allocator> &mv) {
+    if(!m_final)
+      commit();
+    m_diffs.swap(mv.m_diffs);
+    m_anchors.swap(mv.m_anchors);
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/PhraseDecoder.h ADDED Viewed

	@@ -0,0 +1,144 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_PhraseDecoder_h
+#define moses_PhraseDecoder_h
+#include <sstream>
+#include <vector>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <sys/stat.h>
+#include "moses/TypeDef.h"
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/Range.h"
+#include "PhraseDictionaryCompact.h"
+#include "StringVector.h"
+#include "CanonicalHuffman.h"
+#include "TargetPhraseCollectionCache.h"
+namespace Moses
+{
+class PhraseDictionaryCompact;
+class PhraseDecoder
+{
+protected:
+  friend class PhraseDictionaryCompact;
+  typedef std::pair<unsigned char, unsigned char> AlignPoint;
+  typedef std::pair<unsigned, unsigned> SrcTrg;
+  enum Coding { None, REnc, PREnc } m_coding;
+  size_t m_numScoreComponent;
+  bool m_containsAlignmentInfo;
+  size_t m_maxRank;
+  size_t m_maxPhraseLength;
+  boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+  StringVector<unsigned char, unsigned, std::allocator> m_sourceSymbols;
+  StringVector<unsigned char, unsigned, std::allocator> m_targetSymbols;
+  std::vector<size_t> m_lexicalTableIndex;
+  std::vector<SrcTrg> m_lexicalTable;
+  CanonicalHuffman<unsigned>* m_symbolTree;
+  bool m_multipleScoreTrees;
+  std::vector<CanonicalHuffman<float>*> m_scoreTrees;
+  CanonicalHuffman<AlignPoint>* m_alignTree;
+  TargetPhraseCollectionCache m_decodingCache;
+  PhraseDictionaryCompact& m_phraseDictionary;
+  // ***********************************************
+  const std::vector<FactorType>* m_input;
+  const std::vector<FactorType>* m_output;
+  std::string m_separator;
+  // ***********************************************
+  unsigned GetSourceSymbolId(std::string& s);
+  std::string GetTargetSymbol(unsigned id) const;
+  size_t GetREncType(unsigned encodedSymbol);
+  size_t GetPREncType(unsigned encodedSymbol);
+  unsigned GetTranslation(unsigned srcIdx, size_t rank);
+  size_t GetMaxSourcePhraseLength();
+  unsigned DecodeREncSymbol1(unsigned encodedSymbol);
+  unsigned DecodeREncSymbol2Rank(unsigned encodedSymbol);
+  unsigned DecodeREncSymbol2Position(unsigned encodedSymbol);
+  unsigned DecodeREncSymbol3(unsigned encodedSymbol);
+  unsigned DecodePREncSymbol1(unsigned encodedSymbol);
+  int DecodePREncSymbol2Left(unsigned encodedSymbol);
+  int DecodePREncSymbol2Right(unsigned encodedSymbol);
+  unsigned DecodePREncSymbol2Rank(unsigned encodedSymbol);
+  std::string MakeSourceKey(std::string &);
+public:
+  PhraseDecoder(
+    PhraseDictionaryCompact &phraseDictionary,
+    const std::vector<FactorType>* input,
+    const std::vector<FactorType>* output,
+    size_t numScoreComponent
+  );
+  ~PhraseDecoder();
+  size_t Load(std::FILE* in);
+  TargetPhraseVectorPtr CreateTargetPhraseCollection(const Phrase &sourcePhrase,
+      bool topLevel = false, bool eval = true);
+  TargetPhraseVectorPtr DecodeCollection(TargetPhraseVectorPtr tpv,
+                                         BitWrapper<> &encodedBitStream,
+                                         const Phrase &sourcePhrase,
+                                         bool topLevel,
+                                         bool eval);
+  void PruneCache();
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/PhraseTableCreator.h ADDED Viewed

	@@ -0,0 +1,412 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_PhraseTableCreator_h
+#define moses_PhraseTableCreator_h
+#include <sstream>
+#include <iostream>
+#include <queue>
+#include <vector>
+#include <set>
+#include <boost/unordered_map.hpp>
+#include "moses/InputFileStream.h"
+#include "moses/ThreadPool.h"
+#include "moses/Util.h"
+#include "BlockHashIndex.h"
+#include "StringVector.h"
+#include "StringVectorTemp.h"
+#include "CanonicalHuffman.h"
+namespace Moses
+{
+typedef std::pair<unsigned char, unsigned char> AlignPoint;
+template <typename DataType>
+class Counter
+{
+public:
+  typedef boost::unordered_map<DataType, size_t> FreqMap;
+  typedef typename FreqMap::iterator iterator;
+  typedef typename FreqMap::mapped_type mapped_type;
+  typedef typename FreqMap::value_type value_type;
+private:
+#ifdef WITH_THREADS
+  boost::mutex m_mutex;
+#endif
+  FreqMap m_freqMap;
+  size_t m_maxSize;
+  std::vector<DataType> m_bestVec;
+  struct FreqSorter {
+    bool operator()(const value_type& a, const value_type& b) const {
+      if(a.second > b.second)
+        return true;
+      // Check impact on translation quality!
+      if(a.second == b.second && a.first > b.first)
+        return true;
+      return false;
+    }
+  };
+public:
+  Counter() : m_maxSize(0) {}
+  iterator Begin() {
+    return m_freqMap.begin();
+  }
+  iterator End() {
+    return m_freqMap.end();
+  }
+  void Increase(DataType data) {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    m_freqMap[data]++;
+  }
+  void IncreaseBy(DataType data, size_t num) {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    m_freqMap[data] += num;
+  }
+  mapped_type& operator[](DataType data) {
+    return m_freqMap[data];
+  }
+  size_t Size() {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    return m_freqMap.size();
+  }
+  void Quantize(size_t maxSize) {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    m_maxSize = maxSize;
+    std::vector<std::pair<DataType, mapped_type> > freqVec;
+    freqVec.insert(freqVec.begin(), m_freqMap.begin(), m_freqMap.end());
+    std::sort(freqVec.begin(), freqVec.end(), FreqSorter());
+    for(size_t i = 0; i < freqVec.size() && i < m_maxSize; i++)
+      m_bestVec.push_back(freqVec[i].first);
+    std::sort(m_bestVec.begin(), m_bestVec.end());
+    FreqMap t_freqMap;
+    for(typename std::vector<std::pair<DataType, mapped_type> >::iterator it
+        = freqVec.begin(); it != freqVec.end(); it++) {
+      DataType closest = LowerBound(it->first);
+      t_freqMap[closest] += it->second;
+    }
+    m_freqMap.swap(t_freqMap);
+  }
+  void Clear() {
+#ifdef WITH_THREADS
+    boost::mutex::scoped_lock lock(m_mutex);
+#endif
+    m_freqMap.clear();
+  }
+  DataType LowerBound(DataType data) {
+    if(m_maxSize == 0 || m_bestVec.size() == 0)
+      return data;
+    else {
+      typename std::vector<DataType>::iterator it
+      = std::lower_bound(m_bestVec.begin(), m_bestVec.end(), data);
+      if(it != m_bestVec.end())
+        return *it;
+      else
+        return m_bestVec.back();
+    }
+  }
+};
+class PackedItem
+{
+private:
+  long m_line;
+  std::string m_sourcePhrase;
+  std::string m_packedTargetPhrase;
+  size_t m_rank;
+  float m_score;
+public:
+  PackedItem(long line, std::string sourcePhrase,
+             std::string packedTargetPhrase, size_t rank,
+             float m_score = 0);
+  long GetLine() const;
+  const std::string& GetSrc() const;
+  const std::string& GetTrg() const;
+  size_t GetRank() const;
+  float GetScore() const;
+};
+bool operator<(const PackedItem &pi1, const PackedItem &pi2);
+class PhraseTableCreator
+{
+public:
+  enum Coding { None, REnc, PREnc };
+private:
+  std::string m_inPath;
+  std::string m_outPath;
+  std::string m_tempfilePath;
+  std::FILE* m_outFile;
+  size_t m_numScoreComponent;
+  size_t m_sortScoreIndex;
+  size_t m_warnMe;
+  Coding m_coding;
+  size_t m_orderBits;
+  size_t m_fingerPrintBits;
+  bool m_useAlignmentInfo;
+  bool m_multipleScoreTrees;
+  size_t m_quantize;
+  size_t m_maxRank;
+  static std::string m_phraseStopSymbol;
+  static std::string m_separator;
+#ifdef WITH_THREADS
+  size_t m_threads;
+  boost::mutex m_mutex;
+#endif
+  BlockHashIndex m_srcHash;
+  BlockHashIndex m_rnkHash;
+  size_t m_maxPhraseLength;
+  std::vector<unsigned> m_ranks;
+  typedef std::pair<unsigned, unsigned> SrcTrg;
+  typedef std::pair<std::string, std::string> SrcTrgString;
+  typedef std::pair<SrcTrgString, float> SrcTrgProb;
+  struct SrcTrgProbSorter {
+    bool operator()(const SrcTrgProb& a, const SrcTrgProb& b) const {
+      if(a.first.first < b.first.first)
+        return true;
+      if(a.first.first == b.first.first && a.second > b.second)
+        return true;
+      if(a.first.first == b.first.first
+          && a.second == b.second
+          && a.first.second < b.first.second)
+        return true;
+      return false;
+    }
+  };
+  std::vector<size_t> m_lexicalTableIndex;
+  std::vector<SrcTrg> m_lexicalTable;
+  StringVectorTemp<unsigned char, unsigned long, MmapAllocator>*
+  m_encodedTargetPhrases;
+  StringVector<unsigned char, unsigned long, MmapAllocator>*
+  m_compressedTargetPhrases;
+  boost::unordered_map<std::string, unsigned> m_targetSymbolsMap;
+  boost::unordered_map<std::string, unsigned> m_sourceSymbolsMap;
+  typedef Counter<unsigned> SymbolCounter;
+  typedef Counter<float> ScoreCounter;
+  typedef Counter<AlignPoint> AlignCounter;
+  typedef CanonicalHuffman<unsigned> SymbolTree;
+  typedef CanonicalHuffman<float> ScoreTree;
+  typedef CanonicalHuffman<AlignPoint> AlignTree;
+  SymbolCounter m_symbolCounter;
+  SymbolTree* m_symbolTree;
+  AlignCounter m_alignCounter;
+  AlignTree* m_alignTree;
+  std::vector<ScoreCounter*> m_scoreCounters;
+  std::vector<ScoreTree*> m_scoreTrees;
+  std::priority_queue<PackedItem> m_queue;
+  long m_lastFlushedLine;
+  long m_lastFlushedSourceNum;
+  std::string m_lastFlushedSourcePhrase;
+  std::vector<std::string> m_lastSourceRange;
+  std::priority_queue<std::pair<float, size_t> > m_rankQueue;
+  std::vector<std::string> m_lastCollection;
+  void Save();
+  void PrintInfo();
+  void AddSourceSymbolId(std::string& symbol);
+  unsigned GetSourceSymbolId(std::string& symbol);
+  void AddTargetSymbolId(std::string& symbol);
+  unsigned GetTargetSymbolId(std::string& symbol);
+  unsigned GetOrAddTargetSymbolId(std::string& symbol);
+  unsigned GetRank(unsigned srcIdx, unsigned trgIdx);
+  unsigned EncodeREncSymbol1(unsigned symbol);
+  unsigned EncodeREncSymbol2(unsigned position, unsigned rank);
+  unsigned EncodeREncSymbol3(unsigned rank);
+  unsigned EncodePREncSymbol1(unsigned symbol);
+  unsigned EncodePREncSymbol2(int lOff, int rOff, unsigned rank);
+  void EncodeTargetPhraseNone(std::vector<std::string>& t,
+                              std::ostream& os);
+  void EncodeTargetPhraseREnc(std::vector<std::string>& s,
+                              std::vector<std::string>& t,
+                              std::set<AlignPoint>& a,
+                              std::ostream& os);
+  void EncodeTargetPhrasePREnc(std::vector<std::string>& s,
+                               std::vector<std::string>& t,
+                               std::set<AlignPoint>& a, size_t ownRank,
+                               std::ostream& os);
+  void EncodeScores(std::vector<float>& scores, std::ostream& os);
+  void EncodeAlignment(std::set<AlignPoint>& alignment, std::ostream& os);
+  std::string MakeSourceKey(std::string&);
+  std::string MakeSourceTargetKey(std::string&, std::string&);
+  void LoadLexicalTable(std::string filePath);
+  void CreateRankHash();
+  void EncodeTargetPhrases();
+  void CalcHuffmanCodes();
+  void CompressTargetPhrases();
+  void AddRankedLine(PackedItem& pi);
+  void FlushRankedQueue(bool force = false);
+  std::string EncodeLine(std::vector<std::string>& tokens, size_t ownRank);
+  void AddEncodedLine(PackedItem& pi);
+  void FlushEncodedQueue(bool force = false);
+  std::string CompressEncodedCollection(std::string encodedCollection);
+  void AddCompressedCollection(PackedItem& pi);
+  void FlushCompressedQueue(bool force = false);
+public:
+  PhraseTableCreator(std::string inPath,
+                     std::string outPath,
+                     std::string tempfilePath,
+                     size_t numScoreComponent = 5,
+                     size_t sortScoreIndex = 2,
+                     Coding coding = PREnc,
+                     size_t orderBits = 10,
+                     size_t fingerPrintBits = 16,
+                     bool useAlignmentInfo = false,
+                     bool multipleScoreTrees = true,
+                     size_t quantize = 0,
+                     size_t maxRank = 100,
+                     bool warnMe = true
+#ifdef WITH_THREADS
+                                   , size_t threads = 2
+#endif
+                    );
+  ~PhraseTableCreator();
+  friend class RankingTask;
+  friend class EncodingTask;
+  friend class CompressionTask;
+};
+class RankingTask
+{
+private:
+#ifdef WITH_THREADS
+  static boost::mutex m_mutex;
+  static boost::mutex m_fileMutex;
+#endif
+  static size_t m_lineNum;
+  InputFileStream& m_inFile;
+  PhraseTableCreator& m_creator;
+public:
+  RankingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+  void operator()();
+};
+class EncodingTask
+{
+private:
+#ifdef WITH_THREADS
+  static boost::mutex m_mutex;
+  static boost::mutex m_fileMutex;
+#endif
+  static size_t m_lineNum;
+  static size_t m_sourcePhraseNum;
+  static std::string m_lastSourcePhrase;
+  InputFileStream& m_inFile;
+  PhraseTableCreator& m_creator;
+public:
+  EncodingTask(InputFileStream& inFile, PhraseTableCreator& creator);
+  void operator()();
+};
+class CompressionTask
+{
+private:
+#ifdef WITH_THREADS
+  static boost::mutex m_mutex;
+#endif
+  static size_t m_collectionNum;
+  StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
+  m_encodedCollections;
+  PhraseTableCreator& m_creator;
+public:
+  CompressionTask(StringVectorTemp<unsigned char, unsigned long, MmapAllocator>&
+                  encodedCollections, PhraseTableCreator& creator);
+  void operator()();
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/Scope3Parser/Parser.cpp ADDED Viewed

	@@ -0,0 +1,198 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "Parser.h"
+#include "moses/ChartParser.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/InputType.h"
+#include "moses/NonTerminal.h"
+#include "moses/TranslationModel/RuleTable/UTrieNode.h"
+#include "moses/TranslationModel/RuleTable/UTrie.h"
+#include "moses/StaticData.h"
+#include "ApplicableRuleTrie.h"
+#include "StackLattice.h"
+#include "StackLatticeBuilder.h"
+#include "StackLatticeSearcher.h"
+#include "VarSpanTrieBuilder.h"
+#include <memory>
+#include <vector>
+namespace Moses
+{
+void Scope3Parser::GetChartRuleCollection(
+  const InputPath &inputPath,
+  size_t last,
+  ChartParserCallback &outColl)
+{
+  const Range &range = inputPath.GetWordsRange();
+  const size_t start = range.GetStartPos();
+  const size_t end = range.GetEndPos();
+  std::vector<std::pair<const UTrieNode *, const VarSpanNode *> > &pairVec
+  = m_ruleApplications[start][end-start+1];
+  MatchCallback matchCB(range, outColl);
+  for (std::vector<std::pair<const UTrieNode *, const VarSpanNode *> >::const_iterator p = pairVec.begin(); p != pairVec.end(); ++p) {
+    const UTrieNode &ruleNode = *(p->first);
+    const VarSpanNode &varSpanNode = *(p->second);
+    const UTrieNode::LabelMap &labelMap = ruleNode.GetLabelMap();
+    if (varSpanNode.m_rank == 0) {  // Purely lexical rule.
+      assert(labelMap.size() == 1);
+      TargetPhraseCollection::shared_ptr tpc = labelMap.begin()->second;
+      matchCB.m_tpc = tpc;
+      matchCB(m_emptyStackVec);
+    } else {  // Rule has at least one non-terminal.
+      varSpanNode.CalculateRanges(start, end, m_ranges);
+      m_latticeBuilder.Build(start, end, ruleNode, varSpanNode, m_ranges,
+                             *this, m_lattice,
+                             m_quickCheckTable);
+      StackLatticeSearcher<MatchCallback> searcher(m_lattice, m_ranges);
+      UTrieNode::LabelMap::const_iterator p = labelMap.begin();
+      for (; p != labelMap.end(); ++p) {
+        const std::vector<int> &labels = p->first;
+        TargetPhraseCollection::shared_ptr tpc = p->second;
+        assert(labels.size() == varSpanNode.m_rank);
+        bool failCheck = false;
+        for (size_t i = 0; i < varSpanNode.m_rank; ++i) {
+          if (!m_quickCheckTable[i][labels[i]]) {
+            failCheck = true;
+            break;
+          }
+        }
+        if (failCheck) {
+          continue;
+        }
+        matchCB.m_tpc = tpc;
+        searcher.Search(labels, matchCB);
+      }
+    }
+  }
+}
+void Scope3Parser::Init()
+{
+  InitRuleApplicationVector();
+  // Build a map from Words to index-sets.
+  SentenceMap sentMap;
+  FillSentenceMap(sentMap);
+  // Build a trie containing 'elastic' application contexts
+  const UTrieNode &rootNode = m_ruleTable.GetRootNode();
+  std::auto_ptr<ApplicableRuleTrie> art(new ApplicableRuleTrie(-1, -1, rootNode));
+  art->Extend(rootNode, -1, sentMap, false);
+  // Build a trie containing just the non-terminal contexts and insert pointers
+  // to its nodes back into the ART trie.  Contiguous non-terminal contexts are
+  // merged and the number of split points is recorded.
+  VarSpanTrieBuilder vstBuilder;
+  m_varSpanTrie = vstBuilder.Build(*art);
+  // Fill each cell with a list of pointers to relevant ART nodes.
+  AddRulesToCells(*art, std::make_pair<int, int>(-1, -1), GetParser().GetSize()-1, 0);
+}
+void Scope3Parser::InitRuleApplicationVector()
+{
+  const size_t sourceSize = GetParser().GetSize();
+  m_ruleApplications.resize(sourceSize);
+  for (size_t start = 0; start < sourceSize; ++start) {
+    size_t maxSpan = sourceSize-start+1;
+    m_ruleApplications[start].resize(maxSpan+1);
+  }
+}
+void Scope3Parser::FillSentenceMap(SentenceMap &sentMap)
+{
+  for (size_t i = 0; i < GetParser().GetSize(); ++i) {
+    const Word &word = GetParser().GetInputPath(i, i).GetLastWord();
+    sentMap[word].push_back(i);
+  }
+}
+void Scope3Parser::AddRulesToCells(
+  const ApplicableRuleTrie &node,
+  std::pair<int, int> start,
+  int maxPos,
+  int depth)
+{
+  if (depth > 0) {
+    // Determine the start range for this path if not already known.
+    if (start.first == -1 && start.second == -1) {
+      assert(depth == 1);
+      start.first = std::max(0, node.m_start);
+      start.second = node.m_start;
+    } else if (start.second < 0) {
+      assert(depth > 1);
+      if (node.m_start == -1) {
+        --start.second;  // Record split point
+      } else {
+        int numSplitPoints = -1 - start.second;
+        start.second = node.m_start - (numSplitPoints+1);
+      }
+    }
+  }
+  if (node.m_node->HasRules()) {
+    assert(depth > 0);
+    assert(node.m_vstNode);
+    // Determine the end range for this path.
+    std::pair<int, int> end;
+    if (node.m_end == -1) {
+      end.first = (*(node.m_vstNode->m_label))[2];
+      end.second = (*(node.m_vstNode->m_label))[3];
+      assert(end.first != -1);
+      if (end.second == -1) {
+        end.second = maxPos;
+      }
+    } else {
+      assert(node.m_start == node.m_end);  // Should be a terminal
+      end.first = end.second = node.m_start;
+    }
+    // Add a (rule trie node, VST node) pair for each cell in the range.
+    int s2 = start.second;
+    if (s2 < 0) {
+      int numSplitPoints = -1 - s2;
+      s2 = maxPos - numSplitPoints;
+    }
+    for (int i = start.first; i <= s2; ++i) {
+      int e1 = std::max(i+depth-1, end.first);
+      for (int j = e1; j <= end.second; ++j) {
+        size_t span = j-i+1;
+        assert(span >= 1);
+        if (m_maxChartSpan && span > m_maxChartSpan) {
+          break;
+        }
+        m_ruleApplications[i][span].push_back(std::make_pair(node.m_node,
+                                              node.m_vstNode));
+      }
+    }
+  }
+  for (std::vector<ApplicableRuleTrie*>::const_iterator p = node.m_children.begin(); p != node.m_children.end(); ++p) {
+    AddRulesToCells(**p, start, maxPos, depth+1);
+  }
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.cc ADDED Viewed

	@@ -0,0 +1,434 @@

+#include <string>
+#include <cassert>
+#include <iomanip>
+#include <algorithm>
+#include "ug_stringdist.h"
+// string distance measures
+// Code by Ulrich Germann
+namespace stringdist
+{
+  UErrorCode strip_accents(UnicodeString & trg)
+  {
+    UErrorCode status = U_ZERO_ERROR;
+    static Transliterator *stripper
+      = Transliterator::createInstance("NFD; [:M:] Remove; NFC",
+				       UTRANS_FORWARD, status);
+    stripper->transliterate(trg);
+    return status;
+  }
+  char const*
+  StringDiff::
+  Segment::
+  elabel[] = { "same", "cap", "flip", "permutation",
+	       "accent", "duplication",
+	       "insertion", "deletion",
+	       "mismatch", "noinit" };
+  StringDiff::
+  StringDiff()
+  {}
+  StringDiff::
+  StringDiff(string const& a, string const& b)
+  {
+    set_a(a);
+    set_b(b);
+    align();
+  }
+  StringDiff::
+  Segment::
+  Segment()
+    : start_a(-1), end_a(-1), start_b(-1), end_b(-1), match(noinit), dist(0)
+  {}
+  UnicodeString const&
+  StringDiff::
+  set_a(string const& a)
+  {
+    this->a = a.c_str();
+    return this->a;
+  }
+  UnicodeString const&
+  StringDiff::
+  set_b(string const& b)
+  {
+    this->b = b.c_str();
+    return this->b;
+  }
+  UnicodeString const&
+  StringDiff::
+  get_a() const
+  {
+    return this->a;
+  }
+  UnicodeString const&
+  StringDiff::
+  get_b() const
+  {
+    return this->b;
+  }
+  size_t
+  StringDiff::
+  size()
+  {
+    return this->difflist.size();
+  }
+  // float
+  // StringDiff::
+  // levelshtein(bool force)
+  // {
+  //   align(force);
+  //   float ret = 0;
+  //   for (size_t i = 0; i < difflist.size(); +++i)
+  //     {
+  // 	Segment const& s = difflist[i];
+  // 	if      (s.match == same) continue;
+  // 	else if (s.match == insertion) ret += s.end_b - s.start_b;
+  // 	else if (s.match == deletion)  ret += s.end_a - s.start_a;
+  //     }
+  // }
+  void
+  StringDiff::
+  fillAlignmentMatrix(vector<vector<float> > & M) const
+  {
+    assert(a.length() && b.length());
+    M.assign(a.length(),vector<float>(b.length(),0));
+    int i = 0,j;
+    while (i < b.length() && b[i] != a[0]) ++i;
+    while (i < b.length()) M[0][i++] = 1;
+    i = 0;
+    while (i < a.length() && a[i] != b[0]) ++i;
+    while (i < a.length()) M[i++][0] = 1;
+    for (i = 1; i < a.length(); ++i)
+      {
+	for (j = 1; j < b.length(); ++j)
+	  {
+	    float & s = M[i][j];
+	    s = max(M[i-1][j],M[i][j-1]);
+	    if (a[i] == b[j])
+	      s = max(s,M[i-1][j-1] + 1 + (a[i-1] == b[j-1] ? .1f : 0));
+	  }
+      }
+#if 0
+    string abuf,bbuf;
+    a.toUTF8String(abuf);
+    b.toUTF8String(bbuf);
+    cout << "  " << bbuf[0];
+    for (int x = 1; x < b.length(); ++x)
+      cout << " " << bbuf[x];
+    cout << endl;
+    for (int x = 0; x < a.length(); ++x)
+      {
+	cout << abuf[x] << " ";
+	for (int y = 0; y < b.length(); ++y)
+	  cout << int(M[x][y]) << " ";
+	cout << endl;
+      }
+#endif
+  }
+  float
+  fillAlignmentMatrix(UChar const* a, size_t const lenA,
+		      UChar const* b, size_t const lenB,
+		      vector<vector<float> > & M)
+  {
+    M.assign(lenA,vector<float>(lenB,0));
+    assert(lenA); assert(lenB);
+    size_t i = 0;
+    while (i < lenB && b[i] != a[0]) ++i;
+    while (i < lenB) M[0][i++] = 1;
+    i = 0;
+    while (i < lenA && a[i] != b[0]) ++i;
+    while (i < lenA) M[i++][0] = 1;
+    for (i = 1; i < lenA; ++i)
+      {
+	for (size_t j = 1; j < lenB; ++j)
+	  {
+	    float & s = M[i][j];
+	    s = max(M[i-1][j], M[i][j-1]);
+	    if (a[i] == b[j])
+	      s = max(s, M[i-1][j-1] + 1);
+	  }
+      }
+    return M.back().back();
+  }
+  float
+  levenshtein(UChar const* a, size_t const lenA,
+	      UChar const* b, size_t const lenB)
+  {
+    vector<vector<float> > M;
+    fillAlignmentMatrix(a,lenA,b,lenB,M);
+    size_t ret = 0;
+#define DEBUGME 0
+#if DEBUGME
+    for (size_t i = 0; i < M.size(); ++i)
+      {
+    	for (size_t j = 0; j < M[i].size(); ++j)
+    	  cout << M[i][j] << " ";
+    	cout << endl;
+      }
+    cout << string(25,'-') << endl;
+#endif
+    int i = M.size() -1;
+    int j = M.back().size() -1;
+    int I=i, J=j;
+    for (;i >= 0 || j >= 0; --i, --j)
+      {
+	I=i, J=j;
+	if (j>=0) while (i > 0 && M[i-1][j] == M[i][j]) --i;
+	if (i>=0) while (j > 0 && M[i][j-1] == M[i][j]) --j;
+	size_t ilen = I >= 0 ? I - i : 0;
+	size_t jlen = J >= 0 ? J - j : 0;
+	ret += max(ilen,jlen);
+#if DEBUGME
+	cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
+#endif
+	I=i, J=j;
+      }
+    size_t ilen = I >= 0 ? I - i : 0;
+    size_t jlen = J >= 0 ? J - j : 0;
+    ret += max(ilen,jlen);
+#if DEBUGME
+    cout << I << ":" << i << " " << J << ":" << j << " " << ret << endl;
+#endif
+    return ret;
+  }
+  StringDiff::
+  Segment::
+  Segment(size_t const as, size_t const ae,
+	  size_t const bs, size_t const be,
+	  UnicodeString const& a,
+	  UnicodeString const& b)
+  {
+    dist = 0;
+    start_a = as; end_a = ae;
+    start_b = bs; end_b = be;
+    if (as == ae)
+      match = bs == be ? same : insertion;
+    else if (bs == be)
+      match = deletion;
+    else if (be-bs != ae-as)
+      {
+	match = mismatch;
+	dist  = stringdist::levenshtein(a.getBuffer() + as, ae - as,
+					b.getBuffer() + bs, be - bs);
+      }
+    else
+      {
+	match = same;
+	size_t stop = ae-as;
+	for (size_t i = 0; i < stop && match == same; ++i)
+	  if (a[as+i] != b[bs+i]) match = mismatch;
+	if (match == mismatch)
+	  {
+	    if (ae-as == 2 && a[as] == b[bs+1] && a[as+1] == b[bs])
+	      match = flip;
+	    else
+	      {
+		vector<UChar> x(a.getBuffer() + as, a.getBuffer() + ae);
+		vector<UChar> y(b.getBuffer() + bs, b.getBuffer() + be);
+		sort(x.begin(),x.end());
+		sort(y.begin(),y.end());
+		if (x == y) match = permutation;
+		else dist = stringdist::levenshtein(a.getBuffer() + as, ae - as,
+						    b.getBuffer() + bs, be - bs);
+	      }
+	  }
+      }
+    if (match == insertion)
+      {
+	dist = be-bs;
+      }
+    else if (match == deletion)
+      {
+	dist = ae-as;
+      }
+    else if (match == flip)        dist = 1;
+    else if (match == permutation) dist = ae-as-1;
+    if (match == mismatch)
+      {
+	UnicodeString ax(a,as,ae-as);
+	UnicodeString bx(b,bs,be-bs);
+	if (ax.toLower() == bx.toLower())
+	  match = cap;
+	else
+	  {
+	    strip_accents(ax);
+	    strip_accents(bx);
+	    if (ax == bx) match = accent;
+	  }
+      }
+  }
+  size_t
+  StringDiff::
+  align(bool force)
+  {
+    if (force) difflist.clear();
+    if (difflist.size()) return 0;
+    vector<vector<float> > M;
+    fillAlignmentMatrix(M);
+    // now backtrack
+    int i = a.length() - 1;
+    int j = b.length() - 1;
+    vector<int> A(a.length(), -1);
+    vector<int> B(b.length(), -1);
+    while (i + j)
+      {
+	while (i && M[i-1][j] == M[i][j]) --i;
+	while (j && M[i][j-1] == M[i][j]) --j;
+	if (a[i] == b[j]) { A[i] = j; B[j] = i; }
+	if (i) --i;
+	if (j) --j;
+      }
+    i = a.length() - 1;
+    j = b.length() - 1;
+    vector<int> A2(a.length(), -1);
+    vector<int> B2(b.length(), -1);
+    while (i + j)
+      {
+	while (j && M[i][j-1] == M[i][j]) --j;
+	while (i && M[i-1][j] == M[i][j]) --i;
+	if (a[i] == b[j]) { A2[i] = j; B2[j] = i; }
+	if (i) --i;
+	if (j) --j;
+      }
+    for (size_t k = 0; k < A.size(); ++k)
+      A[k] = min(A[k],A2[k]);
+    for (size_t k = 0; k < B.size(); ++k)
+      B[k] = min(B[k],B2[k]);
+    if (a[i] == b[j]) { A[i] = j; B[j] = i; }
+    i = 0;
+    j = 0;
+    size_t I, J;
+    while (i < a.length() and j < b.length())
+      {
+	if (A[i] < 0)
+	  {
+	    I = i + 1;
+	    while (I < A.size() and A[I] < 0) ++I;
+	    if (i)
+	      { for (J = j = A[i-1]+1; J < B.size() && B[J] < 0; ++J); }
+	    else if (I < A.size())
+	      { for (j = J = A[I]; j && B[j-1] < 0; --j); }
+	    else J = B.size();
+	    difflist.push_back(Segment(i,I,j,J,a,b));
+	    i = I; j = J;
+	  }
+	else if (B[j] < 0)
+	  {
+	    for (J = j + 1; J < B.size() && B[J] < 0; ++J);
+	    difflist.push_back(Segment(i,i,j,J,a,b));
+	    j = J;
+	  }
+	else
+	  {
+	    I = i;
+	    J = j;
+	    while(I < A.size() && A[I] >= 0 && J < B.size() && B[J] >= 0)
+	      { ++I; ++J; }
+	    difflist.push_back(Segment(i,I,j,J,a,b));
+	    i = I; j = J;
+	  }
+      }
+    if (i < a.length() || j < b.length())
+      difflist.push_back(Segment(i,a.length(),j,b.length(),a,b));
+    diffcnt.assign(noinit,0);
+    for (size_t i = 0; i < difflist.size(); ++i)
+      {
+	Segment & s = difflist[i];
+	if (s.match == insertion and
+	    ((s.start_a and a[s.start_a - 1] == b[s.start_b]) or
+	     (s.end_a < a.length() and a[s.end_a] == b[s.start_b])))
+	  {
+	    bool sameletter = true;
+	    for (int i = s.start_b + 1; sameletter and i < s.end_b; ++i)
+	      sameletter = b[i] == b[i-1];
+	    if (sameletter) s.match = duplication;
+	  }
+	else if (s.match == deletion and
+		 ((s.start_b and b[s.start_b - 1] == a[s.start_a]) or
+		  (s.end_b < b.length() and b[s.end_b] == a[s.start_a])))
+	  {
+	    bool sameletter = true;
+	    for (int i = s.start_a + 1; sameletter and i < s.end_a; ++i)
+	      sameletter = a[i] == a[i-1];
+	    if (sameletter) s.match= duplication;
+	  }
+	++diffcnt[s.match];
+      }
+    return 0;
+  }
+  void
+  StringDiff::
+  showDiff(std::ostream& out)
+  {
+    if (difflist.size() == 0) align();
+    vector<size_t> fromEnd(difflist.size(),0);
+    for (int d = difflist.size()-1; d-- > 0;)
+      {
+	fromEnd[d] = a.length() - difflist[d].end_a;
+	// cout << d << " " << fromEnd[d] << " "
+	//      << difflist[d].start_a << "-"
+	//      << difflist[d].end_a << endl;
+      }
+    for (size_t d = 0; d < difflist.size(); ++d)
+      {
+	Segment const& s = difflist[d];
+	UnicodeString aseg,bseg;
+	a.extract(s.start_a, s.end_a - s.start_a, aseg);
+	b.extract(s.start_b, s.end_b - s.start_b, bseg);
+	string abuf,bbuf;
+	aseg.toUTF8String(abuf);
+	bseg.toUTF8String(bbuf);
+	out << abuf << " ";
+	out << bbuf << " ";
+	out << s.label() << " "
+	    << s.dist << " "
+	    << fromEnd[d]
+	    << endl;
+      }
+  }
+  char const*
+  StringDiff::
+  Segment::
+  label() const
+  {
+    return elabel[this->match];
+  }
+  StringDiff::Segment const&
+  StringDiff::
+  operator[](uint32_t const i) const
+  {
+    return difflist.at(i);
+  }
+  vector<int> const&
+  StringDiff::
+  getFeatures() const
+  {
+    return diffcnt;
+  }
+}

mosesdecoder/moses/TranslationModel/UG/generic/stringdist/ug_stringdist.h ADDED Viewed

	@@ -0,0 +1,83 @@

+//-*- c++ -*-
+#pragma once
+// string distance measures
+// Code by Ulrich Germann
+#include<iostream>
+#include <unicode/stringpiece.h>
+#include <unicode/translit.h>
+#include <unicode/utypes.h>
+#include <unicode/unistr.h>
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+#include <vector>
+#include "moses/TranslationModel/UG/mm/tpt_typedefs.h"
+namespace stringdist
+{
+  float
+  levenshtein(UChar const* a, size_t const lenA,
+	      UChar const* b, size_t const lenB);
+  UErrorCode strip_accents(UnicodeString & trg);
+  float
+  fillAlignmentMatrix(UChar const* a, size_t const lenA,
+		      UChar const* b, size_t const lenB,
+		      std::vector<std::vector<float> > & M);
+  class StringDiff
+  {
+  public:
+    enum MATCHTYPE
+      {
+	same,        // a and b are identical
+	cap,         // a and b differ only in capitalization
+	flip,        // two-letter flip
+	permutation, // a and b have same letters but in different order
+	accent,      // a and b are the same basic letters, ignoring accents
+	duplication, // a is empty
+	insertion,   // a is empty
+	deletion,    // b is empty
+	mismatch,    // none of the above
+	noinit       // not initialized
+      };
+    struct Segment
+    {
+      static char const* elabel[];
+      int start_a, end_a;
+      int start_b, end_b;
+      MATCHTYPE match;
+      float      dist;
+      Segment();
+      Segment(size_t const as, size_t const ae,
+	      size_t const bs, size_t const be,
+	      UnicodeString const& a,
+	      UnicodeString const& b);
+      char const* label() const;
+    };
+  private:
+    UnicodeString a,b;
+    std::vector<Segment> difflist;
+    std::vector<int> diffcnt;
+  public:
+    UnicodeString const& set_a(std::string const& a);
+    UnicodeString const& set_b(std::string const& b);
+    UnicodeString const& get_a() const;
+    UnicodeString const& get_b() const;
+    StringDiff(std::string const& a, std::string const& b);
+    StringDiff();
+    size_t size();
+    size_t align(bool force=false); // returns the levenshtein distance
+    void showDiff(std::ostream& out);
+    float levenshtein();
+    Segment const& operator[](uint32_t i) const;
+    void fillAlignmentMatrix(std::vector<std::vector<float> > & M) const;
+    vector<int> const& getFeatures() const;
+  };
+}

mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.cc ADDED Viewed

	@@ -0,0 +1,31 @@

+#include "ug_thread_pool.h"
+namespace ug {
+ThreadPool::
+ThreadPool(size_t const num_workers)
+  : m_service(), m_busywork(new boost::asio::io_service::work(m_service))
+{
+  m_workers.reserve(num_workers);
+  for (size_t i = 0; i < num_workers; ++i)
+    {
+      // boost::shared_ptr<boost::thread> t;
+      // t.reset(new boost::thread(boost::bind(&service_t::run, &m_service)));
+      boost::thread* t;
+      t = new boost::thread(boost::bind(&service_t::run, &m_service));
+      m_pool.add_thread(t);
+      // m_workers.push_back(t);
+    }
+}
+ThreadPool::
+~ThreadPool()
+{
+  m_busywork.reset();
+  m_pool.join_all();
+  m_service.stop();
+}
+}

mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_pool.h ADDED Viewed

	@@ -0,0 +1,30 @@

+// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil -*-
+#pragma once
+#include <boost/asio.hpp>
+#include <boost/bind.hpp>
+#include <boost/thread.hpp>
+#include <boost/scoped_ptr.hpp>
+#include <boost/shared_ptr.hpp>
+#include <iostream>
+#include <vector>
+#include <string>
+namespace ug {
+class ThreadPool
+{
+  typedef boost::asio::io_service service_t;
+  service_t m_service;
+  boost::thread_group m_pool;
+  boost::scoped_ptr<service_t::work>  m_busywork;
+  std::vector<boost::shared_ptr<boost::thread> > m_workers;
+public:
+  ThreadPool(size_t const num_workers);
+  ~ThreadPool();
+  template<class callable>
+  void add(callable& job) { m_service.post(job); }
+}; // end of class declaration ThreadPool
+} // end of namespace ug

mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.cc ADDED Viewed

	@@ -0,0 +1,51 @@

+// #include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
+#include "ug_thread_safe_counter.h"
+// obsolete once <atomic> can be assumed to be available everywhere
+namespace Moses
+{
+  ThreadSafeCounter::
+  ThreadSafeCounter()
+    : ctr(0)
+  { }
+  size_t
+  ThreadSafeCounter::
+  operator++()
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ++ctr;
+  }
+  size_t
+  ThreadSafeCounter::
+  operator++(int foo)
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ctr++;
+  }
+  ThreadSafeCounter::
+  operator size_t() const
+  {
+    return ctr;
+  }
+  size_t
+  ThreadSafeCounter::
+  operator--()
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return --ctr;
+  }
+  size_t
+  ThreadSafeCounter::
+  operator--(int foo)
+  {
+    boost::lock_guard<boost::mutex> guard(this->lock);
+    return ctr--;
+  }
+}

mosesdecoder/moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+#include <boost/thread.hpp>
+namespace Moses
+{
+  class ThreadSafeCounter
+  {
+    size_t ctr;
+    boost::mutex lock;
+  public:
+    ThreadSafeCounter();
+    size_t operator++();
+    size_t operator++(int);
+    size_t operator--();
+    size_t operator--(int);
+    operator size_t() const;
+  };
+}

mosesdecoder/moses/TranslationModel/UG/mm/Makefile.x ADDED Viewed

	@@ -0,0 +1,105 @@

+# Some systems apparently distinguish between shell
+# variables and environment variables. The latter are
+# visible to the make utility, the former apparently not,
+# so we need to set them if they are not defined yet
+# ===============================================================================
+# COMPILATION PREFERENCES
+# ===============================================================================
+# CCACHE: if set to ccache, use ccache to speed up compilation
+# OPTI:   optimization level
+# PROF:   profiler switches
+CCACHE  = ccache
+OPTI    = 3
+EXE_TAG = exe
+PROF =
+# PROF = -g -pg
+# ===============================================================================
+SHELL         = bash
+MAKEFLAGS    += --warn-undefined-variables
+.DEFAULT_GOAL = all
+.SUFFIXES:
+# ===============================================================================
+# COMPILATION 'LOCALIZATION'
+HOST     ?= $(shell hostname)
+HOSTTYPE ?= $(shell uname -m)
+KERNEL    = $(shell uname -r)
+MOSES_ROOT ?= ${HOME}/code/mosesdecoder
+WDIR        = build/${HOSTTYPE}/${KERNEL}/${OPTI}
+VPATH       = ${HOME}/code/mosesdecoder/
+CXXFLAGS    = ${PROF} -ggdb -Wall -O${OPTI} ${INCLUDES}
+CXXFLAGS   += -DMAX_NUM_FACTORS=4
+CXXFLAGS   += -DKENLM_MAX_ORDER=5
+modirs     := $(addprefix -I,$(shell find ${MOSES_ROOT}/moses ${MOSES_ROOT}/contrib -type d))
+CXXFLAGS   += -I${MOSES_ROOT}
+INCLUDES    =
+BZLIB       =
+BOOSTLIBTAG =
+REQLIBS = m z pthread lzma ${BZLIB} \
+	boost_thread${BOOSTLIBTAG} \
+	boost_iostreams${BOOSTLIBTAG} \
+	boost_program_options${BOOSTLIBTAG} \
+	boost_system${BOOSTLIBTAG} \
+	boost_filesystem${BOOSTLIBTAG}
+# 	icuuc icuio icui18n \
+LIBS     = $(addprefix -l, ${REQLIBS} moses)
+LIBDIRS   = -L${HOME}/code/mosesdecoder/lib
+LIBDIRS  += -L${HOME}/lib
+PREFIX ?= .
+BINDIR ?= ${PREFIX}/bin
+ifeq "$(OPTI)" "0"
+BINPREF = debug.
+else
+BINPREF =
+endif
+OBJ2 :=
+define compile
+DEP  += ${WDIR}/$(basename $(notdir $1)).d
+${WDIR}/$(basename $(notdir $1)).o : $1 $(wildcard $(basename $1).h)
+	@echo -e "COMPILING $1"
+	@mkdir -p $$(@D)
+	${CXX} ${CXXFLAGS} -MD -MP -c $$(abspath $$<) -o $$@
+endef
+testprogs = test-dynamic-im-tsa
+programs  = mtt-build mtt-dump symal2mam custom-pt mmlex-build ${testprogs}
+programs += mtt-count-words calc-coverage
+all: $(addprefix ${BINDIR}/${BINPREF}, $(programs))
+	@echo $^
+clean:
+	rm -f ${WDIR}/*.o ${WDIR}/*.d
+custom-pt: ${BINDIR}/${BINPREF}custom-pt
+	echo $^
+INMOGEN = $(wildcard ${MOSES_ROOT}/moses/TranslationModel/UG/generic/*/*.cpp)
+OBJ     = $(patsubst %.cc,%.o,$(wildcard $(patsubst %.h,%.cc,$(wildcard *.h))))
+OBJ    += $(patsubst %.cpp,%.o,${INMOGEN})
+EXE     = $(patsubst %.cc,%.o,$(filter-out $(patsubst %.h,%.cc,$(wildcard *.h)),$(wildcard *.cc)))
+$(foreach cpp,${INMOGEN},$(eval $(call compile,${cpp})))
+$(foreach cpp,$(wildcard *.cc),$(eval $(call compile,${cpp})))
+$(addprefix ${BINDIR}/${BINPREF}, $(programs)): $(addprefix ${WDIR}/,$(notdir ${OBJ}))
+$(addprefix ${BINDIR}/${BINPREF}, $(programs)): ${MOSES_ROOT}/lib/libmoses.a
+${BINDIR}/${BINPREF}%: ${WDIR}/%.o
+	echo PREREQS: $<
+	$(CXX) $(CXXFLAGS) -o $@ $^ ${LIBDIRS} ${LIBS}
+.SECONDARY:
+-include $(DEP)

mosesdecoder/moses/TranslationModel/UG/mm/calc-coverage.cc ADDED Viewed

	@@ -0,0 +1,57 @@

+#include "moses/TranslationModel/UG/mm/ug_mm_ttrack.h"
+#include "moses/TranslationModel/UG/mm/ug_mm_tsa.h"
+#include "moses/TranslationModel/UG/mm/tpt_tokenindex.h"
+#include "moses/TranslationModel/UG/mm/ug_corpus_token.h"
+#include "moses/TranslationModel/UG/mm/ug_typedefs.h"
+#include "moses/TranslationModel/UG/mm/tpt_pickler.h"
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+// using namespace Moses;
+using namespace ugdiss;
+using namespace sapt;
+using namespace std;
+typedef L2R_Token<SimpleWordId> Token;
+TokenIndex V;
+SPTR<vector<vector<Token> > > C(new vector<vector<Token> >());
+void
+add_file(string fname)
+{
+  boost::iostreams::filtering_istream in;
+  open_input_stream(fname,in);
+  string line;
+  while (getline(in,line))
+    {
+      C->push_back(vector<Token>());
+      fill_token_seq(V,line,C->back());
+    }
+}
+int
+main(int argc, char* argv[])
+{
+  V.setDynamic(true);
+  add_file(argv[1]);
+  SPTR<imTtrack<Token> > T(new imTtrack<Token>(C));
+  imTSA<Token> I(T,NULL,NULL);
+  string line;
+  while (getline(cin,line))
+    {
+      vector<Token> seq; fill_token_seq<Token>(V,line,seq);
+      for (size_t i = 0; i < seq.size(); ++i)
+	{
+	  TSA<Token>::tree_iterator m(&I);
+	  cout << V[seq[i].id()];
+	  for (size_t k = i; k < seq.size() && m.extend(seq[k]); ++k)
+	    {
+	      cout << " ";
+	      if (k > i) cout << V[seq[k].id()] << " ";
+	      cout << "[" << m.approxOccurrenceCount() << "]";
+	    }
+	  cout << endl;
+	}
+    }
+}

mosesdecoder/moses/TranslationModel/UG/mm/mmlex-build.cc ADDED Viewed

	@@ -0,0 +1,348 @@

+// -*- c++ -*-
+// Program to extract word cooccurrence counts from a memory-mapped
+// word-aligned bitext stores the counts lexicon in the format for
+// mm2dTable<uint32_t> (ug_mm_2d_table.h)
+//
+// (c) 2010-2012 Ulrich Germann
+// to do: multi-threading
+#include <queue>
+#include <iomanip>
+#include <vector>
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <boost/program_options.hpp>
+#include <boost/dynamic_bitset.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/foreach.hpp>
+#include <boost/thread.hpp>
+#include <boost/math/distributions/binomial.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "moses/Util.h"
+#include "ug_mm_2d_table.h"
+#include "ug_mm_ttrack.h"
+#include "ug_corpus_token.h"
+using namespace std;
+using namespace sapt;
+using namespace ugdiss;
+using namespace boost::math;
+typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> LEX_t;
+typedef SimpleWordId Token;
+// DECLARATIONS
+void interpret_args(int ac, char* av[]);
+mmTtrack<Token> T1,T2;
+mmTtrack<char>     Tx;
+TokenIndex      V1,V2;
+typedef pair<id_type,id_type> wpair;
+struct Count
+{
+  uint32_t a;
+  uint32_t c;
+  Count() : a(0), c(0) {};
+  Count(uint32_t ax, uint32_t cx) : a(ax), c(cx) {}
+};
+bool
+operator<(pair<id_type,Count> const& a,
+	  pair<id_type,Count> const& b)
+{
+  return a.first < b.first;
+}
+typedef boost::unordered_map<wpair,Count> countmap_t;
+typedef vector<vector<pair<id_type,Count> > > countlist_t;
+vector<countlist_t> XLEX;
+class Counter
+{
+public:
+  countmap_t  CNT;
+  countlist_t & LEX;
+  size_t  offset;
+  size_t    skip;
+  Counter(countlist_t& lex, size_t o, size_t s)
+    : LEX(lex), offset(o), skip(s) {}
+  void processSentence(id_type sid);
+  void operator()();
+};
+string bname,cfgFile,L1,L2,oname,cooc;
+int    verbose;
+size_t truncat;
+size_t num_threads;
+void
+Counter::
+operator()()
+{
+  for (size_t sid = offset; sid < min(truncat,T1.size()); sid += skip)
+    processSentence(sid);
+  LEX.resize(V1.ksize());
+  for (countmap_t::const_iterator c = CNT.begin(); c != CNT.end(); ++c)
+    {
+      pair<id_type,Count> foo(c->first.second,c->second);
+      LEX.at(c->first.first).push_back(foo);
+    }
+  typedef vector<pair<id_type,Count> > v_t;
+  BOOST_FOREACH(v_t& v, LEX)
+    sort(v.begin(),v.end());
+}
+struct lexsorter
+{
+  vector<countlist_t> const& v;
+  id_type wid;
+  lexsorter(vector<countlist_t> const& vx, id_type widx)
+    : v(vx),wid(widx) {}
+  bool operator()(pair<uint32_t,uint32_t> const& a,
+		  pair<uint32_t,uint32_t> const& b) const
+  {
+    return (v.at(a.first).at(wid).at(a.second).first >
+	    v.at(b.first).at(wid).at(b.second).first);
+  }
+};
+void
+writeTableHeader(ostream& out)
+{
+  filepos_type idxOffset=0;
+  tpt::numwrite(out,idxOffset); // blank for the time being
+  tpt::numwrite(out,id_type(V1.ksize()));
+  tpt::numwrite(out,id_type(V2.ksize()));
+}
+void writeTable(ostream* aln_out, ostream* coc_out)
+{
+  vector<uint32_t> m1a(V1.ksize(),0); // marginals L1
+  vector<uint32_t> m2a(V2.ksize(),0); // marginals L2
+  vector<uint32_t> m1c(V1.ksize(),0); // marginals L1
+  vector<uint32_t> m2c(V2.ksize(),0); // marginals L2
+  vector<id_type> idxa(V1.ksize()+1,0);
+  vector<id_type> idxc(V1.ksize()+1,0);
+  if (aln_out) writeTableHeader(*aln_out);
+  if (coc_out) writeTableHeader(*coc_out);
+  size_t CellCountA=0,CellCountC=0;
+  for (size_t id1 = 0; id1 < V1.ksize(); ++id1)
+    {
+      idxa[id1] = CellCountA;
+      idxc[id1] = CellCountC;
+      lexsorter sorter(XLEX,id1);
+      vector<pair<uint32_t,uint32_t> > H; H.reserve(num_threads);
+      for (size_t i = 0; i < num_threads; ++i)
+	{
+	  if (id1 < XLEX.at(i).size() && XLEX[i][id1].size())
+	    H.push_back(pair<uint32_t,uint32_t>(i,0));
+	}
+      if (!H.size()) continue;
+      make_heap(H.begin(),H.end(),sorter);
+      while (H.size())
+	{
+	  id_type  id2 = XLEX[H[0].first][id1][H[0].second].first;
+	  uint32_t aln = XLEX[H[0].first][id1][H[0].second].second.a;
+	  uint32_t coc = XLEX[H[0].first][id1][H[0].second].second.c;
+	  pop_heap(H.begin(),H.end(),sorter);
+	  ++H.back().second;
+	  if (H.back().second == XLEX[H.back().first][id1].size())
+	    H.pop_back();
+	  else
+	    push_heap(H.begin(),H.end(),sorter);
+	  while (H.size() &&
+		 XLEX[H[0].first][id1].at(H[0].second).first == id2)
+	    {
+	      aln += XLEX[H[0].first][id1][H[0].second].second.a;
+	      coc += XLEX[H[0].first][id1][H[0].second].second.c;
+	      pop_heap(H.begin(),H.end(),sorter);
+	      ++H.back().second;
+	      if (H.back().second == XLEX[H.back().first][id1].size())
+		H.pop_back();
+	      else
+		push_heap(H.begin(),H.end(),sorter);
+	    }
+	  if (aln_out)
+	    {
+	      ++CellCountA;
+	      tpt::numwrite(*aln_out,id2);
+	      tpt::numwrite(*aln_out,aln);
+	      m1a[id1] += aln;
+	      m2a[id2] += aln;
+	    }
+	  if (coc_out && coc)
+	    {
+	      ++CellCountC;
+	      tpt::numwrite(*coc_out,id2);
+	      tpt::numwrite(*coc_out,coc);
+	      m1c[id1] += coc;
+	      m2c[id2] += coc;
+	    }
+	}
+    }
+  idxa.back() = CellCountA;
+  idxc.back() = CellCountC;
+  if (aln_out)
+    {
+      filepos_type idxOffsetA = aln_out->tellp();
+      BOOST_FOREACH(id_type foo, idxa)
+	tpt::numwrite(*aln_out,foo);
+      aln_out->write(reinterpret_cast<char const*>(&m1a[0]),m1a.size()*4);
+      aln_out->write(reinterpret_cast<char const*>(&m2a[0]),m2a.size()*4);
+      aln_out->seekp(0);
+      tpt::numwrite(*aln_out,idxOffsetA);
+    }
+  if (coc_out)
+    {
+      filepos_type idxOffsetC = coc_out->tellp();
+      BOOST_FOREACH(id_type foo, idxc)
+	tpt::numwrite(*coc_out,foo);
+      coc_out->write(reinterpret_cast<char const*>(&m1c[0]),m1c.size()*4);
+      coc_out->write(reinterpret_cast<char const*>(&m2c[0]),m2c.size()*4);
+      coc_out->seekp(0);
+      tpt::numwrite(*coc_out,idxOffsetC);
+    }
+}
+void
+Counter::
+processSentence(id_type sid)
+{
+  Token const* s1 = T1.sntStart(sid);
+  Token const* e1 = T1.sntEnd(sid);
+  Token const* s2 = T2.sntStart(sid);
+  Token const* e2 = T2.sntEnd(sid);
+  // vector<ushort> cnt1(V1.ksize(),0);
+  // vector<ushort> cnt2(V2.ksize(),0);
+  // for (Token const* x = s1; x < e1; ++x)
+  // ++cnt1.at(x->id());
+  // for (Token const* x = s2; x < e2; ++x)
+  // ++cnt2.at(x->id());
+  // boost::unordered_set<wpair> seen;
+  bitvector check1(T1.sntLen(sid)); check1.set();
+  bitvector check2(T2.sntLen(sid)); check2.set();
+  // count links
+  char const*   p = Tx.sntStart(sid);
+  char const*   q = Tx.sntEnd(sid);
+  ushort r,c;
+  if (verbose && sid % 1000000 == 0)
+    cerr << sid/1000000 << " M sentences processed" << endl;
+  while (p < q)
+    {
+      p = tpt::binread(p,r);
+      p = tpt::binread(p,c);
+      // cout << sid << " " << r << "-" << c << endl;
+      UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
+      UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
+      // assert(r < check1.size());
+      // assert(c < check2.size());
+      UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
+      UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
+      // assert(s1+r < e1);
+      // assert(s2+c < e2);
+      check1.reset(r);
+      check2.reset(c);
+      id_type id1 = (s1+r)->id();
+      id_type id2 = (s2+c)->id();
+      wpair k(id1,id2);
+      Count& cnt = CNT[k];
+      cnt.a++;
+      // if (seen.insert(k).second)
+      // cnt.c += cnt1[id1] * cnt2[id2];
+    }
+  // count unaliged words
+  for (size_t i = check1.find_first();
+       i < check1.size();
+       i = check1.find_next(i))
+    CNT[wpair((s1+i)->id(),0)].a++;
+  for (size_t i = check2.find_first();
+       i < check2.size();
+       i = check2.find_next(i))
+    CNT[wpair(0,(s2+i)->id())].a++;
+}
+int
+main(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  char c = *bname.rbegin();
+  if (c != '/' && c != '.') bname += '.';
+  T1.open(bname+L1+".mct");
+  T2.open(bname+L2+".mct");
+  Tx.open(bname+L1+"-"+L2+".mam");
+  V1.open(bname+L1+".tdx");
+  V2.open(bname+L2+".tdx");
+  if (!truncat) truncat = T1.size();
+  XLEX.resize(num_threads);
+  vector<boost::shared_ptr<boost::thread> > workers(num_threads);
+  for (size_t i = 0; i < num_threads; ++i)
+    workers[i].reset(new boost::thread(Counter(XLEX[i],i,num_threads)));
+  for (size_t i = 0; i < workers.size(); ++i)
+    workers[i]->join();
+  // cerr << "done counting" << endl;
+  ofstream aln_out,coc_out;
+  if (oname.size()) aln_out.open(oname.c_str());
+  // if (cooc.size())  coc_out.open(cooc.c_str());
+  writeTable(oname.size() ? &aln_out : NULL,
+	     cooc.size()  ? &coc_out : NULL);
+  if (oname.size()) aln_out.close();
+  // if (cooc.size())  coc_out.close();
+}
+void
+interpret_args(int ac, char* av[])
+{
+  namespace po=boost::program_options;
+  po::variables_map vm;
+  po::options_description o("Options");
+  po::options_description h("Hidden Options");
+  po::positional_options_description a;
+  o.add_options()
+    ("help,h",    "print this message")
+    ("cfg,f", po::value<string>(&cfgFile),"config file")
+    ("oname,o", po::value<string>(&oname),"output file name")
+    // ("cooc,c", po::value<string>(&cooc),
+    // "file name for raw co-occurrence counts")
+    ("verbose,v", po::value<int>(&verbose)->default_value(0)->implicit_value(1),
+     "verbosity level")
+    ("threads,t", po::value<size_t>(&num_threads)->default_value(4),
+     "count in <N> parallel threads")
+    ("truncate,n", po::value<size_t>(&truncat)->default_value(0),
+     "truncate corpus to <N> sentences (for debugging)")
+    ;
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ("L1",    po::value<string>(&L1),"L1 tag")
+    ("L2",    po::value<string>(&L2),"L2 tag")
+    ;
+  a.add("bname",1);
+  a.add("L1",1);
+  a.add("L2",1);
+  get_options(ac,av,h.add(o),a,vm,"cfg");
+  if (vm.count("help") || bname.empty() || (oname.empty() && cooc.empty()))
+    {
+      cout << "usage:\n\t" << av[0] << " <basename> <L1 tag> <L2 tag> [-o <output file>] [-c <output file>]\n" << endl;
+      cout << "at least one of -o / -c must be specified." << endl;
+      cout << o << endl;
+      exit(0);
+    }
+  size_t num_cores = boost::thread::hardware_concurrency();
+  num_threads = min(num_threads,num_cores);
+}

mosesdecoder/moses/TranslationModel/UG/mm/mtt-build.cc ADDED Viewed

	@@ -0,0 +1,498 @@

+// -*- c++ -*-
+// Converts a corpus in text format (plain text, one centence per line) or
+// conll format or treetagger output format (which one is automatically
+// recognized based on the number of fields per line) into memory-mapped
+// format. (c) 2007-2013 Ulrich Germann
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/iostreams/device/mapped_file.hpp>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <vector>
+#include <string>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "ug_conll_record.h"
+#include "tpt_tokenindex.h"
+#include "ug_mm_ttrack.h"
+#include "tpt_pickler.h"
+#include "ug_deptree.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/mm/ug_im_tsa.h"
+using namespace std;
+using namespace sapt;
+using namespace Moses;
+using namespace boost;
+using namespace boost::algorithm;
+namespace po=boost::program_options;
+int with_pfas;
+int with_dcas;
+int with_sfas;
+bool incremental = false; // build / grow vocabs automatically
+bool is_conll    = false; // text or conll format?
+bool quiet       = false; // no progress reporting
+string vocabBase; // base name for existing vocabs that should be used
+string baseName;  // base name for all files
+string tmpFile, mttFile;   /* name of temporary / actual track file
+			    * (.mtt for Conll format, .mct for plain text)
+			    */
+string UNK;
+TokenIndex SF; // surface form
+TokenIndex LM; // lemma
+TokenIndex PS; // part of speech
+TokenIndex DT; // dependency type
+void interpret_args(int ac, char* av[]);
+inline uchar rangeCheck(int p, int limit) { return p < limit ? p : 1; }
+id_type
+get_id(TokenIndex const& T, string const& w)
+{
+  id_type ret = T[w];
+  if (ret == 1 && w != UNK)
+    {
+      cerr << "Warning! Unkown vocabulary item '" << w << "', but "
+	   << "incremental mode (-i) is not set." << endl;
+      assert(0);
+    }
+  return ret;
+}
+void
+open_vocab(TokenIndex& T, string fname)
+{
+  if (!access(fname.c_str(), F_OK))
+    {
+      T.open(fname,UNK);
+      assert(T[UNK] == 1);
+    }
+  else T.setUnkLabel(UNK);
+  if (incremental) T.setDynamic(true);
+  assert(T["NULL"] == 0);
+  assert(T[UNK]  == 1);
+}
+void
+ini_cnt_vec(TokenIndex const& T, vector<pair<string,size_t> > & v)
+{
+  v.resize(T.totalVocabSize());
+  for (size_t i = 0; i < T.totalVocabSize(); ++i)
+    {
+      v[i].first = T[i];
+      v[i].second = 0;
+    }
+}
+void
+write_tokenindex(string fname, TokenIndex& T, vector<id_type> const& n2o)
+{
+  if (!quiet) cerr << "Writing " << fname << endl;
+  vector<id_type> o2n(n2o.size());
+  for (id_type i = 0; i < n2o.size(); ++i) o2n[n2o[i]] = i;
+  vector<pair<string,uint32_t> > v(n2o.size());
+  for (id_type i = 0; i < n2o.size(); ++i)
+    {
+      v[i].first  = T[n2o[i]];
+      v[i].second = i;
+    }
+  T.close();
+  sort(v.begin(),v.end());
+  write_tokenindex_to_disk(v, fname, UNK);
+}
+void init(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  if (is_conll)
+    {
+      open_vocab(SF, vocabBase+".tdx.sfo"); // surface form
+      open_vocab(LM, vocabBase+".tdx.lem"); // lemma
+      open_vocab(PS, vocabBase+".tdx.pos"); // part-of-speech
+      open_vocab(DT, vocabBase+".tdx.drl"); // dependency type
+    }
+  else open_vocab(SF, vocabBase+".tdx"); // surface form
+}
+void fill_rec(Conll_Record& rec, vector<string> const& w)
+{
+  if (w.size() == 3) // treetagger output
+    {
+      rec.sform  =  get_id(SF, w[0]);
+      rec.lemma  =  get_id(LM, w[2] == "<UNKNOWN>" ? w[0] : w[2]);
+      rec.majpos =  rangeCheck(get_id(PS, w[1]), 256);
+      rec.minpos =  rangeCheck(get_id(PS, w[1]), 256);
+      rec.dtype  =  0;
+      rec.parent = -1;
+    }
+  else if (w.size() >= 8) // CONLL format
+    {
+      int id  = atoi(w[0].c_str());
+      int gov = atoi(w[6].c_str());
+      rec.sform  = get_id(SF, w[1]);
+      rec.lemma  = get_id(LM, w[2]);
+      rec.majpos = rangeCheck(get_id(PS, w[3]), 256);
+      rec.minpos = rangeCheck(get_id(PS, w[4]), 256);
+      rec.dtype  = get_id(DT, w[7]);
+      rec.parent = gov ? gov - id : 0;
+    }
+}
+void log_progress(size_t ctr)
+{
+  if (ctr % 100000 == 0)
+    {
+      if (ctr) cerr << endl;
+      cerr << setw(12) << ctr / 1000 << "K sentences processed ";
+    }
+  else if (ctr % 10000 == 0)
+    {
+      cerr << ".";
+    }
+}
+size_t
+process_plain_input(ostream& out, vector<id_type> & s_index)
+{
+  id_type totalWords = 0;
+  string line,w;
+  while (getline(cin,line))
+    {
+      istringstream buf(line);
+      if (!quiet) log_progress(s_index.size());
+      s_index.push_back(totalWords);
+      while (buf>>w)
+	{
+	  tpt::numwrite(out,get_id(SF,w));
+	  ++totalWords;
+	}
+    }
+  s_index.push_back(totalWords);
+  return totalWords;
+}
+size_t
+process_tagged_input(ostream& out,
+		     vector<id_type> & s_index,
+		     vector<id_type> & p_index)
+{
+  string line;
+  Conll_Record rec;
+  bool new_sent  = true;
+  bool new_par   = true;
+  id_type totalWords = 0;
+  while (getline(cin,line))
+    {
+      vector<string> w; string f; istringstream buf(line);
+      while (buf>>f) w.push_back(f);
+      if (w.size() == 0 || starts_with(w[0], "SID="))
+        new_sent = true;
+      else if (w.size() == 1 && w[0] == "<P>")
+	new_par = new_sent = true;
+      if (w.size() < 3) continue;
+      if (!quiet && new_sent) log_progress(s_index.size());
+      if (new_sent) { s_index.push_back(totalWords); new_sent = false; }
+      if (new_par)  { p_index.push_back(totalWords); new_par  = false; }
+      fill_rec(rec,w);
+      out.write(reinterpret_cast<char const*>(&rec),sizeof(rec));
+      ++totalWords;
+    }
+  s_index.push_back(totalWords);
+  return totalWords;
+}
+size_t
+numberize()
+{
+  ofstream out(tmpFile.c_str());
+  filepos_type startIdx=0;
+  id_type idxSize=0,totalWords=0;
+  tpt::numwrite(out,startIdx);   // place holder, to be filled at the end
+  tpt::numwrite(out,idxSize);    // place holder, to be filled at the end
+  tpt::numwrite(out,totalWords); // place holder, to be filled at the end
+  vector<id_type> s_index, p_index;
+  if(is_conll)
+    totalWords = process_tagged_input(out,s_index,p_index);
+  else
+    totalWords = process_plain_input(out,s_index);
+  vector<id_type> const* index = &s_index;
+  if (p_index.size() && p_index.back())
+    {
+      p_index.push_back(totalWords);
+      index = &p_index;
+    }
+  if (!quiet)
+    cerr << endl << "Writing index ... (" << index->size() << " chunks) ";
+  startIdx = out.tellp();
+  for (size_t i = 0; i < index->size(); i++)
+    tpt::numwrite(out,(*index)[i]);
+  out.seekp(0);
+  idxSize = index->size();
+  tpt::numwrite(out, startIdx);
+  tpt::numwrite(out, idxSize - 1);
+  tpt::numwrite(out, totalWords);
+  out.close();
+  if (!quiet) cerr << "done" << endl;
+  return totalWords;
+}
+vector<id_type> smap,lmap,pmap,dmap;
+void
+invert(vector<id_type> const& from, vector<id_type> & to)
+{
+  to.resize(from.size());
+  for (size_t i = 0 ; i < to.size(); ++i)
+    to[from[i]] = i;
+}
+// sorts new items based on occurrence counts but won't reassign
+// existing token ids
+void
+conservative_sort(TokenIndex     const & V,
+		  vector<size_t> const & cnt,
+		  vector<id_type>      & xmap)
+{
+  xmap.resize(V.totalVocabSize());
+  for (size_t i = 0; i < xmap.size(); ++i) xmap[i] = i;
+  VectorIndexSorter<size_t,greater<size_t>, id_type> sorter(cnt);
+  sort(xmap.begin()+max(id_type(2),V.knownVocabSize()), xmap.end(), sorter);
+}
+// reassign token ids in the corpus track based on the id map created by
+// conservative_sort
+void remap()
+{
+  if (!quiet) cerr << "Remapping ids ... ";
+  filepos_type idxOffset;
+  id_type totalWords, idxSize;
+  boost::iostreams::mapped_file mtt(tmpFile);
+  char const* p = mtt.data();
+  p = tpt::numread(p,idxOffset);
+  p = tpt::numread(p,idxSize);
+  p = tpt::numread(p,totalWords);
+  if (is_conll)
+    {
+      vector<size_t> sf(SF.totalVocabSize(), 0);
+      vector<size_t> lm(LM.totalVocabSize(), 0);
+      vector<size_t> ps(PS.totalVocabSize(), 0);
+      vector<size_t> dt(DT.totalVocabSize(), 0);
+      Conll_Record* w  = reinterpret_cast<Conll_Record*>(const_cast<char*>(p));
+      for (size_t i = 0; i < totalWords; ++i)
+	{
+	  ++sf.at(w[i].sform);
+	  ++lm.at(w[i].lemma);
+	  ++ps.at(w[i].majpos);
+	  ++ps.at(w[i].minpos);
+	  ++dt.at(w[i].dtype);
+	}
+      conservative_sort(SF,sf,smap);
+      conservative_sort(LM,lm,lmap);
+      conservative_sort(PS,ps,pmap);
+      conservative_sort(DT,dt,dmap);
+      vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
+      vector<id_type> lmap_i(lmap.size()); invert(lmap,lmap_i);
+      vector<id_type> pmap_i(pmap.size()); invert(pmap,pmap_i);
+      vector<id_type> dmap_i(dmap.size()); invert(dmap,dmap_i);
+      for (size_t i = 0; i < totalWords; ++i)
+	{
+	  w[i].sform  = smap_i[w[i].sform];
+	  w[i].lemma  = lmap_i[w[i].lemma];
+	  w[i].majpos = pmap_i[w[i].majpos];
+	  w[i].minpos = pmap_i[w[i].minpos];
+	  w[i].dtype  = dmap_i[w[i].dtype];
+	}
+    }
+  else
+    {
+      vector<size_t> sf(SF.totalVocabSize(), 0);
+      id_type* w = reinterpret_cast<id_type*>(const_cast<char*>(p));
+      for (size_t i = 0; i < totalWords; ++i) ++sf.at(w[i]);
+      conservative_sort(SF,sf,smap);
+      vector<id_type> smap_i(smap.size()); invert(smap,smap_i);
+      for (size_t i = 0; i < totalWords; ++i) w[i] = smap_i[w[i]];
+    }
+  mtt.close();
+  if (!quiet) cerr << "done." << endl;
+}
+void save_vocabs()
+{
+  string vbase = baseName;
+  if (is_conll)
+    {
+      if (SF.totalVocabSize() > SF.knownVocabSize())
+	write_tokenindex(vbase+".tdx.sfo",SF,smap);
+      if (LM.totalVocabSize() > LM.knownVocabSize())
+	write_tokenindex(vbase+".tdx.lem",LM,lmap);
+      if (PS.totalVocabSize() > PS.knownVocabSize())
+	write_tokenindex(vbase+".tdx.pos",PS,pmap);
+      if (DT.totalVocabSize() > DT.knownVocabSize())
+	write_tokenindex(vbase+".tdx.drl",DT,dmap);
+    }
+  else if (SF.totalVocabSize() > SF.knownVocabSize())
+    write_tokenindex(vbase+".tdx",SF,smap);
+}
+template<typename Token>
+void
+build_mmTSA(string infile, string outfile)
+{
+  // size_t mypid = fork();
+  // if(mypid) return mypid;
+  boost::shared_ptr<mmTtrack<Token> > T(new mmTtrack<Token>(infile));
+  bdBitset filter;
+  filter.resize(T->size(),true);
+  imTSA<Token> S(T,&filter,(quiet?NULL:&cerr));
+  S.save_as_mm_tsa(outfile);
+  // exit(0);
+}
+bool
+build_plaintext_tsas()
+{
+  typedef L2R_Token<SimpleWordId> L2R;
+  typedef R2L_Token<SimpleWordId> R2L;
+  // size_t c = with_sfas + with_pfas;
+  if (with_sfas) build_mmTSA<L2R>(tmpFile, baseName + ".sfa");
+  if (with_pfas) build_mmTSA<R2L>(tmpFile, baseName + ".pfa");
+  // while (c--) wait(NULL);
+  return true;
+}
+void build_conll_tsas()
+{
+  string bn  = baseName;
+  string mtt = tmpFile;
+  size_t c = 3 * (with_sfas + with_pfas + with_dcas);
+  if (with_sfas)
+    {
+      build_mmTSA<L2R_Token<Conll_Sform> >(mtt,bn+".sfa-sform");
+      build_mmTSA<L2R_Token<Conll_Lemma> >(mtt,bn+".sfa-lemma");
+      build_mmTSA<L2R_Token<Conll_MinPos> >(mtt,bn+".sfa-minpos");
+    }
+  if (with_pfas)
+    {
+      build_mmTSA<R2L_Token<Conll_Sform> >(mtt,bn+".pfa-sform");
+      build_mmTSA<R2L_Token<Conll_Lemma> >(mtt,bn+".pfa-lemma");
+      build_mmTSA<R2L_Token<Conll_MinPos> >(mtt,bn+".pfa-minpos");
+    }
+  if (with_dcas)
+    {
+      build_mmTSA<ConllBottomUpToken<Conll_Sform> >(mtt,bn+".dca-sform");
+      build_mmTSA<ConllBottomUpToken<Conll_Lemma> >(mtt,bn+".dca-lemma");
+      build_mmTSA<ConllBottomUpToken<Conll_MinPos> >(mtt,bn+".dca-minpos");
+    }
+  // while (c--) wait(NULL);
+}
+int main(int argc, char* argv[])
+{
+  init(argc,argv);
+  numberize();
+  if (SF.totalVocabSize() > SF.knownVocabSize() ||
+      LM.totalVocabSize() > LM.knownVocabSize() ||
+      PS.totalVocabSize() > PS.knownVocabSize() ||
+      DT.totalVocabSize() > DT.knownVocabSize())
+    {
+      remap();
+      save_vocabs();
+    }
+  if (is_conll) build_conll_tsas();
+  else          build_plaintext_tsas();
+  if (!quiet) cerr << endl;
+  rename(tmpFile.c_str(),mttFile.c_str());
+}
+void
+interpret_args(int ac, char* av[])
+{
+  po::variables_map vm;
+  po::options_description o("Options");
+  o.add_options()
+    ("help,h",  "print this message")
+    ("quiet,q", po::bool_switch(&quiet),
+     "don't print progress information")
+    ("incremental,i", po::bool_switch(&incremental),
+     "incremental mode; rewrites vocab files!")
+    ("vocab-base,v", po::value<string>(&vocabBase),
+     "base name of various vocabularies")
+    ("output,o", po::value<string>(&baseName),
+     "base file name of the resulting file(s)")
+    ("sfa,s", po::value<int>(&with_sfas)->default_value(1),
+     "also build suffix arrays")
+    ("pfa,p", po::value<int>(&with_pfas)
+     ->default_value(0)->implicit_value(1),
+     "also build prefix arrays")
+    ("dca,d", po::value<int>(&with_dcas)
+     ->default_value(0)->implicit_value(1),
+     "also build dependency chain arrays")
+    ("conll,c", po::bool_switch(&is_conll),
+     "corpus is in CoNLL format (default: plain text)")
+    ("unk,u", po::value<string>(&UNK)->default_value("UNK"),
+     "label for unknown tokens")
+    // ("map,m", po::value<string>(&vmap),
+    // "map words to word classes for indexing")
+    ;
+  po::options_description h("Hidden Options");
+  h.add_options()
+    ;
+  h.add(o);
+  po::positional_options_description a;
+  a.add("output",1);
+  po::store(po::command_line_parser(ac,av)
+            .options(h)
+            .positional(a)
+            .run(),vm);
+  po::notify(vm);
+  if (vm.count("help") || !vm.count("output"))
+    {
+      cout << "\nusage:\n\t cat <corpus> | " << av[0]
+           << " [options] <output .mtt file>" << endl;
+      cout << o << endl;
+      exit(0);
+    }
+  mttFile = baseName + (is_conll ? ".mtt" : ".mct");
+  tmpFile = mttFile + "_";
+}

mosesdecoder/moses/TranslationModel/UG/mm/mtt-dump.cc ADDED Viewed

	@@ -0,0 +1,166 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// (c) 2008-2010 Ulrich Germann
+#include <boost/program_options.hpp>
+#include <iomanip>
+#include "tpt_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "tpt_tokenindex.h"
+#include "ug_deptree.h"
+#include "ug_corpus_token.h"
+using namespace std;
+using namespace sapt;
+namespace po = boost::program_options;
+string bname,mtt,mct;
+vector<string> range;
+typedef L2R_Token<Conll_Sform> Token;
+TokenIndex SF,LM,PS,DT;
+mmTtrack<Token> MTT;
+mmTtrack<SimpleWordId> MCT;
+bool sform;
+bool have_mtt, have_mct;
+bool with_sids;
+bool with_positions;
+void
+interpret_args(int ac, char* av[])
+{
+  po::variables_map vm;
+  po::options_description o("Options");
+  o.add_options()
+    ("help,h",    "print this message")
+    ("numbers,n", po::bool_switch(&with_sids), "print sentence ids as first token")
+    ("sform,s", po::bool_switch(&sform), "sform only")
+    ("with-positions,p", po::bool_switch(&with_positions), "show word positions")
+    ;
+  po::options_description h("Hidden Options");
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ("range", po::value<vector<string> >(&range), "range")
+    ;
+  po::positional_options_description a;
+  a.add("bname",1);
+  a.add("range",-1);
+  po::store(po::command_line_parser(ac,av)
+            .options(h.add(o))
+            .positional(a)
+            .run(),vm);
+  po::notify(vm); // IMPORTANT
+  if (vm.count("help") || bname.empty())
+    {
+      cout << "usage:\n\t"
+           << av[0] << " track name [<range>]\n"
+           << endl;
+      cout << o << endl;
+      exit(0);
+    }
+  mtt = bname+".mtt";
+  mct = bname+".mct";
+}
+void
+printRangeMTT(size_t start, size_t stop)
+{
+  for (;start < stop; start++)
+    {
+      size_t i = 0;
+      Token const* s = MTT.sntStart(start);
+      Token const* e = MTT.sntEnd(start);
+      if (with_sids) cout << start << " ";
+      for (Token const* t = s; t < e; ++t)
+        {
+#if 0
+          uchar const* x = reinterpret_cast<uchar const*>(t);
+          cout << *reinterpret_cast<id_type const*>(x) << " ";
+          cout << *reinterpret_cast<id_type const*>(x+4) << " ";
+          cout << int(*(x+8)) << " ";
+          cout << int(*(x+9)) << " ";
+          cout << *reinterpret_cast<short const*>(x+10) << endl;
+#endif
+          if (!sform)
+            {
+              cout << setw(2) << right << ++i << " ";
+	      cout << setw(30) << right << SF[t->sform] << " ";
+              cout << setw(4)  << right << PS[t->majpos]   << " ";
+              cout << setw(4)  << right << PS[t->minpos]   << " ";
+              cout << setw(30) << left  << LM[t->lemma] << " ";
+              cout << i+t->parent << " ";
+              cout << DT[t->dtype] << endl;
+            }
+          else
+	    {
+	      if (with_positions) cout << t-s << ":";
+	      cout << SF[t->id()] << " ";
+	    }
+        }
+      cout << endl;
+    }
+}
+void
+printRangeMCT(size_t start, size_t stop)
+{
+  for (;start < stop; start++)
+    {
+      SimpleWordId const* s = MCT.sntStart(start);
+      SimpleWordId const* t = s;
+      SimpleWordId const* e = MCT.sntEnd(start);
+      if (with_sids) cout << start << " ";
+      while (t < e)
+	{
+	  if (with_positions) cout << t-s << ":";
+	  cout << SF[(t++)->id()] << " ";
+	}
+      cout << endl;
+    }
+}
+int
+main(int argc, char*argv[])
+{
+  interpret_args(argc,argv);
+  have_mtt = !access(mtt.c_str(),F_OK);
+  have_mct = !have_mtt && !access(mct.c_str(),F_OK);
+  if (!have_mtt && !have_mct)
+    {
+      cerr << "FATAL ERROR: neither " << mtt << " nor " << mct << " exit." << endl;
+      exit(1);
+    }
+  if (have_mtt)
+    {
+      SF.open(bname+".tdx.sfo"); SF.iniReverseIndex();
+      LM.open(bname+".tdx.lem"); LM.iniReverseIndex();
+      PS.open(bname+".tdx.pos"); PS.iniReverseIndex();
+      DT.open(bname+".tdx.drl"); DT.iniReverseIndex();
+      MTT.open(mtt);
+    }
+  else
+    {
+      sform = true;
+      SF.open(bname+".tdx"); SF.iniReverseIndex();
+      MCT.open(mct);
+    }
+  if (!range.size())
+    have_mtt ? printRangeMTT(0, MTT.size()) : printRangeMCT(0, MCT.size());
+  else
+    {
+      for (size_t i = 0; i < range.size(); i++)
+        {
+          istringstream buf(range[i]);
+          size_t first,last; uchar c;
+          buf>>first;
+          if (buf.peek() == '-') buf>>c>>last;
+          else                   last = first;
+	  if (have_mtt && last < MTT.size())
+	    printRangeMTT(first,last+1);
+	  else if (last < MCT.size())
+	    printRangeMCT(first,last+1);
+	}
+    }
+}

mosesdecoder/moses/TranslationModel/UG/mm/mtt.count.cc ADDED Viewed

	@@ -0,0 +1,77 @@

+// build a phrase table for the given input
+#include "ug_mm_ttrack.h"
+#include "ug_mm_tsa.h"
+#include "tpt_tokenindex.h"
+#include "ug_corpus_token.h"
+#include <string>
+#include <vector>
+#include <cassert>
+#include <boost/unordered_map.hpp>
+#include <boost/foreach.hpp>
+#include <iomanip>
+#include "ug_typedefs.h"
+#include "tpt_pickler.h"
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include <algorithm>
+#include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+using namespace std;
+using namespace ugdiss;
+using namespace Moses;
+typedef sapt::L2R_Token<sapt::SimpleWordId> Token;
+typedef sapt::mmTSA<Token>::tree_iterator iter;
+typedef boost::unordered_map<pair<size_t,size_t>,size_t> phrase_counter_t;
+#define CACHING_THRESHOLD 1000
+sapt::mmTtrack<Token> T; // token tracks
+sapt::TokenIndex      V; // vocabs
+sapt::mmTSA<Token>    I; // suffix arrays
+void interpret_args(int ac, char* av[]);
+string bname;
+bool   echo;
+int main(int argc, char* argv[])
+{
+  interpret_args(argc,argv);
+  T.open(bname+".mct");
+  V.open(bname+".tdx"); V.iniReverseIndex();
+  I.open(bname+".sfa",&T);
+  string line;
+  while (getline(cin,line))
+    {
+      vector<id_type> phr;
+      V.fillIdSeq(line,phr);
+      TSA<Token>::tree_iterator m(&I);
+      size_t i = 0;
+      while (i < phr.size() && m.extend(phr[i])) ++i;
+      if (echo) cout << line << ": ";
+      if (i < phr.size()) cout << 0 << endl;
+      else                cout << m.rawCnt() << endl;
+    }
+  exit(0);
+}
+void
+interpret_args(int ac, char* av[])
+{
+  namespace po=boost::program_options;
+  po::variables_map vm;
+  po::options_description o("Options");
+  po::options_description h("Hidden Options");
+  po::positional_options_description a;
+  o.add_options()
+    ("help,h",    "print this message")
+    ("echo,e", po::bool_switch(&echo), "repeat lookup phrases")
+    ;
+  h.add_options()
+    ("bname", po::value<string>(&bname), "base name")
+    ;
+  a.add("bname",1);
+  get_options(ac,av,h.add(o),a,vm);
+}

mosesdecoder/moses/TranslationModel/UG/mm/num_read_write.cc ADDED Viewed

	@@ -0,0 +1,74 @@

+#include "num_read_write.h"
+namespace tpt {
+  typedef unsigned char uchar;
+  void
+  numwrite(std::ostream& out, uint16_t const& x)
+  {
+    char buf[2];
+    buf[0] = x%256;
+    buf[1] = (x>>8)%256;
+    out.write(buf,2);
+  }
+  void
+  numwrite(std::ostream& out, uint32_t const& x)
+  {
+    char buf[4];
+    buf[0] = x%256;
+    buf[1] = (x>>8)%256;
+    buf[2] = (x>>16)%256;
+    buf[3] = (x>>24)%256;
+    out.write(buf,4);
+  }
+  void
+  numwrite(std::ostream& out, uint64_t const& x)
+  {
+    char buf[8];
+    buf[0] = x%256;
+    buf[1] = (x>>8)%256;
+    buf[2] = (x>>16)%256;
+    buf[3] = (x>>24)%256;
+    buf[4] = (x>>32)%256;
+    buf[5] = (x>>40)%256;
+    buf[6] = (x>>48)%256;
+    buf[7] = (x>>56)%256;
+    out.write(buf,8);
+  }
+  char const*
+  numread(char const* src, uint16_t & x)
+  {
+    uchar const* d = reinterpret_cast<uchar const*>(src);
+    x = (uint16_t(d[0])<<0) | (uint16_t(d[1])<<8);
+    return src+2;
+  }
+  char const*
+  numread(char const* src, uint32_t & x)
+  {
+    uchar const* d = reinterpret_cast<uchar const*>(src);
+    x = ((uint32_t(d[0])<<0) |
+	 (uint32_t(d[1])<<8) |
+	 (uint32_t(d[2])<<16)|
+	 (uint32_t(d[3])<<24));
+    return src+4;
+  }
+  char const*
+  numread(char const* src, uint64_t & x)
+  {
+    uchar const* d = reinterpret_cast<uchar const*>(src);
+    x = ((uint64_t(d[0])<<0)  |
+	 (uint64_t(d[1])<<8)  |
+	 (uint64_t(d[2])<<16) |
+	 (uint64_t(d[3])<<24) |
+	 (uint64_t(d[4])<<32) |
+	 (uint64_t(d[5])<<40) |
+	 (uint64_t(d[6])<<48) |
+	 (uint64_t(d[7])<<56));
+    return src+8;
+  }
+}

mosesdecoder/moses/TranslationModel/UG/mm/test-http-client.cc ADDED Viewed

	@@ -0,0 +1,27 @@

+// -*- c++ -*-
+#include "ug_http_client.h"
+int main(int argc, char* argv[])
+{
+  try
+    {
+      if (argc != 2)
+	{
+	  std::cout << "Usage: async_client <url>\n";
+	  std::cout << "Example:\n";
+	  std::cout << "  async_client www.boost.org/LICENSE_1_0.txt\n";
+	  return 1;
+	}
+      boost::asio::io_service io_service;
+      Moses::http_client c(io_service, argv[1]);
+      io_service.run();
+      std::cout << c.content() << std::endl;
+    }
+  catch (std::exception& e)
+    {
+      std::cout << "Exception: " << e.what() << "\n";
+    }
+  return 0;
+}

mosesdecoder/moses/TranslationModel/UG/mm/test-xml-escaping.cc ADDED Viewed

	@@ -0,0 +1,13 @@

+#include <iostream>
+#include <string>
+#include <iomanip>
+#include "ug_http_client.h"
+using namespace std;
+int main()
+{
+  string line;
+  while (getline(cin,line))
+    cout << Moses::uri_encode(line) << endl;
+}

mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.cc ADDED Viewed

	@@ -0,0 +1,594 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// (c) 2007,2008 Ulrich Germann
+/* Functions for writing indices tightly (use only the bytes you need).
+ * The first bit indicates whether a byte belongs to a key or a value.
+ * The remaining 7 bits are part of the respective integer value.
+ * (c) 2007 Ulrich Germann
+ */
+//
+// ugTightIndex.cc
+//
+// Made by Ulrich Germann
+// Login   <germann@germann-laptop>
+//
+// Started on  Tue Jul 17 15:09:33 2007 Ulrich Germann
+// Started on  Tue Jul 17 15:09:33 2007 Ulrich Germann
+//
+#include <iostream>
+#include <cassert>
+#include "tpt_tightindex.h"
+namespace tpt
+{
+  // #define LOG_WRITE_ACTIVITY
+  // write a key or value into a tight index
+  // flag indicates wheter it's a key or a value
+  void tightwrite(std::ostream& out, uint64_t data, bool flag)
+  {
+    // assert(sizeof(size_t)==4);
+#ifdef LOG_WRITE_ACTIVITY
+    size_t bytes_written=1;
+    std::cerr << "starting at file position " << out.tellp()
+	      << ": tightwrite " << data;
+#endif
+    if (flag)
+      {
+#ifdef LOG_WRITE_ACTIVITY
+	std::cerr << " with flag 1 ";
+#endif
+	while (data >= 128)
+	  {
+	    char c = char(data%128)|char(-128);
+	    out.put(c);
+	    data >>= 7;
+#ifdef LOG_WRITE_ACTIVITY
+	    bytes_written++;
+#endif
+	  }
+	char c = char(data%128)|char(-128);
+	out.put(c);
+      }
+    else
+      {
+#ifdef LOG_WRITE_ACTIVITY
+	std::cerr << " with flag 0 ";
+#endif
+	while (data >= 128)
+	  {
+	    char c = data&127;
+	    out.put(c);
+	    data >>= 7;
+#ifdef LOG_WRITE_ACTIVITY
+	    bytes_written++;
+#endif
+	  }
+	char c = (data&127);
+	out.put(c);
+      }
+#ifdef LOG_WRITE_ACTIVITY
+    std::cerr << " in " << bytes_written << " bytes" << std::endl;
+#endif
+  }
+// For the code below: does it make a difference if I hard-code the
+// unraveled loop or does code optimization by the compiler take care
+// of that?
+#define DEBUG_TIGHTREAD 0
+  // read a key value from a tight index; filepos_type must be at least as
+  // large as count_type
+  filepos_type
+  tightread(std::istream& in, std::ios::pos_type stop)
+  {
+    // debug=true;
+    // assert(sizeof(size_t) == 4);
+    assert(in.rdbuf()->in_avail() > 0);
+    filepos_type data     = 0;
+    short int bitshift = 7;
+    int pos = in.tellg();
+#if DEBUG_TIGHTREAD
+    if (debug)
+      cerr << bitpattern(uint(in.peek())) << " " << in.peek()
+	   << " pos=" << in.tellg() << "\n";
+#endif
+    int buf = in.get();
+    if (stop == std::ios::pos_type(0))
+      stop = size_t(in.tellg())+in.rdbuf()->in_avail();
+    else
+      stop = std::min(size_t(stop),size_t(in.tellg())+in.rdbuf()->in_avail());
+    if (buf < 0)
+      std::cerr << "number read: " << buf << " " << pos << " "
+		<< in.tellg() << std::endl;
+    assert (buf>=0);
+    if (buf >= 128) // continuation bit is 1
+      {
+	data = buf-128; // unset the bit
+	while (in.tellg() < stop && in.peek() >= 128)
+	  {
+#if DEBUG_TIGHTREAD
+	    if (debug)
+	      cerr << bitpattern(uint(in.peek())) << " " << in.peek();
+#endif
+	    // cerr << bitpattern(size_t(in.peek())) << std::endl;
+	    data += size_t(in.get()-128)<<bitshift;
+	    bitshift += 7;
+#if DEBUG_TIGHTREAD
+	    if (debug)
+	      cerr << " " << data << " pos=" << in.tellg() << std::endl;
+#endif
+	  }
+      }
+    else
+      {
+	data = buf;
+	while (in.tellg() < stop && in.peek() < 128)
+	  {
+	    // cerr << bitpattern(size_t(in.peek())) << std::endl;
+#if DEBUG_TIGHTREAD
+	    if (debug)
+	      cerr << bitpattern(uint(in.peek())) << " " << in.peek();
+#endif
+	    data += size_t(in.get())<<bitshift;
+	    bitshift += 7;
+#if DEBUG_TIGHTREAD
+	    if (debug)
+	      cerr << " " << data << " pos=" << in.tellg() << "\n";
+#endif
+	  }
+      }
+    return data;
+  }
+#define DEBUG_TIGHTFIND 0
+#if DEBUG_TIGHTFIND
+bool debug=true;
+#endif
+  bool
+  tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop)
+  {
+    in.seekg((start+stop)/2);
+    // Jump approximately to the middle. Since we might land in the
+    // middle of a number, we need to find the start of the next
+    // [index key/file offset] pair first. Bytes belonging to an index
+    // key have the leftmost bit set to 0, bytes belonging to a file
+    // offset have it set to 1
+    // if we landed in the middle of an index key, skip to the end of it
+    while (static_cast<filepos_type>(in.tellg()) < stop && in.get() < 128)
+      {
+#if DEBUG_TIGHTFIND
+	if (debug)
+	  {
+	    in.unget();
+	    char c = in.get();
+	    std::cerr << in.tellg() << " skipped key byte " << c << std::endl;
+	  }
+#endif
+	if (in.eof()) return false;
+      }
+  // Also skip the associated file offset:
+    while (static_cast<filepos_type>(in.tellg()) < stop && in.peek() >= 128)
+      {
+#if DEBUG_TIGHTFIND
+	int r = in.get();
+	if (debug)
+	  std::cerr << in.tellg() << " skipped value byte " << r
+	       << " next is " << in.peek()
+	       << std::endl;
+#else
+	in.get();
+#endif
+      }
+    return true;
+  }
+  char const*
+  tightfind_midpoint(char const* const start,
+                     char const* const stop)
+  {
+    char const* mp = start + (stop - start)/2;
+    while (*mp < 0  && mp > start) mp--;
+    while (*mp >= 0 && mp > start) mp--;
+    return (*mp < 0) ? ++mp : mp;
+  }
+  bool
+  linear_search(std::istream& in, filepos_type start, filepos_type stop,
+		id_type key, unsigned char& flags)
+  { // performs a linear search in the range
+    in.seekg(start);
+#if DEBUG_TIGHTFIND
+    if (debug) std::cerr << in.tellg() << " ";
+#endif
+    // ATTENTION! The bitshift operations below are important:
+    // We use some of the bits in the key value to store additional
+    // information about what and where node iformation is stored.
+    id_type foo;
+    for(foo = tightread(in,stop);
+	(foo>>FLAGBITS) < key;
+      foo = tightread(in,stop))
+      {
+	// skip the value associated with key /foo/
+	while (static_cast<filepos_type>(in.tellg()) < stop
+	       && in.peek() >= 128) in.get();
+#if DEBUG_TIGHTFIND
+	if (debug)
+	  std::cerr << (foo>>FLAGBITS) << " [" << key << "] "
+	       << in.tellg() << std::endl;
+#endif
+	if (in.tellg() == std::ios::pos_type(stop))
+	  return false; // not found
+      }
+#if DEBUG_TIGHTFIND
+    if (debug && (foo>>FLAGBITS)==key)
+      std::cerr << "found entry for " << key << std::endl;
+    std::cerr << "current file position is " << in.tellg()
+              << " (value read: " << key << std::endl;
+#endif
+    assert(static_cast<filepos_type>(in.tellg()) < stop);
+    if ((foo>>FLAGBITS)==key)
+      {
+	flags = (foo%256);
+	flags &= FLAGMASK;
+	return true;
+      }
+    else
+      return false;
+  }
+  bool
+  tightfind(std::istream& in, filepos_type start, filepos_type stop,
+	    id_type key, unsigned char& flags)
+  {
+    // returns true if the value is found
+#if DEBUG_TIGHTFIND
+    if (debug)
+      std::cerr << "looking for " << key
+	   << " in range [" << start << ":" << stop << "]" << std::endl;
+#endif
+    if (start==stop) return false;
+    assert(stop>start);
+    if ((start+1)==stop) return false; // list is empty
+    unsigned int const granularity = sizeof(filepos_type)*5;
+    // granularity: point where we should switch to linear search,
+    // because otherwise we might skip over the entry we are looking for
+    // because we land right in the middle of it.
+    if (stop > start + granularity)
+      if (!tightfind_midpoint(in,start,stop))
+	return false; // something went wrong (empty index)
+    if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
+      { // If the search range is very short, tightfind_midpoint might skip the
+	// entry we are loking for. In this case, we can afford a linear
+	// search
+	return linear_search(in,start,stop,key,flags);
+      }
+    // perform binary search
+    filepos_type curpos = in.tellg();
+    id_type foo = tightread(in,stop);
+    id_type tmpid = foo>>FLAGBITS;
+    if (tmpid == key)
+      {
+	flags  = foo%256;
+	flags &= FLAGMASK;
+#if DEBUG_TIGHTFIND
+	if (debug) std::cerr << "found entry for " << key << std::endl;
+#endif
+	return true; // done, found
+      }
+    else if (tmpid > key)
+      { // look in the lower half
+#if DEBUG_TIGHTFIND
+	if (debug) std::cerr << foo << " > " << key << std::endl;
+#endif
+	return tightfind(in,start,curpos,key,flags);
+      }
+    else
+      { // look in the upper half
+	while (static_cast<filepos_type>(in.tellg()) < stop
+	       && in.rdbuf()->in_avail() > 0 // is that still necessary???
+	       && in.peek() >= 128)
+	  in.get(); // skip associated value
+	if (in.rdbuf()->in_avail() == 0 || in.tellg() == std::ios::pos_type(stop))
+	  return false;
+#if DEBUG_TIGHTFIND
+	if (debug) std::cerr << foo << " < " << key << std::endl;
+#endif
+	return tightfind(in,in.tellg(),stop,key,flags);
+      }
+  }
+  char const*
+  tightfind(char const* const start,
+            char const* const stop,
+	    id_type key,
+            unsigned char& flags)
+  {
+    // returns true if the value is found
+    if (start==stop) return NULL;
+    assert(stop>start);
+    if ((start+1)==stop) return NULL; // list is empty
+    char const* p = tightfind_midpoint(start,stop);
+    // if ids can be larger than 67,108,864 on 32-bit machines
+    // (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
+    size_t foo;
+    char const* after = tightread(p,stop,foo);
+    id_type tmpId = foo>>FLAGBITS;
+    if (tmpId == key)
+      {
+	flags  = foo%256;
+	flags &= FLAGMASK;
+        return after;
+      }
+    else if (tmpId > key)
+      { // look in the lower half
+	return tightfind(start,p,key,flags);
+      }
+    else
+      { // look in the upper half
+        while (*after<0 && ++after < stop);
+        if (after == stop) return NULL;
+	return tightfind(after,stop,key,flags);
+      }
+  }
+  char const*
+  tightfind_noflags(char const* const start,
+                    char const* const stop,
+                    id_type key)
+  {
+    // returns true if the value is found
+    if (start==stop) return NULL;
+    assert(stop>start);
+    if ((start+1)==stop) return NULL; // list is empty
+    char const* p = tightfind_midpoint(start,stop);
+    // if ids can be larger than 67,108,864 on 32-bit machines
+    // (i.e., 2**(28-flagbits)), dest must be declared as uint64_t
+    size_t foo;
+    char const* after = tightread(p,stop,foo);
+    if (foo == key)
+      return after;
+    else if (foo > key)
+      { // look in the lower half
+	return tightfind_noflags(start,p,key);
+      }
+    else
+      { // look in the upper half
+        while (*after<0 && ++after < stop);
+        if (after == stop) return NULL;
+	return tightfind_noflags(after,stop,key);
+      }
+  }
+  bool
+  linear_search_noflags(std::istream& in, filepos_type start,
+                filepos_type stop, id_type key)
+  { // performs a linear search in the range
+    std::ios::pos_type mystop = stop;
+    in.seekg(start);
+    id_type foo;
+    for(foo = tightread(in,stop); foo < key; foo = tightread(in,stop))
+      {
+	// skip the value associated with key /foo/
+	while (in.tellg() < mystop && in.peek() >= 128)
+          in.get();
+	if (in.tellg() == mystop)
+	  return false; // not found
+      }
+    assert(in.tellg() < mystop);
+    return (foo==key);
+  }
+  bool
+  tightfind_noflags(std::istream& in, filepos_type start,
+                    filepos_type stop, id_type key)
+  {
+    // returns true if the value is found
+    if (start==stop) return false;
+    assert(stop>start);
+    if ((start+1)==stop) return false; // list is empty
+    // granularity: point where we should switch to linear search,
+    // because otherwise we might skip over the entry we are looking for
+    // because we land right in the middle of it.
+    unsigned int const granularity = sizeof(filepos_type)*5;
+    // UG: why 5? we should be able to get away with less!
+    if (stop > start + granularity)
+      if (!tightfind_midpoint(in,start,stop))
+	return false; // something went wrong (empty index)
+    // If the search range is very short, tightfind_midpoint might skip the
+    // entry we are loking for. In this case, we can afford a linear
+    // search
+    if (stop <= start + granularity || in.tellg() == std::ios::pos_type(stop))
+      return linear_search_noflags(in,start,stop,key);
+    // Otherwise, perform binary search
+    filepos_type curpos = in.tellg();
+    id_type foo = tightread(in,stop);
+    if (foo == key)
+      return true; // done, found
+    else if (foo > key) // search first half
+      return tightfind_noflags(in,start,curpos,key);
+    else // search second half
+      {
+        std::ios::pos_type mystop = stop;
+	while (in.tellg() < mystop
+	       && in.rdbuf()->in_avail() > 0 // is that still necessary???
+	       && in.peek() >= 128)
+	  in.get(); // skip associated value
+	if (in.rdbuf()->in_avail() == 0 || in.tellg() == mystop)
+	  return false;
+	return tightfind_noflags(in,in.tellg(),stop,key);
+      }
+  }
+  void tightwrite2(std::ostream& out, size_t data, bool flag)
+  {
+    // same as tightwrite, but uses basic storage units of size 2
+    // assert(sizeof(size_t)==4);
+    short int foo = (data%32768);
+    if (flag)
+      {
+	foo += 32768; // set first bit
+	while (data >= 32768) // = 2^15
+	  {
+	    out.write(reinterpret_cast<char*>(&foo),2);
+	    data >>= 15;
+	    foo = (data%32768)+32768;
+	  }
+      }
+    else
+      {
+	while (data >= 32768) // = 2^15
+	  {
+	    out.write(reinterpret_cast<char*>(&foo),2);
+	    data >>= 15;
+	    foo = data%32768;
+	  }
+      }
+    out.write(reinterpret_cast<char*>(&foo),2);
+  }
+  char const*
+  tightread8(char const* start,
+             char const* stop,
+             uint64_t& dest)
+  {
+    static char bitmask=127;
+    dest = 0;
+    if (*start < 0)
+      {
+        dest = (*start)&bitmask;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<7;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<14;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<21;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<28;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<35;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<42;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<49;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<56;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint64_t((*start)&bitmask)<<63;
+      }
+    else
+      {
+        dest = *start;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<7;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<14;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<21;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<28;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<35;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<42;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<49;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<56;
+        if (++start==stop || *start < 0) return start;
+        dest += uint64_t(*start)<<63;
+      }
+    assert(start<stop);
+    return ++start;
+  }
+  char const*
+  tightread4(char const* start,
+             char const* stop,
+             uint32_t& dest)
+  {
+    static char bitmask=127;
+    dest = 0;
+    if (*start < 0)
+      {
+        dest = (*start)&bitmask;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<7;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<14;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<21;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<28;
+      }
+    else
+      {
+        dest = *start;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<7;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<14;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<21;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<28;
+      }
+    assert(start<stop);
+    return ++start;
+  }
+  char const*
+  tightread2(char const* start,
+             char const* stop,
+             uint16_t& dest)
+  {
+    static char bitmask=127;
+    dest = 0;
+    if (*start < 0)
+      {
+        dest = (*start)&bitmask;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<7;
+        if (++start==stop || *start >= 0) return start;
+        dest += uint32_t((*start)&bitmask)<<14;
+      }
+    else
+      {
+        dest = *start;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<7;
+        if (++start==stop || *start < 0) return start;
+        dest += uint32_t(*start)<<14;
+      }
+    assert(start<stop);
+    return ++start;
+  }
+} // end namespace ugdiss

mosesdecoder/moses/TranslationModel/UG/mm/tpt_tightindex.h ADDED Viewed

	@@ -0,0 +1,176 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// (c) 2007,2008 Ulrich Germann
+/* Functions for writing indices tightly (use only the bytes you need).
+ * The first bit indicates whether a byte belongs to a key or a value.
+ * The remaining 7 bits are part of the respective integer value.
+ */
+#ifndef __ugTightIndex
+#define __ugTightIndex
+#include <map>
+#include <iostream>
+#include <sstream>
+#include "tpt_typedefs.h"
+#include <cassert>
+#ifndef uchar
+#endif
+#define FLAGBITS 2
+#define FLAGMASK (uchar(3))
+#define HAS_VALUE_MASK (uchar(2))
+#define HAS_CHILD_MASK (uchar(1))
+extern bool debug;
+namespace tpt
+{
+  // void tightwritex(iostream& out, size_t data, bool flag);
+  void
+  tightwrite(std::ostream& out, uint64_t data, bool flag);
+  filepos_type
+  tightread(std::istream& in, std::ios::pos_type stop);
+  bool
+  tightfind(std::istream& in,
+	    filepos_type start,
+	    filepos_type stop,
+	    id_type key,
+	    unsigned char& flags);
+  bool
+  tightfind_noflags(std::istream& in,
+                    filepos_type start,
+                    filepos_type stop,
+                    id_type key);
+  char const*
+  tightfind(char const* const start,
+            char const* const stop,
+            id_type key,
+            unsigned char& flags);
+  char const*
+  tightfind_noflags(char const* const start,
+                    char const* const stop,
+                    id_type key);
+  /** move read header in istream /in/ to the first entry after the midpoint of
+   *  file position range [start,stop) in in a 'tight' index
+   *  @param in the data input stream
+   *  @param start start of the search range
+   *  @param stop  end   of the search range
+   *  @return true if no errors occurred
+   */
+  bool
+  tightfind_midpoint(std::istream& in, filepos_type start, filepos_type stop);
+  // the bitpattern functions below are for debugging
+  // They return a string showing the bits of the argument value
+//   std::string bitpattern(unsigned int s);
+//   std::string bitpattern(unsigned char c);
+//   std::string bitpattern(char c);
+  /** read a number from a tight index directy from a memory location
+   *  @param start start of read range
+   *  @param stop  non-inclusive end of read range
+   *  @param dest  destination
+   *  @return first memory position after the number
+   */
+  char const*
+  tightread2(char const* start, char const* stop, uint16_t& dest);
+  char const*
+  tightread4(char const* start, char const* stop, uint32_t& dest);
+  char const*
+  tightread8(char const* start, char const* stop, uint64_t& dest);
+  template<typename numType>
+  char const*
+  tightread(char const* start, char const* stop, numType& dest)
+  {
+    if (sizeof(numType)==2)
+      return tightread2(start,stop,reinterpret_cast<uint16_t&>(dest));
+    if (sizeof(numType)==4)
+      return tightread4(start,stop,reinterpret_cast<uint32_t&>(dest));
+    else if (sizeof(numType)==8)
+      return tightread8(start,stop,reinterpret_cast<uint64_t&>(dest));
+    assert(0);
+    return NULL;
+  }
+//   char const*
+//   tightread(char const* start, char const* stop, uint64_t& dest);
+//   char const*
+//   tightread(char const* start, char const* stop, filepos_type& dest);
+#if 0
+  template<typename dtype>
+  char const*
+  tightread(char const* start,
+            char const* stop,
+            dtype& dest)
+  {
+    static char bitmask=127;
+    dest = 0;
+    if (*start < 0)
+      {
+        dest = (*start)&bitmask;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<7;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<14;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<21;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<28;
+        if (++start==stop || *start >= 0) return start;
+        assert(sizeof(dtype) > 4);
+        dest += dtype((*start)&bitmask)<<35;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<42;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<49;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<56;
+        if (++start==stop || *start >= 0) return start;
+        dest += dtype((*start)&bitmask)<<63;
+      }
+    else
+      {
+        dest = *start;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<7;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<14;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<21;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<28;
+        if (++start==stop || *start < 0) return start;
+        assert(sizeof(dtype) > 4);
+        dest += dtype(*start)<<35;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<42;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<49;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<56;
+        if (++start==stop || *start < 0) return start;
+        dest += dtype(*start)<<63;
+      }
+    assert(start<stop);
+    return ++start;
+  }
+#endif
+}
+#endif

mosesdecoder/moses/TranslationModel/UG/mm/tpt_tokenindex.cc ADDED Viewed

	@@ -0,0 +1,420 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// (c) 2007-2013 Ulrich Germann
+#include <sstream>
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+#include <boost/pool/pool_alloc.hpp>
+#include "tpt_tokenindex.h"
+#include "ug_typedefs.h"
+using namespace std;
+namespace sapt
+{
+  TokenIndex::
+  TokenIndex(string unkToken)
+    : ridx(0), unkLabel(unkToken), unkId(1), numTokens(0)
+    , startIdx(0), endIdx(0)
+  {
+    lock.reset(new boost::mutex());
+  };
+#if 0
+  TokenIndex::
+  TokenIndex(string fname, string unkToken,bool dyna)
+    : ridx(0),unkLabel(unkToken)
+  {
+    this->open(fname,unkToken,dyna);
+  };
+#endif
+  void
+  TokenIndex::
+  open(string fname, string unkToken,bool dyna)
+  {
+    if (access(fname.c_str(),F_OK))
+      {
+        ostringstream msg;
+        msg << "TokenIndex::open: File '" << fname << "' does not exist.";
+        throw std::runtime_error(msg.str().c_str());
+      }
+    file.open(fname);
+    if (!file.is_open())
+      {
+        ostringstream msg;
+        msg << "TokenIndex::open: Error opening file '" << fname << "'.";
+        throw std::runtime_error(msg.str().c_str());
+      }
+    this->numTokens = *(reinterpret_cast<uint32_t const*>(file.data()));
+    unkId = *(reinterpret_cast<id_type const*>(file.data()+4));
+    startIdx = reinterpret_cast<Entry const*>(file.data()+4+sizeof(id_type));
+    endIdx   = startIdx + numTokens;
+    comp.base = reinterpret_cast<char const*>(endIdx);
+    if (!unkToken.empty())
+      {
+        Entry const* bla = lower_bound(startIdx,endIdx,unkToken.c_str(),comp);
+        unkId = ((bla < endIdx && unkToken == comp.base+bla->offset)
+                 ? bla->id
+                 : numTokens);
+      }
+    this->dynamic=dyna;
+    if (dyna)
+      {
+        this->str2idExtra.reset(new map<string,id_type>());
+        this->newWords.reset(new vector<string>());
+      }
+  }
+  void
+  TokenIndex::
+  close()
+  {
+    file.close();
+  }
+  TokenIndex::
+  CompFunc::
+  CompFunc()
+  {};
+  bool
+  TokenIndex::
+  CompFunc::
+  operator()(Entry const& A, char const* w)
+  {
+    return strcmp(base+A.offset,w) < 0;
+  };
+  id_type
+  TokenIndex::
+  operator[](char const* p) const
+  {
+    if (startIdx != endIdx)
+      {
+        Entry const* bla = lower_bound(startIdx,endIdx,p,comp);
+        if (bla != endIdx && !strcmp(comp.base+bla->offset,p))
+          return bla->id;
+        if (!dynamic) return unkId;
+      }
+    else if (!dynamic) return strcmp(p,"NULL") && unkId;
+    boost::lock_guard<boost::mutex> lk(*this->lock);
+    // stuff below is new as of 2011-01-30, for dynamic adding of
+    // unknown items IMPORTANT: numTokens is not currently not
+    // changed, it is the number of PRE-EXISING TOKENS, not including
+    // dynamically added Items
+    // if (!str2idExtra)
+    //   {
+    //     this->str2idExtra.reset(new map<string,id_type>());
+    //     this->newWords.reset(new vector<string>());
+    //   }
+    map<string,id_type>::value_type newItem(p,str2idExtra->size()+numTokens);
+    pair<map<string,id_type>::iterator,bool> foo = str2idExtra->insert(newItem);
+    if (foo.second) // it actually is a new item
+      newWords->push_back(foo.first->first);
+    return foo.first->second;
+  }
+  id_type
+  TokenIndex::
+  operator[](string const& w) const
+  {
+    return (*this)[w.c_str()];
+  }
+  vector<char const*>
+  TokenIndex::
+  reverseIndex() const
+  {
+    size_t numToks = endIdx-startIdx;
+    // cout << "tokenindex has " << numToks << " tokens" << endl;
+    vector<char const*> v(numToks,NULL);
+    // v.reserve(endIdx-startIdx);
+    for (Entry const* x = startIdx; x != endIdx; x++)
+      {
+	if (x->id >= v.size())
+	  v.resize(x->id+1);
+	v[x->id] = comp.base+x->offset;
+      }
+    // cout << "done reversing index " << endl;
+    return v;
+  }
+  char const* const
+  TokenIndex::
+  operator[](id_type id) const
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        // Someone else (multi-threading!) may have created the
+        // reverse index in the meantime, so let's check again
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    if (id < ridx.size())
+      return ridx[id];
+    boost::lock_guard<boost::mutex> lk(*this->lock);
+    if (dynamic && id < ridx.size()+newWords->size())
+      return (*newWords)[id-ridx.size()].c_str();
+    return unkLabel.c_str();
+  }
+  void
+  TokenIndex::
+  iniReverseIndex()
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+  }
+  char const* const
+  TokenIndex::
+  operator[](id_type id)
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    if (id < ridx.size())
+      return ridx[id];
+    boost::lock_guard<boost::mutex> lk(*this->lock);
+    if (dynamic && id < ridx.size()+newWords->size())
+      return (*newWords)[id-ridx.size()].c_str();
+    return unkLabel.c_str();
+  }
+  string
+  TokenIndex::
+  toString(vector<id_type> const& v)
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    ostringstream buf;
+    for (size_t i = 0; i < v.size(); i++)
+      buf << (i ? " " : "") << (*this)[v[i]];
+    return buf.str();
+  }
+  string
+  TokenIndex::
+  toString(vector<id_type> const& v) const
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    ostringstream buf;
+    for (size_t i = 0; i < v.size(); i++)
+      buf << (i ? " " : "") << (*this)[v[i]];
+    return buf.str();
+  }
+  string
+  TokenIndex::
+  toString(id_type const* start, id_type const* const stop)
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    ostringstream buf;
+    if (start < stop)
+      buf << (*this)[*start];
+    while (++start < stop)
+      buf << " " << (*this)[*start];
+    return buf.str();
+  }
+  string
+  TokenIndex::
+  toString(id_type const* start, id_type const* const stop) const
+  {
+    if (!ridx.size())
+      {
+        boost::lock_guard<boost::mutex> lk(*this->lock);
+        if (!ridx.size()) ridx = reverseIndex();
+      }
+    ostringstream buf;
+    if (start < stop)
+      buf << (*this)[*start];
+    while (++start < stop)
+      buf << " " << (*this)[*start];
+    return buf.str();
+  }
+  vector<id_type>
+  TokenIndex::
+  toIdSeq(string const& line) const
+  {
+    istringstream buf(line);
+    string w;
+    vector<id_type> retval;
+    while (buf>>w)
+      retval.push_back((*this)[w]);
+    return retval;
+  }
+  /// Return false if line contains unknown tokens, true otherwise
+  bool
+  TokenIndex::
+  fillIdSeq(string const& line, vector<id_type> & v) const
+  {
+    bool allgood = true; string w;
+    v.clear();
+    for (istringstream buf(line); buf>>w;)
+      {
+        v.push_back((*this)[w]);
+        allgood = allgood && v.back() > 1;
+      }
+    return allgood;
+  }
+  id_type
+  TokenIndex::
+  getNumTokens() const
+  {
+    return numTokens;
+  }
+  id_type
+  TokenIndex::
+  getUnkId() const
+  {
+    return unkId;
+  }
+  char const* const
+  TokenIndex::
+  getUnkToken() const
+  {
+    return unkLabel.c_str();
+    // return (*this)[unkId];
+  }
+  id_type
+  TokenIndex::
+  knownVocabSize() const
+  {
+    return numTokens;
+  }
+  id_type
+  TokenIndex::
+  ksize() const
+  {
+    return numTokens;
+  }
+  id_type
+  TokenIndex::
+  totalVocabSize() const
+  { return tsize(); }
+  id_type
+  TokenIndex::
+  tsize() const
+  {
+    return (newWords != NULL
+            ? numTokens+newWords->size()
+            : numTokens);
+  }
+  void
+  write_tokenindex_to_disk(vector<pair<string,uint32_t> > const& tok,
+                           string const& ofile, string const& unkToken)
+  {
+    typedef pair<uint32_t,id_type> IndexEntry; // offset and id
+    // Write token strings to a buffer, keep track of offsets
+    vector<IndexEntry> index(tok.size());
+    ostringstream data;
+    id_type unkId = tok.size();
+    for (size_t i = 0; i < tok.size(); i++)
+      {
+        if (tok[i].first == unkToken)
+          unkId = tok[i].second;
+        index[i].first  = data.tellp();   // offset of string
+        index[i].second = tok[i].second;  // respective ID
+        data<<tok[i].first<<char(0);      // write string to buffer
+      }
+    // Now write the actual file
+    ofstream out(ofile.c_str());
+    uint32_t vsize = index.size(); // how many vocab items?
+    out.write(reinterpret_cast<char*>(&vsize),4);
+    out.write(reinterpret_cast<char*>(&unkId),sizeof(id_type));
+    for (size_t i = 0; i < index.size(); i++)
+      {
+        out.write(reinterpret_cast<char*>(&index[i].first),4);
+        out.write(reinterpret_cast<char*>(&index[i].second),sizeof(id_type));
+      }
+    out<<data.str();
+  }
+  void
+  TokenIndex::
+  write(string fname)
+  {
+    typedef pair<string,uint32_t>  Token;      // token and id
+    vector<Token>       tok(totalVocabSize());
+    for (id_type i = 0; i < tok.size(); ++i)
+      tok[i] = Token((*this)[i],i);
+    sort(tok.begin(),tok.end());
+    write_tokenindex_to_disk(tok,fname,unkLabel);
+  }
+  bool
+  TokenIndex::
+  isDynamic() const
+  {
+    return dynamic;
+  }
+  bool
+  TokenIndex::
+  setDynamic(bool on)
+  {
+    bool ret = dynamic;
+    if (on && this->str2idExtra == NULL)
+      {
+        this->str2idExtra.reset(new map<string,id_type>());
+        this->newWords.reset(new vector<string>());
+      }
+    dynamic = on;
+    if (on)
+      {
+	(*this)["NULL"];
+	(*this)[unkLabel];
+      }
+    return ret;
+  }
+  void
+  TokenIndex::
+  setUnkLabel(string unk)
+  {
+    unkId = (*this)[unk];
+    unkLabel = unk;
+  }
+}

mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.cc ADDED Viewed

	@@ -0,0 +1,171 @@

+//-*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
+#include "ug_bitext.h"
+#include <algorithm>
+#include <boost/math/distributions/binomial.hpp>
+namespace sapt
+{
+  float
+  lbop(size_t const tries, size_t const succ, float const confidence)
+  {
+    return (confidence == 0
+            ? float(succ)/tries
+            : (boost::math::binomial_distribution<>::
+               find_lower_bound_on_p(tries, succ, confidence)));
+  }
+  void
+  snt_adder<L2R_Token<SimpleWordId> >::
+  operator()()
+  {
+    typedef L2R_Token<SimpleWordId> tkn;
+    std::vector<id_type> sids; sids.reserve(snt.size());
+    BOOST_FOREACH(std::string const& foo, snt)
+      {
+        sids.push_back(track ? track->size() : 0);
+        std::istringstream buf(foo);
+        std::string w;
+        std::vector<tkn> s; s.reserve(100);
+        while (buf >> w) s.push_back(tkn(V[w]));
+        track = append(track,s);
+      }
+    if (index)
+      index.reset(new imTSA<tkn>(*index,track,sids,V.tsize()));
+    else
+      index.reset(new imTSA<tkn>(track,NULL,NULL));
+  }
+  snt_adder<L2R_Token<SimpleWordId> >::
+  snt_adder(std::vector<std::string> const& s, TokenIndex& v,
+            SPTR<imTtrack<L2R_Token<SimpleWordId> > >& t,
+            SPTR<imTSA<L2R_Token<SimpleWordId> > >& i)
+    : snt(s), V(v), track(t), index(i)
+  { }
+  bool
+  expand_phrase_pair
+  (std::vector<std::vector<ushort> >& a1,
+   std::vector<std::vector<ushort> >& a2,
+   ushort const s2, // next word on in target side
+   ushort const L1, ushort const R1, // limits of previous phrase
+   ushort & s1, ushort & e1, ushort& e2) // start/end src; end trg
+  {
+    if (a2[s2].size() == 0)
+      {
+        std::cout << __FILE__ << ":" << __LINE__ << std::endl;
+        return false;
+      }
+    bitvector done1(a1.size());
+    bitvector done2(a2.size());
+    std::vector<std::pair<ushort,ushort> > agenda;
+    // x.first:  side (1 or 2)
+    // x.second: word position
+    agenda.reserve(a1.size() + a2.size());
+    agenda.push_back(std::pair<ushort,ushort>(2,s2));
+    e2 = s2;
+    s1 = e1 = a2[s2].front();
+    if (s1 >= L1 && s1 < R1)
+      {
+        std::cout << __FILE__ << ":" << __LINE__ << std::endl;
+        return false;
+      }
+    agenda.push_back(std::pair<ushort,ushort>(2,s2));
+    while (agenda.size())
+      {
+        ushort side = agenda.back().first;
+        ushort p    = agenda.back().second;
+        agenda.pop_back();
+        if (side == 1)
+          {
+            done1.set(p);
+            BOOST_FOREACH(ushort i, a1[p])
+              {
+                if (i < s2)
+                  {
+                    // cout << __FILE__ << ":" << __LINE__ << endl;
+                    return false;
+                  }
+                if (done2[i]) continue;
+                for (;e2 <= i;++e2)
+                  if (!done2[e2])
+                    agenda.push_back(std::pair<ushort,ushort>(2,e2));
+              }
+          }
+        else
+          {
+            done2.set(p);
+            BOOST_FOREACH(ushort i, a2[p])
+              {
+                if ((e1 < L1 && i >= L1) ||
+                    (s1 >= R1 && i < R1) ||
+                    (i >= L1 && i < R1))
+                  {
+                    // cout << __FILE__ << ":" << __LINE__ << " "
+                    // << L1 << "-" << R1 << " " << i << " "
+                    // << s1 << "-" << e1<< endl;
+                    return false;
+                  }
+                if (e1 < i)
+                  {
+                    for (; e1 <= i; ++e1)
+                      if (!done1[e1])
+                        agenda.push_back(std::pair<ushort,ushort>(1,e1));
+                  }
+                else if (s1 > i)
+                  {
+                    for (; i <= s1; ++i)
+                      if (!done1[i])
+                        agenda.push_back(std::pair<ushort,ushort>(1,i));
+                  }
+              }
+          }
+      }
+    ++e1;
+    ++e2;
+    return true;
+  }
+  void
+  print_amatrix(std::vector<std::vector<ushort> > a1, uint32_t len2,
+                ushort b1, ushort e1, ushort b2, ushort e2)
+  {
+    using namespace std;
+    std::vector<bitvector> M(a1.size(),bitvector(len2));
+    for (ushort j = 0; j < a1.size(); ++j)
+      {
+        BOOST_FOREACH(ushort k, a1[j])
+          M[j].set(k);
+      }
+    cout << b1 << "-" << e1 << " " << b2 << "-" << e2 << endl;
+    cout << "   ";
+    for (size_t c = 0; c < len2;++c)
+      cout << c%10;
+    cout << endl;
+    for (size_t r = 0; r < M.size(); ++r)
+      {
+        cout << setw(3) << r << " ";
+        for (size_t c = 0; c < M[r].size(); ++c)
+          {
+            if ((b1 <= r) && (r < e1) && b2 <= c && c < e2)
+              cout << (M[r][c] ? 'x' : '-');
+            else cout << (M[r][c] ? 'o' : '.');
+          }
+        cout << endl;
+      }
+    cout  << std::string(90,'-') << endl;
+  }
+  void
+  write_bitvector(bitvector const& v, std::ostream& out)
+  {
+    for (size_t i = v.find_first(); i < v.size();)
+      {
+        out << i;
+        if ((i = v.find_next(i)) < v.size()) out << ",";
+      }
+  }
+}

mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext.h ADDED Viewed

	@@ -0,0 +1,782 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#pragma once
+// Implementations of word-aligned bitext.
+// Written by Ulrich Germann
+//
+// mmBitext: static, memory-mapped bitext
+// imBitext: dynamic, in-memory bitext
+//
+// things we can do to speed up things:
+// - set up threads at startup time that force the
+//   data in to memory sequentially
+//
+// - use multiple agendas for better load balancing and to avoid
+//   competition for locks
+//
+#define UG_BITEXT_TRACK_ACTIVE_THREADS 0
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iomanip>
+#include <algorithm>
+#include <boost/foreach.hpp>
+#include <boost/random.hpp>
+#include <boost/format.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/math/distributions/binomial.hpp>
+#include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
+#include "moses/TranslationModel/UG/generic/sampling/Sampling.h"
+#include "moses/TranslationModel/UG/generic/file_io/ug_stream.h"
+#include "moses/TranslationModel/UG/generic/threading/ug_thread_safe_counter.h"
+#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
+// #include "moses/FF/LexicalReordering/LexicalReorderingState.h"
+#include "moses/Util.h"
+#ifndef NO_MOSES
+// #pragma message "COMPILING WITH MOSES SUPPORT!"
+#include "moses/StaticData.h"
+#include "moses/thread_safe_container.h"
+#include "moses/ContextScope.h"
+#include "moses/TranslationTask.h"
+#else
+// #pragma message "COMPILING WITHOUT MOSES SUPPORT!"
+#endif
+#include "util/exception.hh"
+// #include "util/check.hh"
+#include "ug_typedefs.h"
+#include "ug_mm_ttrack.h"
+#include "ug_im_ttrack.h"
+#include "ug_mm_tsa.h"
+#include "ug_im_tsa.h"
+#include "tpt_tokenindex.h"
+#include "ug_corpus_token.h"
+#include "tpt_pickler.h"
+#include "ug_lexical_phrase_scorer2.h"
+#include "ug_lru_cache.h"
+#include "ug_lexical_reordering.h"
+#include "ug_sampling_bias.h"
+#include "ug_phrasepair.h"
+#include "ug_bitext_phrase_extraction_record.h"
+#include "moses/TranslationModel/UG/generic/threading/ug_ref_counter.h"
+// Minimum source count for caching phrase lookup statistics.
+// If source phrase occurs less frequently, never cache;
+// always re-compute.
+#define PSTATS_CACHE_THRESHOLD 50
+namespace Moses { class Mmsapt; }
+namespace sapt
+{
+  using Moses::ttasksptr;
+  using Moses::ttaskwptr;
+  using tpt::binread;
+  using tpt::binwrite;
+  float lbop(size_t const tries, size_t const succ, float const confidence);
+  void write_bitvector(bitvector const& v, std::ostream& out);
+#ifndef NO_MOSES
+  struct
+  ContextForQuery
+  {
+    // needs to be made thread-safe
+    // ttasksptr const m_ttask;
+    // size_t max_samples;
+    boost::shared_mutex lock;
+    SPTR<SamplingBias> bias;
+    SPTR<pstats::cache_t> cache1, cache2;
+    std::ostream* bias_log;
+    ContextForQuery() : bias_log(NULL) { }
+  };
+#endif
+  template<typename Token> class BitextSampler;
+  template<typename TKN>
+  class Bitext // : public Moses::reference_counter
+  {
+  public:
+    template<typename Token> friend class BitextSampler;
+    typedef TKN Token;
+    typedef typename TSA<Token>::tree_iterator   iter;
+    typedef typename std::vector<PhrasePair<Token> > vec_ppair;
+    typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
+    typedef TSA<Token> tsa;
+    friend class Moses::Mmsapt;
+  protected:
+    mutable boost::shared_mutex m_lock; // for thread-safe operation
+    class agenda; // for parallel sampling see ug_bitext_agenda.h
+    mutable SPTR<agenda> ag;
+    size_t m_num_workers; // number of workers available to the agenda
+    size_t m_default_sample_size;
+    size_t m_pstats_cache_threshold; // threshold for caching sampling results
+    SPTR<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
+    std::vector<std::string> m_docname;
+    std::map<std::string,id_type>  m_docname2docid; // maps from doc names to ids
+    SPTR<std::vector<id_type> >   m_sid2docid; // maps from sentences to docs (ids)
+    mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
+    // caches for unbiased sampling; biased sampling uses the caches that
+    // are stored locally on the translation task
+  public:
+    SPTR<Ttrack<char> >  Tx; // word alignments
+    SPTR<Ttrack<Token> > T1; // token track
+    SPTR<Ttrack<Token> > T2; // token track
+    SPTR<TokenIndex>     V1; // vocab
+    SPTR<TokenIndex>     V2; // vocab
+    SPTR<TSA<Token> >    I1; // indices
+    SPTR<TSA<Token> >    I2; // indices
+    /// given the source phrase sid[start:stop]
+    //  find the possible start (s1 .. s2) and end (e1 .. e2)
+    //  points of the target phrase; if non-NULL, store word
+    //  alignments in *core_alignment. If /flip/, source phrase is
+    //  L2.
+    bool find_trg_phr_bounds(PhraseExtractionRecord& rec) const;
+    bool find_trg_phr_bounds
+    ( size_t const sid,    // sentence to investigate
+      size_t const start,  // start of source phrase
+      size_t const stop,   // last position of source phrase
+      size_t & s1, size_t & s2, // beginning and end of target start
+      size_t & e1, size_t & e2, // beginning and end of target end
+      int& po_fwd, int& po_bwd, // phrase orientations
+      std::vector<unsigned char> * core_alignment, // stores the core alignment
+      bitvector* full_alignment, // stores full word alignment for this sent.
+      bool const flip) const;   // flip source and target (reverse lookup)
+    // prep2 launches sampling and returns immediately.
+    // lookup (below) waits for the job to finish before it returns
+    SPTR<pstats>
+    prep2(iter const& phrase, int max_sample = -1) const;
+#ifndef NO_MOSES
+    SPTR<pstats>
+    prep2(ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+          int max_sample = -1) const;
+#endif
+  protected:
+    Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
+    Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2,
+           Ttrack<char>*  const tx,
+           TokenIndex*    const v1, TokenIndex*    const v2,
+           TSA<Token>*    const i1, TSA<Token>*    const i2,
+           size_t const max_sample=1000,
+           size_t const xnum_workers=16);
+  public:
+    virtual void
+    open(std::string const base, std::string const L1, std::string const L2) = 0;
+    SPTR<pstats>
+    lookup(iter const& phrase, int max_sample = -1) const;
+    void prep(iter const& phrase) const;
+#ifndef NO_MOSES
+    SPTR<pstats>
+    lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
+    void prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const;
+#endif
+    void   setDefaultSampleSize(size_t const max_samples);
+    size_t getDefaultSampleSize() const;
+    std::string toString(uint64_t pid, int isL2) const;
+    virtual size_t revision() const { return 0; }
+    SPTR<SentenceBias>
+    loadSentenceBias(std::string const& fname) const;
+    SPTR<DocumentBias>
+    SetupDocumentBias(std::string const& bserver, std::string const& text,
+                      std::ostream* log) const;
+    SPTR<DocumentBias>
+    SetupDocumentBias(std::map<std::string,float> context_weights,
+                      std::ostream* log) const;
+    void
+    mark_match(Token const* start, Token const* end, iter const& m,
+               bitvector& check) const;
+    void
+    write_yawat_alignment
+    ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const;
+    std::string sid2docname(id_type const sid) const;
+    std::string docid2name(id_type const sid) const;
+    int docname2docid(std::string const& name) const;
+    std::vector<id_type> const* sid2did() const;
+    int sid2did(uint32_t sid) const;
+  };
+  #include "ug_bitext_agenda.h"
+  template<typename Token>
+  int
+  Bitext<Token>::
+  docname2docid(std::string const& name) const
+  {
+    std::map<std::string,id_type>::const_iterator m;
+    m = m_docname2docid.find(name);
+    if (m != m_docname2docid.end()) return m->second;
+    return -1;
+  }
+  template<typename Token>
+  std::string
+  Bitext<Token>::
+  docid2name(id_type const did) const
+  {
+    if (did < m_docname.size())
+      return m_docname[did];
+    else
+      return (boost::format("%d") % did).str();
+  }
+  template<typename Token>
+  std::string
+  Bitext<Token>::
+  sid2docname(id_type const sid) const
+  {
+    if (sid < m_sid2docid->size() && (*m_sid2docid)[sid] < m_docname.size())
+      return m_docname[(*m_sid2docid)[sid]];
+    else
+      return "";
+  }
+  template<typename Token>
+  std::vector<id_type> const*
+  Bitext<Token>::
+  sid2did() const
+  {
+    return m_sid2docid.get();
+  }
+  template<typename Token>
+  int
+  Bitext<Token>::
+  sid2did(uint32_t sid) const
+  {
+    if (m_sid2docid)
+      return m_sid2docid->at(sid);
+    return -1;
+  }
+  template<typename Token>
+  SPTR<SentenceBias>
+  Bitext<Token>::
+  loadSentenceBias(std::string const& fname) const
+  {
+    SPTR<SentenceBias> ret(new SentenceBias(T1->size()));
+    std::ifstream in(fname.c_str());
+    size_t i = 0;
+    float v; while (in>>v) (*ret)[i++] = v;
+    UTIL_THROW_IF2(i != T1->size(),
+                   "Mismatch between bias vector size and corpus size at "
+                   << HERE);
+    return ret;
+  }
+  template<typename Token>
+  std::string
+  Bitext<Token>::
+  toString(uint64_t pid, int isL2) const
+  {
+    std::ostringstream buf;
+    uint32_t sid,off,len; parse_pid(pid,sid,off,len);
+    Token const* t = (isL2 ? T2 : T1)->sntStart(sid) + off;
+    Token const* x = t + len;
+    TokenIndex const& V = isL2 ? *V2 : *V1;
+    while (t < x)
+      {
+        buf << V[t->id()];
+        if (++t < x) buf << " ";
+      }
+    return buf.str();
+  }
+  template<typename Token>
+  size_t
+  Bitext<Token>::
+  getDefaultSampleSize() const
+  {
+    return m_default_sample_size;
+  }
+  template<typename Token>
+  void
+  Bitext<Token>::
+  setDefaultSampleSize(size_t const max_samples)
+  {
+    boost::unique_lock<boost::shared_mutex> guard(m_lock);
+    if (max_samples != m_default_sample_size)
+      {
+        m_cache1.reset(new pstats::cache_t);
+        m_cache2.reset(new pstats::cache_t);
+        m_default_sample_size = max_samples;
+      }
+  }
+  template<typename Token>
+  Bitext<Token>::
+  Bitext(size_t const max_sample, size_t const xnum_workers)
+    : m_num_workers(xnum_workers)
+    , m_default_sample_size(max_sample)
+    , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
+    , m_cache1(new pstats::cache_t)
+    , m_cache2(new pstats::cache_t)
+  { }
+  template<typename Token>
+  Bitext<Token>::
+  Bitext(Ttrack<Token>* const t1,
+         Ttrack<Token>* const t2,
+         Ttrack<char>*  const tx,
+         TokenIndex*    const v1,
+         TokenIndex*    const v2,
+         TSA<Token>* const i1,
+         TSA<Token>* const i2,
+         size_t const max_sample,
+         size_t const xnum_workers)
+    : m_num_workers(xnum_workers)
+    , m_default_sample_size(max_sample)
+    , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
+    , m_cache1(new pstats::cache_t)
+    , m_cache2(new pstats::cache_t)
+    , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
+  { }
+  template<typename TKN> class snt_adder;
+  template<>             class snt_adder<L2R_Token<SimpleWordId> >;
+  template<>
+  class snt_adder<L2R_Token<SimpleWordId> >
+  {
+    typedef L2R_Token<SimpleWordId> TKN;
+    std::vector<std::string> const & snt;
+    TokenIndex           & V;
+    SPTR<imTtrack<TKN> > & track;
+    SPTR<imTSA<TKN > >   & index;
+  public:
+    snt_adder(std::vector<std::string> const& s, TokenIndex& v,
+              SPTR<imTtrack<TKN> >& t, SPTR<imTSA<TKN> >& i);
+    void operator()();
+  };
+  template<typename Token>
+  bool
+  Bitext<Token>::
+  find_trg_phr_bounds(PhraseExtractionRecord& rec) const
+  {
+    return find_trg_phr_bounds(rec.sid, rec.start, rec.stop,
+                               rec.s1, rec.s2, rec.e1, rec.e2,
+                               rec.po_fwd, rec.po_bwd,
+                               rec.aln, rec.full_aln, rec.flip);
+  }
+  template<typename Token>
+  bool
+  Bitext<Token>::
+  find_trg_phr_bounds
+  ( size_t const sid,    // sentence to investigate
+    size_t const start,  // start of source phrase
+    size_t const stop,   // last position of source phrase
+    size_t & s1, size_t & s2, // beginning and end of target start
+    size_t & e1, size_t & e2, // beginning and end of target end
+    int& po_fwd, int& po_bwd, // phrase orientations
+    std::vector<unsigned char> * core_alignment, // stores the core alignment
+    bitvector* full_alignment, // stores full word alignment for this sent.
+    bool const flip) const     // flip source and target (reverse lookup)
+  {
+    // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
+    // a word on the core_alignment (core_alignment):
+    //
+    // Since fringe words ([s1,...,s2),[e1,..,e2) if s1 < s2, or e1
+    // < e2, respectively) are be definition unaligned, we store
+    // only the core alignment in *aln. It is up to the calling
+    // function to shift alignment points over for start positions
+    // of extracted phrases that start with a fringe word
+    assert(T1);
+    assert(T2);
+    assert(Tx);
+    size_t slen1,slen2;
+    if (flip)
+      {
+        slen1 = T2->sntLen(sid);
+        slen2 = T1->sntLen(sid);
+      }
+    else
+      {
+        slen1 = T1->sntLen(sid);
+        slen2 = T2->sntLen(sid);
+      }
+    bitvector forbidden(slen2);
+    if (full_alignment)
+      {
+        if (slen1*slen2 > full_alignment->size())
+          full_alignment->resize(slen1*slen2*2);
+        full_alignment->reset();
+      }
+    size_t src,trg;
+    size_t lft = forbidden.size();
+    size_t rgt = 0;
+    std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
+    // process word alignment for this sentence
+    char const* p = Tx->sntStart(sid);
+    char const* x = Tx->sntEnd(sid);
+    while (p < x)
+      {
+        if (flip)
+          {
+            p = binread(p,trg);
+            assert(p<x);
+            p = binread(p,src);
+          }
+        else
+          {
+            p = binread(p,src);
+            assert(p<x);
+            p = binread(p,trg);
+          }
+        UTIL_THROW_IF2((src >= slen1 || trg >= slen2),
+                       "Alignment range error at sentence " << sid << "!\n"
+                       << src << "/" << slen1 << " " << trg << "/" << slen2);
+        if (src < start || src >= stop)
+          forbidden.set(trg);
+        else
+          {
+            lft = std::min(lft,trg);
+            rgt = std::max(rgt,trg);
+          }
+        if (core_alignment)
+          {
+            aln1[src].push_back(trg);
+            aln2[trg].push_back(src);
+          }
+        if (full_alignment)
+          full_alignment->set(src*slen2 + trg);
+      }
+    for (size_t i = lft; i <= rgt; ++i)
+      if (forbidden[i])
+        return false;
+    s2 = lft;   for (s1 = s2; s1 && !forbidden[s1-1]; --s1);
+    e1 = rgt+1; for (e2 = e1; e2 < forbidden.size() && !forbidden[e2]; ++e2);
+    if (lft > rgt) return false;
+    if (core_alignment)
+      {
+        core_alignment->clear();
+        for (size_t i = start; i < stop; ++i)
+          {
+            BOOST_FOREACH(ushort x, aln1[i])
+              {
+                core_alignment->push_back(i - start);
+                core_alignment->push_back(x - lft);
+              }
+          }
+        // now determine fwd and bwd phrase orientation
+        po_fwd = find_po_fwd(aln1,aln2,start,stop,s1,e2);
+        po_bwd = find_po_bwd(aln1,aln2,start,stop,s1,e2);
+      }
+    return lft <= rgt;
+  }
+  template<typename Token>
+  SPTR<DocumentBias>
+  Bitext<Token>::
+  SetupDocumentBias
+  ( std::string const& bserver, std::string const& text, std::ostream* log ) const
+  {
+    SPTR<DocumentBias> ret;
+    UTIL_THROW_IF2(m_sid2docid == NULL,
+                   "Document bias requested but no document map loaded.");
+    ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
+                               bserver, text, log));
+    return ret;
+  }
+  template<typename Token>
+  SPTR<DocumentBias>
+  Bitext<Token>::
+  SetupDocumentBias
+  ( std::map<std::string,float> context_weights, std::ostream* log ) const
+  {
+    SPTR<DocumentBias> ret;
+    UTIL_THROW_IF2(m_sid2docid == NULL,
+                   "Document bias requested but no document map loaded.");
+    ret.reset(new DocumentBias(*m_sid2docid, m_docname2docid,
+                               context_weights, log));
+    return ret;
+  }
+  template<typename Token>
+  void
+  Bitext<Token>::
+  prep(iter const& phrase) const
+  {
+    prep2(phrase, m_default_sample_size);
+  }
+  // prep2 schedules a phrase for sampling, and returns immediately
+  // the member function lookup retrieves the respective pstats instance
+  // and waits until the sampling is finished before it returns.
+  // This allows sampling in the background
+  template<typename Token>
+  SPTR<pstats>
+  Bitext<Token>
+  ::prep2
+  (iter const& phrase, int max_sample) const
+  {
+    if (max_sample < 0) max_sample = m_default_sample_size;
+    SPTR<SamplingBias> bias;
+    SPTR<pstats::cache_t> cache;
+    // - no caching for rare phrases and special requests (max_sample)
+    //   (still need to test what a good caching threshold is ...)
+    // - use the task-specific cache when there is a sampling bias
+    if (max_sample == int(m_default_sample_size)
+        && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
+      {
+        cache = (phrase.root == I1.get() ? m_cache1 : m_cache2);
+      }
+    SPTR<pstats> ret;
+    SPTR<pstats> const* cached;
+    if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
+      return *cached;
+    boost::unique_lock<boost::shared_mutex> guard(m_lock);
+    if (!ag)
+      {
+        ag.reset(new agenda(*this));
+        if (m_num_workers > 1)
+          ag->add_workers(m_num_workers);
+      }
+    ret = ag->add_job(this, phrase, max_sample, bias);
+    if (cache) cache->set(phrase.getPid(),ret);
+    UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
+    return ret;
+  }
+  // worker for scoring and sorting phrase table entries in parallel
+  template<typename Token>
+  class pstats2pplist
+  {
+    Ttrack<Token> const& m_other;
+    SPTR<pstats> m_pstats;
+    std::vector<PhrasePair<Token> >& m_pplist;
+    typename PhrasePair<Token>::Scorer const* m_scorer;
+    PhrasePair<Token> m_pp;
+    Token const* m_token;
+    size_t m_len;
+    uint64_t m_pid1;
+    bool m_is_inverse;
+  public:
+    // CONSTRUCTOR
+    pstats2pplist(typename TSA<Token>::tree_iterator const& m,
+                  Ttrack<Token> const& other,
+                  SPTR<pstats> const& ps,
+                  std::vector<PhrasePair<Token> >& dest,
+                  typename PhrasePair<Token>::Scorer const* scorer)
+      : m_other(other)
+      , m_pstats(ps)
+      , m_pplist(dest)
+      , m_scorer(scorer)
+      , m_token(m.getToken(0))
+      , m_len(m.size())
+      , m_pid1(m.getPid())
+      , m_is_inverse(false)
+    { }
+    // WORKER
+    void
+    operator()()
+    {
+      // wait till all statistics have been collected
+      boost::unique_lock<boost::mutex> lock(m_pstats->lock);
+      while (m_pstats->in_progress)
+        m_pstats->ready.wait(lock);
+      m_pp.init(m_pid1, m_is_inverse, m_token,m_len,m_pstats.get(),0);
+      // convert pstats entries to phrase pairs
+      pstats::trg_map_t::iterator a;
+      for (a = m_pstats->trg.begin(); a != m_pstats->trg.end(); ++a)
+        {
+          uint32_t sid,off,len;
+          parse_pid(a->first, sid, off, len);
+          m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
+          m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
+                           m_pp.joint);
+          // Poor man's early pruning: if p(f|e) or p(e|f) < 1/128, don't
+          // even consider the phrase pair, as it is unlikely to ever be
+          // considered as a valid translation.
+          size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
+          if (m_pp.good1 > J || m_pp.good2 > J) continue;
+          if (m_scorer)
+            {
+              (*m_scorer)(m_pp);
+            }
+          m_pplist.push_back(m_pp);
+        }
+      std::greater<PhrasePair<Token> > sorter;
+      if (m_scorer) sort(m_pplist.begin(), m_pplist.end(),sorter);
+    }
+  };
+  template<typename Token>
+  void
+  Bitext<Token>
+  ::mark_match(Token const* start, Token const* end,
+               iter const& m, bitvector& check) const
+  {
+    check.resize(end-start);
+    check.reset();
+    Token const* x = m.getToken(0);
+    for (Token const* s = start; s < end; ++s)
+      {
+        if (s->id() != x->id()) continue;
+        Token const* a = x;
+        Token const* b = s;
+        size_t i = 0;
+        while (a && b && a->id() == b->id() && i < m.size())
+          {
+            ++i;
+            a = a->next();
+            b = b->next();
+          }
+        if (i == m.size())
+          {
+            b = s;
+            while (i-- > 0) { check.set(b-start); b = b->next(); }
+          }
+      }
+  }
+  template<typename Token>
+  void
+  Bitext<Token>::
+  write_yawat_alignment
+  ( id_type const sid, iter const* m1, iter const* m2, std::ostream& out ) const
+  {
+    std::vector<int> a1(T1->sntLen(sid),-1), a2(T2->sntLen(sid),-1);
+    bitvector f1(a1.size()), f2(a2.size());
+    if (m1) mark_match(T1->sntStart(sid), T1->sntEnd(sid), *m1, f1);
+    if (m2) mark_match(T2->sntStart(sid), T2->sntEnd(sid), *m2, f2);
+    std::vector<std::pair<bitvector, bitvector> > agroups;
+    std::vector<std::string> grouplabel;
+    std::pair<bitvector, bitvector> ag;
+    ag.first.resize(a1.size());
+    ag.second.resize(a2.size());
+    char const* x = Tx->sntStart(sid);
+    size_t a, b;
+    while (x < Tx->sntEnd(sid))
+      {
+        x = binread(x,a);
+        x = binread(x,b);
+        if (a1.at(a) < 0 && a2.at(b) < 0)
+          {
+            a1[a] = a2[b] = agroups.size();
+            ag.first.reset();
+            ag.second.reset();
+            ag.first.set(a);
+            ag.second.set(b);
+            agroups.push_back(ag);
+            grouplabel.push_back(f1[a] || f2[b] ? "infocusbi" : "unspec");
+          }
+        else if (a1.at(a) < 0)
+          {
+            a1[a] = a2[b];
+            agroups[a2[b]].first.set(a);
+            if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+          }
+        else if (a2.at(b) < 0)
+          {
+            a2[b] = a1[a];
+            agroups[a1[a]].second.set(b);
+            if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+          }
+        else
+          {
+            agroups[a1[a]].first  |= agroups[a2[b]].first;
+            agroups[a1[a]].second |= agroups[a2[b]].second;
+            a2[b] = a1[a];
+            if (f1[a] || f2[b]) grouplabel[a1[a]] = "infocusbi";
+          }
+      }
+    for (a = 0; a < a1.size(); ++a)
+      {
+        if (a1[a] < 0)
+          {
+            if (f1[a]) out << a << "::" << "infocusmono ";
+            continue;
+          }
+        bitvector const& A = agroups[a1[a]].first;
+        bitvector const& B = agroups[a1[a]].second;
+        if (A.find_first() < a) continue;
+        write_bitvector(A,out); out << ":";
+        write_bitvector(B,out); out << ":";
+        out << grouplabel[a1[a]] << " ";
+      }
+    for (b = 0; b < a2.size(); ++b)
+      {
+        if (a2[b] < 0 && f2[b])
+          out <<  "::" << "infocusmono ";
+      }
+  }
+  template<typename Token>
+  void
+  expand(typename Bitext<Token>::iter const& m,
+         Bitext<Token> const& bt, pstats const& ps,
+         std::vector<PhrasePair<Token> >& dest, std::ostream* log)
+  {
+    bool fwd = m.root == bt.I1.get();
+    dest.reserve(ps.trg.size());
+    PhrasePair<Token> pp;
+    pp.init(m.getPid(), !fwd, m.getToken(0), m.size(), &ps, 0);
+    // cout << HERE << " "
+    // << toString(*(fwd ? bt.V1 : bt.V2), pp.start1,pp.len1) << std::endl;
+    pstats::trg_map_t::const_iterator a;
+    for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
+      {
+        uint32_t sid,off,len;
+        parse_pid(a->first, sid, off, len);
+        pp.update(a->first, (fwd ? bt.T2 : bt.T1)->sntStart(sid)+off,
+                  len, a->second);
+        dest.push_back(pp);
+      }
+  }
+} // end of namespace sapt
+#include "ug_im_bitext.h"
+#include "ug_mm_bitext.h"
+#include "ug_bitext_moses.h"

mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_agenda.h ADDED Viewed

	@@ -0,0 +1,188 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+// to be included from ug_bitext.h
+// The agenda handles parallel sampling.
+// It maintains a queue of unfinished sampling jobs and
+// assigns them to a pool of workers.
+//
+template<typename Token>
+class Bitext<Token>
+::agenda
+{
+public:
+  class job;
+  class worker;
+private:
+  boost::mutex lock;
+  std::list<SPTR<job> > joblist;
+  std::vector<SPTR<boost::thread> > workers;
+  bool shutdown;
+  size_t doomed;
+public:
+  Bitext<Token>   const& bt;
+  agenda(Bitext<Token> const& bitext);
+  ~agenda();
+  void
+  add_workers(int n);
+  SPTR<pstats>
+  add_job(Bitext<Token> const* const theBitext,
+	  typename TSA<Token>::tree_iterator const& phrase,
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+    bool const track_sids);
+    // add_job(Bitext<Token> const* const theBitext,
+    // 	  typename TSA<Token>::tree_iterator const& phrase,
+    // 	  size_t const max_samples, SamplingBias const* const bias);
+  SPTR<job>
+  get_job();
+};
+template<typename Token>
+class
+Bitext<Token>::agenda::
+worker
+{
+  agenda& ag;
+public:
+  worker(agenda& a) : ag(a) {}
+  void operator()();
+};
+#include "ug_bitext_agenda_worker.h"
+#include "ug_bitext_agenda_job.h"
+template<typename Token>
+void Bitext<Token>
+::agenda
+::add_workers(int n)
+{
+  static boost::posix_time::time_duration nodelay(0,0,0,0);
+  boost::lock_guard<boost::mutex> guard(this->lock);
+  int target  = std::max(1, int(n + workers.size() - this->doomed));
+  // house keeping: remove all workers that have finished
+  for (size_t i = 0; i < workers.size(); )
+    {
+      if (workers[i]->timed_join(nodelay))
+        {
+          if (i + 1 < workers.size())
+            workers[i].swap(workers.back());
+          workers.pop_back();
+        }
+      else ++i;
+    }
+  // cerr << workers.size() << "/" << target << " active" << std::endl;
+  if (int(workers.size()) > target)
+    this->doomed = workers.size() - target;
+  else
+    while (int(workers.size()) < target)
+      {
+        SPTR<boost::thread> w(new boost::thread(worker(*this)));
+        workers.push_back(w);
+      }
+}
+template<typename Token>
+SPTR<pstats> Bitext<Token>
+::agenda
+::add_job(Bitext<Token> const* const theBitext,
+	  typename TSA<Token>::tree_iterator const& phrase,
+	  size_t const max_samples, SPTR<SamplingBias const> const& bias,
+	  bool const track_sids)
+{
+  boost::unique_lock<boost::mutex> lk(this->lock);
+  static boost::posix_time::time_duration nodelay(0,0,0,0);
+  bool fwd = phrase.root == bt.I1.get();
+  SPTR<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2,
+		      max_samples, fwd, bias, track_sids));
+  j->stats->register_worker();
+  joblist.push_back(j);
+  if (joblist.size() == 1)
+    {
+      size_t i = 0;
+      while (i < workers.size())
+	{
+	  if (workers[i]->timed_join(nodelay))
+	    {
+	      if (doomed)
+		{
+		  if (i+1 < workers.size())
+		    workers[i].swap(workers.back());
+		  workers.pop_back();
+		  --doomed;
+		}
+	      else
+		workers[i++] = SPTR<boost::thread>(new boost::thread(worker(*this)));
+	    }
+	  else ++i;
+	}
+    }
+  return j->stats;
+}
+template<typename Token>
+SPTR<typename Bitext<Token>::agenda::job>
+Bitext<Token>
+::agenda
+::get_job()
+{
+  // cerr << workers.size() << " workers on record" << std::endl;
+  SPTR<job> ret;
+  if (this->shutdown) return ret;
+  boost::unique_lock<boost::mutex> lock(this->lock);
+  if (this->doomed)
+    { // the number of workers has been reduced, tell the redundant once to quit
+      --this->doomed;
+      return ret;
+    }
+  typename std::list<SPTR<job> >::iterator j = joblist.begin();
+  while (j != joblist.end())
+    {
+      if ((*j)->done())
+	{
+	  (*j)->stats->release();
+	  joblist.erase(j++);
+	}
+      else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
+      else break; // found one
+    }
+  if (joblist.size())
+    {
+      ret = j == joblist.end() ? joblist.front() : *j;
+      // if we've reached the end of the queue (all jobs have 4 workers on them),
+      // take the first in the queue
+      boost::lock_guard<boost::mutex> jguard(ret->lock);
+      ++ret->workers;
+    }
+  return ret;
+}
+template<typename Token>
+Bitext<Token>::
+agenda::
+~agenda()
+{
+  this->lock.lock();
+  this->shutdown = true;
+  this->lock.unlock();
+  for (size_t i = 0; i < workers.size(); ++i)
+    workers[i]->join();
+}
+template<typename Token>
+Bitext<Token>::
+agenda::
+agenda(Bitext<Token> const& thebitext)
+  : shutdown(false), doomed(0), bt(thebitext)
+{ }

mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_jstats.h ADDED Viewed

	@@ -0,0 +1,58 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#pragma once
+#include <string>
+#include <stdint.h>
+#include "ug_typedefs.h"
+#include "ug_lexical_reordering.h"
+#include <boost/thread.hpp>
+namespace sapt
+{
+  // "joint" (i.e., phrase std::pair) statistics
+  class
+  jstats
+  {
+    boost::mutex lock;
+    uint32_t my_rcnt; // unweighted joint count
+    uint32_t my_cnt2; // raw counts L2
+    float    my_wcnt; // weighted joint count
+    float    my_bcnt; // cumulative bias
+    // to do: use a static alignment pattern store that stores each pattern only
+    // once, so that we don't have to store so many alignment std::vectors
+    std::vector<std::pair<size_t, std::vector<unsigned char> > > my_aln;
+    // internal word alignment
+    uint32_t ofwd[LRModel::NONE+1]; //  forward distortion type counts
+    uint32_t obwd[LRModel::NONE+1]; // backward distortion type counts
+  public:
+    SPTR<std::vector<uint32_t> > sids; // list of sentence ids in this sample
+    std::map<uint32_t,uint32_t> indoc;
+    // std::vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
+    jstats();
+    jstats(jstats const& other);
+    uint32_t rcnt() const; // raw joint counts
+    uint32_t cnt2() const; // raw target phrase occurrence count
+    float    wcnt() const; // weighted joint counts
+    float    bcnt() const; // cumulative bias scores
+    std::vector<std::pair<size_t, std::vector<unsigned char> > > const & aln() const;
+    size_t
+    add(float w, float b, std::vector<unsigned char> const& a, uint32_t const cnt2,
+	uint32_t fwd_orient, uint32_t bwd_orient, int const docid, uint32_t const sid,
+	bool const track_sid);
+    void invalidate();
+    void validate();
+    bool valid();
+    uint32_t dcnt_fwd(PhraseOrientation const idx) const;
+    uint32_t dcnt_bwd(PhraseOrientation const idx) const;
+    void fill_lr_vec(LRModel::Direction const& dir,
+                     LRModel::ModelType const& mdl,
+                     std::vector<float>& v);
+  };
+}

mosesdecoder/moses/TranslationModel/UG/mm/ug_bitext_moses.h ADDED Viewed

	@@ -0,0 +1,87 @@

+// -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; cc-style: moses-cc-style -*-
+#pragma once
+#ifndef NO_MOSES
+namespace sapt {
+template<typename Token>
+SPTR<pstats>
+Bitext<Token>::
+lookup(ttasksptr const& ttask, iter const& phrase, int max_sample) const
+{
+  SPTR<pstats> ret = prep2(ttask, phrase, max_sample);
+  UTIL_THROW_IF2(!ret, "Got NULL pointer where I expected a valid pointer.");
+  // Why were we locking here?
+  if (m_num_workers <= 1)
+    {
+      boost::unique_lock<boost::shared_mutex> guard(m_lock);
+      typename agenda::worker(*this->ag)();
+    }
+  else
+    {
+      boost::unique_lock<boost::mutex> lock(ret->lock);
+      while (ret->in_progress)
+	ret->ready.wait(lock);
+    }
+  return ret;
+}
+template<typename Token>
+void
+Bitext<Token>::
+prep(ttasksptr const& ttask, iter const& phrase, bool const track_sids) const
+{
+  prep2(ttask, phrase, track_sids, m_default_sample_size);
+}
+// prep2 schedules a phrase for sampling, and returns immediately
+// the member function lookup retrieves the respective pstats instance
+// and waits until the sampling is finished before it returns.
+// This allows sampling in the background
+template<typename Token>
+SPTR<pstats>
+Bitext<Token>
+::prep2
+( ttasksptr const& ttask, iter const& phrase, bool const track_sids,
+  int max_sample) const
+{
+  if (max_sample < 0) max_sample = m_default_sample_size;
+  SPTR<SamplingBias> bias;
+  SPTR<Moses::ContextScope> scope = ttask->GetScope();
+  SPTR<ContextForQuery> context = scope->get<ContextForQuery>(this);
+  if (context) bias = context->bias;
+  SPTR<pstats::cache_t> cache;
+  // - no caching for rare phrases and special requests (max_sample)
+  //   (still need to test what a good caching threshold is ...)
+  // - use the task-specific cache when there is a sampling bias
+  if (max_sample == int(m_default_sample_size)
+      && phrase.approxOccurrenceCount() > m_pstats_cache_threshold)
+    {
+      cache = (phrase.root == I1.get()
+	       ? (bias ? context->cache1 : m_cache1)
+	       : (bias ? context->cache2 : m_cache2));
+    }
+  SPTR<pstats> ret;
+  SPTR<pstats> const* cached;
+  if (cache && (cached = cache->get(phrase.getPid(), ret)) && *cached)
+    return *cached;
+  boost::unique_lock<boost::shared_mutex> guard(m_lock);
+  if (!ag)
+    {
+      ag.reset(new agenda(*this));
+      if (m_num_workers > 1)
+	ag->add_workers(m_num_workers);
+    }
+  ret = ag->add_job(this, phrase, max_sample, bias, track_sids);
+  if (cache) cache->set(phrase.getPid(),ret);
+  UTIL_THROW_IF2(ret == NULL, "Couldn't schedule sampling job.");
+  return ret;
+}
+}
+#endif