sleepyhead111 commited on Apr 20, 2025

Commit

76efa37

verified ·

1 Parent(s): 1747e32

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp +94 -0
mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h +105 -0
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp +195 -0
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h +94 -0
mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h +143 -0
mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h +387 -0
mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h +187 -0
mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp +194 -0
mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h +430 -0
mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h +163 -0
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp +30 -0
mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h +31 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp +65 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp +33 -0
mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp +260 -0
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp +417 -0
mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp +398 -0
mosesdecoder/moses/TranslationModel/RuleTable/Trie.h +63 -0
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp +96 -0
mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h +73 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h +20 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp +1029 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h +91 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h +34 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp +25 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h +43 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp +240 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h +69 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp +71 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h +46 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp +387 -0
mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h +5 -0
mosesdecoder/moses/server/Hypothesis_4server.cpp +37 -0
mosesdecoder/moses/server/Optimizer.h +17 -0
mosesdecoder/moses/server/PackScores.cpp +45 -0
mosesdecoder/moses/server/PackScores.h +10 -0
mosesdecoder/moses/server/Server.h +46 -0
mosesdecoder/moses/server/Session.h +75 -0
mosesdecoder/moses/server/TranslationRequest.cpp +524 -0
mosesdecoder/moses/server/Updater.cpp +58 -0
mosesdecoder/moses/server/Updater.h +44 -0
mosesdecoder/util/bit_packing_test.cc +59 -0
mosesdecoder/util/ersatz_progress.hh +57 -0
mosesdecoder/util/exception.hh +165 -0
mosesdecoder/util/fake_ostream.hh +111 -0
mosesdecoder/util/file_piece.hh +175 -0
mosesdecoder/util/file_piece_test.cc +154 -0
mosesdecoder/util/generator.hh +34 -0
mosesdecoder/util/getopt.c +78 -0
mosesdecoder/util/integer_to_string_test.cc +81 -0

mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp ADDED Viewed

	@@ -0,0 +1,94 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifdef HAVE_CMPH
+#include "CmphStringVectorAdapter.h"
+namespace Moses
+{
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+  delete[] key;
+}
+void CmphStringVectorAdapterRewind(void *data)
+{
+  cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+  cmph_vector->position = 0;
+}
+//************************************************************************//
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v)
+{
+  cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+  cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+  assert(key_source);
+  assert(cmph_vector);
+  cmph_vector->vector = (void *)&v;
+  cmph_vector->position = 0;
+  key_source->data = (void *)cmph_vector;
+  key_source->nkeys = v.size();
+  return key_source;
+}
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+  cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+  std::vector<std::string>* v = (std::vector<std::string>*)cmph_vector->vector;
+  size_t size;
+  *keylen = (*v)[cmph_vector->position].size();
+  size = *keylen;
+  *key = new char[size + 1];
+  std::string temp = (*v)[cmph_vector->position];
+  strcpy(*key, temp.c_str());
+  cmph_vector->position = cmph_vector->position + 1;
+  return (int)(*keylen);
+}
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen)
+{
+  delete[] key;
+}
+void CmphVectorAdapterRewind(void *data)
+{
+  cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+  cmph_vector->position = 0;
+}
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v)
+{
+  cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v);
+  key_source->read = CmphVectorAdapterRead;
+  key_source->dispose = CmphVectorAdapterDispose;
+  key_source->rewind = CmphVectorAdapterRewind;
+  return key_source;
+}
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h ADDED Viewed

	@@ -0,0 +1,105 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_CmphStringVectorAdapterNew_h
+#define moses_CmphStringVectorAdapterNew_h
+#include <cassert>
+#include <cstring>
+#ifdef HAVE_CMPH
+#include "cmph.h"
+#include "StringVector.h"
+namespace Moses
+{
+typedef struct {
+  void *vector;
+  cmph_uint32 position;
+}
+cmph_vector_t;
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector<ValueT, PosT, Allocator>& sv)
+{
+  cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t));
+  cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t));
+  assert(key_source);
+  assert(cmph_vector);
+  cmph_vector->vector = (void *)&sv;
+  cmph_vector->position = 0;
+  key_source->data = (void *)cmph_vector;
+  key_source->nkeys = sv.size();
+  return key_source;
+}
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen)
+{
+  cmph_vector_t *cmph_vector = (cmph_vector_t *)data;
+  StringVector<ValueT, PosT, Allocator>* sv = (StringVector<ValueT, PosT, Allocator>*)cmph_vector->vector;
+  size_t size;
+  *keylen = (*sv)[cmph_vector->position].size();
+  size = *keylen;
+  *key = new char[size + 1];
+  std::string temp = (*sv)[cmph_vector->position];
+  std::strcpy(*key, temp.c_str());
+  cmph_vector->position = cmph_vector->position + 1;
+  return (int)(*keylen);
+}
+void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+void CmphStringVectorAdapterRewind(void *data);
+template <typename ValueT, typename PosT, template <typename> class Allocator>
+cmph_io_adapter_t* CmphStringVectorAdapter(StringVector<ValueT, PosT, Allocator>& sv)
+{
+  cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv);
+  key_source->read = CmphStringVectorAdapterRead<ValueT, PosT, Allocator>;
+  key_source->dispose = CmphStringVectorAdapterDispose;
+  key_source->rewind = CmphStringVectorAdapterRewind;
+  return key_source;
+}
+//************************************************************************//
+cmph_io_adapter_t *CmphVectorAdapterNew(std::vector<std::string>& v);
+int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen);
+void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen);
+void CmphVectorAdapterRewind(void *data);
+cmph_io_adapter_t* CmphVectorAdapter(std::vector<std::string>& v);
+}
+#endif
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp ADDED Viewed

	@@ -0,0 +1,195 @@

+// -*- c++ -*-
+// vim:tabstop=2
+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "LexicalReorderingTableCompact.h"
+#include "moses/parameters/OOVHandlingOptions.h"
+namespace Moses
+{
+bool LexicalReorderingTableCompact::s_inMemoryByDefault = false;
+LexicalReorderingTableCompact::
+LexicalReorderingTableCompact(const std::string& filePath,
+                              const std::vector<FactorType>& f_factors,
+                              const std::vector<FactorType>& e_factors,
+                              const std::vector<FactorType>& c_factors)
+  : LexicalReorderingTable(f_factors, e_factors, c_factors)
+  , m_inMemory(s_inMemoryByDefault)
+  , m_numScoreComponent(6)
+  , m_multipleScoreTrees(true)
+  , m_hash(10, 16)
+  , m_scoreTrees(1)
+{
+  Load(filePath);
+}
+LexicalReorderingTableCompact::
+LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
+                              const std::vector<FactorType>& e_factors,
+                              const std::vector<FactorType>& c_factors)
+  : LexicalReorderingTable(f_factors, e_factors, c_factors)
+  , m_inMemory(s_inMemoryByDefault)
+  , m_numScoreComponent(6)
+  , m_multipleScoreTrees(true)
+  , m_hash(10, 16)
+  , m_scoreTrees(1)
+{ }
+LexicalReorderingTableCompact::
+~LexicalReorderingTableCompact()
+{
+  for(size_t i = 0; i < m_scoreTrees.size(); i++)
+    delete m_scoreTrees[i];
+}
+std::vector<float>
+LexicalReorderingTableCompact::
+GetScore(const Phrase& f, const Phrase& e, const Phrase& c)
+{
+  std::string key;
+  Scores scores;
+  if(0 == c.GetSize())
+    key = MakeKey(f, e, c);
+  else
+    for(size_t i = 0; i <= c.GetSize(); ++i) {
+      Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1)));
+      key = MakeKey(f,e,sub_c);
+    }
+  size_t index = m_hash[key];
+  if(m_hash.GetSize() != index) {
+    std::string scoresString;
+    if(m_inMemory)
+      scoresString = m_scoresMemory[index].str();
+    else
+      scoresString = m_scoresMapped[index].str();
+    BitWrapper<> bitStream(scoresString);
+    for(size_t i = 0; i < m_numScoreComponent; i++)
+      scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));
+    return scores;
+  }
+  return Scores();
+}
+std::string
+LexicalReorderingTableCompact::
+MakeKey(const Phrase& f,
+        const Phrase& e,
+        const Phrase& c) const
+{
+  return MakeKey(Trim(f.GetStringRep(m_FactorsF)),
+                 Trim(e.GetStringRep(m_FactorsE)),
+                 Trim(c.GetStringRep(m_FactorsC)));
+}
+std::string
+LexicalReorderingTableCompact::
+MakeKey(const std::string& f,
+        const std::string& e,
+        const std::string& c) const
+{
+  std::string key;
+  if(!f.empty()) key += f;
+  if(!m_FactorsE.empty()) {
+    if(!key.empty()) key += " ||| ";
+    key += e;
+  }
+  if(!m_FactorsC.empty()) {
+    if(!key.empty()) key += " ||| ";
+    key += c;
+  }
+  key += " ||| ";
+  return key;
+}
+LexicalReorderingTable*
+LexicalReorderingTableCompact::
+CheckAndLoad
+(const std::string& filePath,
+ const std::vector<FactorType>& f_factors,
+ const std::vector<FactorType>& e_factors,
+ const std::vector<FactorType>& c_factors)
+{
+#ifdef HAVE_CMPH
+  std::string minlexr = ".minlexr";
+  // file name is specified without suffix
+  if(FileExists(filePath + minlexr)) {
+    //there exists a compact binary version use that
+    VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+    return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors);
+  }
+  // file name is specified with suffix
+  if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr
+      && FileExists(filePath)) {
+    //there exists a compact binary version use that
+    VERBOSE(2,"Using compact lexical reordering table" << std::endl);
+    return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors);
+  }
+#endif
+  return 0;
+}
+void
+LexicalReorderingTableCompact::
+Load(std::string filePath)
+{
+  std::FILE* pFile = std::fopen(filePath.c_str(), "r");
+  UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened");
+  //if(m_inMemory)
+  m_hash.Load(pFile);
+  //else
+  //m_hash.LoadIndex(pFile);
+  size_t read = 0;
+  read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile);
+  read += std::fread(&m_multipleScoreTrees,
+                     sizeof(m_multipleScoreTrees), 1, pFile);
+  if(m_multipleScoreTrees) {
+    m_scoreTrees.resize(m_numScoreComponent);
+    for(size_t i = 0; i < m_numScoreComponent; i++)
+      m_scoreTrees[i] = new CanonicalHuffman<float>(pFile);
+  } else {
+    m_scoreTrees.resize(1);
+    m_scoreTrees[0] = new CanonicalHuffman<float>(pFile);
+  }
+  if(m_inMemory)
+    m_scoresMemory.load(pFile, false);
+  else
+    m_scoresMapped.load(pFile, true);
+}
+void
+LexicalReorderingTableCompact::
+SetStaticDefaultParameters(Parameter const& param)
+{
+  param.SetParameter(s_inMemoryByDefault, "minlexr-memory", false);
+}
+}

mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h ADDED Viewed

	@@ -0,0 +1,94 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_LexicalReorderingTableCompact_h
+#define moses_LexicalReorderingTableCompact_h
+#include "moses/FF/LexicalReordering/LexicalReorderingTable.h"
+#include "moses/StaticData.h"
+#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/GenerationDictionary.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "BlockHashIndex.h"
+#include "CanonicalHuffman.h"
+#include "StringVector.h"
+namespace Moses
+{
+class LexicalReorderingTableCompact:
+  public LexicalReorderingTable
+{
+private:
+  static bool s_inMemoryByDefault;
+  bool m_inMemory;
+  size_t m_numScoreComponent;
+  bool m_multipleScoreTrees;
+  BlockHashIndex m_hash;
+  typedef CanonicalHuffman<float> ScoreTree;
+  std::vector<ScoreTree*> m_scoreTrees;
+  StringVector<unsigned char, unsigned long, MmapAllocator>  m_scoresMapped;
+  StringVector<unsigned char, unsigned long, std::allocator> m_scoresMemory;
+  std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const;
+  std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const;
+public:
+  LexicalReorderingTableCompact(const std::string& filePath,
+                                const std::vector<FactorType>& f_factors,
+                                const std::vector<FactorType>& e_factors,
+                                const std::vector<FactorType>& c_factors);
+  LexicalReorderingTableCompact(const std::vector<FactorType>& f_factors,
+                                const std::vector<FactorType>& e_factors,
+                                const std::vector<FactorType>& c_factors);
+  virtual
+  ~LexicalReorderingTableCompact();
+  virtual
+  std::vector<float>
+  GetScore(const Phrase& f, const Phrase& e, const Phrase& c);
+  static
+  LexicalReorderingTable*
+  CheckAndLoad(const std::string& filePath,
+               const std::vector<FactorType>& f_factors,
+               const std::vector<FactorType>& e_factors,
+               const std::vector<FactorType>& c_factors);
+  void
+  Load(std::string filePath);
+  static void
+  SetStaticDefaultParameters(Parameter const& param);
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h ADDED Viewed

	@@ -0,0 +1,143 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_LexicalReorderingTableCreator_h
+#define moses_LexicalReorderingTableCreator_h
+#include "PhraseTableCreator.h"
+namespace Moses
+{
+class LexicalReorderingTableCreator
+{
+private:
+  std::string m_inPath;
+  std::string m_outPath;
+  std::string m_tempfilePath;
+  std::FILE* m_outFile;
+  size_t m_orderBits;
+  size_t m_fingerPrintBits;
+  size_t m_numScoreComponent;
+  bool m_multipleScoreTrees;
+  bool m_quantize;
+  std::string m_separator;
+  BlockHashIndex m_hash;
+  typedef Counter<float> ScoreCounter;
+  typedef CanonicalHuffman<float> ScoreTree;
+  std::vector<ScoreCounter*> m_scoreCounters;
+  std::vector<ScoreTree*> m_scoreTrees;
+  StringVector<unsigned char, unsigned long, MmapAllocator>* m_encodedScores;
+  StringVector<unsigned char, unsigned long, MmapAllocator>* m_compressedScores;
+  std::priority_queue<PackedItem> m_queue;
+  long m_lastFlushedLine;
+  long m_lastFlushedSourceNum;
+  std::string m_lastFlushedSourcePhrase;
+  std::vector<std::string> m_lastRange;
+#ifdef WITH_THREADS
+  size_t m_threads;
+#endif
+  void PrintInfo();
+  void EncodeScores();
+  void CalcHuffmanCodes();
+  void CompressScores();
+  void Save();
+  std::string MakeSourceTargetKey(std::string&, std::string&);
+  std::string EncodeLine(std::vector<std::string>& tokens);
+  void AddEncodedLine(PackedItem& pi);
+  void FlushEncodedQueue(bool force = false);
+  std::string CompressEncodedScores(std::string &encodedScores);
+  void AddCompressedScores(PackedItem& pi);
+  void FlushCompressedQueue(bool force = false);
+public:
+  LexicalReorderingTableCreator(std::string inPath,
+                                std::string outPath,
+                                std::string tempfilePath,
+                                size_t orderBits = 10,
+                                size_t fingerPrintBits = 16,
+                                bool multipleScoreTrees = true,
+                                size_t quantize = 0
+#ifdef WITH_THREADS
+                                    , size_t threads = 2
+#endif
+                               );
+  ~LexicalReorderingTableCreator();
+  friend class EncodingTaskReordering;
+  friend class CompressionTaskReordering;
+};
+class EncodingTaskReordering
+{
+private:
+#ifdef WITH_THREADS
+  static boost::mutex m_mutex;
+  static boost::mutex m_fileMutex;
+#endif
+  static size_t m_lineNum;
+  static size_t m_sourcePhraseNum;
+  static std::string m_lastSourcePhrase;
+  InputFileStream& m_inFile;
+  LexicalReorderingTableCreator& m_creator;
+public:
+  EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator);
+  void operator()();
+};
+class CompressionTaskReordering
+{
+private:
+#ifdef WITH_THREADS
+  static boost::mutex m_mutex;
+#endif
+  static size_t m_scoresNum;
+  StringVector<unsigned char, unsigned long, MmapAllocator> &m_encodedScores;
+  LexicalReorderingTableCreator &m_creator;
+public:
+  CompressionTaskReordering(StringVector<unsigned char, unsigned long, MmapAllocator>&
+                            m_encodedScores, LexicalReorderingTableCreator& creator);
+  void operator()();
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h ADDED Viewed

	@@ -0,0 +1,387 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_ListCoders_h
+#define moses_ListCoders_h
+#include <cmath>
+#include <cassert>
+namespace Moses
+{
+template <typename T = unsigned int>
+class VarIntType
+{
+private:
+  template <typename IntType, typename OutIt>
+  static void EncodeSymbol(IntType input, OutIt output) {
+    if(input == 0) {
+      *output = 0;
+      output++;
+      return;
+    }
+    T msb = 1 << (sizeof(T)*8-1);
+    IntType mask  = ~msb;
+    IntType shift = (sizeof(T)*8-1);
+    while(input) {
+      T res = input & mask;
+      input >>= shift;
+      if(input)
+        res |= msb;
+      *output = res;
+      output++;
+    }
+  };
+  template <typename InIt, typename IntType>
+  static void DecodeSymbol(InIt &it, InIt end, IntType &output) {
+    T msb = 1 << (sizeof(T)*8-1);
+    IntType shift = (sizeof(T)*8-1);
+    output = 0;
+    size_t i = 0;
+    while(it != end && *it & msb) {
+      IntType temp = *it & ~msb;
+      temp <<= shift*i;
+      output |= temp;
+      it++;
+      i++;
+    }
+    assert(it != end);
+    IntType temp = *it;
+    temp <<= shift*i;
+    output |= temp;
+    it++;
+  }
+public:
+  template <typename InIt, typename OutIt>
+  static void Encode(InIt it, InIt end, OutIt outIt) {
+    while(it != end) {
+      EncodeSymbol(*it, outIt);
+      it++;
+    }
+  }
+  template <typename InIt, typename OutIt>
+  static void Decode(InIt &it, InIt end, OutIt outIt) {
+    while(it != end) {
+      size_t output;
+      DecodeSymbol(it, end, output);
+      *outIt = output;
+      outIt++;
+    }
+  }
+  template <typename InIt>
+  static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+    size_t sum = 0;
+    size_t curr = 0;
+    while(it != end && curr < num) {
+      size_t output;
+      DecodeSymbol(it, end, output);
+      sum += output;
+      curr++;
+    }
+    return sum;
+  }
+};
+typedef VarIntType<unsigned char> VarByte;
+typedef VarByte VarInt8;
+typedef VarIntType<unsigned short> VarInt16;
+typedef VarIntType<unsigned int>   VarInt32;
+class Simple9
+{
+private:
+  typedef unsigned int uint;
+  template <typename InIt>
+  inline static void EncodeSymbol(uint &output, InIt it, InIt end) {
+    uint length = end - it;
+    uint type = 0;
+    uint bitlength = 0;
+    switch(length) {
+    case 1:
+      type = 1;
+      bitlength = 28;
+      break;
+    case 2:
+      type = 2;
+      bitlength = 14;
+      break;
+    case 3:
+      type = 3;
+      bitlength = 9;
+      break;
+    case 4:
+      type = 4;
+      bitlength = 7;
+      break;
+    case 5:
+      type = 5;
+      bitlength = 5;
+      break;
+    case 7:
+      type = 6;
+      bitlength = 4;
+      break;
+    case 9:
+      type = 7;
+      bitlength = 3;
+      break;
+    case 14:
+      type = 8;
+      bitlength = 2;
+      break;
+    case 28:
+      type = 9;
+      bitlength = 1;
+      break;
+    }
+    output = 0;
+    output |= (type << 28);
+    uint i = 0;
+    while(it != end) {
+      UTIL_THROW_IF2(*it > 268435455, "You are trying to encode " << *it
+                     << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)");
+      uint l = bitlength * (length-i-1);
+      output |= *it << l;
+      it++;
+      i++;
+    }
+  }
+  template <typename OutIt>
+  static inline void DecodeSymbol(uint input, OutIt outIt) {
+    uint type = (input >> 28);
+    uint bitlen = 0;
+    uint shift = 0;
+    uint mask = 0;
+    switch(type) {
+    case 1:
+      bitlen = 28;
+      shift = 0;
+      mask = 268435455;
+      break;
+    case 2:
+      bitlen = 14;
+      shift = 14;
+      mask = 16383;
+      break;
+    case 3:
+      bitlen = 9;
+      shift = 18;
+      mask = 511;
+      break;
+    case 4:
+      bitlen = 7;
+      shift = 21;
+      mask = 127;
+      break;
+    case 5:
+      bitlen = 5;
+      shift = 20;
+      mask = 31;
+      break;
+    case 6:
+      bitlen = 4;
+      shift = 24;
+      mask = 15;
+      break;
+    case 7:
+      bitlen = 3;
+      shift = 24;
+      mask = 7;
+      break;
+    case 8:
+      bitlen = 2;
+      shift = 26;
+      mask = 3;
+      break;
+    case 9:
+      bitlen = 1;
+      shift = 27;
+      mask = 1;
+      break;
+    }
+    while(shift > 0) {
+      *outIt = (input >> shift) & mask;
+      shift -= bitlen;
+      outIt++;
+    }
+    *outIt = input & mask;
+    outIt++;
+  }
+  static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) {
+    uint type = (input >> 28);
+    uint bitlen = 0;
+    uint shift = 0;
+    uint mask = 0;
+    switch(type) {
+    case 1:
+      bitlen = 28;
+      shift = 0;
+      mask = 268435455;
+      break;
+    case 2:
+      bitlen = 14;
+      shift = 14;
+      mask = 16383;
+      break;
+    case 3:
+      bitlen = 9;
+      shift = 18;
+      mask = 511;
+      break;
+    case 4:
+      bitlen = 7;
+      shift = 21;
+      mask = 127;
+      break;
+    case 5:
+      bitlen = 5;
+      shift = 20;
+      mask = 31;
+      break;
+    case 6:
+      bitlen = 4;
+      shift = 24;
+      mask = 15;
+      break;
+    case 7:
+      bitlen = 3;
+      shift = 24;
+      mask = 7;
+      break;
+    case 8:
+      bitlen = 2;
+      shift = 26;
+      mask = 3;
+      break;
+    case 9:
+      bitlen = 1;
+      shift = 27;
+      mask = 1;
+      break;
+    }
+    size_t sum = 0;
+    while(shift > 0) {
+      sum += (input >> shift) & mask;
+      shift -= bitlen;
+      if(++curr == num)
+        return sum;
+    }
+    sum += input & mask;
+    curr++;
+    return sum;
+  }
+public:
+  template <typename InIt, typename OutIt>
+  static void Encode(InIt it, InIt end, OutIt outIt) {
+    uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 };
+    uint buffer[28];
+    for(InIt i = it; i < end; i++) {
+      uint lastbit = 1;
+      uint lastpos = 0;
+      uint lastyes = 0;
+      uint j = 0;
+      double log2 = log(2);
+      while(j < 9 && lastpos < 28 && (i+lastpos) < end) {
+        if(lastpos >= parts[j])
+          j++;
+        buffer[lastpos] = *(i + lastpos);
+        uint reqbit = ceil(log(buffer[lastpos]+1)/log2);
+        assert(reqbit <= 28);
+        uint bit = 28/floor(28/reqbit);
+        if(lastbit < bit)
+          lastbit = bit;
+        if(parts[j] > 28/lastbit)
+          break;
+        else if(lastpos == parts[j]-1)
+          lastyes = lastpos;
+        lastpos++;
+      }
+      i += lastyes;
+      uint length = lastyes + 1;
+      uint output;
+      EncodeSymbol(output, buffer, buffer + length);
+      *outIt = output;
+      outIt++;
+    }
+  }
+  template <typename InIt, typename OutIt>
+  static void Decode(InIt &it, InIt end, OutIt outIt) {
+    while(it != end) {
+      DecodeSymbol(*it, outIt);
+      it++;
+    }
+  }
+  template <typename InIt>
+  static size_t DecodeAndSum(InIt &it, InIt end, size_t num) {
+    size_t sum = 0;
+    size_t curr = 0;
+    while(it != end && curr < num) {
+      sum += DecodeAndSumSymbol(*it, num, curr);
+      it++;
+    }
+    assert(curr == num);
+    return sum;
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h ADDED Viewed

	@@ -0,0 +1,187 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_PackedArray_h
+#define moses_PackedArray_h
+#include <vector>
+#include <cmath>
+#include <cstring>
+#include <cstdio>
+#include "ThrowingFwrite.h"
+namespace Moses
+{
+template <typename T = size_t, typename D = unsigned char>
+class PackedArray
+{
+protected:
+  static size_t m_dataBits;
+  size_t m_size;
+  size_t m_storageSize;
+  D* m_storage;
+public:
+  PackedArray() {
+    m_size = 0;
+    m_storageSize = 0;
+    m_storage = new D[0];
+  }
+  PackedArray(size_t size, size_t bits) : m_size(size) {
+    m_storageSize = ceil(float(bits * size) / float(m_dataBits));
+    m_storage = new D[m_storageSize];
+  }
+  PackedArray(const PackedArray<T, D> &c) {
+    m_size = c.m_size;
+    m_storageSize = c.m_storageSize;
+    m_storage = new D[m_storageSize];
+    std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D));
+  }
+  virtual ~PackedArray() {
+    delete [] m_storage;
+    m_size = 0;
+    m_storageSize = 0;
+    m_storage = 0;
+  }
+  T Get(size_t i, size_t bits) const {
+    T out = 0;
+    size_t bitstart = (i * bits);
+    size_t bitpos = bitstart;
+    size_t zero = ((1ul << (bits)) - 1);
+    while(bitpos - bitstart < bits) {
+      size_t pos = bitpos / m_dataBits;
+      size_t off = bitpos % m_dataBits;
+      out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off;
+      bitpos += (m_dataBits - off);
+    }
+    out &= zero;
+    return out;
+  }
+  void Set(size_t i, T v, size_t bits) {
+    size_t bitstart = (i * bits);
+    size_t bitpos = bitstart;
+    while(bitpos - bitstart < bits) {
+      size_t pos = bitpos / m_dataBits;
+      size_t off = bitpos % m_dataBits;
+      size_t rest = bits - (bitpos - bitstart);
+      D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1);
+      m_storage[pos] &= zero;
+      m_storage[pos] |= v << off;
+      v = v >> (m_dataBits - off);
+      bitpos += (m_dataBits - off);
+    }
+  }
+  virtual D*& GetStorage() {
+    return m_storage;
+  }
+  virtual size_t GetStorageSize() const {
+    return m_storageSize;
+  }
+  virtual size_t Size() const {
+    return m_size;
+  }
+  virtual size_t Load(std::FILE* in) {
+    size_t a1 = std::ftell(in);
+    size_t read = 0;
+    read += std::fread(&m_size, sizeof(m_size), 1, in);
+    read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in);
+    delete [] m_storage;
+    m_storage = new D[m_storageSize];
+    read += std::fread(m_storage, sizeof(D), m_storageSize, in);
+    size_t a2 = std::ftell(in);
+    return a2 - a1;
+  }
+  virtual size_t Save(std::FILE* out) {
+    size_t a1 = std::ftell(out);
+    ThrowingFwrite(&m_size, sizeof(m_size), 1, out);
+    ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out);
+    ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out);
+    size_t a2 = std::ftell(out);
+    return a2 - a1;
+  }
+};
+template <typename T, typename D>
+size_t PackedArray<T, D>::m_dataBits = sizeof(D)*8;
+/**************************************************************************/
+template <typename T = size_t, typename D = unsigned char>
+class PairedPackedArray : public PackedArray<T,D>
+{
+public:
+  PairedPackedArray() : PackedArray<T,D>() {}
+  PairedPackedArray(size_t size, size_t bits1, size_t bits2)
+    : PackedArray<T, D>(size, bits1 + bits2) { }
+  void Set(size_t i, T a, T b, size_t bits1, size_t bits2) {
+    T c = 0;
+    c = a | (b << bits1);
+    PackedArray<T,D>::Set(i, c, bits1 + bits2);
+  }
+  void Set(size_t i, std::pair<T,T> p, size_t bits1, size_t bits2) {
+    T c = 0;
+    c = p.second | (p.first << bits1);
+    PackedArray<T, D>::Set(i, c);
+  }
+  std::pair<T, T> Get(size_t i, size_t bits1, size_t bits2) {
+    T v = PackedArray<T, D>::Get(i, bits1 + bits2);
+    T a = v & ((1 << bits1) - 1);
+    T b = v >> bits1;
+    return std::pair<T, T>(a, b);
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp ADDED Viewed

	@@ -0,0 +1,194 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <queue>
+#include <algorithm>
+#include <sys/stat.h>
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/thread/tss.hpp>
+#include "PhraseDictionaryCompact.h"
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/Range.h"
+#include "moses/ThreadPool.h"
+#include "util/exception.hh"
+using namespace std;
+using namespace boost::algorithm;
+namespace Moses
+{
+PhraseDictionaryCompact::SentenceCache PhraseDictionaryCompact::m_sentenceCache;
+PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line)
+  :PhraseDictionary(line, true)
+  ,m_inMemory(s_inMemoryByDefault)
+  ,m_useAlignmentInfo(true)
+  ,m_hash(10, 16)
+  ,m_phraseDecoder(0)
+{
+  ReadParameters();
+}
+void PhraseDictionaryCompact::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  const StaticData &staticData = StaticData::Instance();
+  SetFeaturesToApply();
+  std::string tFilePath = m_filePath;
+  std::string suffix = ".minphr";
+  if (!ends_with(tFilePath, suffix)) tFilePath += suffix;
+  if (!FileExists(tFilePath))
+    throw runtime_error("Error: File " + tFilePath + " does not exist.");
+  m_phraseDecoder
+  = new PhraseDecoder(*this, &m_input, &m_output, m_numScoreComponents);
+  std::FILE* pFile = std::fopen(tFilePath.c_str() , "r");
+  size_t indexSize;
+  //if(m_inMemory)
+  // Load source phrase index into memory
+  indexSize = m_hash.Load(pFile);
+  // else
+  // Keep source phrase index on disk
+  //indexSize = m_hash.LoadIndex(pFile);
+  size_t coderSize = m_phraseDecoder->Load(pFile);
+  size_t phraseSize;
+  if(m_inMemory)
+    // Load target phrase collections into memory
+    phraseSize = m_targetPhrasesMemory.load(pFile, false);
+  else
+    // Keep target phrase collections on disk
+    phraseSize = m_targetPhrasesMapped.load(pFile, true);
+  UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0,
+                 "Not successfully loaded");
+}
+TargetPhraseCollection::shared_ptr
+PhraseDictionaryCompact::
+GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &sourcePhrase) const
+{
+  //cerr << "sourcePhrase=" << sourcePhrase << endl;
+  TargetPhraseCollection::shared_ptr ret;
+  // There is no souch source phrase if source phrase is longer than longest
+  // observed source phrase during compilation
+  if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
+    return ret;
+  // Retrieve target phrase collection from phrase table
+  TargetPhraseVectorPtr decodedPhraseColl
+  = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true);
+  if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
+    TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
+    TargetPhraseCollection::shared_ptr  phraseColl(new TargetPhraseCollection);
+    // Score phrases and if possible apply ttable_limit
+    TargetPhraseVector::iterator nth =
+      (m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
+      tpv->end() : tpv->begin() + m_tableLimit;
+    NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
+    for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) {
+      TargetPhrase *tp = new TargetPhrase(*it);
+      phraseColl->Add(tp);
+    }
+    // Cache phrase pair for clean-up or retrieval with PREnc
+    const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(phraseColl);
+    return phraseColl;
+  } else
+    return ret;
+}
+TargetPhraseVectorPtr
+PhraseDictionaryCompact::
+GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const
+{
+  // There is no such source phrase if source phrase is longer than longest
+  // observed source phrase during compilation
+  if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
+    return TargetPhraseVectorPtr();
+  // Retrieve target phrase collection from phrase table
+  return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false);
+}
+PhraseDictionaryCompact::
+~PhraseDictionaryCompact()
+{
+  if(m_phraseDecoder)
+    delete m_phraseDecoder;
+}
+void
+PhraseDictionaryCompact::
+CacheForCleanup(TargetPhraseCollection::shared_ptr  tpc)
+{
+  if(!m_sentenceCache.get())
+    m_sentenceCache.reset(new PhraseCache());
+  m_sentenceCache->push_back(tpc);
+}
+void
+PhraseDictionaryCompact::
+AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase)
+{ }
+void
+PhraseDictionaryCompact::
+CleanUpAfterSentenceProcessing(const InputType &source)
+{
+  if(!m_sentenceCache.get())
+    m_sentenceCache.reset(new PhraseCache());
+  m_phraseDecoder->PruneCache();
+  m_sentenceCache->clear();
+  ReduceCache();
+}
+bool PhraseDictionaryCompact::s_inMemoryByDefault = false;
+void
+PhraseDictionaryCompact::
+SetStaticDefaultParameters(Parameter const& param)
+{
+  param.SetParameter(s_inMemoryByDefault, "minphr-memory", false);
+}
+}

mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h ADDED Viewed

	@@ -0,0 +1,430 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_StringVectorTemp_h
+#define moses_StringVectorTemp_h
+#include <vector>
+#include <algorithm>
+#include <string>
+#include <iterator>
+#include <cstdio>
+#include <cassert>
+#include <boost/iterator/iterator_facade.hpp>
+#include "ThrowingFwrite.h"
+#include "StringVector.h"
+#include "MmapAllocator.h"
+namespace Moses
+{
+// ********** StringVectorTemp **********
+template <typename ValueT = unsigned char, typename PosT = unsigned int,
+         template <typename> class Allocator = std::allocator>
+class StringVectorTemp
+{
+protected:
+  bool m_sorted;
+  bool m_memoryMapped;
+  std::vector<ValueT, Allocator<ValueT> >* m_charArray;
+  std::vector<PosT> m_positions;
+  virtual const ValueT* value_ptr(PosT i) const;
+public:
+  //typedef ValueIteratorRange<typename std::vector<ValueT, Allocator<ValueT> >::const_iterator> range;
+  typedef ValueIteratorRange<const ValueT *> range;
+  // ********** RangeIterator **********
+  class RangeIterator : public boost::iterator_facade<RangeIterator,
+    range, std::random_access_iterator_tag, range, PosT>
+  {
+  private:
+    PosT m_index;
+    StringVectorTemp<ValueT, PosT, Allocator>* m_container;
+  public:
+    RangeIterator();
+    RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
+    PosT get_index();
+  private:
+    friend class boost::iterator_core_access;
+    range dereference() const;
+    bool equal(RangeIterator const& other) const;
+    void increment();
+    void decrement();
+    void advance(PosT n);
+    PosT distance_to(RangeIterator const& other) const;
+  };
+  // ********** StringIterator **********
+  class StringIterator : public boost::iterator_facade<StringIterator,
+    std::string, std::random_access_iterator_tag, const std::string, PosT>
+  {
+  private:
+    PosT m_index;
+    StringVectorTemp<ValueT, PosT, Allocator>* m_container;
+  public:
+    StringIterator();
+    StringIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index=0);
+    PosT get_index();
+  private:
+    friend class boost::iterator_core_access;
+    const std::string dereference() const;
+    bool equal(StringIterator const& other) const;
+    void increment();
+    void decrement();
+    void advance(PosT n);
+    PosT distance_to(StringIterator const& other) const;
+  };
+  typedef RangeIterator iterator;
+  typedef StringIterator string_iterator;
+  StringVectorTemp();
+  StringVectorTemp(Allocator<ValueT> alloc);
+  virtual ~StringVectorTemp() {
+    delete m_charArray;
+  }
+  void swap(StringVectorTemp<ValueT, PosT, Allocator> &c) {
+    m_positions.swap(c.m_positions);
+    m_charArray->swap(*c.m_charArray);
+    bool temp = m_sorted;
+    m_sorted = c.m_sorted;
+    c.m_sorted = temp;
+  }
+  bool is_sorted() const;
+  PosT size() const;
+  virtual PosT size2() const;
+  template<class Iterator> Iterator begin() const;
+  template<class Iterator> Iterator end() const;
+  iterator begin() const;
+  iterator end() const;
+  PosT length(PosT i) const;
+  //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator begin(PosT i) const;
+  //typename std::vector<ValueT, Allocator<ValueT> >::const_iterator end(PosT i) const;
+  const ValueT* begin(PosT i) const;
+  const ValueT* end(PosT i) const;
+  void clear() {
+    m_charArray->clear();
+    m_sorted = true;
+    m_positions.clear();
+  }
+  range at(PosT i) const;
+  range operator[](PosT i) const;
+  range back() const;
+  template <typename StringT>
+  void push_back(StringT s);
+  void push_back(const char* c);
+  template <typename StringT>
+  PosT find(StringT &s) const;
+  PosT find(const char* c) const;
+};
+// ********** Implementation **********
+// StringVectorTemp
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp()
+  : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >()) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::StringVectorTemp(Allocator<ValueT> alloc)
+  : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector<ValueT, Allocator<ValueT> >(alloc)) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+template <typename StringT>
+void StringVectorTemp<ValueT, PosT, Allocator>::push_back(StringT s)
+{
+  if(is_sorted() && size() && !(back() < s))
+    m_sorted = false;
+  m_positions.push_back(size2());
+  std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray));
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::push_back(const char* c)
+{
+  std::string dummy(c);
+  push_back(dummy);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+template <typename Iterator>
+Iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
+{
+  return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), 0);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+template <typename Iterator>
+Iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
+{
+  return Iterator(const_cast<StringVectorTemp<ValueT, PosT, Allocator>&>(*this), size());
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::begin() const
+{
+  return begin<iterator>();
+};
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::iterator StringVectorTemp<ValueT, PosT, Allocator>::end() const
+{
+  return end<iterator>();
+};
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+bool StringVectorTemp<ValueT, PosT, Allocator>::is_sorted() const
+{
+  return m_sorted;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::size() const
+{
+  return m_positions.size();
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::size2() const
+{
+  return m_charArray->size();
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::at(PosT i) const
+{
+  return range(begin(i), end(i));
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::operator[](PosT i) const
+{
+  return at(i);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::range StringVectorTemp<ValueT, PosT, Allocator>::back() const
+{
+  return at(size()-1);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::length(PosT i) const
+{
+  if(i+1 < size())
+    return m_positions[i+1] - m_positions[i];
+  else
+    return size2() - m_positions[i];
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::value_ptr(PosT i) const
+{
+  return &(*m_charArray)[m_positions[i]];
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
+const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::begin(PosT i) const
+{
+  //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i));
+  return value_ptr(i);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+//typename std::vector<ValueT, Allocator<ValueT> >::const_iterator StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
+const ValueT* StringVectorTemp<ValueT, PosT, Allocator>::end(PosT i) const
+{
+  //return typename std::vector<ValueT, Allocator<ValueT> >::const_iterator(value_ptr(i) + length(i));
+  return value_ptr(i) + length(i);
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+template <typename StringT>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::find(StringT &s) const
+{
+  if(m_sorted)
+    return std::distance(begin(), std::lower_bound(begin(), end(), s));
+  return std::distance(begin(), std::find(begin(), end(), s));
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::find(const char* c) const
+{
+  std::string s(c);
+  return find(s);
+}
+// RangeIterator
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator() : m_index(0), m_container(0) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::RangeIterator(StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index)
+  : m_index(index), m_container(&sv) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::get_index()
+{
+  return m_index;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+typename StringVectorTemp<ValueT, PosT, Allocator>::range
+StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::dereference() const
+{
+  return typename StringVectorTemp<ValueT, PosT, Allocator>::range(
+           m_container->begin(m_index),
+           m_container->end(m_index)
+         );
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+bool StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::equal(
+  StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
+{
+  return m_index == other.m_index && m_container == other.m_container;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::increment()
+{
+  m_index++;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::decrement()
+{
+  m_index--;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::advance(PosT n)
+{
+  m_index += n;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator::distance_to(
+  StringVectorTemp<ValueT, PosT, Allocator>::RangeIterator const& other) const
+{
+  return other.m_index - m_index;
+}
+// StringIterator
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator()
+  : m_index(0), m_container(0) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::StringIterator(
+  StringVectorTemp<ValueT, PosT, Allocator> &sv, PosT index) : m_index(index),
+  m_container(&sv) { }
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::get_index()
+{
+  return m_index;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+const std::string StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::dereference() const
+{
+  return StringVectorTemp<ValueT, PosT, Allocator>::range(m_container->begin(m_index),
+         m_container->end(m_index)).str();
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+bool StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::equal(
+  StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
+{
+  return m_index == other.m_index && m_container == other.m_container;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::increment()
+{
+  m_index++;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::decrement()
+{
+  m_index--;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+void StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::advance(PosT n)
+{
+  m_index += n;
+}
+template<typename ValueT, typename PosT, template <typename> class Allocator>
+PosT StringVectorTemp<ValueT, PosT, Allocator>::StringIterator::distance_to(
+  StringVectorTemp<ValueT, PosT, Allocator>::StringIterator const& other) const
+{
+  return other.m_index - m_index;
+}
+// ********** Some typedefs **********
+typedef StringVectorTemp<unsigned char, unsigned int> MediumStringVectorTemp;
+typedef StringVectorTemp<unsigned char, unsigned long> LongStringVectorTemp;
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h ADDED Viewed

	@@ -0,0 +1,163 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_TargetPhraseCollectionCache_h
+#define moses_TargetPhraseCollectionCache_h
+#include <map>
+#include <set>
+#include <vector>
+#include <boost/thread/tss.hpp>
+#include <boost/shared_ptr.hpp>
+#include "moses/Phrase.h"
+#include "moses/TargetPhraseCollection.h"
+namespace Moses
+{
+// Avoid using new due to locking
+typedef std::vector<TargetPhrase> TargetPhraseVector;
+typedef boost::shared_ptr<TargetPhraseVector> TargetPhraseVectorPtr;
+/** Implementation of Persistent Cache **/
+class TargetPhraseCollectionCache
+{
+private:
+  size_t m_max;
+  float m_tolerance;
+  struct LastUsed {
+    clock_t m_clock;
+    TargetPhraseVectorPtr m_tpv;
+    size_t m_bitsLeft;
+    LastUsed() : m_clock(0), m_bitsLeft(0) {}
+    LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0)
+      : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {}
+  };
+  typedef std::map<Phrase, LastUsed> CacheMap;
+  static boost::thread_specific_ptr<CacheMap> m_phraseCache;
+public:
+  typedef CacheMap::iterator iterator;
+  typedef CacheMap::const_iterator const_iterator;
+  TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2)
+    : m_max(max), m_tolerance(tolerance) {
+  }
+  iterator Begin() {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->begin();
+  }
+  const_iterator Begin() const {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->begin();
+  }
+  iterator End() {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->end();
+  }
+  const_iterator End() const {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    return m_phraseCache->end();
+  }
+  /** retrieve translations for source phrase from persistent cache **/
+  void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv,
+             size_t bitsLeft = 0, size_t maxRank = 0) {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    // check if source phrase is already in cache
+    iterator it = m_phraseCache->find(sourcePhrase);
+    if(it != m_phraseCache->end())
+      // if found, just update clock
+      it->second.m_clock = clock();
+    else {
+      // else, add to cache
+      if(maxRank && tpv->size() > maxRank) {
+        TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector());
+        tpv_temp->resize(maxRank);
+        std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin());
+        (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft);
+      } else
+        (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft);
+    }
+  }
+  std::pair<TargetPhraseVectorPtr, size_t> Retrieve(const Phrase &sourcePhrase) {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    iterator it = m_phraseCache->find(sourcePhrase);
+    if(it != m_phraseCache->end()) {
+      LastUsed &lu = it->second;
+      lu.m_clock = clock();
+      return std::make_pair(lu.m_tpv, lu.m_bitsLeft);
+    } else
+      return std::make_pair(TargetPhraseVectorPtr(), 0);
+  }
+  // if cache full, reduce
+  void Prune() {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    if(m_phraseCache->size() > m_max * (1 + m_tolerance)) {
+      typedef std::set<std::pair<clock_t, Phrase> > Cands;
+      Cands cands;
+      for(CacheMap::iterator it = m_phraseCache->begin();
+          it != m_phraseCache->end(); it++) {
+        LastUsed &lu = it->second;
+        cands.insert(std::make_pair(lu.m_clock, it->first));
+      }
+      for(Cands::iterator it = cands.begin(); it != cands.end(); it++) {
+        const Phrase& p = it->second;
+        m_phraseCache->erase(p);
+        if(m_phraseCache->size() < (m_max * (1 - m_tolerance)))
+          break;
+      }
+    }
+  }
+  void CleanUp() {
+    if(!m_phraseCache.get())
+      m_phraseCache.reset(new CacheMap());
+    m_phraseCache->clear();
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp ADDED Viewed

	@@ -0,0 +1,30 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "ThrowingFwrite.h"
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream)
+{
+  assert(size);
+  size_t returnValue = std::fwrite(ptr, size, count, stream);
+  UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size);
+  return returnValue;
+}

mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h ADDED Viewed

	@@ -0,0 +1,31 @@

+// $Id$
+// vim:tabstop=2
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#ifndef moses_ThrowingFwrite_h
+#define moses_ThrowingFwrite_h
+#include <cassert>
+#include <cstdio>
+#include "util/exception.hh"
+size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream);
+#endif

mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp ADDED Viewed

	@@ -0,0 +1,65 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "LoaderFactory.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "LoaderCompact.h"
+#include "LoaderHiero.h"
+#include "LoaderStandard.h"
+#include <sstream>
+#include <iostream>
+using namespace std;
+namespace Moses
+{
+// Determines the rule table type by peeking inside the file then creates
+// a suitable RuleTableLoader object.
+std::auto_ptr<RuleTableLoader>
+RuleTableLoaderFactory::
+Create(const std::string &path)
+{
+  InputFileStream input(path);
+  std::string line;
+  if (std::getline(input, line)) {
+    std::vector<std::string> tokens;
+    Tokenize(tokens, line);
+    if (tokens.size() == 1) {
+      if (tokens[0] == "1") {
+        return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderCompact());
+      }
+      std::cerr << "Unsupported compact rule table format: " << tokens[0];
+      return std::auto_ptr<RuleTableLoader>();
+    } else if (tokens[0] == "[X]" && tokens[1] == "|||") {
+      return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderHiero());
+    }
+    return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
+  } else {
+    // empty phrase table
+    return std::auto_ptr<RuleTableLoader>(new RuleTableLoaderStandard());
+  }
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp ADDED Viewed

	@@ -0,0 +1,33 @@

+//
+//  RuleTableLoaderHiero.cpp
+//  moses
+//
+//  Created by Hieu Hoang on 04/11/2011.
+//  Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+#include <iostream>
+#include "LoaderHiero.h"
+using namespace std;
+namespace Moses
+{
+bool RuleTableLoaderHiero::Load(AllOptions const& opts,
+                                const std::vector<FactorType> &input,
+                                const std::vector<FactorType> &output,
+                                const std::string &inFile,
+                                size_t tableLimit,
+                                RuleTableTrie &ruleTable)
+{
+  bool ret = RuleTableLoaderStandard::Load(opts, HieroFormat
+             ,input, output
+             ,inFile
+             ,tableLimit
+             ,ruleTable);
+  return ret;
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp ADDED Viewed

	@@ -0,0 +1,260 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#include "LoaderStandard.h"
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <iostream>
+#include <sys/stat.h>
+#include <cstdlib>
+#include <boost/algorithm/string/predicate.hpp>
+#include "Trie.h"
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/Range.h"
+#include "moses/ChartTranslationOptionList.h"
+#include "moses/FactorCollection.h"
+#include "util/file_piece.hh"
+#include "util/string_piece.hh"
+#include "util/tokenize_piece.hh"
+#include "util/double-conversion/double-conversion.h"
+#include "util/exception.hh"
+using namespace std;
+using namespace boost::algorithm;
+namespace Moses
+{
+bool
+RuleTableLoaderStandard::
+Load(AllOptions const& opts
+     , const std::vector<FactorType> &input
+     , const std::vector<FactorType> &output
+     , const std::string &inFile
+     , size_t tableLimit
+     , RuleTableTrie &ruleTable)
+{
+  return Load(opts, MosesFormat,input, output ,inFile ,tableLimit ,ruleTable);
+}
+void ReformatHieroRule(int sourceTarget, string &phrase, map<size_t, pair<size_t, size_t> > &ntAlign)
+{
+  vector<string> toks;
+  Tokenize(toks, phrase, " ");
+  for (size_t i = 0; i < toks.size(); ++i) {
+    string &tok = toks[i];
+    if (starts_with(tok, "[") && ends_with(tok, "]")) {
+      // no-term
+      vector<string> split = Tokenize(tok, ",");
+      UTIL_THROW_IF2(split.size() != 2,
+                     "Incorrectly formmatted non-terminal: " << tok);
+      tok = "[X]" + split[0] + "]";
+      size_t coIndex = Scan<size_t>(split[1]);
+      pair<size_t, size_t> &alignPoint = ntAlign[coIndex];
+      if (sourceTarget == 0) {
+        alignPoint.first = i;
+      } else {
+        alignPoint.second = i;
+      }
+    }
+  }
+  phrase = Join(" ", toks) + " [X]";
+}
+void ReformateHieroScore(string &scoreString)
+{
+  vector<string> toks;
+  Tokenize(toks, scoreString, " ");
+  for (size_t i = 0; i < toks.size(); ++i) {
+    string &tok = toks[i];
+    vector<string> nameValue = Tokenize(tok, "=");
+    UTIL_THROW_IF2(nameValue.size() != 2,
+                   "Incorrectly formatted score: " << tok);
+    float score = Scan<float>(nameValue[1]);
+    score = exp(-score);
+    tok = SPrint(score);
+  }
+  scoreString = Join(" ", toks);
+}
+void ReformatHieroRule(const string &lineOrig, string &out)
+{
+  vector<string> tokens;
+  vector<float> scoreVector;
+  TokenizeMultiCharSeparator(tokens, lineOrig, "|||" );
+  string &sourcePhraseString = tokens[1]
+                               , &targetPhraseString = tokens[2]
+                                   , &scoreString        = tokens[3];
+  map<size_t, pair<size_t, size_t> > ntAlign;
+  ReformatHieroRule(0, sourcePhraseString, ntAlign);
+  ReformatHieroRule(1, targetPhraseString, ntAlign);
+  ReformateHieroScore(scoreString);
+  util::StringStream align;
+  map<size_t, pair<size_t, size_t> >::const_iterator iterAlign;
+  for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) {
+    const pair<size_t, size_t> &alignPoint = iterAlign->second;
+    align << alignPoint.first << "-" << alignPoint.second << " ";
+  }
+  util::StringStream ret;
+  ret << sourcePhraseString << " ||| "
+      << targetPhraseString << " ||| "
+      << scoreString << " ||| "
+      << align.str();
+  out = ret.str();
+}
+bool RuleTableLoaderStandard::Load(AllOptions const& opts, FormatType format
+                                   , const std::vector<FactorType> &input
+                                   , const std::vector<FactorType> &output
+                                   , const std::string &inFile
+                                   , size_t /* tableLimit */
+                                   , RuleTableTrie &ruleTable)
+{
+  PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format");
+  // const StaticData &staticData = StaticData::Instance();
+  string lineOrig;
+  size_t count = 0;
+  std::ostream *progress = NULL;
+  IFVERBOSE(1) progress = &std::cerr;
+  util::FilePiece in(inFile.c_str(), progress);
+  // reused variables
+  vector<float> scoreVector;
+  StringPiece line;
+  std::string hiero_before, hiero_after;
+  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");
+  while(true) {
+    try {
+      line = in.ReadLine();
+    } catch (const util::EndOfFileException &e) {
+      break;
+    }
+    if (format == HieroFormat) { // inefficiently reformat line
+      hiero_before.assign(line.data(), line.size());
+      ReformatHieroRule(hiero_before, hiero_after);
+      line = hiero_after;
+    }
+    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
+    StringPiece sourcePhraseString(*pipes);
+    StringPiece targetPhraseString(*++pipes);
+    StringPiece scoreString(*++pipes);
+    StringPiece alignString;
+    if (++pipes) {
+      StringPiece temp(*pipes);
+      alignString = temp;
+    }
+    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
+    if (isLHSEmpty && !opts.unk.word_deletion_enabled) {
+      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
+      continue;
+    }
+    scoreVector.clear();
+    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
+      int processed;
+      float score = converter.StringToFloat(s->data(), s->length(), &processed);
+      UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count);
+      scoreVector.push_back(FloorScore(TransformScore(score)));
+    }
+    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
+    if (scoreVector.size() != numScoreComponents) {
+      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
+                  << numScoreComponents << ") of score components on line " << count);
+    }
+    // parse source & find pt node
+    // constituent labels
+    Word *sourceLHS = NULL;
+    Word *targetLHS;
+    // create target phrase obj
+    TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable);
+    targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
+    // source
+    Phrase sourcePhrase;
+    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);
+    // rest of target phrase
+    targetPhrase->SetAlignmentInfo(alignString);
+    targetPhrase->SetTargetLHS(targetLHS);
+    ++pipes;  // skip over counts field
+    if (++pipes) {
+      StringPiece sparseString(*pipes);
+      targetPhrase->SetSparseScore(&ruleTable, sparseString);
+    }
+    if (++pipes) {
+      StringPiece propertiesString(*pipes);
+      targetPhrase->SetProperties(propertiesString);
+    }
+    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
+    targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply());
+    TargetPhraseCollection::shared_ptr phraseColl
+    = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase,
+                                        *targetPhrase, sourceLHS);
+    phraseColl->Add(targetPhrase);
+    // not implemented correctly in memory pt. just delete it for now
+    delete sourceLHS;
+    count++;
+  }
+  // sort and prune each target phrase collection
+  SortAndPrune(ruleTable);
+  return true;
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp ADDED Viewed

	@@ -0,0 +1,417 @@

+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <climits>
+#include <sys/types.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include "Loader.h"
+#include "LoaderFactory.h"
+#include "PhraseDictionaryFuzzyMatch.h"
+#include "moses/FactorCollection.h"
+#include "moses/Word.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/Range.h"
+#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h"
+#include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h"
+#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h"
+#include "moses/TranslationTask.h"
+#include "util/file.hh"
+#include "util/exception.hh"
+#include "util/random.hh"
+using namespace std;
+#if defined __MINGW32__ && !defined mkdtemp
+#include <windows.h>
+#include <cerrno>
+char *mkdtemp(char *tempbuf)
+{
+  int rand_value = 0;
+  char* tempbase = NULL;
+  char tempbasebuf[MAX_PATH] = "";
+  if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) {
+    errno = EINVAL;
+    return NULL;
+  }
+  util::rand_init();
+  rand_value = util::rand_excl(1e6);
+  tempbase = strrchr(tempbuf, '/');
+  tempbase = tempbase ? tempbase+1 : tempbuf;
+  strcpy(tempbasebuf, tempbase);
+  sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value);
+  ::GetTempPath(MAX_PATH, tempbuf);
+  strcat(tempbuf, tempbasebuf);
+  ::CreateDirectory(tempbuf, NULL);
+  return tempbuf;
+}
+#endif
+namespace Moses
+{
+PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line)
+  :PhraseDictionary(line, true)
+  ,m_config(3)
+  ,m_FuzzyMatchWrapper(NULL)
+{
+  ReadParameters();
+}
+PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch()
+{
+  delete m_FuzzyMatchWrapper;
+}
+void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  SetFeaturesToApply();
+  m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]);
+}
+ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager(
+  const ChartParser &parser,
+  const ChartCellCollectionBase &cellCollection,
+  std::size_t /*maxChartSpan*/)
+{
+  return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this);
+}
+void
+PhraseDictionaryFuzzyMatch::
+SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "source") {
+    m_config[0] = value;
+  } else if (key == "target") {
+    m_config[1] = value;
+  } else if (key == "alignment") {
+    m_config[2] = value;
+  } else {
+    PhraseDictionary::SetParameter(key, value);
+  }
+}
+int removedirectoryrecursively(const char *dirname)
+{
+#if defined __MINGW32__
+  //TODO(jie): replace this function with boost implementation
+#else
+  DIR *dir;
+  struct dirent *entry;
+  char path[PATH_MAX];
+  dir = opendir(dirname);
+  if (dir == NULL) {
+    perror("Error opendir()");
+    return 0;
+  }
+  while ((entry = readdir(dir)) != NULL) {
+    if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) {
+      snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name);
+      if (entry->d_type == DT_DIR) {
+        removedirectoryrecursively(path);
+      }
+      remove(path);
+      /*
+       * Here, the actual deletion must be done.  Beacuse this is
+       * quite a dangerous thing to do, and this program is not very
+       * well tested, we are just printing as if we are deleting.
+       */
+      //printf("(not really) Deleting: %s\n", path);
+      /*
+       * When you are finished testing this and feel you are ready to do the real
+       * deleting, use this: remove*STUB*(path);
+       * (see "man 3 remove")
+       * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this!
+       */
+    }
+  }
+  closedir(dir);
+  rmdir(dirname);
+  /*
+   * Now the directory is emtpy, finally delete the directory itself. (Just
+   * printing here, see above)
+   */
+  //printf("(not really) Deleting: %s\n", dirname);
+#endif
+  return 1;
+}
+void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask)
+{
+  InputType const& inputSentence = *ttask->GetSource();
+#if defined __MINGW32__
+  char dirName[] = "moses.XXXXXX";
+#else
+  char dirName[] = "/tmp/moses.XXXXXX";
+#endif // defined
+  char *temp = mkdtemp(dirName);
+  UTIL_THROW_IF2(temp == NULL,
+                 "Couldn't create temporary directory " << dirName);
+  string dirNameStr(dirName);
+  string inFileName(dirNameStr + "/in");
+  ofstream inFile(inFileName.c_str());
+  for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
+    inFile << inputSentence.GetWord(i);
+  }
+  inFile << endl;
+  inFile.close();
+  long translationId = inputSentence.GetTranslationId();
+  string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);
+  // populate with rules for this sentence
+  PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
+  FormatType format = MosesFormat;
+  // data from file
+  InputFileStream inStream(ptFileName);
+  // copied from class LoaderStandard
+  PrintUserTime("Start loading fuzzy-match phrase model");
+  const StaticData &staticData = StaticData::Instance();
+  string lineOrig;
+  size_t count = 0;
+  while(getline(inStream, lineOrig)) {
+    const string *line;
+    if (format == HieroFormat) { // reformat line
+      UTIL_THROW(util::Exception, "Cannot be Hiero format");
+      //line = ReformatHieroRule(lineOrig);
+    } else {
+      // do nothing to format of line
+      line = &lineOrig;
+    }
+    vector<string> tokens;
+    vector<float> scoreVector;
+    TokenizeMultiCharSeparator(tokens, *line , "|||" );
+    if (tokens.size() != 4 && tokens.size() != 5) {
+      UTIL_THROW2("Syntax error at " << ptFileName << ":" << count);
+    }
+    const string &sourcePhraseString = tokens[0]
+                                       , &targetPhraseString = tokens[1]
+                                           , &scoreString        = tokens[2]
+                                               , &alignString        = tokens[3];
+    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
+    if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) {
+      TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
+      continue;
+    }
+    Tokenize<float>(scoreVector, scoreString);
+    const size_t numScoreComponents = GetNumScoreComponents();
+    if (scoreVector.size() != numScoreComponents) {
+      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
+                  << numScoreComponents << ") of score components on line " << count);
+    }
+    UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
+                   "Number of scores incorrectly specified");
+    // parse source & find pt node
+    // constituent labels
+    Word *sourceLHS;
+    Word *targetLHS;
+    // source
+    Phrase sourcePhrase( 0);
+    sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS);
+    // create target phrase obj
+    TargetPhrase *targetPhrase = new TargetPhrase(this);
+    targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS);
+    // rest of target phrase
+    targetPhrase->SetAlignmentInfo(alignString);
+    targetPhrase->SetTargetLHS(targetLHS);
+    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
+    // component score, for n-best output
+    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
+    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
+    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+    targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
+    TargetPhraseCollection::shared_ptr phraseColl
+    = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase,
+                                        *targetPhrase, sourceLHS);
+    phraseColl->Add(targetPhrase);
+    count++;
+    if (format == HieroFormat) { // reformat line
+      delete line;
+    } else {
+      // do nothing
+    }
+  }
+  // sort and prune each target phrase collection
+  SortAndPrune(rootNode);
+  //removedirectoryrecursively(dirName);
+}
+TargetPhraseCollection::shared_ptr
+PhraseDictionaryFuzzyMatch::
+GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode
+                                  , const Phrase &source
+                                  , const TargetPhrase &target
+                                  , const Word *sourceLHS)
+{
+  PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS);
+  return currNode.GetTargetPhraseCollection();
+}
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode
+    , const Phrase &source
+    , const TargetPhrase &target
+    , const Word *sourceLHS)
+{
+  cerr << source << endl << target << endl;
+  const size_t size = source.GetSize();
+  const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+  AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+  PhraseDictionaryNodeMemory *currNode = &rootNode;
+  for (size_t pos = 0 ; pos < size ; ++pos) {
+    const Word& word = source.GetWord(pos);
+    if (word.IsNonTerminal()) {
+      // indexed by source label 1st
+      const Word &sourceNonTerm = word;
+      UTIL_THROW_IF2(iterAlign == alignmentInfo.end(),
+                     "No alignment for non-term at position " << pos);
+      UTIL_THROW_IF2(iterAlign->first != pos,
+                     "Alignment info incorrect at position " << pos);
+      size_t targetNonTermInd = iterAlign->second;
+      ++iterAlign;
+      const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+#if defined(UNLABELLED_SOURCE)
+      currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+#else
+      currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm);
+#endif
+    } else {
+      currNode = currNode->GetOrCreateChild(word);
+    }
+    UTIL_THROW_IF2(currNode == NULL,
+                   "Node not found at position " << pos);
+  }
+  // finally, the source LHS
+  //currNode = currNode->GetOrCreateChild(sourceLHS);
+  return *currNode;
+}
+void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode)
+{
+  if (GetTableLimit()) {
+    rootNode.Sort(GetTableLimit());
+  }
+}
+void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source)
+{
+  m_collection.erase(source.GetTranslationId());
+}
+const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const
+{
+  std::map<long, PhraseDictionaryNodeMemory>::const_iterator iter = m_collection.find(translationId);
+  UTIL_THROW_IF2(iter == m_collection.end(),
+                 "Couldn't find root node for input: " << translationId);
+  return iter->second;
+}
+PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source)
+{
+  long transId = source.GetTranslationId();
+  std::map<long, PhraseDictionaryNodeMemory>::iterator iter = m_collection.find(transId);
+  UTIL_THROW_IF2(iter == m_collection.end(),
+                 "Couldn't find root node for input: " << transId);
+  return iter->second;
+}
+TO_STRING_BODY(PhraseDictionaryFuzzyMatch);
+// friend
+ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict)
+{
+  /*
+  typedef PhraseDictionaryNodeMemory::TerminalMap TermMap;
+  typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap;
+  const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection;
+  for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) {
+    const Word &sourceNonTerm = p->first.first;
+    out << sourceNonTerm;
+  }
+  for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) {
+    const Word &sourceTerm = p->first;
+    out << sourceTerm;
+  }
+   */
+  return out;
+}
+}

mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp ADDED Viewed

	@@ -0,0 +1,398 @@

+// vim:tabstop=2
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 Hieu Hoang
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "PhraseDictionaryOnDisk.h"
+#include "moses/InputFileStream.h"
+#include "moses/StaticData.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/InputPath.h"
+#include "moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h"
+#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h"
+#include "moses/TranslationTask.h"
+#include "OnDiskPt/OnDiskWrapper.h"
+#include "OnDiskPt/Word.h"
+#include "util/tokenize_piece.hh"
+using namespace std;
+namespace Moses
+{
+PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
+  : MyBase(line, true)
+  , m_maxSpanDefault(NOT_FOUND)
+  , m_maxSpanLabelled(NOT_FOUND)
+{
+  ReadParameters();
+}
+PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk()
+{
+}
+void PhraseDictionaryOnDisk::Load(AllOptions::ptr const& opts)
+{
+  m_options = opts;
+  SetFeaturesToApply();
+}
+ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager(
+  const ChartParser &parser,
+  const ChartCellCollectionBase &cellCollection,
+  std::size_t /*maxChartSpan*/)
+{
+  return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this,
+                                          GetImplementation(),
+                                          m_input,
+                                          m_output);
+}
+OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation()
+{
+  OnDiskPt::OnDiskWrapper* dict;
+  dict = m_implementation.get();
+  UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
+  return *dict;
+}
+const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const
+{
+  OnDiskPt::OnDiskWrapper* dict;
+  dict = m_implementation.get();
+  UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread");
+  return *dict;
+}
+void PhraseDictionaryOnDisk::InitializeForInput(ttasksptr const& ttask)
+{
+  InputType const& source = *ttask->GetSource();
+  ReduceCache();
+  OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
+  obj->BeginLoad(m_filePath);
+  UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
+                 "On-disk phrase table is version " <<  obj->GetMisc("Version")
+                 << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);
+  UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
+                 "On-disk phrase table has " <<  obj->GetMisc("NumSourceFactors") << " source factors."
+                 << ". The ini file specified " << m_input.size() << " source factors");
+  UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
+                 "On-disk phrase table has " <<  obj->GetMisc("NumTargetFactors") << " target factors."
+                 << ". The ini file specified " << m_output.size() << " target factors");
+  UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
+                 "On-disk phrase table has " <<  obj->GetMisc("NumScores") << " scores."
+                 << ". The ini file specified " << m_numScoreComponents << " scores");
+  m_implementation.reset(obj);
+}
+void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
+{
+  InputPathList::const_iterator iter;
+  for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
+    InputPath &inputPath = **iter;
+    GetTargetPhraseCollectionBatch(inputPath);
+  }
+  // delete nodes that's been saved
+  for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
+    InputPath &inputPath = **iter;
+    const OnDiskPt::PhraseNode *ptNode = static_cast<const OnDiskPt::PhraseNode*>(inputPath.GetPtNode(*this));
+    delete ptNode;
+  }
+}
+void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const
+{
+  OnDiskPt::OnDiskWrapper &wrapper = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
+  const Phrase &phrase = inputPath.GetPhrase();
+  const InputPath *prevInputPath = inputPath.GetPrevPath();
+  const OnDiskPt::PhraseNode *prevPtNode = NULL;
+  if (prevInputPath) {
+    prevPtNode = static_cast<const OnDiskPt::PhraseNode*>(prevInputPath->GetPtNode(*this));
+  } else {
+    // Starting subphrase.
+    assert(phrase.GetSize() == 1);
+    prevPtNode = &wrapper.GetRootSourceNode();
+  }
+  // backoff
+  if (!SatisfyBackoff(inputPath)) {
+    return;
+  }
+  if (prevPtNode) {
+    Word lastWord = phrase.GetWord(phrase.GetSize() - 1);
+    lastWord.OnlyTheseFactors(m_inputFactors);
+    OnDiskPt::Word *lastWordOnDisk = ConvertFromMoses(wrapper, m_input, lastWord);
+    TargetPhraseCollection::shared_ptr tpc;
+    if (lastWordOnDisk == NULL) {
+      // OOV according to this phrase table. Not possible to extend
+      inputPath.SetTargetPhrases(*this, tpc, NULL);
+    } else {
+      OnDiskPt::PhraseNode const* ptNode;
+      ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper);
+      if (ptNode) tpc = GetTargetPhraseCollection(ptNode);
+      inputPath.SetTargetPhrases(*this, tpc, ptNode);
+      delete lastWordOnDisk;
+    }
+  }
+}
+TargetPhraseCollection::shared_ptr
+PhraseDictionaryOnDisk::
+GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const
+{
+  TargetPhraseCollection::shared_ptr ret;
+  CacheColl &cache = GetCache();
+  size_t hash = (size_t) ptNode->GetFilePos();
+  CacheColl::iterator iter;
+  iter = cache.find(hash);
+  if (iter == cache.end()) {
+    // not in cache, need to look up from phrase table
+    ret = GetTargetPhraseCollectionNonCache(ptNode);
+    std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(ret, clock());
+    cache[hash] = value;
+  } else {
+    // in cache. just use it
+    iter->second.second = clock();
+    ret = iter->second.first;
+  }
+  return ret;
+}
+TargetPhraseCollection::shared_ptr
+PhraseDictionaryOnDisk::
+GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const
+{
+  OnDiskPt::OnDiskWrapper& wrapper
+  = const_cast<OnDiskPt::OnDiskWrapper&>(GetImplementation());
+  vector<float> weightT = StaticData::Instance().GetWeights(this);
+  OnDiskPt::Vocab &vocab = wrapper.GetVocab();
+  OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
+  = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper);
+  TargetPhraseCollection::shared_ptr targetPhrases
+  = ConvertToMoses(targetPhrasesOnDisk, m_input, m_output, *this,
+                   weightT, vocab, false);
+  // delete targetPhrasesOnDisk;
+  return targetPhrases;
+}
+Moses::TargetPhraseCollection::shared_ptr
+PhraseDictionaryOnDisk::ConvertToMoses(
+  const OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk
+  , const std::vector<Moses::FactorType> &inputFactors
+  , const std::vector<Moses::FactorType> &outputFactors
+  , const Moses::PhraseDictionary &phraseDict
+  , const std::vector<float> &weightT
+  , OnDiskPt::Vocab &vocab
+  , bool isSyntax) const
+{
+  Moses::TargetPhraseCollection::shared_ptr ret;
+  ret.reset(new Moses::TargetPhraseCollection);
+  for (size_t i = 0; i < targetPhrasesOnDisk->GetSize(); ++i) {
+    const OnDiskPt::TargetPhrase &tp = targetPhrasesOnDisk->GetTargetPhrase(i);
+    Moses::TargetPhrase *mosesPhrase
+    = ConvertToMoses(tp, inputFactors, outputFactors, vocab,
+                     phraseDict, weightT, isSyntax);
+    /*
+    // debugging output
+    stringstream strme;
+    strme << filePath << " " << *mosesPhrase;
+    mosesPhrase->SetDebugOutput(strme.str());
+    */
+    ret->Add(mosesPhrase);
+  }
+  ret->Sort(true, phraseDict.GetTableLimit());
+  return ret;
+}
+Moses::TargetPhrase *PhraseDictionaryOnDisk::ConvertToMoses(const OnDiskPt::TargetPhrase &targetPhraseOnDisk
+    , const std::vector<Moses::FactorType> &inputFactors
+    , const std::vector<Moses::FactorType> &outputFactors
+    , const OnDiskPt::Vocab &vocab
+    , const Moses::PhraseDictionary &phraseDict
+    , const std::vector<float> &weightT
+    , bool isSyntax) const
+{
+  Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict);
+  // words
+  size_t phraseSize = targetPhraseOnDisk.GetSize();
+  UTIL_THROW_IF2(phraseSize == 0, "Target phrase cannot be empty"); // last word is lhs
+  if (isSyntax) {
+    --phraseSize;
+  }
+  for (size_t pos = 0; pos < phraseSize; ++pos) {
+    const OnDiskPt::Word &wordOnDisk = targetPhraseOnDisk.GetWord(pos);
+    ConvertToMoses(wordOnDisk, outputFactors, vocab, ret->AddWord());
+  }
+  // alignments
+  // int index = 0;
+  Moses::AlignmentInfo::CollType alignTerm, alignNonTerm;
+  std::set<std::pair<size_t, size_t> > alignmentInfo;
+  const OnDiskPt::PhrasePtr sp = targetPhraseOnDisk.GetSourcePhrase();
+  for (size_t ind = 0; ind < targetPhraseOnDisk.GetAlign().size(); ++ind) {
+    const std::pair<size_t, size_t> &entry = targetPhraseOnDisk.GetAlign()[ind];
+    alignmentInfo.insert(entry);
+    size_t sourcePos = entry.first;
+    size_t targetPos = entry.second;
+    if (targetPhraseOnDisk.GetWord(targetPos).IsNonTerminal()) {
+      alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+    } else {
+      alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos));
+    }
+  }
+  ret->SetAlignTerm(alignTerm);
+  ret->SetAlignNonTerm(alignNonTerm);
+  if (isSyntax) {
+    Moses::Word *lhsTarget = new Moses::Word(true);
+    const OnDiskPt::Word &lhsOnDisk = targetPhraseOnDisk.GetWord(targetPhraseOnDisk.GetSize() - 1);
+    ConvertToMoses(lhsOnDisk, outputFactors, vocab, *lhsTarget);
+    ret->SetTargetLHS(lhsTarget);
+  }
+  // set source phrase
+  Moses::Phrase mosesSP(Moses::Input);
+  for (size_t pos = 0; pos < sp->GetSize(); ++pos) {
+    ConvertToMoses(sp->GetWord(pos), inputFactors, vocab, mosesSP.AddWord());
+  }
+  // scores
+  ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetScores());
+  // sparse features
+  ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetSparseFeatures());
+  // property
+  ret->SetProperties(targetPhraseOnDisk.GetProperty());
+  ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply());
+  return ret;
+}
+void PhraseDictionaryOnDisk::ConvertToMoses(
+  const OnDiskPt::Word &wordOnDisk,
+  const std::vector<Moses::FactorType> &outputFactorsVec,
+  const OnDiskPt::Vocab &vocab,
+  Moses::Word &overwrite) const
+{
+  Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance();
+  overwrite = Moses::Word(wordOnDisk.IsNonTerminal());
+  if (wordOnDisk.IsNonTerminal()) {
+    const std::string &tok = vocab.GetString(wordOnDisk.GetVocabId());
+    overwrite.SetFactor(0, factorColl.AddFactor(tok, wordOnDisk.IsNonTerminal()));
+  } else {
+    // TODO: this conversion should have been done at load time.
+    util::TokenIter<util::SingleCharacter> tok(vocab.GetString(wordOnDisk.GetVocabId()), '|');
+    for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) {
+      UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
+      overwrite.SetFactor(*t, factorColl.AddFactor(*tok, wordOnDisk.IsNonTerminal()));
+    }
+    UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size());
+  }
+}
+OnDiskPt::Word *PhraseDictionaryOnDisk::ConvertFromMoses(OnDiskPt::OnDiskWrapper &wrapper, const std::vector<Moses::FactorType> &factorsVec
+    , const Moses::Word &origWord) const
+{
+  bool isNonTerminal = origWord.IsNonTerminal();
+  OnDiskPt::Word *newWord = new OnDiskPt::Word(isNonTerminal);
+  util::StringStream strme;
+  size_t factorType = factorsVec[0];
+  const Moses::Factor *factor = origWord.GetFactor(factorType);
+  UTIL_THROW_IF2(factor == NULL, "Expecting factor " << factorType);
+  strme << factor->GetString();
+  for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) {
+    size_t factorType = factorsVec[ind];
+    const Moses::Factor *factor = origWord.GetFactor(factorType);
+    if (factor == NULL) {
+      // can have less factors than factorType.size()
+      break;
+    }
+    UTIL_THROW_IF2(factor == NULL,
+                   "Expecting factor " << factorType << " at position " << ind);
+    strme << "|" << factor->GetString();
+  } // for (size_t factorType
+  bool found;
+  uint64_t vocabId = wrapper.GetVocab().GetVocabId(strme.str(), found);
+  if (!found) {
+    // factor not in phrase table -> phrse definately not in. exit
+    delete newWord;
+    return NULL;
+  } else {
+    newWord->SetVocabId(vocabId);
+    return newWord;
+  }
+}
+void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "max-span-default") {
+    m_maxSpanDefault = Scan<size_t>(value);
+  } else if (key == "max-span-labelled") {
+    m_maxSpanLabelled = Scan<size_t>(value);
+  } else {
+    PhraseDictionary::SetParameter(key, value);
+  }
+}
+} // namespace

mosesdecoder/moses/TranslationModel/RuleTable/Trie.h ADDED Viewed

	@@ -0,0 +1,63 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "moses/TranslationModel/PhraseDictionary.h"
+#include "moses/TypeDef.h"
+#include <string>
+#include <vector>
+namespace Moses
+{
+class Phrase;
+class TargetPhrase;
+class TargetPhraseCollection;
+class Word;
+/*** Implementation of a SCFG rule table in a trie.  Looking up a rule of
+ * length n symbols requires n look-ups to find the TargetPhraseCollection.
+ * @todo why need this and PhraseDictionaryMemory?
+ */
+class RuleTableTrie : public PhraseDictionary
+{
+public:
+  RuleTableTrie(const std::string &line)
+    : PhraseDictionary(line, true) {
+  }
+  virtual ~RuleTableTrie();
+  void Load(AllOptions::ptr const& opts);
+private:
+  friend class RuleTableLoader;
+  virtual TargetPhraseCollection::shared_ptr
+  GetOrCreateTargetPhraseCollection(const Phrase &source,
+                                    const TargetPhrase &target,
+                                    const Word *sourceLHS) = 0;
+  virtual void SortAndPrune() = 0;
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp ADDED Viewed

	@@ -0,0 +1,96 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ ***********************************************************************/
+#include "moses/NonTerminal.h"
+#include "moses/TranslationModel/Scope3Parser/Parser.h"
+#include "moses/StaticData.h"
+#include "moses/TargetPhrase.h"
+#include "moses/TargetPhraseCollection.h"
+#include "moses/Util.h"
+#include "moses/Word.h"
+#include "UTrie.h"
+#include "Trie.h"
+#include "UTrieNode.h"
+#include <boost/functional/hash.hpp>
+#include <boost/unordered_map.hpp>
+#include <boost/version.hpp>
+#include <map>
+#include <vector>
+namespace Moses
+{
+TargetPhraseCollection::shared_ptr
+RuleTableUTrie::
+GetOrCreateTargetPhraseCollection(const Phrase &source,
+                                  const TargetPhrase &target,
+                                  const Word *sourceLHS)
+{
+  UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS);
+  return currNode.GetOrCreateTargetPhraseCollection(target);
+}
+UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source,
+    const TargetPhrase &target,
+    const Word */*sourceLHS*/)
+{
+  const size_t size = source.GetSize();
+  const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm();
+  AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin();
+  UTrieNode *currNode = &m_root;
+  for (size_t pos = 0 ; pos < size ; ++pos) {
+    const Word &word = source.GetWord(pos);
+    if (word.IsNonTerminal()) {
+      assert(iterAlign != alignmentInfo.end());
+      assert(iterAlign->first == pos);
+      size_t targetNonTermInd = iterAlign->second;
+      ++iterAlign;
+      const Word &targetNonTerm = target.GetWord(targetNonTermInd);
+      currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm);
+    } else {
+      currNode = currNode->GetOrCreateTerminalChild(word);
+    }
+    assert(currNode != NULL);
+  }
+  return *currNode;
+}
+ChartRuleLookupManager *RuleTableUTrie::CreateRuleLookupManager(
+  const ChartParser &parser,
+  const ChartCellCollectionBase &cellCollection,
+  std::size_t maxChartSpan)
+{
+  return new Scope3Parser(parser, cellCollection, *this, maxChartSpan);
+}
+void RuleTableUTrie::SortAndPrune()
+{
+  if (GetTableLimit()) {
+    m_root.Sort(GetTableLimit());
+  }
+}
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h ADDED Viewed

	@@ -0,0 +1,73 @@

+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+#pragma once
+#include "Trie.h"
+#include "UTrieNode.h"
+#include "moses/TargetPhraseCollection.h"
+namespace Moses
+{
+class Phrase;
+class TargetPhrase;
+class Word;
+class ChartParser;
+/** Implementation of RuleTableTrie.  A RuleTableUTrie is designed to store
+ * string-to-tree SCFG grammars only (i.e. rules can have distinct labels on
+ * the target side, but only a generic non-terminal on the source side).
+ * A key is the source RHS (one symbol per edge) of a rule and a mapped value
+ * is the collection of grammar rules that share the same source RHS.
+ *
+ * (The 'U' in UTrie stands for 'unlabelled' -- the keys are unlabelled and
+ * the target labels are stored on the node values, as opposed to the grammar
+ * being a monolingual projection with target labels projected onto the source
+ * side.)
+ */
+class RuleTableUTrie : public RuleTableTrie
+{
+public:
+  RuleTableUTrie(const std::string &line)
+    : RuleTableTrie(line) {
+  }
+  const UTrieNode &GetRootNode() const {
+    return m_root;
+  }
+  ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &,
+      const ChartCellCollectionBase &, std::size_t);
+private:
+  TargetPhraseCollection::shared_ptr
+  GetOrCreateTargetPhraseCollection(const Phrase &source,
+                                    const TargetPhrase &target,
+                                    const Word *sourceLHS);
+  UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target,
+                             const Word *sourceLHS);
+  void SortAndPrune();
+  UTrieNode m_root;
+};
+}  // namespace Moses

mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include <string>
+#include <vector>
+#include <map>
+class Alignments
+{
+public:
+  std::vector< std::map<int, int> > m_alignS2T, m_alignT2S;
+  Alignments(const std::string &align, size_t sourceSize, size_t targetSize);
+protected:
+};

mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp ADDED Viewed

	@@ -0,0 +1,1029 @@

+//
+//  FuzzyMatchWrapper.cpp
+//  moses
+//
+//  Created by Hieu Hoang on 26/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+#include <iostream>
+#include "FuzzyMatchWrapper.h"
+#include "SentenceAlignment.h"
+#include "Match.h"
+#include "create_xml.h"
+#include "moses/Util.h"
+#include "moses/StaticData.h"
+#include "util/file.hh"
+using namespace std;
+namespace tmmt
+{
+FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath)
+  :basic_flag(false)
+  ,lsed_flag(true)
+  ,refined_flag(true)
+  ,length_filter_flag(true)
+  ,parse_flag(true)
+  ,min_match(70)
+  ,multiple_flag(true)
+  ,multiple_slack(0)
+  ,multiple_max(100)
+{
+  cerr << "creating suffix array" << endl;
+  suffixArray = new tmmt::SuffixArray( sourcePath );
+  //cerr << "loading source data" << endl;
+  //load_corpus(sourcePath, source);
+  cerr << "loading target data" << endl;
+  load_target(targetPath, targetAndAlignment);
+  cerr << "loading alignment" << endl;
+  load_alignment(alignmentPath, targetAndAlignment);
+  // create suffix array
+  //load_corpus(m_config[0], input);
+  cerr << "loading completed" << endl;
+}
+string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr)
+{
+  const Moses::StaticData &staticData = Moses::StaticData::Instance();
+  WordIndex wordIndex;
+  string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr);
+  // create extrac files
+  create_xml(fuzzyMatchFile);
+  // create phrase table with usual Moses scoring and consolidate programs
+  string cmd;
+  cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > "
+        + fuzzyMatchFile + ".extract.sorted.gz";
+  system(cmd.c_str());
+  cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > "
+        + fuzzyMatchFile + ".extract.inv.sorted.gz";
+  system(cmd.c_str());
+#ifdef IS_XCODE
+  cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin";
+#elif IS_ECLIPSE
+  cmd = "/home/hieu/workspace/github/moses-smt/bin";
+#else
+  cmd = staticData.GetBinDirectory();
+#endif
+  cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ")
+         + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" "
+         + " -phrase-translation-table " + fuzzyMatchFile + ".pt";
+  system(cmd.c_str());
+  return fuzzyMatchFile + ".pt.gz";
+}
+string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr)
+{
+  const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus();
+  string inputPath = dirNameStr + "/in";
+  string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile";
+  ofstream fuzzyMatchStream(fuzzyMatchFile.c_str());
+  vector< vector< WORD_ID > > input;
+  load_corpus(inputPath, input);
+  assert(input.size() == 1);
+  size_t sentenceInd = 0;
+  clock_t start_clock = clock();
+  // if (i % 10 == 0) cerr << ".";
+  // establish some basic statistics
+  // int input_length = compute_length( input[i] );
+  int input_length = input[sentenceInd].size();
+  int best_cost = input_length * (100-min_match) / 100 + 1;
+  int match_count = 0; // how many substring matches to be considered
+  //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl;
+  // find match ranges in suffix array
+  vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range;
+  for(int start=0; start<input[sentenceInd].size(); start++) {
+    SuffixArray::INDEX prior_first_match = 0;
+    SuffixArray::INDEX prior_last_match = suffixArray->GetSize()-1;
+    vector< string > substring;
+    bool stillMatched = true;
+    vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart;
+    //cerr << "start: " << start;
+    for(size_t word=start; stillMatched && word<input[sentenceInd].size(); word++) {
+      substring.push_back( GetVocabulary().GetWord( input[sentenceInd][word] ) );
+      // only look up, if needed (i.e. no unnecessary short gram lookups)
+      //				if (! word-start+1 <= short_match_max_length( input_length ) )
+      //			{
+      SuffixArray::INDEX first_match, last_match;
+      stillMatched = false;
+      if (suffixArray->FindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) {
+        stillMatched = true;
+        matchedAtThisStart.push_back( make_pair( first_match, last_match ) );
+        //cerr << " (" << first_match << "," << last_match << ")";
+        //cerr << " " << ( last_match - first_match + 1 );
+        prior_first_match = first_match;
+        prior_last_match = last_match;
+      }
+      //}
+    }
+    //cerr << endl;
+    match_range.push_back( matchedAtThisStart );
+  }
+  clock_t clock_range = clock();
+  map< int, vector< Match > > sentence_match;
+  map< int, int > sentence_match_word_count;
+  // go through all matches, longest first
+  for(int length = input[sentenceInd].size(); length >= 1; length--) {
+    // do not create matches, if these are handled by the short match function
+    if (length <= short_match_max_length( input_length ) ) {
+      continue;
+    }
+    unsigned int count = 0;
+    for(int start = 0; start <= input[sentenceInd].size() - length; start++) {
+      if (match_range[start].size() >= length) {
+        pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1];
+        // cerr << " (" << range.first << "," << range.second << ")";
+        count += range.second - range.first + 1;
+        for(SuffixArray::INDEX i=range.first; i<=range.second; i++) {
+          size_t position = suffixArray->GetPosition( i );
+          // sentence length mismatch
+          size_t sentence_id = suffixArray->GetSentence( position );
+          int sentence_length = suffixArray->GetSentenceLength( sentence_id );
+          int diff = abs( (int)sentence_length - (int)input_length );
+          // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length;
+          //if (length <= 2 && input_length>=5 &&
+          //		sentence_match.find( sentence_id ) == sentence_match.end())
+          //	continue;
+          if (diff > best_cost)
+            continue;
+          // compute minimal cost
+          int start_pos = suffixArray->GetWordInSentence( position );
+          int end_pos = start_pos + length-1;
+          // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. "
+          // << start << "-" << (start+length-1) << " (" << input_length << ")";
+          // different number of prior words -> cost is at least diff
+          int min_cost = abs( start - start_pos );
+          // same number of words, but not sent. start -> cost is at least 1
+          if (start == start_pos && start>0)
+            min_cost++;
+          // different number of remaining words -> cost is at least diff
+          min_cost += abs( ( sentence_length-1 - end_pos ) -
+                           ( input_length-1 - (start+length-1) ) );
+          // same number of words, but not sent. end -> cost is at least 1
+          if ( sentence_length-1 - end_pos ==
+               input_length-1 - (start+length-1)
+               && end_pos != sentence_length-1 )
+            min_cost++;
+          // cerr << " -> min_cost " << min_cost;
+          if (min_cost > best_cost)
+            continue;
+          // valid match
+          match_count++;
+          // compute maximal cost
+          int max_cost = max( start, start_pos )
+                         + max( sentence_length-1 - end_pos,
+                                input_length-1 - (start+length-1) );
+          // cerr << ", max_cost " << max_cost;
+          Match m = Match( start, start+length-1,
+                           start_pos, start_pos+length-1,
+                           min_cost, max_cost, 0);
+          sentence_match[ sentence_id ].push_back( m );
+          sentence_match_word_count[ sentence_id ] += length;
+          if (max_cost < best_cost) {
+            best_cost = max_cost;
+            if (best_cost == 0) break;
+          }
+          //if (match_count >= MAX_MATCH_COUNT) break;
+        }
+      }
+      // cerr << endl;
+      if (best_cost == 0) break;
+      //if (match_count >= MAX_MATCH_COUNT) break;
+    }
+    // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl;
+    if (best_cost == 0) break;
+    //if (match_count >= MAX_MATCH_COUNT) break;
+  }
+  cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl;
+  clock_t clock_matches = clock();
+  // consider each sentence for which we have matches
+  int old_best_cost = best_cost;
+  int tm_count_word_match = 0;
+  int tm_count_word_match2 = 0;
+  int pruned_match_count = 0;
+  if (short_match_max_length( input_length )) {
+    init_short_matches(wordIndex, translationId, input[sentenceInd] );
+  }
+  vector< int > best_tm;
+  typedef map< int, vector< Match > >::iterator I;
+  clock_t clock_validation_sum = 0;
+  for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) {
+    int tmID = tm->first;
+    int tm_length = suffixArray->GetSentenceLength(tmID);
+    vector< Match > &match = tm->second;
+    add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost );
+    //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl;
+    // quick look: how many words are matched
+    int words_matched = 0;
+    for(size_t m=0; m<match.size(); m++) {
+      if (match[m].min_cost <= best_cost) // makes no difference
+        words_matched += match[m].input_end - match[m].input_start + 1;
+    }
+    if (max(input_length,tm_length) - words_matched > best_cost) {
+      if (length_filter_flag) continue;
+    }
+    tm_count_word_match++;
+    // prune, check again how many words are matched
+    vector< Match > pruned = prune_matches( match, best_cost );
+    words_matched = 0;
+    for(size_t p=0; p<pruned.size(); p++) {
+      words_matched += pruned[p].input_end - pruned[p].input_start + 1;
+    }
+    if (max(input_length,tm_length) - words_matched > best_cost) {
+      if (length_filter_flag) continue;
+    }
+    tm_count_word_match2++;
+    pruned_match_count += pruned.size();
+    int prior_best_cost = best_cost;
+    int cost;
+    clock_t clock_validation_start = clock();
+    if (! parse_flag ||
+        pruned.size()>=10) { // to prevent worst cases
+      string path;
+      cost = sed( input[sentenceInd], source[tmID], path, false );
+      if (cost <  best_cost) {
+        best_cost = cost;
+      }
+    }
+    else {
+      cost = parse_matches( pruned, input_length, tm_length, best_cost );
+      if (prior_best_cost != best_cost) {
+        best_tm.clear();
+      }
+    }
+    clock_validation_sum += clock() - clock_validation_start;
+    if (cost == best_cost) {
+      best_tm.push_back( tmID );
+    }
+  }
+  cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl;
+  cerr << "tm considered: " << sentence_match.size()
+       << " word-matched: " << tm_count_word_match
+       << " word-matched2: " << tm_count_word_match2
+       << " best: " << best_tm.size() << endl;
+  cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl;
+  // create xml and extract files
+  string inputStr, sourceStr;
+  for (size_t pos = 0; pos < input_length; ++pos) {
+    inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " ";
+  }
+  // do not try to find the best ... report multiple matches
+  if (multiple_flag) {
+    for(size_t si=0; si<best_tm.size(); si++) {
+      int s = best_tm[si];
+      string path;
+      sed( input[sentenceInd], source[s], path, true );
+      const vector<WORD_ID> &sourceSentence = source[s];
+      vector<SentenceAlignment> &targets = targetAndAlignment[s];
+      create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream);
+    }
+  } // if (multiple_flag)
+  else {
+    // find the best matches according to letter sed
+    string best_path = "";
+    int best_match = -1;
+    unsigned int best_letter_cost;
+    if (lsed_flag) {
+      best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
+      for(size_t si=0; si<best_tm.size(); si++) {
+        int s = best_tm[si];
+        string path;
+        unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
+        if (letter_cost < best_letter_cost) {
+          best_letter_cost = letter_cost;
+          best_path = path;
+          best_match = s;
+        }
+      }
+    }
+    // if letter sed turned off, just compute path for first match
+    else {
+      if (best_tm.size() > 0) {
+        string path;
+        sed( input[sentenceInd], source[best_tm[0]], path, false );
+        best_path = path;
+        best_match = best_tm[0];
+      }
+    }
+    cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC)
+         << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC)
+         << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC)
+         << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC)
+         << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")"
+         << " )" << endl;
+    if (lsed_flag) {
+      //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " (";
+    }
+    //cout << best_cost <<"/" << input_length;
+    if (lsed_flag) {
+      //cout << ")";
+    }
+    //cout << " ||| " << best_match << " ||| " << best_path << endl;
+    if (best_match == -1) {
+      UTIL_THROW_IF2(source.size() == 0, "Empty source phrase");
+      best_match = 0;
+    }
+    // creat xml & extracts
+    const vector<WORD_ID> &sourceSentence = source[best_match];
+    vector<SentenceAlignment> &targets = targetAndAlignment[best_match];
+    create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream);
+  } // else if (multiple_flag)
+  fuzzyMatchStream.close();
+  return fuzzyMatchFile;
+}
+void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus )
+{
+  // source
+  ifstream fileStream;
+  fileStream.open(fileName.c_str());
+  if (!fileStream) {
+    cerr << "file not found: " << fileName << endl;
+    exit(1);
+  }
+  cerr << "loading " << fileName << endl;
+  istream *fileStreamP = &fileStream;
+  string line;
+  while(getline(*fileStreamP, line)) {
+    corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) );
+  }
+}
+void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus)
+{
+  ifstream fileStream;
+  fileStream.open(fileName.c_str());
+  if (!fileStream) {
+    cerr << "file not found: " << fileName << endl;
+    exit(1);
+  }
+  cerr << "loading " << fileName << endl;
+  istream *fileStreamP = &fileStream;
+  WORD_ID delimiter = GetVocabulary().StoreIfNew("|||");
+  int lineNum = 0;
+  string line;
+  while(getline(*fileStreamP, line)) {
+    vector<WORD_ID> toks = GetVocabulary().Tokenize( line.c_str() );
+    corpus.push_back(vector< SentenceAlignment >());
+    vector< SentenceAlignment > &vec = corpus.back();
+    vec.push_back(SentenceAlignment());
+    SentenceAlignment *sentence = &vec.back();
+    const WORD &countStr = GetVocabulary().GetWord(toks[0]);
+    sentence->count = atoi(countStr.c_str());
+    for (size_t i = 1; i < toks.size(); ++i) {
+      WORD_ID wordId = toks[i];
+      if (wordId == delimiter) {
+        // target and alignments can have multiple sentences.
+        vec.push_back(SentenceAlignment());
+        sentence = &vec.back();
+        // count
+        ++i;
+        const WORD &countStr = GetVocabulary().GetWord(toks[i]);
+        sentence->count = atoi(countStr.c_str());
+      } else {
+        // just a normal word, add
+        sentence->target.push_back(wordId);
+      }
+    }
+    ++lineNum;
+  }
+}
+void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus )
+{
+  ifstream fileStream;
+  fileStream.open(fileName.c_str());
+  if (!fileStream) {
+    cerr << "file not found: " << fileName << endl;
+    exit(1);
+  }
+  cerr << "loading " << fileName << endl;
+  istream *fileStreamP = &fileStream;
+  string delimiter = "|||";
+  int lineNum = 0;
+  string line;
+  while(getline(*fileStreamP, line)) {
+    vector< SentenceAlignment > &vec = corpus[lineNum];
+    size_t targetInd = 0;
+    SentenceAlignment *sentence = &vec[targetInd];
+    vector<string> toks = Moses::Tokenize(line);
+    for (size_t i = 0; i < toks.size(); ++i) {
+      string &tok = toks[i];
+      if (tok == delimiter) {
+        // target and alignments can have multiple sentences.
+        ++targetInd;
+        sentence = &vec[targetInd];
+        ++i;
+      } else {
+        // just a normal alignment, add
+        vector<int> alignPoint = Moses::Tokenize<int>(tok, "-");
+        assert(alignPoint.size() == 2);
+        sentence->alignment.push_back(pair<int,int>(alignPoint[0], alignPoint[1]));
+      }
+    }
+    ++lineNum;
+  }
+}
+bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
+{
+#ifdef WITH_THREADS
+  boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
+  map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
+  if (lookup != m_lsed.end()) {
+    value = lookup->second;
+    return true;
+  }
+  return false;
+}
+void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
+{
+#ifdef WITH_THREADS
+  boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
+  m_lsed[ key ] = value;
+}
+/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */
+unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx )
+{
+  // check if already computed -> lookup in cache
+  pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx );
+  unsigned int value;
+  bool ret = GetLSEDCache(pIdx, value);
+  if (ret) {
+    return value;
+  }
+  // get surface strings for word indices
+  const string &a = GetVocabulary().GetWord( aIdx );
+  const string &b = GetVocabulary().GetWord( bIdx );
+  // initialize cost matrix
+  unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int*  ), a.size()+1 );
+  for( unsigned int i=0; i<=a.size(); i++ ) {
+    cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+    cost[i][0] = i;
+  }
+  for( unsigned int j=0; j<=b.size(); j++ ) {
+    cost[0][j] = j;
+  }
+  // core string edit distance loop
+  for( unsigned int i=1; i<=a.size(); i++ ) {
+    for( unsigned int j=1; j<=b.size(); j++ ) {
+      unsigned int ins = cost[i-1][j] + 1;
+      unsigned int del = cost[i][j-1] + 1;
+      bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0);
+      unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1);
+      unsigned int min = (ins < del) ? ins : del;
+      min = (diag < min) ? diag : min;
+      cost[i][j] = min;
+    }
+  }
+  // clear out memory
+  unsigned int final = cost[a.size()][b.size()];
+  for( unsigned int i=0; i<=a.size(); i++ ) {
+    free( cost[i] );
+  }
+  free( cost );
+  // cache and return result
+  SetLSEDCache(pIdx, final);
+  return final;
+}
+/* string edit distance implementation */
+unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed )
+{
+  // initialize cost and path matrices
+  unsigned int **cost  = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 );
+  char **path = (char**) calloc( sizeof( char* ), a.size()+1 );
+  for( unsigned int i=0; i<=a.size(); i++ ) {
+    cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 );
+    path[i] = (char*) calloc( sizeof(char), b.size()+1 );
+    if (i>0) {
+      cost[i][0] = cost[i-1][0];
+      if (use_letter_sed) {
+        cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size();
+      } else {
+        cost[i][0]++;
+      }
+    } else {
+      cost[i][0] = 0;
+    }
+    path[i][0] = 'I';
+  }
+  for( unsigned int j=0; j<=b.size(); j++ ) {
+    if (j>0) {
+      cost[0][j] = cost[0][j-1];
+      if (use_letter_sed) {
+        cost[0][j] +=	GetVocabulary().GetWord( b[j-1] ).size();
+      } else {
+        cost[0][j]++;
+      }
+    } else {
+      cost[0][j] = 0;
+    }
+    path[0][j] = 'D';
+  }
+  // core string edit distance algorithm
+  for( unsigned int i=1; i<=a.size(); i++ ) {
+    for( unsigned int j=1; j<=b.size(); j++ ) {
+      unsigned int ins = cost[i-1][j];
+      unsigned int del = cost[i][j-1];
+      unsigned int match;
+      if (use_letter_sed) {
+        ins += GetVocabulary().GetWord( a[i-1] ).size();
+        del += GetVocabulary().GetWord( b[j-1] ).size();
+        match = letter_sed( a[i-1], b[j-1] );
+      } else {
+        ins++;
+        del++;
+        match = ( a[i-1] == b[j-1] ) ? 0 : 1;
+      }
+      unsigned int diag = cost[i-1][j-1] + match;
+      char action = (ins < del) ? 'I' : 'D';
+      unsigned int min = (ins < del) ? ins : del;
+      if (diag < min) {
+        action = (match>0) ? 'S' : 'M';
+        min = diag;
+      }
+      cost[i][j] = min;
+      path[i][j] = action;
+    }
+  }
+  // construct string for best path
+  unsigned int i = a.size();
+  unsigned int j = b.size();
+  best_path = "";
+  while( i>0 || j>0 ) {
+    best_path = path[i][j] + best_path;
+    if (path[i][j] == 'I') {
+      i--;
+    } else if (path[i][j] == 'D') {
+      j--;
+    } else {
+      i--;
+      j--;
+    }
+  }
+  // clear out memory
+  unsigned int final = cost[a.size()][b.size()];
+  for( unsigned int i=0; i<=a.size(); i++ ) {
+    free( cost[i] );
+    free( path[i] );
+  }
+  free( cost );
+  free( path );
+  // return result
+  return final;
+}
+/* utlility function: compute length of sentence in characters
+ (spaces do not count) */
+unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence )
+{
+  unsigned int length = 0;
+  for( unsigned int i=0; i<sentence.size(); i++ ) {
+    length += GetVocabulary().GetWord( sentence[i] ).size();
+  }
+  return length;
+}
+/* brute force method: compare input to all corpus sentences */
+void FuzzyMatchWrapper::basic_fuzzy_match( vector< vector< WORD_ID > > source,
+    vector< vector< WORD_ID > > input )
+{
+  // go through input set...
+  for(unsigned int i=0; i<input.size(); i++) {
+    bool use_letter_sed = false;
+    // compute sentence length and worst allowed cost
+    unsigned int input_length;
+    if (use_letter_sed) {
+      input_length = compute_length( input[i] );
+    } else {
+      input_length = input[i].size();
+    }
+    unsigned int best_cost = input_length * (100-min_match) / 100 + 2;
+    string best_path = "";
+    //int best_match = -1;
+    // go through all corpus sentences
+    for(unsigned int s=0; s<source.size(); s++) {
+      int source_length;
+      if (use_letter_sed) {
+        source_length = compute_length( source[s] );
+      } else {
+        source_length = source[s].size();
+      }
+      int diff = abs((int)source_length - (int)input_length);
+      if (length_filter_flag && (diff >= best_cost)) {
+        continue;
+      }
+      // compute string edit distance
+      string path;
+      unsigned int cost = sed( input[i], source[s], path, use_letter_sed );
+      // update if new best
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_path = path;
+        //best_match = s;
+      }
+    }
+    //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl;
+  }
+}
+/* definition of short matches
+ very short n-gram matches (1-grams) will not be looked up in
+ the suffix array, since there are too many matches
+ and for longer sentences, at least one 2-gram match must occur */
+int FuzzyMatchWrapper::short_match_max_length( int input_length )
+{
+  if ( ! refined_flag )
+    return 0;
+  if ( input_length >= 5 )
+    return 1;
+  return 0;
+}
+/* if we have non-short matches in a sentence, we need to
+ take a closer look at it.
+ this function creates a hash map for all input words and their positions
+ (to be used by the next function)
+ (done here, because this has be done only once for an input sentence) */
+void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input )
+{
+  int max_length = short_match_max_length( input.size() );
+  if (max_length == 0)
+    return;
+  wordIndex.clear();
+  // store input words and their positions in hash map
+  for(size_t i=0; i<input.size(); i++) {
+    if (wordIndex.find( input[i] ) == wordIndex.end()) {
+      vector< int > position_vector;
+      wordIndex[ input[i] ] = position_vector;
+    }
+    wordIndex[ input[i] ].push_back( i );
+  }
+}
+/* add all short matches to list of matches for a sentence */
+void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost )
+{
+  int max_length = short_match_max_length( input_length );
+  if (max_length == 0)
+    return;
+  int tm_length = tm.size();
+  map< WORD_ID,vector< int > >::iterator input_word_hit;
+  for(int t_pos=0; t_pos<tm.size(); t_pos++) {
+    input_word_hit = wordIndex.find( tm[t_pos] );
+    if (input_word_hit != wordIndex.end()) {
+      vector< int > &position_vector = input_word_hit->second;
+      for(size_t j=0; j<position_vector.size(); j++) {
+        int &i_pos = position_vector[j];
+        // before match
+        int max_cost = max( i_pos , t_pos );
+        int min_cost = abs( i_pos - t_pos );
+        if ( i_pos>0 && i_pos == t_pos )
+          min_cost++;
+        // after match
+        max_cost += max( (input_length-i_pos) , (tm_length-t_pos));
+        min_cost += abs( (input_length-i_pos) - (tm_length-t_pos));
+        if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos))
+          min_cost++;
+        if (min_cost <= best_cost) {
+          Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 );
+          match.push_back( new_match );
+        }
+      }
+    }
+  }
+}
+/* remove matches that are subsumed by a larger match */
+vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost )
+{
+  //cerr << "\tpruning";
+  vector< Match > pruned;
+  for(int i=match.size()-1; i>=0; i--) {
+    //cerr << " (" << match[i].input_start << "," << match[i].input_end
+    //		 << " ; " << match[i].tm_start << "," << match[i].tm_end
+    //		 << " * " << match[i].min_cost << ")";
+    //if (match[i].min_cost > best_cost)
+    //	continue;
+    bool subsumed = false;
+    for(int j=match.size()-1; j>=0; j--) {
+      if (i!=j // do not compare match with itself
+          && ( match[i].input_end - match[i].input_start <=
+               match[j].input_end - match[j].input_start ) // i shorter than j
+          && ((match[i].input_start == match[j].input_start &&
+               match[i].tm_start    == match[j].tm_start	) ||
+              (match[i].input_end   == match[j].input_end &&
+               match[i].tm_end      == match[j].tm_end) ) ) {
+        subsumed = true;
+      }
+    }
+    if (! subsumed && match[i].min_cost <= best_cost) {
+      //cerr << "*";
+      pruned.push_back( match[i] );
+    }
+  }
+  //cerr << endl;
+  return pruned;
+}
+/* A* parsing method to compute string edit distance */
+int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost )
+{
+  // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl;
+  if (match.size() == 1)
+    return match[0].max_cost;
+  if (match.size() == 0)
+    return input_length+tm_length;
+  int this_best_cost = input_length + tm_length;
+  for(size_t i=0; i<match.size(); i++) {
+    this_best_cost = min( this_best_cost, match[i].max_cost );
+  }
+  // cerr << "\tthis best cost: " << this_best_cost << endl;
+  // bottom up combination of spans
+  vector< vector< Match > > multi_match;
+  multi_match.push_back( match );
+  int match_level = 1;
+  while(multi_match[ match_level-1 ].size()>0) {
+    // init vector
+    vector< Match > empty;
+    multi_match.push_back( empty );
+    for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) {
+      int second_level = match_level - first_level -1;
+      //cerr << "\tcombining level " << first_level << " and " << second_level << endl;
+      vector< Match > &first_match  = multi_match[ first_level ];
+      vector< Match > &second_match = multi_match[ second_level ];
+      for(size_t i1 = 0; i1 < first_match.size(); i1++) {
+        for(size_t i2 = 0; i2 < second_match.size(); i2++) {
+          // do not combine the same pair twice
+          if (first_level == second_level && i2 <= i1) {
+            continue;
+          }
+          // get sorted matches (first is before second)
+          Match *first, *second;
+          if (first_match[i1].input_start < second_match[i2].input_start ) {
+            first = &first_match[i1];
+            second = &second_match[i2];
+          } else {
+            second = &first_match[i1];
+            first = &second_match[i2];
+          }
+          //cerr << "\tcombining "
+          //		 << "(" << first->input_start << "," << first->input_end << "), "
+          //		 << first->tm_start << " [" << first->internal_cost << "]"
+          //		 << " with "
+          //		 << "(" << second->input_start << "," << second->input_end << "), "
+          //		 << second->tm_start<< " [" << second->internal_cost << "]"
+          //		 << endl;
+          // do not process overlapping matches
+          if (first->input_end >= second->input_start) {
+            continue;
+          }
+          // no overlap / mismatch in tm
+          if (first->tm_end >= second->tm_start) {
+            continue;
+          }
+          // compute cost
+          int min_cost = 0;
+          int max_cost = 0;
+          // initial
+          min_cost += abs( first->input_start - first->tm_start );
+          max_cost += max( first->input_start, first->tm_start );
+          // same number of words, but not sent. start -> cost is at least 1
+          if (first->input_start == first->tm_start && first->input_start > 0) {
+            min_cost++;
+          }
+          // in-between
+          int skipped_words = second->input_start - first->input_end -1;
+          int skipped_words_tm = second->tm_start - first->tm_end -1;
+          int internal_cost = max( skipped_words, skipped_words_tm );
+          internal_cost += first->internal_cost + second->internal_cost;
+          min_cost += internal_cost;
+          max_cost += internal_cost;
+          // final
+          min_cost += abs( (tm_length-1 - second->tm_end) -
+                           (input_length-1 - second->input_end) );
+          max_cost += max( (tm_length-1 - second->tm_end),
+                           (input_length-1 - second->input_end) );
+          // same number of words, but not sent. end -> cost is at least 1
+          if ( ( input_length-1 - second->input_end
+                 == tm_length-1 - second->tm_end )
+               && input_length-1 != second->input_end ) {
+            min_cost++;
+          }
+          // cerr << "\tcost: " << min_cost << "-" << max_cost << endl;
+          // if worst than best cost, forget it
+          if (min_cost > best_cost) {
+            continue;
+          }
+          // add match
+          Match new_match( first->input_start,
+                           second->input_end,
+                           first->tm_start,
+                           second->tm_end,
+                           min_cost,
+                           max_cost,
+                           internal_cost);
+          multi_match[ match_level ].push_back( new_match );
+          // cerr << "\tstored\n";
+          // possibly updating this_best_cost
+          if (max_cost < this_best_cost) {
+            // cerr << "\tupdating this best cost to " << max_cost << "\n";
+            this_best_cost = max_cost;
+            // possibly updating best_cost
+            if (max_cost < best_cost) {
+              // cerr << "\tupdating best cost to " << max_cost << "\n";
+              best_cost = max_cost;
+            }
+          }
+        }
+      }
+    }
+    match_level++;
+  }
+  return this_best_cost;
+}
+void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector<SentenceAlignment> &targets, const string &inputStr, const string  &path, ofstream &outputFile)
+{
+  string sourceStr;
+  for (size_t pos = 0; pos < sourceSentence.size(); ++pos) {
+    WORD_ID wordId = sourceSentence[pos];
+    sourceStr += GetVocabulary().GetWord(wordId) + " ";
+  }
+  for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) {
+    const SentenceAlignment &sentenceAlignment = targets[targetInd];
+    string targetStr = sentenceAlignment.getTargetString(GetVocabulary());
+    string alignStr = sentenceAlignment.getAlignmentString();
+    outputFile
+        << sentenceInd << endl
+        << cost << endl
+        << sourceStr << endl
+        << inputStr << endl
+        << targetStr << endl
+        << alignStr << endl
+        << path << endl
+        << sentenceAlignment.count << endl;
+  }
+}
+} // namespace

mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h ADDED Viewed

	@@ -0,0 +1,91 @@

+//
+//  FuzzyMatchWrapper.h
+//  moses
+//
+//  Created by Hieu Hoang on 26/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+#ifndef moses_FuzzyMatchWrapper_h
+#define moses_FuzzyMatchWrapper_h
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#endif
+#include <fstream>
+#include <string>
+#include "SuffixArray.h"
+#include "Vocabulary.h"
+#include "Match.h"
+#include "moses/InputType.h"
+namespace tmmt
+{
+class Match;
+struct SentenceAlignment;
+class FuzzyMatchWrapper
+{
+public:
+  FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment);
+  std::string Extract(long translationId, const std::string &dirNameStr);
+protected:
+  // tm-mt
+  std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment;
+  tmmt::SuffixArray *suffixArray;
+  int basic_flag;
+  int lsed_flag;
+  int refined_flag;
+  int length_filter_flag;
+  int parse_flag;
+  int min_match;
+  int multiple_flag;
+  int multiple_slack;
+  int multiple_max;
+  typedef std::map< WORD_ID,std::vector< int > > WordIndex;
+  // global cache for word pairs
+  std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed;
+#ifdef WITH_THREADS
+  //reader-writer lock
+  mutable boost::shared_mutex m_accessLock;
+#endif
+  void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus );
+  void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus);
+  void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus );
+  /** brute force method: compare input to all corpus sentences */
+  void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source,
+                          std::vector< std::vector< tmmt::WORD_ID > > input ) ;
+  /** utlility function: compute length of sentence in characters
+   (spaces do not count) */
+  unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence );
+  unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx );
+  unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed );
+  void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input );
+  int short_match_max_length( int input_length );
+  void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost );
+  std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost );
+  int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost );
+  void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector<SentenceAlignment> &targets, const std::string &inputStr, const std::string  &path, std::ofstream &outputFile);
+  std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath);
+  Vocabulary &GetVocabulary() {
+    return suffixArray->GetVocabulary();
+  }
+  bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const;
+  void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value);
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h ADDED Viewed

	@@ -0,0 +1,34 @@

+//
+//  Match.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+#ifndef fuzzy_match_Match_h
+#define fuzzy_match_Match_h
+namespace tmmt
+{
+/* data structure for n-gram match between input and corpus */
+class Match
+{
+public:
+  int input_start;
+  int input_end;
+  int tm_start;
+  int tm_end;
+  int min_cost;
+  int max_cost;
+  int internal_cost;
+  Match( int is, int ie, int ts, int te, int min, int max, int i )
+    :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) {
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp ADDED Viewed

	@@ -0,0 +1,25 @@

+//
+//  SentenceAlignment.cpp
+//  moses
+//
+//  Created by Hieu Hoang on 26/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+#include <iostream>
+#include "util/string_stream.hh"
+#include "SentenceAlignment.h"
+namespace tmmt
+{
+std::string SentenceAlignment::getTargetString(const Vocabulary &vocab) const
+{
+  util::StringStream strme;
+  for (size_t i = 0; i < target.size(); ++i) {
+    const WORD &word = vocab.GetWord(target[i]);
+    strme << word << " ";
+  }
+  return strme.str();
+}
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h ADDED Viewed

	@@ -0,0 +1,43 @@

+//
+//  SentenceAlignment.h
+//  fuzzy-match
+//
+//  Created by Hieu Hoang on 25/07/2012.
+//  Copyright 2012 __MyCompanyName__. All rights reserved.
+//
+#ifndef fuzzy_match_SentenceAlignment_h
+#define fuzzy_match_SentenceAlignment_h
+#include <sstream>
+#include <vector>
+#include "Vocabulary.h"
+#include "util/string_stream.hh"
+namespace tmmt
+{
+struct SentenceAlignment {
+  int count;
+  std::vector< WORD_ID > target;
+  std::vector< std::pair<int,int> > alignment;
+  SentenceAlignment() {
+  }
+  std::string getTargetString(const Vocabulary &vocab) const;
+  std::string getAlignmentString() const {
+    util::StringStream strme;
+    for (size_t i = 0; i < alignment.size(); ++i) {
+      const std::pair<int,int> &alignPair = alignment[i];
+      strme << alignPair.first << "-" << alignPair.second << " ";
+    }
+    return strme.str();
+  }
+};
+}
+#endif

mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp ADDED Viewed

	@@ -0,0 +1,240 @@

+#include "SuffixArray.h"
+#include <string>
+#include <stdlib.h>
+#include <cstring>
+using namespace std;
+namespace tmmt
+{
+SuffixArray::SuffixArray( string fileName )
+{
+  m_vcb.StoreIfNew( "<uNk>" );
+  m_endOfSentence = m_vcb.StoreIfNew( "<s>" );
+  ifstream extractFile;
+  // count the number of words first;
+  extractFile.open(fileName.c_str());
+  istream *fileP = &extractFile;
+  m_size = 0;
+  size_t sentenceCount = 0;
+  string line;
+  while(getline(*fileP, line)) {
+    vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
+    m_size += words.size() + 1;
+    sentenceCount++;
+  }
+  extractFile.close();
+  cerr << m_size << " words (incl. sentence boundaries)" << endl;
+  // allocate memory
+  m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size );
+  m_index = (INDEX*) calloc( sizeof( INDEX ), m_size );
+  m_wordInSentence = (char*) calloc( sizeof( char ), m_size );
+  m_sentence = (size_t*) calloc( sizeof( size_t ), m_size );
+  m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount );
+  // fill the array
+  int wordIndex = 0;
+  int sentenceId = 0;
+  extractFile.open(fileName.c_str());
+  fileP = &extractFile;
+  while(getline(*fileP, line)) {
+    vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() );
+    // add to corpus vector
+    corpus.push_back(words);
+    // create SA
+    vector< WORD_ID >::const_iterator i;
+    for( i=words.begin(); i!=words.end(); i++) {
+      m_index[ wordIndex ] = wordIndex;
+      m_sentence[ wordIndex ] = sentenceId;
+      m_wordInSentence[ wordIndex ] = i-words.begin();
+      m_array[ wordIndex++ ] = *i;
+    }
+    m_index[ wordIndex ] = wordIndex;
+    m_array[ wordIndex++ ] = m_endOfSentence;
+    m_sentenceLength[ sentenceId++ ] = words.size();
+  }
+  extractFile.close();
+  cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl;
+  // List(0,9);
+  // sort
+  m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size );
+  Sort( 0, m_size-1 );
+  free( m_buffer );
+  cerr << "done sorting" << endl;
+}
+// good ol' quick sort
+void SuffixArray::Sort(INDEX start, INDEX end)
+{
+  if (start == end) return;
+  INDEX mid = (start+end+1)/2;
+  Sort( start, mid-1 );
+  Sort( mid, end );
+  // merge
+  size_t i = start;
+  size_t j = mid;
+  size_t k = 0;
+  size_t length = end-start+1;
+  while( k<length ) {
+    if (i == mid ) {
+      m_buffer[ k++ ] = m_index[ j++ ];
+    } else if (j > end ) {
+      m_buffer[ k++ ] = m_index[ i++ ];
+    } else {
+      if (CompareIndex( m_index[i], m_index[j] ) < 0) {
+        m_buffer[ k++ ] = m_index[ i++ ];
+      } else {
+        m_buffer[ k++ ] = m_index[ j++ ];
+      }
+    }
+  }
+  memcpy( ((char*)m_index) + sizeof( INDEX ) * start,
+          ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) );
+}
+SuffixArray::~SuffixArray()
+{
+  free(m_index);
+  free(m_array);
+}
+int SuffixArray::CompareIndex( INDEX a, INDEX b ) const
+{
+  // skip over identical words
+  INDEX offset = 0;
+  while( a+offset < m_size &&
+         b+offset < m_size &&
+         m_array[ a+offset ] == m_array[ b+offset ] ) {
+    offset++;
+  }
+  if( a+offset == m_size ) return -1;
+  if( b+offset == m_size ) return 1;
+  return CompareWord( m_array[ a+offset ], m_array[ b+offset ] );
+}
+inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const
+{
+  // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl;
+  return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) );
+}
+int SuffixArray::Count( const vector< WORD > &phrase )
+{
+  INDEX dummy;
+  return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 );
+}
+bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min )
+{
+  INDEX dummy;
+  return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min;
+}
+bool SuffixArray::Exists( const vector< WORD > &phrase )
+{
+  INDEX dummy;
+  return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1;
+}
+int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+  return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end );
+}
+int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end )
+{
+  // cerr << "FindFirst\n";
+  INDEX start = search_start;
+  INDEX end = (search_end == -1) ? (m_size-1) : search_end;
+  INDEX mid = FindFirst( phrase, start, end );
+  // cerr << "done\n";
+  if (mid == m_size) return 0; // no matches
+  if (min == 1) return 1;      // only existance check
+  int matchCount = 1;
+  //cerr << "before...\n";
+  firstMatch = FindLast( phrase, mid, start, -1 );
+  matchCount += mid - firstMatch;
+  //cerr << "after...\n";
+  lastMatch = FindLast( phrase, mid, end, 1 );
+  matchCount += lastMatch - mid;
+  return matchCount;
+}
+SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction )
+{
+  end += direction;
+  while(true) {
+    INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2;
+    int match = Match( phrase, mid );
+    int matchNext = Match( phrase, mid+direction );
+    //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl;
+    if (match == 0 && matchNext != 0) return mid;
+    if (match == 0) // mid point is a match
+      start = mid;
+    else
+      end = mid;
+  }
+}
+SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end )
+{
+  while(true) {
+    INDEX mid = ( start + end + 1 )/2;
+    //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n";
+    int match = Match( phrase, mid );
+    if (match == 0) return mid;
+    if (start >= end && match != 0 ) return m_size;
+    if (match > 0)
+      start = mid+1;
+    else
+      end = mid-1;
+  }
+}
+int SuffixArray::Match( const vector< WORD > &phrase, INDEX index )
+{
+  INDEX pos = m_index[ index ];
+  for(INDEX i=0; i<phrase.size() && i+pos<m_size; i++) {
+    int match = CompareWord( m_vcb.GetWordID( phrase[i] ), m_array[ pos+i ] );
+    // cerr << "{" << index << "+" << i << "," << pos+i << ":" << match << "}" << endl;
+    if (match != 0)
+      return match;
+  }
+  return 0;
+}
+void SuffixArray::List(INDEX start, INDEX end)
+{
+  for(INDEX i=start; i<=end; i++) {
+    INDEX pos = m_index[ i ];
+    // cerr << i << ":" << pos << "\t";
+    for(int j=0; j<5 && j+pos<m_size; j++) {
+      //cout << " " << m_vcb.GetWord( m_array[ pos+j ] );
+    }
+    // cerr << "\n";
+  }
+}
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.h ADDED Viewed

	@@ -0,0 +1,69 @@

+#include "Vocabulary.h"
+#pragma once
+#define LINE_MAX_LENGTH 10000
+namespace tmmt
+{
+class SuffixArray
+{
+public:
+  typedef unsigned int INDEX;
+private:
+  std::vector< std::vector< WORD_ID > > corpus;
+  WORD_ID *m_array;
+  INDEX *m_index;
+  INDEX *m_buffer;
+  char *m_wordInSentence;
+  size_t *m_sentence;
+  char *m_sentenceLength;
+  WORD_ID m_endOfSentence;
+  Vocabulary m_vcb;
+  INDEX m_size;
+public:
+  SuffixArray( std::string fileName );
+  ~SuffixArray();
+  void Sort(INDEX start, INDEX end);
+  int CompareIndex( INDEX a, INDEX b ) const;
+  inline int CompareWord( WORD_ID a, WORD_ID b ) const;
+  int Count( const std::vector< WORD > &phrase );
+  bool MinCount( const std::vector< WORD > &phrase, INDEX min );
+  bool Exists( const std::vector< WORD > &phrase );
+  int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 );
+  int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 );
+  INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end );
+  INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction );
+  int Match( const std::vector< WORD > &phrase, INDEX index );
+  void List( INDEX start, INDEX end );
+  inline INDEX GetPosition( INDEX index ) {
+    return m_index[ index ];
+  }
+  inline size_t GetSentence( INDEX position ) {
+    return m_sentence[position];
+  }
+  inline char GetWordInSentence( INDEX position ) {
+    return m_wordInSentence[position];
+  }
+  inline char GetSentenceLength( size_t sentenceId ) {
+    return m_sentenceLength[sentenceId];
+  }
+  inline INDEX GetSize() {
+    return m_size;
+  }
+  Vocabulary &GetVocabulary() {
+    return m_vcb;
+  }
+  const std::vector< std::vector< WORD_ID > > &GetCorpus() const {
+    return corpus;
+  }
+};
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp ADDED Viewed

	@@ -0,0 +1,71 @@

+// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $
+#include "Vocabulary.h"
+#ifdef WITH_THREADS
+#include <boost/thread/locks.hpp>
+#endif
+using namespace std;
+namespace tmmt
+{
+// as in beamdecoder/tables.cpp
+vector<WORD_ID> Vocabulary::Tokenize( const char input[] )
+{
+  vector< WORD_ID > token;
+  bool betweenWords = true;
+  int start=0;
+  int i=0;
+  for(; input[i] != '\0'; i++) {
+    bool isSpace = (input[i] == ' ' || input[i] == '\t');
+    if (!isSpace && betweenWords) {
+      start = i;
+      betweenWords = false;
+    } else if (isSpace && !betweenWords) {
+      token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+      betweenWords = true;
+    }
+  }
+  if (!betweenWords)
+    token.push_back( StoreIfNew ( string( input+start, i-start ) ) );
+  return token;
+}
+WORD_ID Vocabulary::StoreIfNew( const WORD& word )
+{
+  {
+    // read=lock scope
+#ifdef WITH_THREADS
+    boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
+    map<WORD, WORD_ID>::iterator i = lookup.find( word );
+    if( i != lookup.end() )
+      return i->second;
+  }
+#ifdef WITH_THREADS
+  boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
+  WORD_ID id = vocab.size();
+  vocab.push_back( word );
+  lookup[ word ] = id;
+  return id;
+}
+WORD_ID Vocabulary::GetWordID( const WORD &word )
+{
+#ifdef WITH_THREADS
+  boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
+  map<WORD, WORD_ID>::iterator i = lookup.find( word );
+  if( i == lookup.end() )
+    return 0;
+  WORD_ID w= (WORD_ID) i->second;
+  return w;
+}
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $
+#pragma once
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <cstdlib>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#endif
+namespace tmmt
+{
+typedef std::string WORD;
+typedef unsigned int WORD_ID;
+class Vocabulary
+{
+public:
+  std::map<WORD, WORD_ID> lookup;
+  std::vector< WORD > vocab;
+  WORD_ID StoreIfNew( const WORD& );
+  WORD_ID GetWordID( const WORD& );
+  std::vector<WORD_ID> Tokenize( const char[] );
+  inline WORD &GetWord( WORD_ID id ) const {
+    WORD &i = (WORD&) vocab[ id ];
+    return i;
+  }
+protected:
+#ifdef WITH_THREADS
+  //reader-writer lock
+  mutable boost::shared_mutex m_accessLock;
+#endif
+};
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp ADDED Viewed

	@@ -0,0 +1,387 @@

+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <vector>
+#include <string>
+#include "moses/Util.h"
+#include "Alignments.h"
+using namespace std;
+using namespace Moses;
+inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+  std::string res = str;
+  res.erase(str.find_last_not_of(dropChars)+1);
+  return res.erase(0, res.find_first_not_of(dropChars));
+}
+class CreateXMLRetValues
+{
+public:
+  string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv;
+};
+CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path );
+void create_xml(const string &inPath)
+{
+  ifstream inStrme(inPath.c_str());
+  ofstream rule((inPath + ".extract").c_str());
+  ofstream ruleInv((inPath + ".extract.inv").c_str());
+  // int setenceId;
+  // float score;
+  string source, target, align, path;
+  string *input = NULL;
+  int count;
+  int lineCount = 1;
+  int ruleCount = 1;
+  string inLine;
+  int step = 0;
+  while (!inStrme.eof()) {
+    getline(inStrme, inLine);
+    //cout << inLine << endl;
+    switch (step) {
+    case 0:
+      /*setenceId = */
+      Scan<int>(inLine);
+      ++step;
+      break;
+    case 1:
+      /*score = */
+      Scan<float>(inLine);
+      ++step;
+      break;
+    case 2:
+      source = inLine;
+      ++step;
+      break;
+    case 3:
+      if (input == NULL) {
+        input = new string(inLine);
+      } else {
+        assert(inLine == *input);
+      }
+      ++step;
+      break;
+    case 4:
+      target = inLine;
+      ++step;
+      break;
+    case 5:
+      align = inLine;
+      ++step;
+      break;
+    case 6:
+      path = inLine + "X";
+      ++step;
+      break;
+    case 7:
+      count = Scan<int>(inLine);
+      CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path);
+      //print STDOUT $frame."\n";
+      rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment
+           << " ||| " << count << endl;
+      ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv
+              << " ||| " << count << endl;
+      //print STDOUT "$sentenceInd ||| $score ||| $count\n";
+      ++ruleCount;
+      step = 0;
+      break;
+    }
+    ++lineCount;
+  }
+  delete input;
+  ruleInv.close();
+  rule.close();
+  inStrme.close();
+}
+CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path)
+{
+  CreateXMLRetValues ret;
+  vector<string> sourceToks   = Tokenize(source, " ")
+                                ,inputToks    = Tokenize(input, " ")
+                                    ,targetsToks  = Tokenize(target, " ");
+  Alignments alignments(align, sourceToks.size(), targetsToks.size());
+  map<int, string> frameInput;
+  map<int, int> alignI2S;
+  vector< map<string, int> > nonTerms;
+  vector<bool> targetBitmap(targetsToks.size(), true);
+  vector<bool> inputBitmap;
+  // STEP 1: FIND MISMATCHES
+  int s = 0, i = 0;
+  bool currently_matching = false;
+  int start_s = 0, start_i = 0;
+  //cerr << input << endl << source << endl << target << endl << path << endl;
+  for ( int p = 0 ; p < int(path.length()) ; p++ ) {
+    string action = path.substr(p, 1);
+    // beginning of a mismatch
+    if ( currently_matching && action != "M" && action != "X" ) {
+      start_i            = i;
+      start_s            = s;
+      currently_matching = 0;
+    } // if ( currently_matching
+    // end of a mismatch
+    else if ( !currently_matching && ( action == "M" || action == "X" ) ) {
+      // remove use of affected target words
+      for ( int ss = start_s ; ss < s ; ss++ ) {
+        const std::map<int, int> &targets = alignments.m_alignS2T[ss];
+        std::map<int, int>::const_iterator iter;
+        for (iter = targets.begin(); iter != targets.end(); ++iter) {
+          int tt = iter->first;
+          targetBitmap[tt] = 0;
+        }
+        // also remove enclosed unaligned words?
+      } //for ( int ss = start_s ; ss < s ; ss++ ) {
+      // are there input words that need to be inserted ?
+      //cerr << start_i << "<" << i << "?" << endl;
+      if (start_i < i ) {
+        // take note of input words to be inserted
+        string insertion = "";
+        for (int ii = start_i ; ii < i ; ii++ ) {
+          insertion += inputToks[ii] + " ";
+        }
+        // find position for inserted input words
+        // find first removed target word
+        int start_t = 1000;
+        for ( int ss = start_s ; ss < s ; ss++ ) {
+          const std::map<int, int> &targets = alignments.m_alignS2T[ss];
+          std::map<int, int>::const_iterator iter;
+          for (iter = targets.begin(); iter != targets.end(); ++iter) {
+            int tt = iter->first;
+            if (tt < start_t) {
+              start_t = tt;
+            }
+          }
+        }
+        // end of sentence? add to end
+        if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) {
+          start_t = targetsToks.size() - 1;
+        }
+        // backtrack to previous words if unaligned
+        if ( start_t == 1000 ) {
+          start_t = -1;
+          for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) {
+            const std::map<int, int> &targets = alignments.m_alignS2T[ss];
+            std::map<int, int>::const_iterator iter;
+            for (iter = targets.begin(); iter != targets.end(); ++iter) {
+              int tt = iter->first;
+              if (tt > start_t) {
+                start_t = tt;
+              }
+            }
+          }
+        } // if ( start_t == 1000 ) {
+        frameInput[start_t] += insertion;
+        map<string, int> nt;
+        nt["start_t"] = start_t;
+        nt["start_i"] = start_i;
+        nonTerms.push_back(nt);
+      } // if (start_i < i ) {
+      currently_matching = 1;
+    } // else if ( !currently_matching
+    /*
+    cerr << action << " " << s << " " << i
+    		<< "(" << start_s << " " << start_i << ")"
+    		<< currently_matching;
+     */
+    if ( action != "I" ) {
+      //cerr << " ->";
+      if (s < int(alignments.m_alignS2T.size())) {
+        const std::map<int, int> &targets = alignments.m_alignS2T[s];
+        //cerr << "s=" << s << endl;
+        std::map<int, int>::const_iterator iter;
+        for (iter = targets.begin(); iter != targets.end(); ++iter) {
+          // int tt = iter->first;
+          //cerr << " " << tt;
+        }
+      }
+    }
+    //cerr << endl;
+    if (action != "I")
+      s++;
+    if (action != "D") {
+      i++;
+      alignI2S[i] = s;
+    }
+    if (action == "M") {
+      inputBitmap.push_back(1);
+    } else if (action == "I" || action == "S") {
+      inputBitmap.push_back(0);
+    }
+  } // for ( int p = 0
+  //cerr << target << endl;
+  for (size_t i = 0; i < targetBitmap.size(); ++i) {
+    //cerr << targetBitmap[i];
+  }
+  //cerr << endl;
+  for (map<int, string>::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) {
+    //cerr << iter->first << ":" <<iter->second << endl;
+  }
+  // STEP 2: BUILD RULE AND FRAME
+  // hierarchical rule
+  int rule_pos_s = 0;
+  map<int, int> ruleAlignS;
+  for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) {
+    if ( inputBitmap[i] ) {
+      ret.ruleS += inputToks[i] + " ";
+      ruleAlignS[ alignI2S[i] ] = rule_pos_s++;
+    }
+    for (size_t j = 0; j < nonTerms.size(); ++j) {
+      map<string, int> &nt = nonTerms[j];
+      if (i == nt["start_i"]) {
+        ret.ruleS += "[X][X] ";
+        nt["rule_pos_s"] = rule_pos_s++;
+      }
+    }
+  }
+  int rule_pos_t = 0;
+  map<int, int> ruleAlignT;
+  for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) {
+    if (t >= 0 && targetBitmap[t]) {
+      ret.ruleT += targetsToks[t] + " ";
+      ruleAlignT[t] = rule_pos_t++;
+    }
+    for (size_t i = 0; i < nonTerms.size(); ++i) {
+      map<string, int> &nt = nonTerms[i];
+      if (t == nt["start_t"]) {
+        ret.ruleT += "[X][X] ";
+        nt["rule_pos_t"] = rule_pos_t++;
+      }
+    }
+  }
+  int numAlign = 0;
+  ret.ruleAlignment = "";
+  for (map<int, int>::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) {
+    int s = iter->first;
+    if (s < int(alignments.m_alignS2T.size())) {
+      const std::map<int, int> &targets = alignments.m_alignS2T[s];
+      std::map<int, int>::const_iterator iter;
+      for (iter = targets.begin(); iter != targets.end(); ++iter) {
+        int t =iter->first;
+        if (ruleAlignT.find(t) == ruleAlignT.end())
+          continue;
+        ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " ";
+        ++numAlign;
+      }
+    }
+  }
+  //cerr << "numAlign=" << numAlign << endl;
+  for (size_t i = 0; i < nonTerms.size(); ++i) {
+    map<string, int> &nt = nonTerms[i];
+    ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " ";
+    ++numAlign;
+  }
+  //cerr << "numAlign=" << numAlign << endl;
+  ret.ruleS = TrimInternal(ret.ruleS);
+  ret.ruleT = TrimInternal(ret.ruleT);
+  ret.ruleAlignment = TrimInternal(ret.ruleAlignment);
+  vector<string> ruleAlignmentToks = Tokenize(ret.ruleAlignment);
+  for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) {
+    const string &alignPoint = ruleAlignmentToks[i];
+    vector<string> toks = Tokenize(alignPoint, "-");
+    assert(toks.size() == 2);
+    ret.ruleAlignmentInv += toks[1] + "-" +toks[0];
+  }
+  ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv);
+  // frame
+  // ret.frame;
+  if (frameInput.find(-1) == frameInput.end())
+    ret.frame = frameInput[-1];
+  int currently_included = 0;
+  int start_t            = -1;
+  targetBitmap.push_back(0);
+  for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) {
+    // beginning of tm target inclusion
+    if ( !currently_included && targetBitmap[t] ) {
+      start_t            = t;
+      currently_included = 1;
+    }
+    // end of tm target inclusion (not included word or inserted input)
+    else if (currently_included
+             && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() )
+            ) {
+      // add xml (unless change is at the beginning of the sentence
+      if ( start_t >= 0 ) {
+        string target = "";
+        //cerr << "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n";
+        for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) {
+          target += targetsToks[tt] + " ";
+        }
+        // target = Trim(target); TODO
+        ret.frame += "<xml translation=\"" + target + "\"> x </xml> ";
+      }
+      currently_included = 0;
+    }
+    if (frameInput.find(t) != frameInput.end())
+      ret.frame += frameInput[t];
+    //cerr << targetBitmap[t] << " " << t << " " << "(" << start_t << ")"
+    //			<< currently_included << endl;
+  } //for (int t = 0
+  cerr << ret.frame << "\n-------------------------------------\n";
+  return ret;
+}

mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <string>
+void create_xml(const std::string &inPath);

mosesdecoder/moses/server/Hypothesis_4server.cpp ADDED Viewed

	@@ -0,0 +1,37 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#include "moses/Hypothesis.h"
+#include "moses/Manager.h"
+#include <boost/foreach.hpp>
+namespace Moses {
+  void
+  Hypothesis::
+  OutputLocalWordAlignment(std::vector<xmlrpc_c::value>& dest) const
+  {
+    using namespace std;
+    Range const& src = this->GetCurrSourceWordsRange();
+    Range const& trg = this->GetCurrTargetWordsRange();
+    WordAlignmentSort waso = m_manager.options()->output.WA_SortOrder;
+    vector<pair<size_t,size_t> const* > a
+      = this->GetCurrTargetPhrase().GetAlignTerm().GetSortedAlignments(waso);
+    typedef pair<size_t,size_t> item;
+    BOOST_FOREACH(item const* p, a) {
+      map<string, xmlrpc_c::value> M;
+      M["source-word"] = xmlrpc_c::value_int(src.GetStartPos() + p->first);
+      M["target-word"] = xmlrpc_c::value_int(trg.GetStartPos() + p->second);
+      dest.push_back(xmlrpc_c::value_struct(M));
+    }
+  }
+  void
+  Hypothesis::
+  OutputWordAlignment(std::vector<xmlrpc_c::value>& out) const
+  {
+    std::vector<Hypothesis const*> tmp;
+    for (Hypothesis const* h = this; h; h = h->GetPrevHypo())
+      tmp.push_back(h);
+    for (size_t i = tmp.size(); i-- > 0;)
+      tmp[i]->OutputLocalWordAlignment(out);
+  }
+}

mosesdecoder/moses/server/Optimizer.h ADDED Viewed

	@@ -0,0 +1,17 @@

+// -*- c++ -*-
+#include <xmlrpc-c/base.hpp>
+#include <xmlrpc-c/registry.hpp>
+#include <xmlrpc-c/server_abyss.hpp>
+namespace MosesServer
+{
+class
+  Optimizer : public xmlrpc_c::method
+{
+public:
+  Optimizer();
+  void execute(xmlrpc_c::paramList const& paramList,
+               xmlrpc_c::value *   const  retvalP);
+};
+}

mosesdecoder/moses/server/PackScores.cpp ADDED Viewed

	@@ -0,0 +1,45 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#include "PackScores.h"
+#include "moses/FF/StatefulFeatureFunction.h"
+#include "moses/FF/StatelessFeatureFunction.h"
+#include <boost/foreach.hpp>
+namespace Moses {
+void
+PackScores(FeatureFunction const& ff, FVector const& S,
+	   std::map<std::string, xmlrpc_c::value>& M)
+{
+  std::vector<xmlrpc_c::value> v;
+  size_t N = ff.GetNumScoreComponents();
+  std::vector<xmlrpc_c::value> dense;
+  dense.reserve(N);
+  size_t o = ff.GetIndex();
+  for (size_t i = 0; i < N; ++i)
+    if (ff.IsTuneableComponent(i))
+      dense.push_back(xmlrpc_c::value_double(S[o+i]));
+  v.push_back(xmlrpc_c::value_array(dense));
+  std::map<std::string,xmlrpc_c::value> sparse;
+  typedef FVector::FNVmap::const_iterator iter;
+  for(iter m = S.cbegin(); m != S.cend(); ++m)
+    sparse[m->first.name()] = xmlrpc_c::value_double(m->second);
+  v.push_back(xmlrpc_c::value_struct(sparse));
+  M[ff.GetScoreProducerDescription()] = xmlrpc_c::value_array(v);
+}
+xmlrpc_c::value
+PackScores(ScoreComponentCollection const& S)
+{
+  std::map<std::string, xmlrpc_c::value> M;
+  typedef StatefulFeatureFunction SFFF;
+  typedef StatelessFeatureFunction SLFF;
+  BOOST_FOREACH(SFFF const* ff, SFFF::GetStatefulFeatureFunctions())
+    if (ff->IsTuneable())
+      PackScores(*ff, S.GetScoresVector(), M);
+  BOOST_FOREACH(SLFF const* ff, SLFF::GetStatelessFeatureFunctions())
+    if (ff->IsTuneable())
+      PackScores(*ff, S.GetScoresVector(), M);
+  return xmlrpc_c::value_struct(M);
+}
+}

mosesdecoder/moses/server/PackScores.h ADDED Viewed

	@@ -0,0 +1,10 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#pragma once
+#include <xmlrpc-c/base.hpp>
+#include "moses/FF/FeatureFunction.h"
+#include "moses/ScoreComponentCollection.h"
+namespace Moses {
+  xmlrpc_c::value
+  PackScores(ScoreComponentCollection const& S);
+}

mosesdecoder/moses/server/Server.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
+#pragma once
+#include "moses/TypeDef.h"
+#ifdef WITH_THREADS
+#include <boost/thread.hpp>
+#include "moses/ThreadPool.h"
+#endif
+#include <xmlrpc-c/base.hpp>
+#include <xmlrpc-c/registry.hpp>
+#include <xmlrpc-c/server_abyss.hpp>
+#include "Translator.h"
+#include "Optimizer.h"
+#include "Updater.h"
+#include "CloseSession.h"
+#include "Session.h"
+#include "moses/parameters/ServerOptions.h"
+#include <string>
+namespace MosesServer
+{
+  class Server
+  {
+    Moses::ServerOptions m_server_options;
+    SessionCache   m_session_cache;
+    xmlrpc_c::registry m_registry;
+    xmlrpc_c::methodPtr const m_updater;
+    xmlrpc_c::methodPtr const m_optimizer;
+    xmlrpc_c::methodPtr const m_translator;
+    xmlrpc_c::methodPtr const m_close_session;
+    std::string m_pidfile;
+  public:
+    Server(Moses::Parameter& params);
+    ~Server();
+    int run();
+    void delete_session(uint64_t const session_id);
+    Moses::ServerOptions const&
+    options() const;
+    Session const&
+    get_session(uint64_t session_id);
+  };
+}

mosesdecoder/moses/server/Session.h ADDED Viewed

	@@ -0,0 +1,75 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*-
+#pragma once
+#include "moses/Util.h"
+#include "moses/ContextScope.h"
+#include "moses/parameters/AllOptions.h"
+#include <sys/time.h>
+#include <boost/unordered_map.hpp>
+#ifdef WITH_THREADS
+#include <boost/thread/shared_mutex.hpp>
+#include <boost/thread/locks.hpp>
+#endif
+namespace MosesServer{
+  struct Session
+  {
+    uint64_t const id;
+    time_t start_time;
+    time_t last_access;
+    boost::shared_ptr<Moses::ContextScope> const scope; // stores local info
+    SPTR<std::map<std::string,float> > m_context_weights;
+    Session(uint64_t const session_id)
+      : id(session_id)
+      , scope(new Moses::ContextScope)
+    {
+      last_access = start_time = time(NULL);
+    }
+    bool is_new() const { return last_access == start_time; }
+    void setup(std::map<std::string, xmlrpc_c::value> const& params);
+  };
+  class SessionCache
+  {
+    mutable boost::shared_mutex m_lock;
+    uint64_t m_session_counter;
+    boost::unordered_map<uint64_t,Session> m_cache;
+  public:
+    SessionCache() : m_session_counter(1) {}
+    Session const&
+    operator[](uint32_t id)
+    {
+      boost::upgrade_lock<boost::shared_mutex> lock(m_lock);
+      if (id > 1)
+        {
+          boost::unordered_map<uint64_t, Session>::iterator m = m_cache.find(id);
+          if (m != m_cache.end())
+            {
+              m->second.last_access = time(NULL);
+              return m->second;
+            }
+        }
+      boost::upgrade_to_unique_lock<boost::shared_mutex> xlock(lock);
+      id = ++m_session_counter;
+      std::pair<uint64_t, Session> foo(id, Session(id));
+      return m_cache.insert(foo).first->second;
+    }
+    void
+    erase(uint32_t const id)
+    {
+      boost::unique_lock<boost::shared_mutex> lock(m_lock);
+      m_cache.erase(id);
+    }
+  };
+}

mosesdecoder/moses/server/TranslationRequest.cpp ADDED Viewed

	@@ -0,0 +1,524 @@

+#include "TranslationRequest.h"
+#include "PackScores.h"
+#include "moses/ContextScope.h"
+#include <boost/foreach.hpp>
+#include "moses/Util.h"
+#include "moses/Hypothesis.h"
+namespace MosesServer
+{
+using namespace std;
+using Moses::Hypothesis;
+using Moses::StaticData;
+using Moses::Range;
+using Moses::ChartHypothesis;
+using Moses::Phrase;
+using Moses::Manager;
+using Moses::SearchGraphNode;
+using Moses::TrellisPathList;
+using Moses::TranslationOptionCollection;
+using Moses::TranslationOptionList;
+using Moses::TranslationOption;
+using Moses::TargetPhrase;
+using Moses::FValue;
+using Moses::PhraseDictionaryMultiModel;
+using Moses::FindPhraseDictionary;
+using Moses::Sentence;
+using Moses::TokenizeMultiCharSeparator;
+using Moses::FeatureFunction;
+using Moses::Scan;
+boost::shared_ptr<TranslationRequest>
+TranslationRequest::
+create(Translator* translator, xmlrpc_c::paramList const& paramList,
+       boost::condition_variable& cond, boost::mutex& mut)
+{
+  boost::shared_ptr<TranslationRequest> ret;
+  ret.reset(new TranslationRequest(paramList, cond, mut));
+  ret->m_self = ret;
+  ret->m_translator = translator;
+  return ret;
+}
+void
+SetContextWeights(Moses::ContextScope& s, xmlrpc_c::value const& w)
+{
+  SPTR<std::map<std::string,float> > M(new std::map<std::string, float>);
+  typedef std::map<std::string,xmlrpc_c::value> tmap;
+  tmap const tmp = static_cast<tmap>(xmlrpc_c::value_struct(w));
+  for(tmap::const_iterator m = tmp.begin(); m != tmp.end(); ++m)
+    (*M)[m->first] = xmlrpc_c::value_double(m->second);
+  s.SetContextWeights(M);
+}
+void
+TranslationRequest::
+Run()
+{
+  typedef std::map<std::string,xmlrpc_c::value> param_t;
+  param_t const& params = m_paramList.getStruct(0);
+  parse_request(params);
+  // cerr << "SESSION ID" << ret->m_session_id << endl;
+  // settings within the session scope
+  param_t::const_iterator si = params.find("context-weights");
+  if (si != params.end()) SetContextWeights(*m_scope, si->second);
+  Moses::StaticData const& SD = Moses::StaticData::Instance();
+  if (is_syntax(m_options->search.algo))
+    run_chart_decoder();
+  else
+    run_phrase_decoder();
+  {
+    boost::lock_guard<boost::mutex> lock(m_mutex);
+    m_done = true;
+  }
+  m_cond.notify_one();
+}
+/// add phrase alignment information from a Hypothesis
+void
+TranslationRequest::
+add_phrase_aln_info(Hypothesis const& h, vector<xmlrpc_c::value>& aInfo) const
+{
+  if (!m_withAlignInfo) return;
+  //  if (!options()->output.ReportSegmentation) return;
+  Range const& trg = h.GetCurrTargetWordsRange();
+  Range const& src = h.GetCurrSourceWordsRange();
+  std::map<std::string, xmlrpc_c::value> pAlnInfo;
+  pAlnInfo["tgt-start"] = xmlrpc_c::value_int(trg.GetStartPos());
+  pAlnInfo["tgt-end"] = xmlrpc_c::value_int(trg.GetEndPos());
+  pAlnInfo["src-start"] = xmlrpc_c::value_int(src.GetStartPos());
+  pAlnInfo["src-end"]   = xmlrpc_c::value_int(src.GetEndPos());
+  aInfo.push_back(xmlrpc_c::value_struct(pAlnInfo));
+}
+void
+TranslationRequest::
+outputChartHypo(ostream& out, const ChartHypothesis* hypo)
+{
+  Phrase outPhrase(20);
+  hypo->GetOutputPhrase(outPhrase);
+  // delete 1st & last
+  assert(outPhrase.GetSize() >= 2);
+  outPhrase.RemoveWord(0);
+  outPhrase.RemoveWord(outPhrase.GetSize() - 1);
+  for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++)
+    out << *outPhrase.GetFactor(pos, 0) << " ";
+}
+bool
+TranslationRequest::
+compareSearchGraphNode(const Moses::SearchGraphNode& a,
+                       const Moses::SearchGraphNode& b)
+{
+  return a.hypo->GetId() < b.hypo->GetId();
+}
+void
+TranslationRequest::
+insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData)
+{
+  using xmlrpc_c::value_int;
+  using xmlrpc_c::value_double;
+  using xmlrpc_c::value_struct;
+  using xmlrpc_c::value_string;
+  vector<xmlrpc_c::value> searchGraphXml;
+  vector<SearchGraphNode> searchGraph;
+  manager.GetSearchGraph(searchGraph);
+  std::sort(searchGraph.begin(), searchGraph.end());
+  BOOST_FOREACH(Moses::SearchGraphNode const& n, searchGraph) {
+    map<string, xmlrpc_c::value> x; // search graph xml node
+    x["forward"] = value_double(n.forward);
+    x["fscore"] = value_double(n.fscore);
+    const Hypothesis* hypo = n.hypo;
+    x["hyp"] = value_int(hypo->GetId());
+    x["stack"] = value_int(hypo->GetWordsBitmap().GetNumWordsCovered());
+    if (hypo->GetId() != 0) {
+      const Hypothesis *prevHypo = hypo->GetPrevHypo();
+      x["back"] = value_int(prevHypo->GetId());
+      x["score"] = value_double(hypo->GetScore());
+      x["transition"] = value_double(hypo->GetScore() - prevHypo->GetScore());
+      if (n.recombinationHypo)
+        x["recombined"] = value_int(n.recombinationHypo->GetId());
+      x["cover-start"] = value_int(hypo->GetCurrSourceWordsRange().GetStartPos());
+      x["cover-end"] = value_int(hypo->GetCurrSourceWordsRange().GetEndPos());
+      x["out"] = value_string(hypo->GetCurrTargetPhrase().GetStringRep(options()->output.factor_order));
+    }
+    searchGraphXml.push_back(value_struct(x));
+  }
+  retData["sg"] = xmlrpc_c::value_array(searchGraphXml);
+}
+void
+TranslationRequest::
+outputNBest(const Manager& manager, map<string, xmlrpc_c::value>& retData)
+{
+  TrellisPathList nBestList;
+  vector<xmlrpc_c::value> nBestXml;
+  Moses::NBestOptions const& nbo = m_options->nbest;
+  manager.CalcNBest(nbo.nbest_size, nBestList, nbo.only_distinct);
+  manager.OutputNBest(cout, nBestList);
+  BOOST_FOREACH(Moses::TrellisPath const* path, nBestList) {
+    vector<const Hypothesis *> const& E = path->GetEdges();
+    if (!E.size()) continue;
+    std::map<std::string, xmlrpc_c::value> nBestXmlItem;
+    pack_hypothesis(manager, E, "hyp", nBestXmlItem);
+    if (m_withScoreBreakdown) {
+      // should the score breakdown be reported in a more structured manner?
+      ostringstream buf;
+      bool with_labels = nbo.include_feature_labels;
+      path->GetScoreBreakdown()->OutputAllFeatureScores(buf, with_labels);
+      nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str());
+      nBestXmlItem["scores"] = PackScores(*path->GetScoreBreakdown());
+    }
+    // weighted score
+    nBestXmlItem["totalScore"] = xmlrpc_c::value_double(path->GetFutureScore());
+    nBestXml.push_back(xmlrpc_c::value_struct(nBestXmlItem));
+  }
+  retData["nbest"] = xmlrpc_c::value_array(nBestXml);
+}
+void
+TranslationRequest::
+insertTranslationOptions(Moses::Manager& manager,
+                         std::map<std::string, xmlrpc_c::value>& retData)
+{
+  std::vector<Moses::FactorType> const& ofactor_order = options()->output.factor_order;
+  const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions();
+  vector<xmlrpc_c::value> toptsXml;
+  size_t const stop = toptsColl->GetSource().GetSize();
+  TranslationOptionList const* tol;
+  for (size_t s = 0 ; s < stop ; ++s) {
+    for (size_t e=s;(tol=toptsColl->GetTranslationOptionList(s,e))!=NULL;++e) {
+      BOOST_FOREACH(TranslationOption const* topt, *tol) {
+        std::map<std::string, xmlrpc_c::value> toptXml;
+        TargetPhrase const& tp = topt->GetTargetPhrase();
+        std::string tphrase = tp.GetStringRep(ofactor_order);
+        toptXml["phrase"] = xmlrpc_c::value_string(tphrase);
+        toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore());
+        toptXml["start"]  = xmlrpc_c::value_int(s);
+        toptXml["end"]    = xmlrpc_c::value_int(e);
+        vector<xmlrpc_c::value> scoresXml;
+        const std::valarray<FValue> &scores
+	  = topt->GetScoreBreakdown().getCoreFeatures();
+        for (size_t j = 0; j < scores.size(); ++j)
+          scoresXml.push_back(xmlrpc_c::value_double(scores[j]));
+        toptXml["scores"] = xmlrpc_c::value_array(scoresXml);
+	ostringstream buf;
+	topt->GetScoreBreakdown().OutputAllFeatureScores(buf, true);
+	toptXml["labelledScores"] = PackScores(topt->GetScoreBreakdown());
+        toptsXml.push_back(xmlrpc_c::value_struct(toptXml));
+      }
+    }
+  }
+  retData["topt"] = xmlrpc_c::value_array(toptsXml);
+}
+TranslationRequest::
+TranslationRequest(xmlrpc_c::paramList const& paramList,
+                   boost::condition_variable& cond, boost::mutex& mut)
+  : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList)
+  , m_session_id(0)
+{
+}
+bool
+check(std::map<std::string, xmlrpc_c::value> const& param,
+      std::string const key)
+{
+  std::map<std::string, xmlrpc_c::value>::const_iterator m = param.find(key);
+  if(m == param.end()) return false;
+  if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN)
+    return xmlrpc_c::value_boolean(m->second);
+  std::string val = string(xmlrpc_c::value_string(m->second));
+  if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true;
+  return false;
+}
+void
+TranslationRequest::
+parse_request(std::map<std::string, xmlrpc_c::value> const& params)
+{
+  // parse XMLRPC request
+  m_paramList.verifyEnd(1); // ??? UG
+  typedef std::map<std::string, xmlrpc_c::value> params_t;
+  params_t::const_iterator si;
+  si = params.find("session-id");
+  if (si != params.end())
+    {
+      m_session_id = xmlrpc_c::value_int(si->second);
+      Session const& S = m_translator->get_session(m_session_id);
+      m_scope = S.scope;
+      m_session_id = S.id;
+    }
+  else
+    {
+      m_session_id = 0;
+      m_scope.reset(new Moses::ContextScope);
+    }
+  boost::shared_ptr<Moses::AllOptions> opts(new Moses::AllOptions(*StaticData::Instance().options()));
+  opts->update(params);
+  m_withGraphInfo = check(params, "sg");
+  if (m_withGraphInfo || opts->nbest.nbest_size > 0) {
+    opts->output.SearchGraph = "true";
+    opts->nbest.enabled = true;
+  }
+  m_options = opts;
+  // source text must be given, or we don't know what to translate
+  si = params.find("text");
+  if (si == params.end())
+    throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE);
+  m_source_string = xmlrpc_c::value_string(si->second);
+  XVERBOSE(1,"Input: " << m_source_string << endl);
+  m_withTopts           = check(params, "topt");
+  m_withScoreBreakdown  = check(params, "add-score-breakdown");
+  si = params.find("lambda");
+  if (si != params.end())
+    {
+      // muMo = multiModel
+      xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second);
+      vector<xmlrpc_c::value> muMoValVec(muMoArray.vectorValueValue());
+      vector<float> w(muMoValVec.size());
+      for (size_t i = 0; i < muMoValVec.size(); ++i)
+	w[i] = xmlrpc_c::value_double(muMoValVec[i]);
+      if (w.size() && (si = params.find("model_name")) != params.end())
+	{
+	  string const model_name = xmlrpc_c::value_string(si->second);
+	  PhraseDictionaryMultiModel* pdmm
+	    = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name);
+	  pdmm->SetTemporaryMultiModelWeightsVector(w);
+	}
+    }
+  si = params.find("context");
+  if (si != params.end())
+    {
+      string context = xmlrpc_c::value_string(si->second);
+      VERBOSE(1,"CONTEXT " << context);
+      m_context.reset(new std::vector<std::string>(1,context));
+    }
+  si = params.find("context-scope");
+  if (si != params.end())
+    {
+      string context = xmlrpc_c::value_string(si->second);
+      string groupSeparator("Moses::ContextScope::GroupSeparator");
+      string recordSeparator("Moses::ContextScope::RecordSeparator");
+      // Here, we assume that any XML-RPC value
+      //       associated with the key "context-scope"
+      //       has the following format:
+      //
+      // FeatureFunctionName followed by recordSeparator
+      //                     followed by the value of interest
+      //                     followed by groupSeparator
+      //
+      // In the following code, the value of interest will be stored
+      //        in contextScope under the key FeatureFunctionName,
+      //        where FeatureFunctionName is the actual name of the feature function
+      boost::shared_ptr<Moses::ContextScope> contextScope = GetScope();
+      BOOST_FOREACH(string group, TokenizeMultiCharSeparator(context, groupSeparator)) {
+	vector<string> record = TokenizeMultiCharSeparator(group, recordSeparator);
+	// Use the feature function whose name is record[0] as a key
+	FeatureFunction& ff = Moses::FeatureFunction::FindFeatureFunction(record[0]);
+	void const* key = static_cast<void const*>(&ff);
+	// Store (in the context scope) record[1] as the value associated with that key
+	boost::shared_ptr<string> value = contextScope->get<string>(key,true);
+	value->replace(value->begin(), value->end(), record[1]);
+      }
+    }
+  // Report alignment info if Moses config says to or if XML request says to
+  m_withAlignInfo = options()->output.ReportSegmentation || check(params, "align");
+  // Report word alignment info if Moses config says to or if XML request says to
+  m_withWordAlignInfo = options()->output.PrintAlignmentInfo || check(params, "word-align");
+  si = params.find("weights");
+  if (si != params.end())
+    {
+      boost::unordered_map<string, FeatureFunction*> map;
+      {
+	const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
+	BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
+	  map[ff->GetScoreProducerDescription()] = ff;
+	}
+      }
+      string allValues = xmlrpc_c::value_string(si->second);
+      BOOST_FOREACH(string values, TokenizeMultiCharSeparator(allValues, "\t")) {
+	vector<string> record = TokenizeMultiCharSeparator(values, "=");
+	if (record.size() == 2) {
+	  string featureName = record[0];
+	  string featureWeights = record[1];
+	  boost::unordered_map<string, FeatureFunction*>::iterator ffi = map.find(featureName);
+	  if (ffi != map.end()) {
+	    FeatureFunction* ff = ffi->second;
+	    size_t prevNumWeights = ff->GetNumScoreComponents();
+	    vector<float> ffWeights;
+	    BOOST_FOREACH(string weight, TokenizeMultiCharSeparator(featureWeights, " ")) {
+	      ffWeights.push_back(Scan<float>(weight));
+	    }
+	    if (ffWeights.size() == ff->GetNumScoreComponents()) {
+	      // XXX: This is NOT thread-safe
+	      Moses::StaticData::InstanceNonConst().SetWeights(ff, ffWeights);
+	      VERBOSE(1, "WARNING: THIS IS NOT THREAD-SAFE!\tUpdating weights for " << featureName << " to " << featureWeights << "\n");
+	    } else {
+	      TRACE_ERR("ERROR: Unable to update weights for " << featureName << " because " << ff->GetNumScoreComponents() << " weights are required but only " << ffWeights.size() << " were provided\n");
+	    }
+	  } else {
+	    TRACE_ERR("ERROR: No FeatureFunction with name " << featureName << ", no weight update\n");
+	  }
+	} else {
+	  TRACE_ERR("WARNING: XML-RPC weights update was improperly formatted:\t" << values << "\n");
+	}
+      }
+    }
+  // // biased sampling for suffix-array-based sampling phrase table?
+  // if ((si = params.find("bias")) != params.end())
+  //   {
+  // 	std::vector<xmlrpc_c::value> tmp
+  // 	  = xmlrpc_c::value_array(si->second).cvalue();
+  // 	for (size_t i = 1; i < tmp.size(); i += 2)
+  // 	  m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]);
+  //   }
+  if (is_syntax(m_options->search.algo)) {
+    m_source.reset(new Sentence(m_options,0,m_source_string));
+  } else {
+    m_source.reset(new Sentence(m_options,0,m_source_string));
+  }
+	interpret_dlt();
+} // end of Translationtask::parse_request()
+void
+TranslationRequest::
+run_chart_decoder()
+{
+  Moses::ChartManager manager(this->self());
+  manager.Decode();
+  const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis();
+  ostringstream out;
+  if (hypo) outputChartHypo(out,hypo);
+  m_target_string = out.str();
+  m_retData["text"] = xmlrpc_c::value_string(m_target_string);
+  if (m_withGraphInfo) {
+    std::ostringstream sgstream;
+    manager.OutputSearchGraphMoses(sgstream);
+    m_retData["sg"] =  xmlrpc_c::value_string(sgstream.str());
+  }
+} // end of TranslationRequest::run_chart_decoder()
+void
+TranslationRequest::
+pack_hypothesis(const Moses::Manager& manager,
+		vector<Hypothesis const* > const& edges, string const& key,
+                map<string, xmlrpc_c::value> & dest) const
+{
+  // target string
+  ostringstream target;
+  BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) {
+    manager.OutputSurface(target, *e);
+  }
+  XVERBOSE(1, "BEST TRANSLATION: " << *(manager.GetBestHypothesis())
+	   << std::endl);
+  dest[key] = xmlrpc_c::value_string(target.str());
+  if (m_withAlignInfo) {
+  //  if (options()->output.ReportSegmentation) {
+    // phrase alignment, if requested
+    vector<xmlrpc_c::value> p_aln;
+    BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
+      add_phrase_aln_info(*e, p_aln);
+    dest["align"] = xmlrpc_c::value_array(p_aln);
+  }
+  if (m_withWordAlignInfo) {
+    //if (options()->output.PrintAlignmentInfo) {
+    // word alignment, if requested
+    vector<xmlrpc_c::value> w_aln;
+    BOOST_REVERSE_FOREACH(Hypothesis const* e, edges)
+      e->OutputLocalWordAlignment(w_aln);
+    dest["word-align"] = xmlrpc_c::value_array(w_aln);
+  }
+}
+void
+TranslationRequest::
+pack_hypothesis(const Moses::Manager& manager, Hypothesis const* h, string const& key,
+                map<string, xmlrpc_c::value>& dest) const
+{
+  using namespace std;
+  vector<Hypothesis const*> edges;
+  for (; h; h = h->GetPrevHypo())
+    edges.push_back(h);
+  pack_hypothesis(manager, edges, key, dest);
+}
+void
+TranslationRequest::
+run_phrase_decoder()
+{
+  Manager manager(this->self());
+  manager.Decode();
+  pack_hypothesis(manager, manager.GetBestHypothesis(), "text", m_retData);
+  if (m_session_id)
+    m_retData["session-id"] = xmlrpc_c::value_int(m_session_id);
+  if (m_withGraphInfo) insertGraphInfo(manager,m_retData);
+  if (m_withTopts) insertTranslationOptions(manager,m_retData);
+  if (m_options->nbest.nbest_size) outputNBest(manager, m_retData);
+}
+}

mosesdecoder/moses/server/Updater.cpp ADDED Viewed

	@@ -0,0 +1,58 @@

+// -*- mode: c++; indent-tabs-mode: nil; tab-width:2  -*-
+#include "Updater.h"
+namespace MosesServer
+{
+using namespace Moses;
+using namespace std;
+Updater::
+Updater()
+{
+  // signature and help strings are documentation -- the client
+  // can query this information with a system.methodSignature and
+  // system.methodHelp RPC.
+  this->_signature = "S:S";
+  this->_help = "Updates stuff";
+}
+void
+Updater::
+execute(xmlrpc_c::paramList const& paramList,
+        xmlrpc_c::value *   const  retvalP)
+{
+#if PT_UG
+  const params_t params = paramList.getStruct(0);
+  breakOutParams(params);
+  Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
+  pdsa->add(m_src, m_trg, m_aln);
+  XVERBOSE(1,"Done inserting\n");
+  *retvalP = xmlrpc_c::value_string("Phrase table updated");
+#endif
+};
+void
+Updater::
+breakOutParams(const params_t& params)
+{
+  params_t::const_iterator si = params.find("source");
+  if(si == params.end())
+    throw xmlrpc_c::fault("Missing source sentence",
+                          xmlrpc_c::fault::CODE_PARSE);
+  m_src = xmlrpc_c::value_string(si->second);
+  XVERBOSE(1,"source = " << m_src << endl);
+  si = params.find("target");
+  if(si == params.end())
+    throw xmlrpc_c::fault("Missing target sentence",
+                          xmlrpc_c::fault::CODE_PARSE);
+  m_trg = xmlrpc_c::value_string(si->second);
+  XVERBOSE(1,"target = " << m_trg << endl);
+  if((si = params.find("alignment")) == params.end())
+    throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
+  m_aln = xmlrpc_c::value_string(si->second);
+  XVERBOSE(1,"alignment = " << m_aln << endl);
+  m_bounded  = ((si = params.find("bounded")) != params.end());
+  m_add2ORLM = ((si = params.find("updateORLM")) != params.end());
+};
+}

mosesdecoder/moses/server/Updater.h ADDED Viewed

	@@ -0,0 +1,44 @@

+// -*- c++ -*-
+#pragma once
+#include "moses/Util.h"
+#include "moses/ChartManager.h"
+#include "moses/Hypothesis.h"
+#include "moses/Manager.h"
+#include "moses/StaticData.h"
+#include "moses/ThreadPool.h"
+#if PT_UG
+#include "moses/TranslationModel/UG/mmsapt.h"
+#endif
+#include <xmlrpc-c/base.hpp>
+#include <xmlrpc-c/registry.hpp>
+#include <xmlrpc-c/server_abyss.hpp>
+namespace MosesServer
+{
+class
+  Updater: public xmlrpc_c::method
+{
+  typedef std::map<std::string, xmlrpc_c::value> params_t;
+  std::string m_src, m_trg, m_aln;
+  bool m_bounded, m_add2ORLM;
+public:
+  Updater();
+  void
+  execute(xmlrpc_c::paramList const& paramList,
+          xmlrpc_c::value * const  retvalP);
+  void
+  breakOutParams(const params_t& params);
+};
+}

mosesdecoder/util/bit_packing_test.cc ADDED Viewed

	@@ -0,0 +1,59 @@

+#include "util/bit_packing.hh"
+#define BOOST_TEST_MODULE BitPackingTest
+#include <boost/test/unit_test.hpp>
+#include <cstring>
+namespace util {
+namespace {
+const uint64_t test57 = 0x123456789abcdefULL;
+const uint32_t test25 = 0x1234567;
+BOOST_AUTO_TEST_CASE(ZeroBit57) {
+  char mem[16];
+  memset(mem, 0, sizeof(mem));
+  WriteInt57(mem, 0, 57, test57);
+  BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
+}
+BOOST_AUTO_TEST_CASE(EachBit57) {
+  char mem[16];
+  for (uint8_t b = 0; b < 8; ++b) {
+    memset(mem, 0, sizeof(mem));
+    WriteInt57(mem, b, 57, test57);
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
+  }
+}
+BOOST_AUTO_TEST_CASE(Consecutive57) {
+  char mem[57+8];
+  memset(mem, 0, sizeof(mem));
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    WriteInt57(mem, b, 57, test57);
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
+  }
+  for (uint64_t b = 0; b < 57 * 8; b += 57) {
+    BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
+  }
+}
+BOOST_AUTO_TEST_CASE(Consecutive25) {
+  char mem[25+8];
+  memset(mem, 0, sizeof(mem));
+  for (uint64_t b = 0; b < 25 * 8; b += 25) {
+    WriteInt25(mem, b, 25, test25);
+    BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
+  }
+  for (uint64_t b = 0; b < 25 * 8; b += 25) {
+    BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
+  }
+}
+BOOST_AUTO_TEST_CASE(Sanity) {
+  BitPackingSanity();
+}
+} // namespace
+} // namespace util

mosesdecoder/util/ersatz_progress.hh ADDED Viewed

	@@ -0,0 +1,57 @@

+#ifndef UTIL_ERSATZ_PROGRESS_H
+#define UTIL_ERSATZ_PROGRESS_H
+#include <iostream>
+#include <string>
+#include <stdint.h>
+// Ersatz version of boost::progress so core language model doesn't depend on
+// boost.  Also adds option to print nothing.
+namespace util {
+extern const char kProgressBanner[];
+class ErsatzProgress {
+  public:
+    // No output.
+    ErsatzProgress();
+    // Null means no output.  The null value is useful for passing along the ostream pointer from another caller.
+    explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
+    ~ErsatzProgress();
+    ErsatzProgress &operator++() {
+      if (++current_ >= next_) Milestone();
+      return *this;
+    }
+    ErsatzProgress &operator+=(uint64_t amount) {
+      if ((current_ += amount) >= next_) Milestone();
+      return *this;
+    }
+    void Set(uint64_t to) {
+      if ((current_ = to) >= next_) Milestone();
+    }
+    void Finished() {
+      Set(complete_);
+    }
+  private:
+    void Milestone();
+    uint64_t current_, next_, complete_;
+    unsigned char stones_written_;
+    std::ostream *out_;
+    // noncopyable
+    ErsatzProgress(const ErsatzProgress &other);
+    ErsatzProgress &operator=(const ErsatzProgress &other);
+};
+} // namespace util
+#endif // UTIL_ERSATZ_PROGRESS_H

mosesdecoder/util/exception.hh ADDED Viewed

	@@ -0,0 +1,165 @@

+#ifndef UTIL_EXCEPTION_H
+#define UTIL_EXCEPTION_H
+#include "util/string_stream.hh"
+#include <exception>
+#include <limits>
+#include <string>
+#include <stdint.h>
+// TODO(hieu): delete this
+#include <sstream>
+namespace util {
+template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
+class Exception : public std::exception {
+  public:
+    Exception() throw();
+    virtual ~Exception() throw();
+    const char *what() const throw() { return what_.str().c_str(); }
+    // For use by the UTIL_THROW macros.
+    void SetLocation(
+        const char *file,
+        unsigned int line,
+        const char *func,
+        const char *child_name,
+        const char *condition);
+  private:
+    template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
+    // This helps restrict operator<< defined below.
+    template <class T> struct ExceptionTag {
+      typedef T Identity;
+    };
+    StringStream what_;
+};
+/* This implements the normal operator<< for Exception and all its children.
+ * SFINAE means it only applies to Exception.  Think of this as an ersatz
+ * boost::enable_if.
+ */
+template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
+  // TODO(hieu): delete this.
+  std::stringstream moses_hack;
+  moses_hack << data;
+  e.what_ << moses_hack.str();
+  return e;
+}
+#ifdef __GNUC__
+#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
+#else
+#ifdef _WIN32
+#define UTIL_FUNC_NAME __FUNCTION__
+#else
+#define UTIL_FUNC_NAME NULL
+#endif
+#endif
+/* Create an instance of Exception, add the message Modify, and throw it.
+ * Modify is appended to the what() message and can contain << for ostream
+ * operations.
+ *
+ * do .. while kludge to swallow trailing ; character
+ * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
+ * Arg can be a constructor argument to the exception.
+ */
+#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
+  Exception UTIL_e Arg; \
+  UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
+  UTIL_e << Modify; \
+  throw UTIL_e; \
+} while (0)
+#define UTIL_THROW_ARG(Exception, Arg, Modify) \
+  UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
+#define UTIL_THROW(Exception, Modify) \
+  UTIL_THROW_BACKEND(NULL, Exception, , Modify);
+#define UTIL_THROW2(Modify) \
+  UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
+#if __GNUC__ >= 3
+#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
+#else
+#define UTIL_UNLIKELY(x) (x)
+#endif
+#if __GNUC__ >= 3
+#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1)
+#else
+#define UTIL_LIKELY(x) (x)
+#endif
+#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
+  if (UTIL_UNLIKELY(Condition)) { \
+    UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
+  } \
+} while (0)
+#define UTIL_THROW_IF(Condition, Exception, Modify) \
+  UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
+#define UTIL_THROW_IF2(Condition, Modify) \
+  UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify)
+// Exception that records errno and adds it to the message.
+class ErrnoException : public Exception {
+  public:
+    ErrnoException() throw();
+    virtual ~ErrnoException() throw();
+    int Error() const throw() { return errno_; }
+  private:
+    int errno_;
+};
+// file wasn't there, or couldn't be open for some reason
+class FileOpenException : public Exception {
+  public:
+	FileOpenException() throw() {}
+    ~FileOpenException() throw() {}
+};
+// Utilities for overflow checking.
+class OverflowException : public Exception {
+  public:
+    OverflowException() throw();
+    ~OverflowException() throw();
+};
+template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
+  UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected.  This model is too big for 32-bit code.");
+  return value;
+}
+template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
+  return value;
+}
+inline std::size_t CheckOverflow(uint64_t value) {
+  return CheckOverflowInternal<sizeof(std::size_t)>(value);
+}
+#if defined(_WIN32) || defined(_WIN64)
+/* Thrown for Windows specific operations. */
+class WindowsException : public Exception {
+  public:
+    WindowsException() throw();
+    ~WindowsException() throw();
+};
+#endif
+} // namespace util
+#endif // UTIL_EXCEPTION_H

mosesdecoder/util/fake_ostream.hh ADDED Viewed

	@@ -0,0 +1,111 @@

+#ifndef UTIL_FAKE_OSTREAM_H
+#define UTIL_FAKE_OSTREAM_H
+#include "util/float_to_string.hh"
+#include "util/integer_to_string.hh"
+#include "util/string_piece.hh"
+#include <cassert>
+#include <limits>
+#include <stdint.h>
+namespace util {
+/* Like std::ostream but without being incredibly slow.
+ * Supports most of the built-in types except for long double.
+ *
+ * The FakeOStream class is intended to be inherited from.  The inherting class
+ * should provide:
+ * public:
+ *   Derived &flush();
+ *   Derived &write(const void *data, std::size_t length);
+ *
+ * private: or protected:
+ *   friend class FakeOStream;
+ *   char *Ensure(std::size_t amount);
+ *   void AdvanceTo(char *to);
+ *
+ * The Ensure function makes enough space for an in-place write and returns
+ * where to write.  The AdvanceTo function happens after the write, saying how
+ * much was actually written.
+ *
+ * Precondition:
+ * amount <= kToStringMaxBytes for in-place writes.
+ */
+template <class Derived> class FakeOStream {
+  public:
+    FakeOStream() {}
+    // This also covers std::string and char*
+    Derived &operator<<(StringPiece str) {
+      return C().write(str.data(), str.size());
+    }
+    // Handle integers by size and signedness.
+  private:
+    template <class Arg> struct EnableIfKludge {
+      typedef Derived type;
+    };
+    template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
+    template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
+    template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
+    template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
+    template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
+    template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
+    template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
+  public:
+    template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
+      return CallToString(static_cast<typename Coerce<From>::To>(value));
+    }
+    // Character types that get copied as bytes instead of displayed as integers.
+    Derived &operator<<(char val) { return put(val); }
+    Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
+    Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
+    Derived &operator<<(bool val) { return put(val + '0'); }
+    // enums will fall back to int but are not caught by the template.
+    Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
+    Derived &operator<<(float val) { return CallToString(val); }
+    Derived &operator<<(double val) { return CallToString(val); }
+    // This is here to catch all the other pointer types.
+    Derived &operator<<(const void *value) { return CallToString(value); }
+    // This is here because the above line also catches const char*.
+    Derived &operator<<(const char *value) { return *this << StringPiece(value); }
+    Derived &operator<<(char *value) { return *this << StringPiece(value); }
+    Derived &put(char val) {
+      char *c = C().Ensure(1);
+      *c = val;
+      C().AdvanceTo(++c);
+      return C();
+    }
+    char widen(char val) const { return val; }
+  private:
+    // References to derived class for convenience.
+    Derived &C() {
+      return *static_cast<Derived*>(this);
+    }
+    const Derived &C() const {
+      return *static_cast<const Derived*>(this);
+    }
+    // This is separate to prevent an infinite loop if the compiler considers
+    // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
+    template <class T> Derived &CallToString(const T value) {
+      C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
+      return C();
+    }
+};
+} // namespace
+#endif // UTIL_FAKE_OSTREAM_H

mosesdecoder/util/file_piece.hh ADDED Viewed

	@@ -0,0 +1,175 @@

+#ifndef UTIL_FILE_PIECE_H
+#define UTIL_FILE_PIECE_H
+#include "util/ersatz_progress.hh"
+#include "util/exception.hh"
+#include "util/file.hh"
+#include "util/mmap.hh"
+#include "util/read_compressed.hh"
+#include "util/string_piece.hh"
+#include <cstddef>
+#include <iosfwd>
+#include <string>
+#include <cassert>
+#include <stdint.h>
+namespace util {
+class ParseNumberException : public Exception {
+  public:
+    explicit ParseNumberException(StringPiece value) throw();
+    ~ParseNumberException() throw() {}
+};
+extern const bool kSpaces[256];
+// Memory backing the returned StringPiece may vanish on the next call.
+class FilePiece {
+  public:
+    // 1 MB default.
+    explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
+    // Takes ownership of fd.  name is used for messages.
+    explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576);
+    /* Read from an istream.  Don't use this if you can avoid it.  Raw fd IO is
+     * much faster.  But sometimes you just have an istream like Boost's HTTP
+     * server and want to parse it the same way.
+     * name is just used for messages and FileName().
+     */
+    explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576);
+    ~FilePiece();
+    char get() {
+      if (position_ == position_end_) {
+        Shift();
+        if (at_end_) throw EndOfFileException();
+      }
+      return *(position_++);
+    }
+    // Leaves the delimiter, if any, to be returned by get().  Delimiters defined by isspace().
+    StringPiece ReadDelimited(const bool *delim = kSpaces) {
+      SkipSpaces(delim);
+      return Consume(FindDelimiterOrEOF(delim));
+    }
+    /// Read word until the line or file ends.
+    bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) {
+      assert(delim[static_cast<unsigned char>('\n')]);
+      // Skip non-enter spaces.
+      for (; ; ++position_) {
+        if (position_ == position_end_) {
+          try {
+            Shift();
+          } catch (const util::EndOfFileException &e) { return false; }
+          // And break out at end of file.
+          if (position_ == position_end_) return false;
+        }
+        if (!delim[static_cast<unsigned char>(*position_)]) break;
+        if (*position_ == '\n') return false;
+      }
+      // We can't be at the end of file because there's at least one character open.
+      to = Consume(FindDelimiterOrEOF(delim));
+      return true;
+    }
+    /** Read a line of text from the file.
+     *
+     * Unlike ReadDelimited, this includes leading spaces and consumes the
+     * delimiter.   It is similar to getline in that way.
+     *
+     * If strip_cr is true, any trailing carriate return (as would be found on
+     * a file written on Windows) will be left out of the returned line.
+     *
+     * Throws EndOfFileException if the end of the file is encountered.  If the
+     * file does not end in a newline, this could mean that the last line is
+     * never read.
+     */
+    StringPiece ReadLine(char delim = '\n', bool strip_cr = true);
+    /** Read a line of text from the file, or return false on EOF.
+     *
+     * This is like ReadLine, except it returns false where ReadLine throws
+     * EndOfFileException.  Like ReadLine it may not read the last line in the
+     * file if the file does not end in a newline.
+     *
+     * If strip_cr is true, any trailing carriate return (as would be found on
+     * a file written on Windows) will be left out of the returned line.
+     */
+    bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true);
+    float ReadFloat();
+    double ReadDouble();
+    long int ReadLong();
+    unsigned long int ReadULong();
+    // Skip spaces defined by isspace.
+    void SkipSpaces(const bool *delim = kSpaces) {
+      assert(position_ <= position_end_);
+      for (; ; ++position_) {
+        if (position_ == position_end_) {
+          Shift();
+          // And break out at end of file.
+          if (position_ == position_end_) return;
+        }
+        assert(position_ < position_end_);
+        if (!delim[static_cast<unsigned char>(*position_)]) return;
+      }
+    }
+    uint64_t Offset() const {
+      return position_ - data_.begin() + mapped_offset_;
+    }
+    const std::string &FileName() const { return file_name_; }
+  private:
+    void InitializeNoRead(const char *name, std::size_t min_buffer);
+    // Calls InitializeNoRead, so don't call both.
+    void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer);
+    template <class T> T ReadNumber();
+    StringPiece Consume(const char *to) {
+      assert(to >= position_);
+      StringPiece ret(position_, to - position_);
+      position_ = to;
+      return ret;
+    }
+    const char *FindDelimiterOrEOF(const bool *delim = kSpaces);
+    void Shift();
+    // Backends to Shift().
+    void MMapShift(uint64_t desired_begin);
+    void TransitionToRead();
+    void ReadShift();
+    const char *position_, *last_space_, *position_end_;
+    scoped_fd file_;
+    const uint64_t total_size_;
+    const uint64_t page_;
+    std::size_t default_map_size_;
+    uint64_t mapped_offset_;
+    // Order matters: file_ should always be destroyed after this.
+    scoped_memory data_;
+    bool at_end_;
+    bool fallback_to_read_;
+    ErsatzProgress progress_;
+    std::string file_name_;
+    ReadCompressed fell_back_;
+};
+} // namespace util
+#endif // UTIL_FILE_PIECE_H

mosesdecoder/util/file_piece_test.cc ADDED Viewed

	@@ -0,0 +1,154 @@

+// Tests might fail if you have creative characters in your path.  Sue me.
+#include "util/file_piece.hh"
+#include "util/file_stream.hh"
+#include "util/file.hh"
+#include "util/scoped.hh"
+#define BOOST_TEST_MODULE FilePieceTest
+#include <boost/test/unit_test.hpp>
+#include <fstream>
+#include <iostream>
+#include <cstdio>
+#include <sys/types.h>
+#include <sys/stat.h>
+namespace util {
+namespace {
+std::string FileLocation() {
+  if (boost::unit_test::framework::master_test_suite().argc < 2) {
+    return "file_piece.cc";
+  }
+  std::string ret(boost::unit_test::framework::master_test_suite().argv[1]);
+  return ret;
+}
+/* istream */
+BOOST_AUTO_TEST_CASE(IStream) {
+  std::fstream ref(FileLocation().c_str(), std::ios::in);
+  std::fstream backing(FileLocation().c_str(), std::ios::in);
+  FilePiece test(backing);
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    BOOST_CHECK_EQUAL(ref_line, test_line);
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+/* mmap implementation */
+BOOST_AUTO_TEST_CASE(MMapReadLine) {
+  std::fstream ref(FileLocation().c_str(), std::ios::in);
+  FilePiece test(FileLocation().c_str(), NULL, 1);
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+    if (!test_line.empty() || !ref_line.empty()) {
+      BOOST_CHECK_EQUAL(ref_line, test_line);
+    }
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__)
+/* Apple isn't happy with the popen, fileno, dup.  And I don't want to
+ * reimplement popen.  This is an issue with the test.
+ */
+/* read() implementation */
+BOOST_AUTO_TEST_CASE(StreamReadLine) {
+  std::fstream ref(FileLocation().c_str(), std::ios::in);
+  std::string popen_args = "cat \"";
+  popen_args += FileLocation();
+  popen_args += '"';
+  FILE *catter = popen(popen_args.c_str(), "r");
+  BOOST_REQUIRE(catter);
+  FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1);
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+    if (!test_line.empty() || !ref_line.empty()) {
+      BOOST_CHECK_EQUAL(ref_line, test_line);
+    }
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_REQUIRE(!pclose(catter));
+}
+#endif
+#ifdef HAVE_ZLIB
+// gzip file
+BOOST_AUTO_TEST_CASE(PlainZipReadLine) {
+  std::string location(FileLocation());
+  std::fstream ref(location.c_str(), std::ios::in);
+  std::string command("gzip <\"");
+  command += location + "\" >\"" + location + "\".gz";
+  BOOST_REQUIRE_EQUAL(0, system(command.c_str()));
+  FilePiece test((location + ".gz").c_str(), NULL, 1);
+  unlink((location + ".gz").c_str());
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+    if (!test_line.empty() || !ref_line.empty()) {
+      BOOST_CHECK_EQUAL(ref_line, test_line);
+    }
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+}
+// gzip stream.  Apple doesn't like popen, fileno, dup.  This is an issue with
+// the test.
+#if !defined __APPLE__ && !defined __MINGW32__
+BOOST_AUTO_TEST_CASE(StreamZipReadLine) {
+  std::fstream ref(FileLocation().c_str(), std::ios::in);
+  std::string command("gzip <\"");
+  command += FileLocation() + "\"";
+  FILE * catter = popen(command.c_str(), "r");
+  BOOST_REQUIRE(catter);
+  FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1);
+  std::string ref_line;
+  while (getline(ref, ref_line)) {
+    StringPiece test_line(test.ReadLine());
+    // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924
+    if (!test_line.empty() || !ref_line.empty()) {
+      BOOST_CHECK_EQUAL(ref_line, test_line);
+    }
+  }
+  BOOST_CHECK_THROW(test.get(), EndOfFileException);
+  BOOST_REQUIRE(!pclose(catter));
+}
+#endif // __APPLE__
+#endif // HAVE_ZLIB
+BOOST_AUTO_TEST_CASE(Numbers) {
+  scoped_fd file(MakeTemp(FileLocation()));
+  const float floating = 3.2;
+  {
+    util::FileStream writing(file.get());
+    writing << "94389483984398493890287 " << floating << " 5";
+  }
+  SeekOrThrow(file.get(), 0);
+  util::FilePiece f(file.release());
+  BOOST_CHECK_THROW(f.ReadULong(), ParseNumberException);
+  BOOST_CHECK_EQUAL("94389483984398493890287", f.ReadDelimited());
+  // Yes, exactly equal.  Isn't double-conversion wonderful?
+  BOOST_CHECK_EQUAL(floating, f.ReadFloat());
+  BOOST_CHECK_EQUAL(5, f.ReadULong());
+}
+} // namespace
+} // namespace util

mosesdecoder/util/generator.hh ADDED Viewed

	@@ -0,0 +1,34 @@

+#pragma once
+// generator/continuation for C++
+// author: Andrew Fedoniouk @ terrainformatica.com
+// idea borrowed from: "coroutines in C" Simon Tatham,
+//                     http://www.chiark.greenend.org.uk/~sgtatham/coroutines.html
+// BSD license
+template<typename T>
+  struct _generator
+  {
+    T* _stack;
+    int _line;
+    _generator():_stack(0), _line(-1) {}
+    void _push() { T* n = new T; *n = *static_cast<T*>(this); _stack = n; }
+    bool _pop() { if(!_stack) return false; T* t = _stack; *static_cast<T*>(this) = *_stack; t->_stack = 0; delete t; return true; }
+    ~_generator() { while(_pop()); }
+  };
+  #define $generator(NAME) struct NAME : public _generator<NAME>
+  #define $emit(T) bool operator()(T& _rv) { \
+                      if(_line < 0) _line=0; \
+                      $START: switch(_line) { case 0:;
+  #define $stop  } _line = 0; if(_pop()) goto $START; return false; }
+  #define $restart(WITH) { _push(); _stack->_line = __LINE__; _line=0; WITH; goto $START; case __LINE__:; }
+  #define $yield(V)     \
+          do {\
+              _line=__LINE__;\
+              _rv = (V); return true; case __LINE__:;\
+          } while (0)

mosesdecoder/util/getopt.c ADDED Viewed

	@@ -0,0 +1,78 @@

+/*
+POSIX getopt for Windows
+AT&T Public License
+Code given out at the 1985 UNIFORUM conference in Dallas.
+*/
+#ifndef __GNUC__
+#include "getopt.hh"
+#include <stdio.h>
+#include <string.h>
+#define NULL	0
+#define EOF	(-1)
+#define ERR(s, c)	if(opterr){\
+	char errbuf[2];\
+	errbuf[0] = c; errbuf[1] = '\n';\
+	fputs(argv[0], stderr);\
+	fputs(s, stderr);\
+	fputc(c, stderr);}
+	//(void) write(2, argv[0], (unsigned)strlen(argv[0]));\
+	//(void) write(2, s, (unsigned)strlen(s));\
+	//(void) write(2, errbuf, 2);}
+int	opterr = 1;
+int	optind = 1;
+int	optopt;
+char	*optarg;
+int
+getopt(argc, argv, opts)
+int	argc;
+char	**argv, *opts;
+{
+	static int sp = 1;
+	register int c;
+	register char *cp;
+	if(sp == 1)
+		if(optind >= argc ||
+		   argv[optind][0] != '-' || argv[optind][1] == '\0')
+			return(EOF);
+		else if(strcmp(argv[optind], "--") == NULL) {
+			optind++;
+			return(EOF);
+		}
+	optopt = c = argv[optind][sp];
+	if(c == ':' || (cp=strchr(opts, c)) == NULL) {
+		ERR(": illegal option -- ", c);
+		if(argv[optind][++sp] == '\0') {
+			optind++;
+			sp = 1;
+		}
+		return('?');
+	}
+	if(*++cp == ':') {
+		if(argv[optind][sp+1] != '\0')
+			optarg = &argv[optind++][sp+1];
+		else if(++optind >= argc) {
+			ERR(": option requires an argument -- ", c);
+			sp = 1;
+			return('?');
+		} else
+			optarg = argv[optind++];
+		sp = 1;
+	} else {
+		if(argv[optind][++sp] == '\0') {
+			sp = 1;
+			optind++;
+		}
+		optarg = NULL;
+	}
+	return(c);
+}
+#endif  /* __GNUC__ */

mosesdecoder/util/integer_to_string_test.cc ADDED Viewed

	@@ -0,0 +1,81 @@

+#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE
+#include "util/integer_to_string.hh"
+#include "util/string_piece.hh"
+#define BOOST_TEST_MODULE IntegerToStringTest
+#include <boost/test/unit_test.hpp>
+#include <boost/lexical_cast.hpp>
+#include <limits>
+namespace util {
+namespace {
+template <class T> void TestValue(const T value) {
+  char buf[ToStringBuf<T>::kBytes];
+  StringPiece result(buf, ToString(value, buf) - buf);
+  BOOST_REQUIRE_GE(static_cast<std::size_t>(ToStringBuf<T>::kBytes), result.size());
+  if (value) {
+    BOOST_CHECK_EQUAL(boost::lexical_cast<std::string>(value), result);
+  } else {
+    // Platforms can do void * as 0x0 or 0.
+    BOOST_CHECK(result == "0x0" || result == "0");
+  }
+}
+template <class T> void TestCorners() {
+  TestValue(std::numeric_limits<T>::min());
+  TestValue(std::numeric_limits<T>::max());
+  TestValue((T)0);
+  TestValue((T)-1);
+  TestValue((T)1);
+}
+BOOST_AUTO_TEST_CASE(Corners) {
+  TestCorners<uint16_t>();
+  TestCorners<uint32_t>();
+  TestCorners<uint64_t>();
+  TestCorners<int16_t>();
+  TestCorners<int32_t>();
+  TestCorners<int64_t>();
+  TestCorners<const void*>();
+}
+template <class T> void TestAll() {
+  for (T i = std::numeric_limits<T>::min(); i < std::numeric_limits<T>::max(); ++i) {
+    TestValue(i);
+  }
+  TestValue(std::numeric_limits<T>::max());
+}
+BOOST_AUTO_TEST_CASE(Short) {
+  TestAll<uint16_t>();
+  TestAll<int16_t>();
+}
+template <class T> void Test10s() {
+  for (T i = 1; i < std::numeric_limits<T>::max() / 10; i *= 10) {
+    TestValue(i);
+    TestValue(i - 1);
+    TestValue(i + 1);
+  }
+}
+BOOST_AUTO_TEST_CASE(Tens) {
+  Test10s<uint64_t>();
+  Test10s<int64_t>();
+  Test10s<uint32_t>();
+  Test10s<int32_t>();
+}
+BOOST_AUTO_TEST_CASE(Pointers) {
+  for (uintptr_t i = 1; i < std::numeric_limits<uintptr_t>::max() / 10; i *= 10) {
+    TestValue((const void*)i);
+  }
+  for (uintptr_t i = 0; i < 256; ++i) {
+    TestValue((const void*)i);
+    TestValue((const void*)(i + 0xf00));
+  }
+}
+}} // namespaces