diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e4d1641ffd2f6a9f94cbd42baef5db2b2609e72 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.cpp @@ -0,0 +1,94 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifdef HAVE_CMPH + +#include "CmphStringVectorAdapter.h" + +namespace Moses +{ + +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} + +void CmphStringVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; +} + +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *)&v; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = v.size(); + + return key_source; +} + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + std::vector* v = (std::vector*)cmph_vector->vector; + size_t size; + *keylen = (*v)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*v)[cmph_vector->position]; + strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); +} + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen) +{ + delete[] key; +} + +void CmphVectorAdapterRewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; +} + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v) +{ + cmph_io_adapter_t * key_source = CmphVectorAdapterNew(v); + + key_source->read = CmphVectorAdapterRead; + key_source->dispose = CmphVectorAdapterDispose; + key_source->rewind = CmphVectorAdapterRewind; + return key_source; +} + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h b/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h new file mode 100644 index 0000000000000000000000000000000000000000..4a532c289d3c2b5d8ceb511667e0d5c20ef8770f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/CmphStringVectorAdapter.h @@ -0,0 +1,105 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_CmphStringVectorAdapterNew_h +#define moses_CmphStringVectorAdapterNew_h + +#include +#include + +#ifdef HAVE_CMPH +#include "cmph.h" + +#include "StringVector.h" + +namespace Moses +{ + +typedef struct { + void *vector; + cmph_uint32 position; +} +cmph_vector_t; + + +template class Allocator> +cmph_io_adapter_t *CmphStringVectorAdapterNew(StringVector& sv) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + + cmph_vector->vector = (void *)&sv; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = sv.size(); + + return key_source; +} + +template class Allocator> +int CmphStringVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + StringVector* sv = (StringVector*)cmph_vector->vector; + size_t size; + *keylen = (*sv)[cmph_vector->position].size(); + size = *keylen; + *key = new char[size + 1]; + std::string temp = (*sv)[cmph_vector->position]; + std::strcpy(*key, temp.c_str()); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); +} + +void CmphStringVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphStringVectorAdapterRewind(void *data); + +template class Allocator> +cmph_io_adapter_t* CmphStringVectorAdapter(StringVector& sv) +{ + cmph_io_adapter_t * key_source = CmphStringVectorAdapterNew(sv); + + key_source->read = CmphStringVectorAdapterRead; + key_source->dispose = CmphStringVectorAdapterDispose; + key_source->rewind = CmphStringVectorAdapterRewind; + return key_source; +} + +//************************************************************************// + +cmph_io_adapter_t *CmphVectorAdapterNew(std::vector& v); + +int CmphVectorAdapterRead(void *data, char **key, cmph_uint32 *keylen); + +void CmphVectorAdapterDispose(void *data, char *key, cmph_uint32 keylen); + +void CmphVectorAdapterRewind(void *data); + +cmph_io_adapter_t* CmphVectorAdapter(std::vector& v); + +} + +#endif + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp new file mode 100644 index 0000000000000000000000000000000000000000..484f6c8c1140aae473222ce78cd646ab66b7b870 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.cpp @@ -0,0 +1,195 @@ +// -*- c++ -*- +// vim:tabstop=2 +// $Id$ +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "LexicalReorderingTableCompact.h" +#include "moses/parameters/OOVHandlingOptions.h" + +namespace Moses +{ +bool LexicalReorderingTableCompact::s_inMemoryByDefault = false; + +LexicalReorderingTableCompact:: +LexicalReorderingTableCompact(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) + : LexicalReorderingTable(f_factors, e_factors, c_factors) + , m_inMemory(s_inMemoryByDefault) + , m_numScoreComponent(6) + , m_multipleScoreTrees(true) + , m_hash(10, 16) + , m_scoreTrees(1) +{ + Load(filePath); +} + +LexicalReorderingTableCompact:: +LexicalReorderingTableCompact(const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) + : LexicalReorderingTable(f_factors, e_factors, c_factors) + , m_inMemory(s_inMemoryByDefault) + , m_numScoreComponent(6) + , m_multipleScoreTrees(true) + , m_hash(10, 16) + , m_scoreTrees(1) +{ } + +LexicalReorderingTableCompact:: +~LexicalReorderingTableCompact() +{ + for(size_t i = 0; i < m_scoreTrees.size(); i++) + delete m_scoreTrees[i]; +} + +std::vector +LexicalReorderingTableCompact:: +GetScore(const Phrase& f, const Phrase& e, const Phrase& c) +{ + std::string key; + Scores scores; + + if(0 == c.GetSize()) + key = MakeKey(f, e, c); + else + for(size_t i = 0; i <= c.GetSize(); ++i) { + Phrase sub_c(c.GetSubString(Range(i,c.GetSize()-1))); + key = MakeKey(f,e,sub_c); + } + + size_t index = m_hash[key]; + if(m_hash.GetSize() != index) { + std::string scoresString; + if(m_inMemory) + scoresString = m_scoresMemory[index].str(); + else + scoresString = m_scoresMapped[index].str(); + + BitWrapper<> bitStream(scoresString); + for(size_t i = 0; i < m_numScoreComponent; i++) + scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream)); + + return scores; + } + + return Scores(); +} + +std::string +LexicalReorderingTableCompact:: +MakeKey(const Phrase& f, + const Phrase& e, + const Phrase& c) const +{ + return MakeKey(Trim(f.GetStringRep(m_FactorsF)), + Trim(e.GetStringRep(m_FactorsE)), + Trim(c.GetStringRep(m_FactorsC))); +} + +std::string +LexicalReorderingTableCompact:: +MakeKey(const std::string& f, + const std::string& e, + const std::string& c) const +{ + std::string key; + if(!f.empty()) key += f; + if(!m_FactorsE.empty()) { + if(!key.empty()) key += " ||| "; + key += e; + } + if(!m_FactorsC.empty()) { + if(!key.empty()) key += " ||| "; + key += c; + } + key += " ||| "; + return key; +} + +LexicalReorderingTable* +LexicalReorderingTableCompact:: +CheckAndLoad +(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors) +{ +#ifdef HAVE_CMPH + std::string minlexr = ".minlexr"; + // file name is specified without suffix + if(FileExists(filePath + minlexr)) { + //there exists a compact binary version use that + VERBOSE(2,"Using compact lexical reordering table" << std::endl); + return new LexicalReorderingTableCompact(filePath + minlexr, f_factors, e_factors, c_factors); + } + // file name is specified with suffix + if(filePath.substr(filePath.length() - minlexr.length(), minlexr.length()) == minlexr + && FileExists(filePath)) { + //there exists a compact binary version use that + VERBOSE(2,"Using compact lexical reordering table" << std::endl); + return new LexicalReorderingTableCompact(filePath, f_factors, e_factors, c_factors); + } +#endif + return 0; +} + +void +LexicalReorderingTableCompact:: +Load(std::string filePath) +{ + std::FILE* pFile = std::fopen(filePath.c_str(), "r"); + UTIL_THROW_IF2(pFile == NULL, "File " << filePath << " could not be opened"); + + //if(m_inMemory) + m_hash.Load(pFile); + //else + //m_hash.LoadIndex(pFile); + + size_t read = 0; + read += std::fread(&m_numScoreComponent, sizeof(m_numScoreComponent), 1, pFile); + read += std::fread(&m_multipleScoreTrees, + sizeof(m_multipleScoreTrees), 1, pFile); + + if(m_multipleScoreTrees) { + m_scoreTrees.resize(m_numScoreComponent); + for(size_t i = 0; i < m_numScoreComponent; i++) + m_scoreTrees[i] = new CanonicalHuffman(pFile); + } else { + m_scoreTrees.resize(1); + m_scoreTrees[0] = new CanonicalHuffman(pFile); + } + + if(m_inMemory) + m_scoresMemory.load(pFile, false); + else + m_scoresMapped.load(pFile, true); +} + +void +LexicalReorderingTableCompact:: +SetStaticDefaultParameters(Parameter const& param) +{ + param.SetParameter(s_inMemoryByDefault, "minlexr-memory", false); +} + + +} diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h new file mode 100644 index 0000000000000000000000000000000000000000..ce4f5b10e9cd88e503ff902ee593e7f16ef19fe8 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCompact.h @@ -0,0 +1,94 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LexicalReorderingTableCompact_h +#define moses_LexicalReorderingTableCompact_h + +#include "moses/FF/LexicalReordering/LexicalReorderingTable.h" +#include "moses/StaticData.h" +#include "moses/TranslationModel/PhraseDictionary.h" +#include "moses/GenerationDictionary.h" +#include "moses/TargetPhrase.h" +#include "moses/TargetPhraseCollection.h" + +#include "BlockHashIndex.h" +#include "CanonicalHuffman.h" +#include "StringVector.h" + +namespace Moses +{ + +class LexicalReorderingTableCompact: + public LexicalReorderingTable +{ +private: + static bool s_inMemoryByDefault; + bool m_inMemory; + + size_t m_numScoreComponent; + bool m_multipleScoreTrees; + + BlockHashIndex m_hash; + + typedef CanonicalHuffman ScoreTree; + std::vector m_scoreTrees; + + StringVector m_scoresMapped; + StringVector m_scoresMemory; + + std::string MakeKey(const Phrase& f, const Phrase& e, const Phrase& c) const; + std::string MakeKey(const std::string& f, const std::string& e, const std::string& c) const; + +public: + LexicalReorderingTableCompact(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + LexicalReorderingTableCompact(const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + virtual + ~LexicalReorderingTableCompact(); + + virtual + std::vector + GetScore(const Phrase& f, const Phrase& e, const Phrase& c); + + static + LexicalReorderingTable* + CheckAndLoad(const std::string& filePath, + const std::vector& f_factors, + const std::vector& e_factors, + const std::vector& c_factors); + + void + Load(std::string filePath); + + static void + SetStaticDefaultParameters(Parameter const& param); + +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h new file mode 100644 index 0000000000000000000000000000000000000000..1bf8444fe8d62de9d57ab7570c319d7279004aa8 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/LexicalReorderingTableCreator.h @@ -0,0 +1,143 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_LexicalReorderingTableCreator_h +#define moses_LexicalReorderingTableCreator_h + +#include "PhraseTableCreator.h" + +namespace Moses +{ + +class LexicalReorderingTableCreator +{ +private: + std::string m_inPath; + std::string m_outPath; + std::string m_tempfilePath; + + std::FILE* m_outFile; + + size_t m_orderBits; + size_t m_fingerPrintBits; + + size_t m_numScoreComponent; + + bool m_multipleScoreTrees; + bool m_quantize; + + std::string m_separator; + + BlockHashIndex m_hash; + + typedef Counter ScoreCounter; + typedef CanonicalHuffman ScoreTree; + + std::vector m_scoreCounters; + std::vector m_scoreTrees; + + StringVector* m_encodedScores; + StringVector* m_compressedScores; + + std::priority_queue m_queue; + long m_lastFlushedLine; + long m_lastFlushedSourceNum; + std::string m_lastFlushedSourcePhrase; + std::vector m_lastRange; + +#ifdef WITH_THREADS + size_t m_threads; +#endif + + void PrintInfo(); + + void EncodeScores(); + void CalcHuffmanCodes(); + void CompressScores(); + void Save(); + + std::string MakeSourceTargetKey(std::string&, std::string&); + + std::string EncodeLine(std::vector& tokens); + void AddEncodedLine(PackedItem& pi); + void FlushEncodedQueue(bool force = false); + + std::string CompressEncodedScores(std::string &encodedScores); + void AddCompressedScores(PackedItem& pi); + void FlushCompressedQueue(bool force = false); + +public: + LexicalReorderingTableCreator(std::string inPath, + std::string outPath, + std::string tempfilePath, + size_t orderBits = 10, + size_t fingerPrintBits = 16, + bool multipleScoreTrees = true, + size_t quantize = 0 +#ifdef WITH_THREADS + , size_t threads = 2 +#endif + ); + + ~LexicalReorderingTableCreator(); + + friend class EncodingTaskReordering; + friend class CompressionTaskReordering; +}; + +class EncodingTaskReordering +{ +private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; + static boost::mutex m_fileMutex; +#endif + static size_t m_lineNum; + static size_t m_sourcePhraseNum; + static std::string m_lastSourcePhrase; + + InputFileStream& m_inFile; + LexicalReorderingTableCreator& m_creator; + +public: + EncodingTaskReordering(InputFileStream& inFile, LexicalReorderingTableCreator& creator); + void operator()(); +}; + +class CompressionTaskReordering +{ +private: +#ifdef WITH_THREADS + static boost::mutex m_mutex; +#endif + static size_t m_scoresNum; + StringVector &m_encodedScores; + LexicalReorderingTableCreator &m_creator; + +public: + CompressionTaskReordering(StringVector& + m_encodedScores, LexicalReorderingTableCreator& creator); + void operator()(); +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h b/mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h new file mode 100644 index 0000000000000000000000000000000000000000..b78dbdd8a603b1f8f786ce6ceee32c86fb405431 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/ListCoders.h @@ -0,0 +1,387 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_ListCoders_h +#define moses_ListCoders_h + +#include +#include + +namespace Moses +{ + +template +class VarIntType +{ +private: + template + static void EncodeSymbol(IntType input, OutIt output) { + if(input == 0) { + *output = 0; + output++; + return; + } + + T msb = 1 << (sizeof(T)*8-1); + IntType mask = ~msb; + IntType shift = (sizeof(T)*8-1); + + while(input) { + T res = input & mask; + input >>= shift; + if(input) + res |= msb; + *output = res; + output++; + } + }; + + template + static void DecodeSymbol(InIt &it, InIt end, IntType &output) { + T msb = 1 << (sizeof(T)*8-1); + IntType shift = (sizeof(T)*8-1); + + output = 0; + size_t i = 0; + while(it != end && *it & msb) { + IntType temp = *it & ~msb; + temp <<= shift*i; + output |= temp; + it++; + i++; + } + assert(it != end); + + IntType temp = *it; + temp <<= shift*i; + output |= temp; + it++; + } + +public: + + template + static void Encode(InIt it, InIt end, OutIt outIt) { + while(it != end) { + EncodeSymbol(*it, outIt); + it++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while(it != end) { + size_t output; + DecodeSymbol(it, end, output); + *outIt = output; + outIt++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + + while(it != end && curr < num) { + size_t output; + DecodeSymbol(it, end, output); + sum += output; + curr++; + } + + return sum; + } + +}; + +typedef VarIntType VarByte; + +typedef VarByte VarInt8; +typedef VarIntType VarInt16; +typedef VarIntType VarInt32; + +class Simple9 +{ +private: + typedef unsigned int uint; + + template + inline static void EncodeSymbol(uint &output, InIt it, InIt end) { + uint length = end - it; + + uint type = 0; + uint bitlength = 0; + + switch(length) { + case 1: + type = 1; + bitlength = 28; + break; + case 2: + type = 2; + bitlength = 14; + break; + case 3: + type = 3; + bitlength = 9; + break; + case 4: + type = 4; + bitlength = 7; + break; + case 5: + type = 5; + bitlength = 5; + break; + case 7: + type = 6; + bitlength = 4; + break; + case 9: + type = 7; + bitlength = 3; + break; + case 14: + type = 8; + bitlength = 2; + break; + case 28: + type = 9; + bitlength = 1; + break; + } + + output = 0; + output |= (type << 28); + + uint i = 0; + while(it != end) { + UTIL_THROW_IF2(*it > 268435455, "You are trying to encode " << *it + << " with Simple9. Cannot encode numbers larger than 268435455 (2^28-1)"); + + uint l = bitlength * (length-i-1); + output |= *it << l; + it++; + i++; + } + } + + template + static inline void DecodeSymbol(uint input, OutIt outIt) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch(type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; + } + + while(shift > 0) { + *outIt = (input >> shift) & mask; + shift -= bitlen; + outIt++; + } + *outIt = input & mask; + outIt++; + } + + static inline size_t DecodeAndSumSymbol(uint input, size_t num, size_t &curr) { + uint type = (input >> 28); + + uint bitlen = 0; + uint shift = 0; + uint mask = 0; + + switch(type) { + case 1: + bitlen = 28; + shift = 0; + mask = 268435455; + break; + case 2: + bitlen = 14; + shift = 14; + mask = 16383; + break; + case 3: + bitlen = 9; + shift = 18; + mask = 511; + break; + case 4: + bitlen = 7; + shift = 21; + mask = 127; + break; + case 5: + bitlen = 5; + shift = 20; + mask = 31; + break; + case 6: + bitlen = 4; + shift = 24; + mask = 15; + break; + case 7: + bitlen = 3; + shift = 24; + mask = 7; + break; + case 8: + bitlen = 2; + shift = 26; + mask = 3; + break; + case 9: + bitlen = 1; + shift = 27; + mask = 1; + break; + } + + size_t sum = 0; + while(shift > 0) { + sum += (input >> shift) & mask; + shift -= bitlen; + if(++curr == num) + return sum; + } + sum += input & mask; + curr++; + return sum; + } + +public: + template + static void Encode(InIt it, InIt end, OutIt outIt) { + uint parts[] = { 1, 2, 3, 4, 5, 7, 9, 14, 28 }; + + uint buffer[28]; + for(InIt i = it; i < end; i++) { + uint lastbit = 1; + uint lastpos = 0; + uint lastyes = 0; + uint j = 0; + + double log2 = log(2); + while(j < 9 && lastpos < 28 && (i+lastpos) < end) { + if(lastpos >= parts[j]) + j++; + + buffer[lastpos] = *(i + lastpos); + + uint reqbit = ceil(log(buffer[lastpos]+1)/log2); + assert(reqbit <= 28); + + uint bit = 28/floor(28/reqbit); + if(lastbit < bit) + lastbit = bit; + + if(parts[j] > 28/lastbit) + break; + else if(lastpos == parts[j]-1) + lastyes = lastpos; + + lastpos++; + } + i += lastyes; + + uint length = lastyes + 1; + uint output; + EncodeSymbol(output, buffer, buffer + length); + + *outIt = output; + outIt++; + } + } + + template + static void Decode(InIt &it, InIt end, OutIt outIt) { + while(it != end) { + DecodeSymbol(*it, outIt); + it++; + } + } + + template + static size_t DecodeAndSum(InIt &it, InIt end, size_t num) { + size_t sum = 0; + size_t curr = 0; + while(it != end && curr < num) { + sum += DecodeAndSumSymbol(*it, num, curr); + it++; + } + assert(curr == num); + return sum; + } +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h b/mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h new file mode 100644 index 0000000000000000000000000000000000000000..479c2cc79cde5082b290dd765b28fe3f9d42abfc --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/PackedArray.h @@ -0,0 +1,187 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_PackedArray_h +#define moses_PackedArray_h + +#include +#include +#include +#include + +#include "ThrowingFwrite.h" + +namespace Moses +{ + +template +class PackedArray +{ +protected: + static size_t m_dataBits; + + size_t m_size; + size_t m_storageSize; + D* m_storage; + +public: + PackedArray() { + m_size = 0; + m_storageSize = 0; + m_storage = new D[0]; + } + + PackedArray(size_t size, size_t bits) : m_size(size) { + m_storageSize = ceil(float(bits * size) / float(m_dataBits)); + m_storage = new D[m_storageSize]; + } + + PackedArray(const PackedArray &c) { + m_size = c.m_size; + + m_storageSize = c.m_storageSize; + m_storage = new D[m_storageSize]; + + std::memcpy(m_storage, c.m_storage, m_storageSize * sizeof(D)); + } + + virtual ~PackedArray() { + delete [] m_storage; + m_size = 0; + m_storageSize = 0; + m_storage = 0; + } + + T Get(size_t i, size_t bits) const { + T out = 0; + + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + size_t zero = ((1ul << (bits)) - 1); + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + out |= (T(m_storage[pos]) << (bitpos - bitstart)) >> off; + + bitpos += (m_dataBits - off); + } + + out &= zero; + return out; + } + + void Set(size_t i, T v, size_t bits) { + size_t bitstart = (i * bits); + size_t bitpos = bitstart; + + while(bitpos - bitstart < bits) { + size_t pos = bitpos / m_dataBits; + size_t off = bitpos % m_dataBits; + + size_t rest = bits - (bitpos - bitstart); + D zero = ~((1ul << (rest + off)) - 1) | ((1ul << off) - 1); + + m_storage[pos] &= zero; + m_storage[pos] |= v << off; + v = v >> (m_dataBits - off); + bitpos += (m_dataBits - off); + } + } + + virtual D*& GetStorage() { + return m_storage; + } + + virtual size_t GetStorageSize() const { + return m_storageSize; + } + + virtual size_t Size() const { + return m_size; + } + + virtual size_t Load(std::FILE* in) { + size_t a1 = std::ftell(in); + + size_t read = 0; + read += std::fread(&m_size, sizeof(m_size), 1, in); + read += std::fread(&m_storageSize, sizeof(m_storageSize), 1, in); + delete [] m_storage; + m_storage = new D[m_storageSize]; + read += std::fread(m_storage, sizeof(D), m_storageSize, in); + + size_t a2 = std::ftell(in); + return a2 - a1; + } + + virtual size_t Save(std::FILE* out) { + size_t a1 = std::ftell(out); + + ThrowingFwrite(&m_size, sizeof(m_size), 1, out); + ThrowingFwrite(&m_storageSize, sizeof(m_storageSize), 1, out); + ThrowingFwrite(m_storage, sizeof(D), m_storageSize, out); + + size_t a2 = std::ftell(out); + return a2 - a1; + } + +}; + +template +size_t PackedArray::m_dataBits = sizeof(D)*8; + +/**************************************************************************/ + +template +class PairedPackedArray : public PackedArray +{ +public: + PairedPackedArray() : PackedArray() {} + + PairedPackedArray(size_t size, size_t bits1, size_t bits2) + : PackedArray(size, bits1 + bits2) { } + + void Set(size_t i, T a, T b, size_t bits1, size_t bits2) { + T c = 0; + c = a | (b << bits1); + PackedArray::Set(i, c, bits1 + bits2); + } + + void Set(size_t i, std::pair p, size_t bits1, size_t bits2) { + T c = 0; + c = p.second | (p.first << bits1); + PackedArray::Set(i, c); + } + + std::pair Get(size_t i, size_t bits1, size_t bits2) { + T v = PackedArray::Get(i, bits1 + bits2); + T a = v & ((1 << bits1) - 1); + T b = v >> bits1; + return std::pair(a, b); + } +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d93613b8fdf8d4cf8e722e137d3241e5e4fde755 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/PhraseDictionaryCompact.cpp @@ -0,0 +1,194 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "PhraseDictionaryCompact.h" +#include "moses/FactorCollection.h" +#include "moses/Word.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/Range.h" +#include "moses/ThreadPool.h" +#include "util/exception.hh" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ + +PhraseDictionaryCompact::SentenceCache PhraseDictionaryCompact::m_sentenceCache; + +PhraseDictionaryCompact::PhraseDictionaryCompact(const std::string &line) + :PhraseDictionary(line, true) + ,m_inMemory(s_inMemoryByDefault) + ,m_useAlignmentInfo(true) + ,m_hash(10, 16) + ,m_phraseDecoder(0) +{ + ReadParameters(); +} + +void PhraseDictionaryCompact::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + const StaticData &staticData = StaticData::Instance(); + + SetFeaturesToApply(); + + std::string tFilePath = m_filePath; + + std::string suffix = ".minphr"; + if (!ends_with(tFilePath, suffix)) tFilePath += suffix; + if (!FileExists(tFilePath)) + throw runtime_error("Error: File " + tFilePath + " does not exist."); + + m_phraseDecoder + = new PhraseDecoder(*this, &m_input, &m_output, m_numScoreComponents); + + std::FILE* pFile = std::fopen(tFilePath.c_str() , "r"); + + size_t indexSize; + //if(m_inMemory) + // Load source phrase index into memory + indexSize = m_hash.Load(pFile); + // else + // Keep source phrase index on disk + //indexSize = m_hash.LoadIndex(pFile); + + size_t coderSize = m_phraseDecoder->Load(pFile); + + size_t phraseSize; + if(m_inMemory) + // Load target phrase collections into memory + phraseSize = m_targetPhrasesMemory.load(pFile, false); + else + // Keep target phrase collections on disk + phraseSize = m_targetPhrasesMapped.load(pFile, true); + + UTIL_THROW_IF2(indexSize == 0 || coderSize == 0 || phraseSize == 0, + "Not successfully loaded"); +} + +TargetPhraseCollection::shared_ptr +PhraseDictionaryCompact:: +GetTargetPhraseCollectionNonCacheLEGACY(const Phrase &sourcePhrase) const +{ + //cerr << "sourcePhrase=" << sourcePhrase << endl; + + TargetPhraseCollection::shared_ptr ret; + // There is no souch source phrase if source phrase is longer than longest + // observed source phrase during compilation + if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) + return ret; + + // Retrieve target phrase collection from phrase table + TargetPhraseVectorPtr decodedPhraseColl + = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, true); + + if(decodedPhraseColl != NULL && decodedPhraseColl->size()) { + TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl)); + TargetPhraseCollection::shared_ptr phraseColl(new TargetPhraseCollection); + + // Score phrases and if possible apply ttable_limit + TargetPhraseVector::iterator nth = + (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? + tpv->end() : tpv->begin() + m_tableLimit; + NTH_ELEMENT4(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); + for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) { + TargetPhrase *tp = new TargetPhrase(*it); + phraseColl->Add(tp); + } + + // Cache phrase pair for clean-up or retrieval with PREnc + const_cast(this)->CacheForCleanup(phraseColl); + + return phraseColl; + } else + return ret; +} + +TargetPhraseVectorPtr +PhraseDictionaryCompact:: +GetTargetPhraseCollectionRaw(const Phrase &sourcePhrase) const +{ + + // There is no such source phrase if source phrase is longer than longest + // observed source phrase during compilation + if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) + return TargetPhraseVectorPtr(); + + // Retrieve target phrase collection from phrase table + return m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true, false); +} + +PhraseDictionaryCompact:: +~PhraseDictionaryCompact() +{ + if(m_phraseDecoder) + delete m_phraseDecoder; +} + +void +PhraseDictionaryCompact:: +CacheForCleanup(TargetPhraseCollection::shared_ptr tpc) +{ + if(!m_sentenceCache.get()) + m_sentenceCache.reset(new PhraseCache()); + m_sentenceCache->push_back(tpc); +} + +void +PhraseDictionaryCompact:: +AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase) +{ } + +void +PhraseDictionaryCompact:: +CleanUpAfterSentenceProcessing(const InputType &source) +{ + if(!m_sentenceCache.get()) + m_sentenceCache.reset(new PhraseCache()); + + m_phraseDecoder->PruneCache(); + m_sentenceCache->clear(); + + ReduceCache(); +} + +bool PhraseDictionaryCompact::s_inMemoryByDefault = false; +void +PhraseDictionaryCompact:: +SetStaticDefaultParameters(Parameter const& param) +{ + param.SetParameter(s_inMemoryByDefault, "minphr-memory", false); +} +} + diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h b/mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h new file mode 100644 index 0000000000000000000000000000000000000000..ffac0b718f7bc55fea6d4445e04746599d4fc2df --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/StringVectorTemp.h @@ -0,0 +1,430 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_StringVectorTemp_h +#define moses_StringVectorTemp_h + +#include +#include +#include +#include +#include +#include + +#include + +#include "ThrowingFwrite.h" +#include "StringVector.h" + +#include "MmapAllocator.h" + +namespace Moses +{ + + +// ********** StringVectorTemp ********** + +template class Allocator = std::allocator> +class StringVectorTemp +{ +protected: + bool m_sorted; + bool m_memoryMapped; + + std::vector >* m_charArray; + std::vector m_positions; + + virtual const ValueT* value_ptr(PosT i) const; + +public: + //typedef ValueIteratorRange >::const_iterator> range; + typedef ValueIteratorRange range; + + // ********** RangeIterator ********** + + class RangeIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVectorTemp* m_container; + + public: + RangeIterator(); + RangeIterator(StringVectorTemp &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + range dereference() const; + bool equal(RangeIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + + PosT distance_to(RangeIterator const& other) const; + }; + + // ********** StringIterator ********** + + class StringIterator : public boost::iterator_facade + { + + private: + PosT m_index; + StringVectorTemp* m_container; + + public: + StringIterator(); + StringIterator(StringVectorTemp &sv, PosT index=0); + + PosT get_index(); + + private: + friend class boost::iterator_core_access; + + const std::string dereference() const; + bool equal(StringIterator const& other) const; + void increment(); + void decrement(); + void advance(PosT n); + PosT distance_to(StringIterator const& other) const; + }; + + typedef RangeIterator iterator; + typedef StringIterator string_iterator; + + StringVectorTemp(); + StringVectorTemp(Allocator alloc); + + virtual ~StringVectorTemp() { + delete m_charArray; + } + + void swap(StringVectorTemp &c) { + m_positions.swap(c.m_positions); + m_charArray->swap(*c.m_charArray); + + bool temp = m_sorted; + m_sorted = c.m_sorted; + c.m_sorted = temp; + } + + bool is_sorted() const; + PosT size() const; + virtual PosT size2() const; + + template Iterator begin() const; + template Iterator end() const; + + iterator begin() const; + iterator end() const; + + PosT length(PosT i) const; + //typename std::vector >::const_iterator begin(PosT i) const; + //typename std::vector >::const_iterator end(PosT i) const; + const ValueT* begin(PosT i) const; + const ValueT* end(PosT i) const; + + void clear() { + m_charArray->clear(); + m_sorted = true; + m_positions.clear(); + } + + range at(PosT i) const; + range operator[](PosT i) const; + range back() const; + + template + void push_back(StringT s); + void push_back(const char* c); + + template + PosT find(StringT &s) const; + PosT find(const char* c) const; +}; + +// ********** Implementation ********** + +// StringVectorTemp + +template class Allocator> +StringVectorTemp::StringVectorTemp() + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >()) { } + +template class Allocator> +StringVectorTemp::StringVectorTemp(Allocator alloc) + : m_sorted(true), m_memoryMapped(false), m_charArray(new std::vector >(alloc)) { } + +template class Allocator> +template +void StringVectorTemp::push_back(StringT s) +{ + if(is_sorted() && size() && !(back() < s)) + m_sorted = false; + + m_positions.push_back(size2()); + std::copy(s.begin(), s.end(), std::back_inserter(*m_charArray)); +} + +template class Allocator> +void StringVectorTemp::push_back(const char* c) +{ + std::string dummy(c); + push_back(dummy); +} + +template class Allocator> +template +Iterator StringVectorTemp::begin() const +{ + return Iterator(const_cast&>(*this), 0); +} + +template class Allocator> +template +Iterator StringVectorTemp::end() const +{ + return Iterator(const_cast&>(*this), size()); +} + +template class Allocator> +typename StringVectorTemp::iterator StringVectorTemp::begin() const +{ + return begin(); +}; + +template class Allocator> +typename StringVectorTemp::iterator StringVectorTemp::end() const +{ + return end(); +}; + +template class Allocator> +bool StringVectorTemp::is_sorted() const +{ + return m_sorted; +} + +template class Allocator> +PosT StringVectorTemp::size() const +{ + return m_positions.size(); +} + +template class Allocator> +PosT StringVectorTemp::size2() const +{ + return m_charArray->size(); +} + +template class Allocator> +typename StringVectorTemp::range StringVectorTemp::at(PosT i) const +{ + return range(begin(i), end(i)); +} + +template class Allocator> +typename StringVectorTemp::range StringVectorTemp::operator[](PosT i) const +{ + return at(i); +} + +template class Allocator> +typename StringVectorTemp::range StringVectorTemp::back() const +{ + return at(size()-1); +} + +template class Allocator> +PosT StringVectorTemp::length(PosT i) const +{ + if(i+1 < size()) + return m_positions[i+1] - m_positions[i]; + else + return size2() - m_positions[i]; +} + +template class Allocator> +const ValueT* StringVectorTemp::value_ptr(PosT i) const +{ + return &(*m_charArray)[m_positions[i]]; +} + +template class Allocator> +//typename std::vector >::const_iterator StringVectorTemp::begin(PosT i) const +const ValueT* StringVectorTemp::begin(PosT i) const +{ + //return typename std::vector >::const_iterator(value_ptr(i)); + return value_ptr(i); +} + +template class Allocator> +//typename std::vector >::const_iterator StringVectorTemp::end(PosT i) const +const ValueT* StringVectorTemp::end(PosT i) const +{ + //return typename std::vector >::const_iterator(value_ptr(i) + length(i)); + return value_ptr(i) + length(i); +} + +template class Allocator> +template +PosT StringVectorTemp::find(StringT &s) const +{ + if(m_sorted) + return std::distance(begin(), std::lower_bound(begin(), end(), s)); + return std::distance(begin(), std::find(begin(), end(), s)); +} + +template class Allocator> +PosT StringVectorTemp::find(const char* c) const +{ + std::string s(c); + return find(s); +} + +// RangeIterator + +template class Allocator> +StringVectorTemp::RangeIterator::RangeIterator() : m_index(0), m_container(0) { } + +template class Allocator> +StringVectorTemp::RangeIterator::RangeIterator(StringVectorTemp &sv, PosT index) + : m_index(index), m_container(&sv) { } + +template class Allocator> +PosT StringVectorTemp::RangeIterator::get_index() +{ + return m_index; +} + +template class Allocator> +typename StringVectorTemp::range +StringVectorTemp::RangeIterator::dereference() const +{ + return typename StringVectorTemp::range( + m_container->begin(m_index), + m_container->end(m_index) + ); +} + +template class Allocator> +bool StringVectorTemp::RangeIterator::equal( + StringVectorTemp::RangeIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVectorTemp::RangeIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVectorTemp::RangeIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVectorTemp::RangeIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVectorTemp::RangeIterator::distance_to( + StringVectorTemp::RangeIterator const& other) const +{ + return other.m_index - m_index; +} + +// StringIterator + +template class Allocator> +StringVectorTemp::StringIterator::StringIterator() + : m_index(0), m_container(0) { } + +template class Allocator> +StringVectorTemp::StringIterator::StringIterator( + StringVectorTemp &sv, PosT index) : m_index(index), + m_container(&sv) { } + +template class Allocator> +PosT StringVectorTemp::StringIterator::get_index() +{ + return m_index; +} + +template class Allocator> +const std::string StringVectorTemp::StringIterator::dereference() const +{ + return StringVectorTemp::range(m_container->begin(m_index), + m_container->end(m_index)).str(); +} + +template class Allocator> +bool StringVectorTemp::StringIterator::equal( + StringVectorTemp::StringIterator const& other) const +{ + return m_index == other.m_index && m_container == other.m_container; +} + +template class Allocator> +void StringVectorTemp::StringIterator::increment() +{ + m_index++; +} + +template class Allocator> +void StringVectorTemp::StringIterator::decrement() +{ + m_index--; +} + +template class Allocator> +void StringVectorTemp::StringIterator::advance(PosT n) +{ + m_index += n; +} + +template class Allocator> +PosT StringVectorTemp::StringIterator::distance_to( + StringVectorTemp::StringIterator const& other) const +{ + return other.m_index - m_index; +} + +// ********** Some typedefs ********** + +typedef StringVectorTemp MediumStringVectorTemp; +typedef StringVectorTemp LongStringVectorTemp; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h b/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h new file mode 100644 index 0000000000000000000000000000000000000000..e017a3c1914397f84bcffb93240b5c3e1fc40e55 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/TargetPhraseCollectionCache.h @@ -0,0 +1,163 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_TargetPhraseCollectionCache_h +#define moses_TargetPhraseCollectionCache_h + +#include +#include +#include + +#include +#include + +#include "moses/Phrase.h" +#include "moses/TargetPhraseCollection.h" + +namespace Moses +{ + +// Avoid using new due to locking +typedef std::vector TargetPhraseVector; +typedef boost::shared_ptr TargetPhraseVectorPtr; + +/** Implementation of Persistent Cache **/ +class TargetPhraseCollectionCache +{ +private: + size_t m_max; + float m_tolerance; + + struct LastUsed { + clock_t m_clock; + TargetPhraseVectorPtr m_tpv; + size_t m_bitsLeft; + + LastUsed() : m_clock(0), m_bitsLeft(0) {} + + LastUsed(clock_t clock, TargetPhraseVectorPtr tpv, size_t bitsLeft = 0) + : m_clock(clock), m_tpv(tpv), m_bitsLeft(bitsLeft) {} + }; + + typedef std::map CacheMap; + static boost::thread_specific_ptr m_phraseCache; + +public: + + typedef CacheMap::iterator iterator; + typedef CacheMap::const_iterator const_iterator; + + TargetPhraseCollectionCache(size_t max = 5000, float tolerance = 0.2) + : m_max(max), m_tolerance(tolerance) { + } + + iterator Begin() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->begin(); + } + + const_iterator Begin() const { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->begin(); + } + + iterator End() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->end(); + } + + const_iterator End() const { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + return m_phraseCache->end(); + } + + /** retrieve translations for source phrase from persistent cache **/ + void Cache(const Phrase &sourcePhrase, TargetPhraseVectorPtr tpv, + size_t bitsLeft = 0, size_t maxRank = 0) { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + // check if source phrase is already in cache + iterator it = m_phraseCache->find(sourcePhrase); + if(it != m_phraseCache->end()) + // if found, just update clock + it->second.m_clock = clock(); + else { + // else, add to cache + if(maxRank && tpv->size() > maxRank) { + TargetPhraseVectorPtr tpv_temp(new TargetPhraseVector()); + tpv_temp->resize(maxRank); + std::copy(tpv->begin(), tpv->begin() + maxRank, tpv_temp->begin()); + (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv_temp, bitsLeft); + } else + (*m_phraseCache)[sourcePhrase] = LastUsed(clock(), tpv, bitsLeft); + } + } + + std::pair Retrieve(const Phrase &sourcePhrase) { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + iterator it = m_phraseCache->find(sourcePhrase); + if(it != m_phraseCache->end()) { + LastUsed &lu = it->second; + lu.m_clock = clock(); + return std::make_pair(lu.m_tpv, lu.m_bitsLeft); + } else + return std::make_pair(TargetPhraseVectorPtr(), 0); + } + + // if cache full, reduce + void Prune() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + if(m_phraseCache->size() > m_max * (1 + m_tolerance)) { + typedef std::set > Cands; + Cands cands; + for(CacheMap::iterator it = m_phraseCache->begin(); + it != m_phraseCache->end(); it++) { + LastUsed &lu = it->second; + cands.insert(std::make_pair(lu.m_clock, it->first)); + } + + for(Cands::iterator it = cands.begin(); it != cands.end(); it++) { + const Phrase& p = it->second; + m_phraseCache->erase(p); + + if(m_phraseCache->size() < (m_max * (1 - m_tolerance))) + break; + } + } + } + + void CleanUp() { + if(!m_phraseCache.get()) + m_phraseCache.reset(new CacheMap()); + m_phraseCache->clear(); + } + +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp b/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp new file mode 100644 index 0000000000000000000000000000000000000000..554c4ed7b599da6b089a6dfede35479158b20bf1 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.cpp @@ -0,0 +1,30 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "ThrowingFwrite.h" + +size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) +{ + assert(size); + size_t returnValue = std::fwrite(ptr, size, count, stream); + UTIL_THROW_IF2(count != returnValue, "Short fwrite; requested size " << size); + return returnValue; +} diff --git a/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h b/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h new file mode 100644 index 0000000000000000000000000000000000000000..466d3973b33eb78dd9dd80854da5a229ceceb97e --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/CompactPT/ThrowingFwrite.h @@ -0,0 +1,31 @@ +// $Id$ +// vim:tabstop=2 +/*********************************************************************** +Moses - factored phrase-based language decoder +Copyright (C) 2006 University of Edinburgh + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#ifndef moses_ThrowingFwrite_h +#define moses_ThrowingFwrite_h + +#include +#include +#include "util/exception.hh" + +size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream); + +#endif diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7ec1d48dde290750903313bf0d84aa475265ea3 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderFactory.cpp @@ -0,0 +1,65 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "LoaderFactory.h" + +#include "moses/Util.h" +#include "moses/InputFileStream.h" +#include "LoaderCompact.h" +#include "LoaderHiero.h" +#include "LoaderStandard.h" + +#include +#include + +using namespace std; + +namespace Moses +{ + +// Determines the rule table type by peeking inside the file then creates +// a suitable RuleTableLoader object. +std::auto_ptr +RuleTableLoaderFactory:: +Create(const std::string &path) +{ + InputFileStream input(path); + std::string line; + + if (std::getline(input, line)) { + std::vector tokens; + Tokenize(tokens, line); + if (tokens.size() == 1) { + if (tokens[0] == "1") { + return std::auto_ptr(new RuleTableLoaderCompact()); + } + std::cerr << "Unsupported compact rule table format: " << tokens[0]; + return std::auto_ptr(); + } else if (tokens[0] == "[X]" && tokens[1] == "|||") { + return std::auto_ptr(new RuleTableLoaderHiero()); + } + + return std::auto_ptr(new RuleTableLoaderStandard()); + } else { + // empty phrase table + return std::auto_ptr(new RuleTableLoaderStandard()); + } +} + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eb81d56772e07a4ba7737ed338c2346751245888 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderHiero.cpp @@ -0,0 +1,33 @@ +// +// RuleTableLoaderHiero.cpp +// moses +// +// Created by Hieu Hoang on 04/11/2011. +// Copyright 2011 __MyCompanyName__. All rights reserved. +// + +#include +#include "LoaderHiero.h" + +using namespace std; + +namespace Moses +{ + +bool RuleTableLoaderHiero::Load(AllOptions const& opts, + const std::vector &input, + const std::vector &output, + const std::string &inFile, + size_t tableLimit, + RuleTableTrie &ruleTable) +{ + bool ret = RuleTableLoaderStandard::Load(opts, HieroFormat + ,input, output + ,inFile + ,tableLimit + ,ruleTable); + return ret; +} + +} + diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c84286588830eec2eef9405e64727d6ed82c9be3 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/LoaderStandard.cpp @@ -0,0 +1,260 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "LoaderStandard.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include "Trie.h" +#include "moses/FactorCollection.h" +#include "moses/Word.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/Range.h" +#include "moses/ChartTranslationOptionList.h" +#include "moses/FactorCollection.h" +#include "util/file_piece.hh" +#include "util/string_piece.hh" +#include "util/tokenize_piece.hh" +#include "util/double-conversion/double-conversion.h" +#include "util/exception.hh" + +using namespace std; +using namespace boost::algorithm; + +namespace Moses +{ + +bool +RuleTableLoaderStandard:: +Load(AllOptions const& opts + , const std::vector &input + , const std::vector &output + , const std::string &inFile + , size_t tableLimit + , RuleTableTrie &ruleTable) +{ + return Load(opts, MosesFormat,input, output ,inFile ,tableLimit ,ruleTable); +} + +void ReformatHieroRule(int sourceTarget, string &phrase, map > &ntAlign) +{ + vector toks; + Tokenize(toks, phrase, " "); + + for (size_t i = 0; i < toks.size(); ++i) { + string &tok = toks[i]; + if (starts_with(tok, "[") && ends_with(tok, "]")) { + // no-term + vector split = Tokenize(tok, ","); + UTIL_THROW_IF2(split.size() != 2, + "Incorrectly formmatted non-terminal: " << tok); + + tok = "[X]" + split[0] + "]"; + size_t coIndex = Scan(split[1]); + + pair &alignPoint = ntAlign[coIndex]; + if (sourceTarget == 0) { + alignPoint.first = i; + } else { + alignPoint.second = i; + } + } + } + + phrase = Join(" ", toks) + " [X]"; + +} + +void ReformateHieroScore(string &scoreString) +{ + vector toks; + Tokenize(toks, scoreString, " "); + + for (size_t i = 0; i < toks.size(); ++i) { + string &tok = toks[i]; + vector nameValue = Tokenize(tok, "="); + UTIL_THROW_IF2(nameValue.size() != 2, + "Incorrectly formatted score: " << tok); + + float score = Scan(nameValue[1]); + score = exp(-score); + tok = SPrint(score); + } + + scoreString = Join(" ", toks); +} + +void ReformatHieroRule(const string &lineOrig, string &out) +{ + vector tokens; + vector scoreVector; + + TokenizeMultiCharSeparator(tokens, lineOrig, "|||" ); + + string &sourcePhraseString = tokens[1] + , &targetPhraseString = tokens[2] + , &scoreString = tokens[3]; + + map > ntAlign; + ReformatHieroRule(0, sourcePhraseString, ntAlign); + ReformatHieroRule(1, targetPhraseString, ntAlign); + ReformateHieroScore(scoreString); + + util::StringStream align; + map >::const_iterator iterAlign; + for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) { + const pair &alignPoint = iterAlign->second; + align << alignPoint.first << "-" << alignPoint.second << " "; + } + + util::StringStream ret; + ret << sourcePhraseString << " ||| " + << targetPhraseString << " ||| " + << scoreString << " ||| " + << align.str(); + + out = ret.str(); +} + +bool RuleTableLoaderStandard::Load(AllOptions const& opts, FormatType format + , const std::vector &input + , const std::vector &output + , const std::string &inFile + , size_t /* tableLimit */ + , RuleTableTrie &ruleTable) +{ + PrintUserTime(string("Start loading text phrase table. ") + (format==MosesFormat?"Moses":"Hiero") + " format"); + + // const StaticData &staticData = StaticData::Instance(); + + string lineOrig; + size_t count = 0; + + std::ostream *progress = NULL; + IFVERBOSE(1) progress = &std::cerr; + util::FilePiece in(inFile.c_str(), progress); + + // reused variables + vector scoreVector; + StringPiece line; + std::string hiero_before, hiero_after; + + double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); + + while(true) { + try { + line = in.ReadLine(); + } catch (const util::EndOfFileException &e) { + break; + } + + if (format == HieroFormat) { // inefficiently reformat line + hiero_before.assign(line.data(), line.size()); + ReformatHieroRule(hiero_before, hiero_after); + line = hiero_after; + } + + util::TokenIter pipes(line, "|||"); + StringPiece sourcePhraseString(*pipes); + StringPiece targetPhraseString(*++pipes); + StringPiece scoreString(*++pipes); + + StringPiece alignString; + if (++pipes) { + StringPiece temp(*pipes); + alignString = temp; + } + + bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); + if (isLHSEmpty && !opts.unk.word_deletion_enabled) { + TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); + continue; + } + + scoreVector.clear(); + for (util::TokenIter s(scoreString, " \t"); s; ++s) { + int processed; + float score = converter.StringToFloat(s->data(), s->length(), &processed); + UTIL_THROW_IF2(isnan(score), "Bad score " << *s << " on line " << count); + scoreVector.push_back(FloorScore(TransformScore(score))); + } + const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); + if (scoreVector.size() != numScoreComponents) { + UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" + << numScoreComponents << ") of score components on line " << count); + } + + // parse source & find pt node + + // constituent labels + Word *sourceLHS = NULL; + Word *targetLHS; + + // create target phrase obj + TargetPhrase *targetPhrase = new TargetPhrase(&ruleTable); + targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); + // source + Phrase sourcePhrase; + sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS); + + // rest of target phrase + targetPhrase->SetAlignmentInfo(alignString); + targetPhrase->SetTargetLHS(targetLHS); + + ++pipes; // skip over counts field + + if (++pipes) { + StringPiece sparseString(*pipes); + targetPhrase->SetSparseScore(&ruleTable, sparseString); + } + + if (++pipes) { + StringPiece propertiesString(*pipes); + targetPhrase->SetProperties(propertiesString); + } + + targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector); + targetPhrase->EvaluateInIsolation(sourcePhrase, ruleTable.GetFeaturesToApply()); + + TargetPhraseCollection::shared_ptr phraseColl + = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, + *targetPhrase, sourceLHS); + phraseColl->Add(targetPhrase); + + // not implemented correctly in memory pt. just delete it for now + delete sourceLHS; + + count++; + } + + // sort and prune each target phrase collection + SortAndPrune(ruleTable); + + return true; +} + +} diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50dd4bb8afa4aab8510c1d3ad8420376112079f5 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryFuzzyMatch.cpp @@ -0,0 +1,417 @@ +// vim:tabstop=2 + +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2006 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "Loader.h" +#include "LoaderFactory.h" +#include "PhraseDictionaryFuzzyMatch.h" +#include "moses/FactorCollection.h" +#include "moses/Word.h" +#include "moses/Util.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/Range.h" +#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.h" +#include "moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h" +#include "moses/TranslationModel/fuzzy-match/SentenceAlignment.h" +#include "moses/TranslationTask.h" +#include "util/file.hh" +#include "util/exception.hh" +#include "util/random.hh" + +using namespace std; + +#if defined __MINGW32__ && !defined mkdtemp +#include +#include +char *mkdtemp(char *tempbuf) +{ + int rand_value = 0; + char* tempbase = NULL; + char tempbasebuf[MAX_PATH] = ""; + + if (strcmp(&tempbuf[strlen(tempbuf)-6], "XXXXXX")) { + errno = EINVAL; + return NULL; + } + + util::rand_init(); + rand_value = util::rand_excl(1e6); + tempbase = strrchr(tempbuf, '/'); + tempbase = tempbase ? tempbase+1 : tempbuf; + strcpy(tempbasebuf, tempbase); + sprintf(&tempbasebuf[strlen(tempbasebuf)-6], "%d", rand_value); + ::GetTempPath(MAX_PATH, tempbuf); + strcat(tempbuf, tempbasebuf); + ::CreateDirectory(tempbuf, NULL); + return tempbuf; +} +#endif + +namespace Moses +{ + +PhraseDictionaryFuzzyMatch::PhraseDictionaryFuzzyMatch(const std::string &line) + :PhraseDictionary(line, true) + ,m_config(3) + ,m_FuzzyMatchWrapper(NULL) +{ + ReadParameters(); +} + +PhraseDictionaryFuzzyMatch::~PhraseDictionaryFuzzyMatch() +{ + delete m_FuzzyMatchWrapper; +} + +void PhraseDictionaryFuzzyMatch::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + SetFeaturesToApply(); + + m_FuzzyMatchWrapper = new tmmt::FuzzyMatchWrapper(m_config[0], m_config[1], m_config[2]); +} + +ChartRuleLookupManager *PhraseDictionaryFuzzyMatch::CreateRuleLookupManager( + const ChartParser &parser, + const ChartCellCollectionBase &cellCollection, + std::size_t /*maxChartSpan*/) +{ + return new ChartRuleLookupManagerMemoryPerSentence(parser, cellCollection, *this); +} + +void +PhraseDictionaryFuzzyMatch:: +SetParameter(const std::string& key, const std::string& value) +{ + if (key == "source") { + m_config[0] = value; + } else if (key == "target") { + m_config[1] = value; + } else if (key == "alignment") { + m_config[2] = value; + } else { + PhraseDictionary::SetParameter(key, value); + } +} + +int removedirectoryrecursively(const char *dirname) +{ +#if defined __MINGW32__ + //TODO(jie): replace this function with boost implementation +#else + DIR *dir; + struct dirent *entry; + char path[PATH_MAX]; + + dir = opendir(dirname); + if (dir == NULL) { + perror("Error opendir()"); + return 0; + } + + while ((entry = readdir(dir)) != NULL) { + if (strcmp(entry->d_name, ".") && strcmp(entry->d_name, "..")) { + snprintf(path, (size_t) PATH_MAX, "%s/%s", dirname, entry->d_name); + if (entry->d_type == DT_DIR) { + removedirectoryrecursively(path); + } + + remove(path); + /* + * Here, the actual deletion must be done. Beacuse this is + * quite a dangerous thing to do, and this program is not very + * well tested, we are just printing as if we are deleting. + */ + //printf("(not really) Deleting: %s\n", path); + /* + * When you are finished testing this and feel you are ready to do the real + * deleting, use this: remove*STUB*(path); + * (see "man 3 remove") + * Please note that I DONT TAKE RESPONSIBILITY for data you delete with this! + */ + } + + } + closedir(dir); + + rmdir(dirname); + /* + * Now the directory is emtpy, finally delete the directory itself. (Just + * printing here, see above) + */ + //printf("(not really) Deleting: %s\n", dirname); +#endif + return 1; +} + +void PhraseDictionaryFuzzyMatch::InitializeForInput(ttasksptr const& ttask) +{ + InputType const& inputSentence = *ttask->GetSource(); +#if defined __MINGW32__ + char dirName[] = "moses.XXXXXX"; +#else + char dirName[] = "/tmp/moses.XXXXXX"; +#endif // defined + char *temp = mkdtemp(dirName); + UTIL_THROW_IF2(temp == NULL, + "Couldn't create temporary directory " << dirName); + + string dirNameStr(dirName); + + string inFileName(dirNameStr + "/in"); + + ofstream inFile(inFileName.c_str()); + + for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { + inFile << inputSentence.GetWord(i); + } + inFile << endl; + inFile.close(); + + long translationId = inputSentence.GetTranslationId(); + string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); + + // populate with rules for this sentence + PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; + FormatType format = MosesFormat; + + // data from file + InputFileStream inStream(ptFileName); + + // copied from class LoaderStandard + PrintUserTime("Start loading fuzzy-match phrase model"); + + const StaticData &staticData = StaticData::Instance(); + + + string lineOrig; + size_t count = 0; + + while(getline(inStream, lineOrig)) { + const string *line; + if (format == HieroFormat) { // reformat line + UTIL_THROW(util::Exception, "Cannot be Hiero format"); + //line = ReformatHieroRule(lineOrig); + } else { + // do nothing to format of line + line = &lineOrig; + } + + vector tokens; + vector scoreVector; + + TokenizeMultiCharSeparator(tokens, *line , "|||" ); + + if (tokens.size() != 4 && tokens.size() != 5) { + UTIL_THROW2("Syntax error at " << ptFileName << ":" << count); + } + + const string &sourcePhraseString = tokens[0] + , &targetPhraseString = tokens[1] + , &scoreString = tokens[2] + , &alignString = tokens[3]; + + bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); + if (isLHSEmpty && !ttask->options()->unk.word_deletion_enabled) { + TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); + continue; + } + + Tokenize(scoreVector, scoreString); + const size_t numScoreComponents = GetNumScoreComponents(); + if (scoreVector.size() != numScoreComponents) { + UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" + << numScoreComponents << ") of score components on line " << count); + } + + UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, + "Number of scores incorrectly specified"); + + // parse source & find pt node + + // constituent labels + Word *sourceLHS; + Word *targetLHS; + + // source + Phrase sourcePhrase( 0); + sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, &sourceLHS); + + // create target phrase obj + TargetPhrase *targetPhrase = new TargetPhrase(this); + targetPhrase->CreateFromString(Output, m_output, targetPhraseString, &targetLHS); + + // rest of target phrase + targetPhrase->SetAlignmentInfo(alignString); + targetPhrase->SetTargetLHS(targetLHS); + //targetPhrase->SetDebugOutput(string("New Format pt ") + line); + + // component score, for n-best output + std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); + std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); + + targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); + targetPhrase->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply()); + + TargetPhraseCollection::shared_ptr phraseColl + = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, + *targetPhrase, sourceLHS); + phraseColl->Add(targetPhrase); + + count++; + + if (format == HieroFormat) { // reformat line + delete line; + } else { + // do nothing + } + + } + + // sort and prune each target phrase collection + SortAndPrune(rootNode); + + //removedirectoryrecursively(dirName); +} + +TargetPhraseCollection::shared_ptr +PhraseDictionaryFuzzyMatch:: +GetOrCreateTargetPhraseCollection(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) +{ + PhraseDictionaryNodeMemory &currNode = GetOrCreateNode(rootNode, source, target, sourceLHS); + return currNode.GetTargetPhraseCollection(); +} + +PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetOrCreateNode(PhraseDictionaryNodeMemory &rootNode + , const Phrase &source + , const TargetPhrase &target + , const Word *sourceLHS) +{ + cerr << source << endl << target << endl; + const size_t size = source.GetSize(); + + const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); + AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); + + PhraseDictionaryNodeMemory *currNode = &rootNode; + for (size_t pos = 0 ; pos < size ; ++pos) { + const Word& word = source.GetWord(pos); + + if (word.IsNonTerminal()) { + // indexed by source label 1st + const Word &sourceNonTerm = word; + + UTIL_THROW_IF2(iterAlign == alignmentInfo.end(), + "No alignment for non-term at position " << pos); + UTIL_THROW_IF2(iterAlign->first != pos, + "Alignment info incorrect at position " << pos); + + size_t targetNonTermInd = iterAlign->second; + ++iterAlign; + const Word &targetNonTerm = target.GetWord(targetNonTermInd); + +#if defined(UNLABELLED_SOURCE) + currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm); +#else + currNode = currNode->GetOrCreateChild(sourceNonTerm, targetNonTerm); +#endif + } else { + currNode = currNode->GetOrCreateChild(word); + } + + UTIL_THROW_IF2(currNode == NULL, + "Node not found at position " << pos); + + } + + // finally, the source LHS + //currNode = currNode->GetOrCreateChild(sourceLHS); + + return *currNode; +} + +void PhraseDictionaryFuzzyMatch::SortAndPrune(PhraseDictionaryNodeMemory &rootNode) +{ + if (GetTableLimit()) { + rootNode.Sort(GetTableLimit()); + } +} + +void PhraseDictionaryFuzzyMatch::CleanUpAfterSentenceProcessing(const InputType &source) +{ + m_collection.erase(source.GetTranslationId()); +} + +const PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(long translationId) const +{ + std::map::const_iterator iter = m_collection.find(translationId); + UTIL_THROW_IF2(iter == m_collection.end(), + "Couldn't find root node for input: " << translationId); + return iter->second; +} +PhraseDictionaryNodeMemory &PhraseDictionaryFuzzyMatch::GetRootNode(const InputType &source) +{ + long transId = source.GetTranslationId(); + std::map::iterator iter = m_collection.find(transId); + UTIL_THROW_IF2(iter == m_collection.end(), + "Couldn't find root node for input: " << transId); + return iter->second; +} + +TO_STRING_BODY(PhraseDictionaryFuzzyMatch); + +// friend +ostream& operator<<(ostream& out, const PhraseDictionaryFuzzyMatch& phraseDict) +{ + /* + typedef PhraseDictionaryNodeMemory::TerminalMap TermMap; + typedef PhraseDictionaryNodeMemory::NonTerminalMap NonTermMap; + + const PhraseDictionaryNodeMemory &coll = phraseDict.m_collection; + for (NonTermMap::const_iterator p = coll.m_nonTermMap.begin(); p != coll.m_nonTermMap.end(); ++p) { + const Word &sourceNonTerm = p->first.first; + out << sourceNonTerm; + } + for (TermMap::const_iterator p = coll.m_sourceTermMap.begin(); p != coll.m_sourceTermMap.end(); ++p) { + const Word &sourceTerm = p->first; + out << sourceTerm; + } + */ + + return out; +} + +} diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7766c897a898f04bc69662ac73910a7325be83e1 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp @@ -0,0 +1,398 @@ +// vim:tabstop=2 +/*********************************************************************** + Moses - factored phrase-based language decoder + Copyright (C) 2010 Hieu Hoang + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "PhraseDictionaryOnDisk.h" +#include "moses/InputFileStream.h" +#include "moses/StaticData.h" +#include "moses/TargetPhraseCollection.h" +#include "moses/InputPath.h" +#include "moses/TranslationModel/CYKPlusParser/DotChartOnDisk.h" +#include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerOnDisk.h" +#include "moses/TranslationTask.h" + +#include "OnDiskPt/OnDiskWrapper.h" +#include "OnDiskPt/Word.h" + +#include "util/tokenize_piece.hh" + +using namespace std; + + +namespace Moses +{ +PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line) + : MyBase(line, true) + , m_maxSpanDefault(NOT_FOUND) + , m_maxSpanLabelled(NOT_FOUND) +{ + ReadParameters(); +} + +PhraseDictionaryOnDisk::~PhraseDictionaryOnDisk() +{ +} + +void PhraseDictionaryOnDisk::Load(AllOptions::ptr const& opts) +{ + m_options = opts; + SetFeaturesToApply(); +} + +ChartRuleLookupManager *PhraseDictionaryOnDisk::CreateRuleLookupManager( + const ChartParser &parser, + const ChartCellCollectionBase &cellCollection, + std::size_t /*maxChartSpan*/) +{ + return new ChartRuleLookupManagerOnDisk(parser, cellCollection, *this, + GetImplementation(), + m_input, + m_output); +} + +OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() +{ + OnDiskPt::OnDiskWrapper* dict; + dict = m_implementation.get(); + UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread"); + return *dict; +} + +const OnDiskPt::OnDiskWrapper &PhraseDictionaryOnDisk::GetImplementation() const +{ + OnDiskPt::OnDiskWrapper* dict; + dict = m_implementation.get(); + UTIL_THROW_IF2(dict == NULL, "Dictionary object not yet created for this thread"); + return *dict; +} + +void PhraseDictionaryOnDisk::InitializeForInput(ttasksptr const& ttask) +{ + InputType const& source = *ttask->GetSource(); + ReduceCache(); + + OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper(); + obj->BeginLoad(m_filePath); + + UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM, + "On-disk phrase table is version " << obj->GetMisc("Version") + << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM); + + UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(), + "On-disk phrase table has " << obj->GetMisc("NumSourceFactors") << " source factors." + << ". The ini file specified " << m_input.size() << " source factors"); + + UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(), + "On-disk phrase table has " << obj->GetMisc("NumTargetFactors") << " target factors." + << ". The ini file specified " << m_output.size() << " target factors"); + + UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents, + "On-disk phrase table has " << obj->GetMisc("NumScores") << " scores." + << ". The ini file specified " << m_numScoreComponents << " scores"); + + m_implementation.reset(obj); +} + +void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const +{ + InputPathList::const_iterator iter; + for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { + InputPath &inputPath = **iter; + GetTargetPhraseCollectionBatch(inputPath); + } + + // delete nodes that's been saved + for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) { + InputPath &inputPath = **iter; + const OnDiskPt::PhraseNode *ptNode = static_cast(inputPath.GetPtNode(*this)); + delete ptNode; + } + +} + +void PhraseDictionaryOnDisk::GetTargetPhraseCollectionBatch(InputPath &inputPath) const +{ + OnDiskPt::OnDiskWrapper &wrapper = const_cast(GetImplementation()); + const Phrase &phrase = inputPath.GetPhrase(); + const InputPath *prevInputPath = inputPath.GetPrevPath(); + + const OnDiskPt::PhraseNode *prevPtNode = NULL; + + if (prevInputPath) { + prevPtNode = static_cast(prevInputPath->GetPtNode(*this)); + } else { + // Starting subphrase. + assert(phrase.GetSize() == 1); + prevPtNode = &wrapper.GetRootSourceNode(); + } + + // backoff + if (!SatisfyBackoff(inputPath)) { + return; + } + + if (prevPtNode) { + Word lastWord = phrase.GetWord(phrase.GetSize() - 1); + lastWord.OnlyTheseFactors(m_inputFactors); + OnDiskPt::Word *lastWordOnDisk = ConvertFromMoses(wrapper, m_input, lastWord); + + TargetPhraseCollection::shared_ptr tpc; + if (lastWordOnDisk == NULL) { + // OOV according to this phrase table. Not possible to extend + inputPath.SetTargetPhrases(*this, tpc, NULL); + } else { + OnDiskPt::PhraseNode const* ptNode; + ptNode = prevPtNode->GetChild(*lastWordOnDisk, wrapper); + if (ptNode) tpc = GetTargetPhraseCollection(ptNode); + inputPath.SetTargetPhrases(*this, tpc, ptNode); + + delete lastWordOnDisk; + } + } +} + +TargetPhraseCollection::shared_ptr +PhraseDictionaryOnDisk:: +GetTargetPhraseCollection(const OnDiskPt::PhraseNode *ptNode) const +{ + TargetPhraseCollection::shared_ptr ret; + + CacheColl &cache = GetCache(); + size_t hash = (size_t) ptNode->GetFilePos(); + + CacheColl::iterator iter; + + iter = cache.find(hash); + + if (iter == cache.end()) { + // not in cache, need to look up from phrase table + ret = GetTargetPhraseCollectionNonCache(ptNode); + + std::pair value(ret, clock()); + cache[hash] = value; + } else { + // in cache. just use it + iter->second.second = clock(); + ret = iter->second.first; + } + + return ret; +} + +TargetPhraseCollection::shared_ptr +PhraseDictionaryOnDisk:: +GetTargetPhraseCollectionNonCache(const OnDiskPt::PhraseNode *ptNode) const +{ + OnDiskPt::OnDiskWrapper& wrapper + = const_cast(GetImplementation()); + + vector weightT = StaticData::Instance().GetWeights(this); + OnDiskPt::Vocab &vocab = wrapper.GetVocab(); + + OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk + = ptNode->GetTargetPhraseCollection(m_tableLimit, wrapper); + TargetPhraseCollection::shared_ptr targetPhrases + = ConvertToMoses(targetPhrasesOnDisk, m_input, m_output, *this, + weightT, vocab, false); + + // delete targetPhrasesOnDisk; + + return targetPhrases; +} + +Moses::TargetPhraseCollection::shared_ptr +PhraseDictionaryOnDisk::ConvertToMoses( + const OnDiskPt::TargetPhraseCollection::shared_ptr targetPhrasesOnDisk + , const std::vector &inputFactors + , const std::vector &outputFactors + , const Moses::PhraseDictionary &phraseDict + , const std::vector &weightT + , OnDiskPt::Vocab &vocab + , bool isSyntax) const +{ + Moses::TargetPhraseCollection::shared_ptr ret; + ret.reset(new Moses::TargetPhraseCollection); + + for (size_t i = 0; i < targetPhrasesOnDisk->GetSize(); ++i) { + const OnDiskPt::TargetPhrase &tp = targetPhrasesOnDisk->GetTargetPhrase(i); + Moses::TargetPhrase *mosesPhrase + = ConvertToMoses(tp, inputFactors, outputFactors, vocab, + phraseDict, weightT, isSyntax); + + /* + // debugging output + stringstream strme; + strme << filePath << " " << *mosesPhrase; + mosesPhrase->SetDebugOutput(strme.str()); + */ + + ret->Add(mosesPhrase); + } + + ret->Sort(true, phraseDict.GetTableLimit()); + + return ret; +} + +Moses::TargetPhrase *PhraseDictionaryOnDisk::ConvertToMoses(const OnDiskPt::TargetPhrase &targetPhraseOnDisk + , const std::vector &inputFactors + , const std::vector &outputFactors + , const OnDiskPt::Vocab &vocab + , const Moses::PhraseDictionary &phraseDict + , const std::vector &weightT + , bool isSyntax) const +{ + Moses::TargetPhrase *ret = new Moses::TargetPhrase(&phraseDict); + + // words + size_t phraseSize = targetPhraseOnDisk.GetSize(); + UTIL_THROW_IF2(phraseSize == 0, "Target phrase cannot be empty"); // last word is lhs + if (isSyntax) { + --phraseSize; + } + + for (size_t pos = 0; pos < phraseSize; ++pos) { + const OnDiskPt::Word &wordOnDisk = targetPhraseOnDisk.GetWord(pos); + ConvertToMoses(wordOnDisk, outputFactors, vocab, ret->AddWord()); + } + + // alignments + // int index = 0; + Moses::AlignmentInfo::CollType alignTerm, alignNonTerm; + std::set > alignmentInfo; + const OnDiskPt::PhrasePtr sp = targetPhraseOnDisk.GetSourcePhrase(); + for (size_t ind = 0; ind < targetPhraseOnDisk.GetAlign().size(); ++ind) { + const std::pair &entry = targetPhraseOnDisk.GetAlign()[ind]; + alignmentInfo.insert(entry); + size_t sourcePos = entry.first; + size_t targetPos = entry.second; + + if (targetPhraseOnDisk.GetWord(targetPos).IsNonTerminal()) { + alignNonTerm.insert(std::pair(sourcePos, targetPos)); + } else { + alignTerm.insert(std::pair(sourcePos, targetPos)); + } + + } + ret->SetAlignTerm(alignTerm); + ret->SetAlignNonTerm(alignNonTerm); + + if (isSyntax) { + Moses::Word *lhsTarget = new Moses::Word(true); + const OnDiskPt::Word &lhsOnDisk = targetPhraseOnDisk.GetWord(targetPhraseOnDisk.GetSize() - 1); + ConvertToMoses(lhsOnDisk, outputFactors, vocab, *lhsTarget); + ret->SetTargetLHS(lhsTarget); + } + + // set source phrase + Moses::Phrase mosesSP(Moses::Input); + for (size_t pos = 0; pos < sp->GetSize(); ++pos) { + ConvertToMoses(sp->GetWord(pos), inputFactors, vocab, mosesSP.AddWord()); + } + + // scores + ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetScores()); + + // sparse features + ret->GetScoreBreakdown().Assign(&phraseDict, targetPhraseOnDisk.GetSparseFeatures()); + + // property + ret->SetProperties(targetPhraseOnDisk.GetProperty()); + + ret->EvaluateInIsolation(mosesSP, phraseDict.GetFeaturesToApply()); + + return ret; +} + +void PhraseDictionaryOnDisk::ConvertToMoses( + const OnDiskPt::Word &wordOnDisk, + const std::vector &outputFactorsVec, + const OnDiskPt::Vocab &vocab, + Moses::Word &overwrite) const +{ + Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance(); + overwrite = Moses::Word(wordOnDisk.IsNonTerminal()); + + if (wordOnDisk.IsNonTerminal()) { + const std::string &tok = vocab.GetString(wordOnDisk.GetVocabId()); + overwrite.SetFactor(0, factorColl.AddFactor(tok, wordOnDisk.IsNonTerminal())); + } else { + // TODO: this conversion should have been done at load time. + util::TokenIter tok(vocab.GetString(wordOnDisk.GetVocabId()), '|'); + + for (std::vector::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) { + UTIL_THROW_IF2(!tok, "Too few factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size()); + overwrite.SetFactor(*t, factorColl.AddFactor(*tok, wordOnDisk.IsNonTerminal())); + } + UTIL_THROW_IF2(tok, "Too many factors in \"" << vocab.GetString(wordOnDisk.GetVocabId()) << "\"; was expecting " << outputFactorsVec.size()); + } +} + +OnDiskPt::Word *PhraseDictionaryOnDisk::ConvertFromMoses(OnDiskPt::OnDiskWrapper &wrapper, const std::vector &factorsVec + , const Moses::Word &origWord) const +{ + bool isNonTerminal = origWord.IsNonTerminal(); + OnDiskPt::Word *newWord = new OnDiskPt::Word(isNonTerminal); + + util::StringStream strme; + + size_t factorType = factorsVec[0]; + const Moses::Factor *factor = origWord.GetFactor(factorType); + UTIL_THROW_IF2(factor == NULL, "Expecting factor " << factorType); + strme << factor->GetString(); + + for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) { + size_t factorType = factorsVec[ind]; + const Moses::Factor *factor = origWord.GetFactor(factorType); + if (factor == NULL) { + // can have less factors than factorType.size() + break; + } + UTIL_THROW_IF2(factor == NULL, + "Expecting factor " << factorType << " at position " << ind); + strme << "|" << factor->GetString(); + } // for (size_t factorType + + bool found; + uint64_t vocabId = wrapper.GetVocab().GetVocabId(strme.str(), found); + if (!found) { + // factor not in phrase table -> phrse definately not in. exit + delete newWord; + return NULL; + } else { + newWord->SetVocabId(vocabId); + return newWord; + } + +} + +void PhraseDictionaryOnDisk::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "max-span-default") { + m_maxSpanDefault = Scan(value); + } else if (key == "max-span-labelled") { + m_maxSpanLabelled = Scan(value); + } else { + PhraseDictionary::SetParameter(key, value); + } +} + + +} // namespace + diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/Trie.h b/mosesdecoder/moses/TranslationModel/RuleTable/Trie.h new file mode 100644 index 0000000000000000000000000000000000000000..7a9e12e8dd88c37af2a1793c960ba1b4717de5d2 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/Trie.h @@ -0,0 +1,63 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "moses/TranslationModel/PhraseDictionary.h" +#include "moses/TypeDef.h" + +#include +#include + +namespace Moses +{ + +class Phrase; +class TargetPhrase; +class TargetPhraseCollection; +class Word; + +/*** Implementation of a SCFG rule table in a trie. Looking up a rule of + * length n symbols requires n look-ups to find the TargetPhraseCollection. + * @todo why need this and PhraseDictionaryMemory? + */ +class RuleTableTrie : public PhraseDictionary +{ +public: + RuleTableTrie(const std::string &line) + : PhraseDictionary(line, true) { + } + + virtual ~RuleTableTrie(); + + void Load(AllOptions::ptr const& opts); + +private: + friend class RuleTableLoader; + + virtual TargetPhraseCollection::shared_ptr + GetOrCreateTargetPhraseCollection(const Phrase &source, + const TargetPhrase &target, + const Word *sourceLHS) = 0; + + virtual void SortAndPrune() = 0; + +}; + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp b/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eee88a11b2036a46b52f422e6ae9244accae8fe7 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.cpp @@ -0,0 +1,96 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + ***********************************************************************/ + +#include "moses/NonTerminal.h" +#include "moses/TranslationModel/Scope3Parser/Parser.h" +#include "moses/StaticData.h" +#include "moses/TargetPhrase.h" +#include "moses/TargetPhraseCollection.h" +#include "moses/Util.h" +#include "moses/Word.h" +#include "UTrie.h" +#include "Trie.h" +#include "UTrieNode.h" + +#include +#include +#include + +#include +#include + +namespace Moses +{ + +TargetPhraseCollection::shared_ptr +RuleTableUTrie:: +GetOrCreateTargetPhraseCollection(const Phrase &source, + const TargetPhrase &target, + const Word *sourceLHS) +{ + UTrieNode &currNode = GetOrCreateNode(source, target, sourceLHS); + return currNode.GetOrCreateTargetPhraseCollection(target); +} + +UTrieNode &RuleTableUTrie::GetOrCreateNode(const Phrase &source, + const TargetPhrase &target, + const Word */*sourceLHS*/) +{ + const size_t size = source.GetSize(); + + const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); + AlignmentInfo::const_iterator iterAlign = alignmentInfo.begin(); + + UTrieNode *currNode = &m_root; + for (size_t pos = 0 ; pos < size ; ++pos) { + const Word &word = source.GetWord(pos); + + if (word.IsNonTerminal()) { + assert(iterAlign != alignmentInfo.end()); + assert(iterAlign->first == pos); + size_t targetNonTermInd = iterAlign->second; + ++iterAlign; + const Word &targetNonTerm = target.GetWord(targetNonTermInd); + currNode = currNode->GetOrCreateNonTerminalChild(targetNonTerm); + } else { + currNode = currNode->GetOrCreateTerminalChild(word); + } + + assert(currNode != NULL); + } + + return *currNode; +} + +ChartRuleLookupManager *RuleTableUTrie::CreateRuleLookupManager( + const ChartParser &parser, + const ChartCellCollectionBase &cellCollection, + std::size_t maxChartSpan) +{ + return new Scope3Parser(parser, cellCollection, *this, maxChartSpan); +} + +void RuleTableUTrie::SortAndPrune() +{ + if (GetTableLimit()) { + m_root.Sort(GetTableLimit()); + } +} + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h b/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h new file mode 100644 index 0000000000000000000000000000000000000000..708bf866e2b9efc4dbbd9ad87f5d9fc25ebc8d41 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/RuleTable/UTrie.h @@ -0,0 +1,73 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "Trie.h" +#include "UTrieNode.h" +#include "moses/TargetPhraseCollection.h" + +namespace Moses +{ + +class Phrase; +class TargetPhrase; +class Word; +class ChartParser; + +/** Implementation of RuleTableTrie. A RuleTableUTrie is designed to store + * string-to-tree SCFG grammars only (i.e. rules can have distinct labels on + * the target side, but only a generic non-terminal on the source side). + * A key is the source RHS (one symbol per edge) of a rule and a mapped value + * is the collection of grammar rules that share the same source RHS. + * + * (The 'U' in UTrie stands for 'unlabelled' -- the keys are unlabelled and + * the target labels are stored on the node values, as opposed to the grammar + * being a monolingual projection with target labels projected onto the source + * side.) + */ +class RuleTableUTrie : public RuleTableTrie +{ +public: + RuleTableUTrie(const std::string &line) + : RuleTableTrie(line) { + } + + const UTrieNode &GetRootNode() const { + return m_root; + } + + ChartRuleLookupManager *CreateRuleLookupManager(const ChartParser &, + const ChartCellCollectionBase &, std::size_t); + +private: + TargetPhraseCollection::shared_ptr + GetOrCreateTargetPhraseCollection(const Phrase &source, + const TargetPhrase &target, + const Word *sourceLHS); + + UTrieNode &GetOrCreateNode(const Phrase &source, const TargetPhrase &target, + const Word *sourceLHS); + + void SortAndPrune(); + + UTrieNode m_root; +}; + +} // namespace Moses diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h new file mode 100644 index 0000000000000000000000000000000000000000..f54372d27b020f6e36db0ebe6921d30f827fc575 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/Alignments.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include +#include + +class Alignments +{ +public: + std::vector< std::map > m_alignS2T, m_alignT2S; + + Alignments(const std::string &align, size_t sourceSize, size_t targetSize); + + +protected: + +}; + + + diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp new file mode 100644 index 0000000000000000000000000000000000000000..89287ca9127418fdcabd665abaee9b61977a76f5 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp @@ -0,0 +1,1029 @@ +// +// FuzzyMatchWrapper.cpp +// moses +// +// Created by Hieu Hoang on 26/07/2012. +// Copyright 2012 __MyCompanyName__. All rights reserved. +// + +#include +#include "FuzzyMatchWrapper.h" +#include "SentenceAlignment.h" +#include "Match.h" +#include "create_xml.h" +#include "moses/Util.h" +#include "moses/StaticData.h" +#include "util/file.hh" + +using namespace std; + +namespace tmmt +{ + +FuzzyMatchWrapper::FuzzyMatchWrapper(const std::string &sourcePath, const std::string &targetPath, const std::string &alignmentPath) + :basic_flag(false) + ,lsed_flag(true) + ,refined_flag(true) + ,length_filter_flag(true) + ,parse_flag(true) + ,min_match(70) + ,multiple_flag(true) + ,multiple_slack(0) + ,multiple_max(100) +{ + cerr << "creating suffix array" << endl; + suffixArray = new tmmt::SuffixArray( sourcePath ); + + //cerr << "loading source data" << endl; + //load_corpus(sourcePath, source); + + cerr << "loading target data" << endl; + load_target(targetPath, targetAndAlignment); + + cerr << "loading alignment" << endl; + load_alignment(alignmentPath, targetAndAlignment); + + // create suffix array + //load_corpus(m_config[0], input); + + cerr << "loading completed" << endl; +} + +string FuzzyMatchWrapper::Extract(long translationId, const string &dirNameStr) +{ + const Moses::StaticData &staticData = Moses::StaticData::Instance(); + + WordIndex wordIndex; + + string fuzzyMatchFile = ExtractTM(wordIndex, translationId, dirNameStr); + + // create extrac files + create_xml(fuzzyMatchFile); + + // create phrase table with usual Moses scoring and consolidate programs + string cmd; + cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract | gzip -c > " + + fuzzyMatchFile + ".extract.sorted.gz"; + system(cmd.c_str()); + cmd = "LC_ALL=C sort " + fuzzyMatchFile + ".extract.inv | gzip -c > " + + fuzzyMatchFile + ".extract.inv.sorted.gz"; + system(cmd.c_str()); + +#ifdef IS_XCODE + cmd = "/Users/hieuhoang/unison/workspace/github/moses-smt/bin"; +#elif IS_ECLIPSE + cmd = "/home/hieu/workspace/github/moses-smt/bin"; +#else + cmd = staticData.GetBinDirectory(); +#endif + + cmd += string("/../scripts/training/train-model.perl -dont-zip -first-step 6 -last-step 6 -f en -e fr -hierarchical ") + + " -extract-file " + fuzzyMatchFile + ".extract -lexical-file - -score-options \"--NoLex\" " + + " -phrase-translation-table " + fuzzyMatchFile + ".pt"; + system(cmd.c_str()); + + + return fuzzyMatchFile + ".pt.gz"; +} + +string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, const string &dirNameStr) +{ + const std::vector< std::vector< WORD_ID > > &source = suffixArray->GetCorpus(); + + string inputPath = dirNameStr + "/in"; + string fuzzyMatchFile = dirNameStr + "/fuzzyMatchFile"; + ofstream fuzzyMatchStream(fuzzyMatchFile.c_str()); + + vector< vector< WORD_ID > > input; + load_corpus(inputPath, input); + + assert(input.size() == 1); + size_t sentenceInd = 0; + + clock_t start_clock = clock(); + // if (i % 10 == 0) cerr << "."; + + // establish some basic statistics + + // int input_length = compute_length( input[i] ); + int input_length = input[sentenceInd].size(); + int best_cost = input_length * (100-min_match) / 100 + 1; + + int match_count = 0; // how many substring matches to be considered + //cerr << endl << "sentence " << i << ", length " << input_length << ", best_cost " << best_cost << endl; + + // find match ranges in suffix array + vector< vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > > match_range; + for(int start=0; startGetSize()-1; + vector< string > substring; + bool stillMatched = true; + vector< pair< SuffixArray::INDEX, SuffixArray::INDEX > > matchedAtThisStart; + //cerr << "start: " << start; + for(size_t word=start; stillMatched && wordFindMatches( substring, first_match, last_match, prior_first_match, prior_last_match ) ) { + stillMatched = true; + matchedAtThisStart.push_back( make_pair( first_match, last_match ) ); + //cerr << " (" << first_match << "," << last_match << ")"; + //cerr << " " << ( last_match - first_match + 1 ); + prior_first_match = first_match; + prior_last_match = last_match; + } + //} + } + //cerr << endl; + match_range.push_back( matchedAtThisStart ); + } + + clock_t clock_range = clock(); + + map< int, vector< Match > > sentence_match; + map< int, int > sentence_match_word_count; + + // go through all matches, longest first + for(int length = input[sentenceInd].size(); length >= 1; length--) { + // do not create matches, if these are handled by the short match function + if (length <= short_match_max_length( input_length ) ) { + continue; + } + + unsigned int count = 0; + for(int start = 0; start <= input[sentenceInd].size() - length; start++) { + if (match_range[start].size() >= length) { + pair< SuffixArray::INDEX, SuffixArray::INDEX > &range = match_range[start][length-1]; + // cerr << " (" << range.first << "," << range.second << ")"; + count += range.second - range.first + 1; + + for(SuffixArray::INDEX i=range.first; i<=range.second; i++) { + size_t position = suffixArray->GetPosition( i ); + + // sentence length mismatch + size_t sentence_id = suffixArray->GetSentence( position ); + int sentence_length = suffixArray->GetSentenceLength( sentence_id ); + int diff = abs( (int)sentence_length - (int)input_length ); + // cerr << endl << i << "\tsentence " << sentence_id << ", length " << sentence_length; + //if (length <= 2 && input_length>=5 && + // sentence_match.find( sentence_id ) == sentence_match.end()) + // continue; + + if (diff > best_cost) + continue; + + // compute minimal cost + int start_pos = suffixArray->GetWordInSentence( position ); + int end_pos = start_pos + length-1; + // cerr << endl << "\t" << start_pos << "-" << end_pos << " (" << sentence_length << ") vs. " + // << start << "-" << (start+length-1) << " (" << input_length << ")"; + // different number of prior words -> cost is at least diff + int min_cost = abs( start - start_pos ); + + // same number of words, but not sent. start -> cost is at least 1 + if (start == start_pos && start>0) + min_cost++; + + // different number of remaining words -> cost is at least diff + min_cost += abs( ( sentence_length-1 - end_pos ) - + ( input_length-1 - (start+length-1) ) ); + + // same number of words, but not sent. end -> cost is at least 1 + if ( sentence_length-1 - end_pos == + input_length-1 - (start+length-1) + && end_pos != sentence_length-1 ) + min_cost++; + + // cerr << " -> min_cost " << min_cost; + if (min_cost > best_cost) + continue; + + // valid match + match_count++; + + // compute maximal cost + int max_cost = max( start, start_pos ) + + max( sentence_length-1 - end_pos, + input_length-1 - (start+length-1) ); + // cerr << ", max_cost " << max_cost; + + Match m = Match( start, start+length-1, + start_pos, start_pos+length-1, + min_cost, max_cost, 0); + sentence_match[ sentence_id ].push_back( m ); + sentence_match_word_count[ sentence_id ] += length; + + if (max_cost < best_cost) { + best_cost = max_cost; + if (best_cost == 0) break; + } + //if (match_count >= MAX_MATCH_COUNT) break; + } + } + // cerr << endl; + if (best_cost == 0) break; + //if (match_count >= MAX_MATCH_COUNT) break; + } + // cerr << count << " matches at length " << length << " in " << sentence_match.size() << " tm." << endl; + + if (best_cost == 0) break; + //if (match_count >= MAX_MATCH_COUNT) break; + } + cerr << match_count << " matches in " << sentence_match.size() << " sentences." << endl; + + clock_t clock_matches = clock(); + + // consider each sentence for which we have matches + int old_best_cost = best_cost; + int tm_count_word_match = 0; + int tm_count_word_match2 = 0; + int pruned_match_count = 0; + if (short_match_max_length( input_length )) { + init_short_matches(wordIndex, translationId, input[sentenceInd] ); + } + vector< int > best_tm; + typedef map< int, vector< Match > >::iterator I; + + clock_t clock_validation_sum = 0; + + for(I tm=sentence_match.begin(); tm!=sentence_match.end(); tm++) { + int tmID = tm->first; + int tm_length = suffixArray->GetSentenceLength(tmID); + vector< Match > &match = tm->second; + add_short_matches(wordIndex, translationId, match, source[tmID], input_length, best_cost ); + + //cerr << "match in sentence " << tmID << ": " << match.size() << " [" << tm_length << "]" << endl; + + // quick look: how many words are matched + int words_matched = 0; + for(size_t m=0; m best_cost) { + if (length_filter_flag) continue; + } + tm_count_word_match++; + + // prune, check again how many words are matched + vector< Match > pruned = prune_matches( match, best_cost ); + words_matched = 0; + for(size_t p=0; p best_cost) { + if (length_filter_flag) continue; + } + tm_count_word_match2++; + + pruned_match_count += pruned.size(); + int prior_best_cost = best_cost; + int cost; + + clock_t clock_validation_start = clock(); + if (! parse_flag || + pruned.size()>=10) { // to prevent worst cases + string path; + cost = sed( input[sentenceInd], source[tmID], path, false ); + if (cost < best_cost) { + best_cost = cost; + } + } + + else { + cost = parse_matches( pruned, input_length, tm_length, best_cost ); + if (prior_best_cost != best_cost) { + best_tm.clear(); + } + } + clock_validation_sum += clock() - clock_validation_start; + if (cost == best_cost) { + best_tm.push_back( tmID ); + } + } + cerr << "reduced best cost from " << old_best_cost << " to " << best_cost << endl; + cerr << "tm considered: " << sentence_match.size() + << " word-matched: " << tm_count_word_match + << " word-matched2: " << tm_count_word_match2 + << " best: " << best_tm.size() << endl; + + cerr << "pruned matches: " << ((float)pruned_match_count/(float)tm_count_word_match2) << endl; + + // create xml and extract files + string inputStr, sourceStr; + for (size_t pos = 0; pos < input_length; ++pos) { + inputStr += GetVocabulary().GetWord(input[sentenceInd][pos]) + " "; + } + + // do not try to find the best ... report multiple matches + if (multiple_flag) { + for(size_t si=0; si &sourceSentence = source[s]; + vector &targets = targetAndAlignment[s]; + create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, path, fuzzyMatchStream); + + } + } // if (multiple_flag) + else { + + // find the best matches according to letter sed + string best_path = ""; + int best_match = -1; + unsigned int best_letter_cost; + if (lsed_flag) { + best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1; + for(size_t si=0; si 0) { + string path; + sed( input[sentenceInd], source[best_tm[0]], path, false ); + best_path = path; + best_match = best_tm[0]; + } + } + cerr << "elapsed: " << (1000 * (clock()-start_clock) / CLOCKS_PER_SEC) + << " ( range: " << (1000 * (clock_range-start_clock) / CLOCKS_PER_SEC) + << " match: " << (1000 * (clock_matches-clock_range) / CLOCKS_PER_SEC) + << " tm: " << (1000 * (clock()-clock_matches) / CLOCKS_PER_SEC) + << " (validation: " << (1000 * (clock_validation_sum) / CLOCKS_PER_SEC) << ")" + << " )" << endl; + if (lsed_flag) { + //cout << best_letter_cost << "/" << compute_length( input[sentenceInd] ) << " ("; + } + //cout << best_cost <<"/" << input_length; + if (lsed_flag) { + //cout << ")"; + } + //cout << " ||| " << best_match << " ||| " << best_path << endl; + + if (best_match == -1) { + UTIL_THROW_IF2(source.size() == 0, "Empty source phrase"); + best_match = 0; + } + + // creat xml & extracts + const vector &sourceSentence = source[best_match]; + vector &targets = targetAndAlignment[best_match]; + create_extract(sentenceInd, best_cost, sourceSentence, targets, inputStr, best_path, fuzzyMatchStream); + + } // else if (multiple_flag) + + fuzzyMatchStream.close(); + + return fuzzyMatchFile; +} + +void FuzzyMatchWrapper::load_corpus( const std::string &fileName, vector< vector< WORD_ID > > &corpus ) +{ + // source + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); + } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + string line; + while(getline(*fileStreamP, line)) { + corpus.push_back( GetVocabulary().Tokenize( line.c_str() ) ); + } +} + +void FuzzyMatchWrapper::load_target(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus) +{ + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); + } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + WORD_ID delimiter = GetVocabulary().StoreIfNew("|||"); + + int lineNum = 0; + string line; + while(getline(*fileStreamP, line)) { + vector toks = GetVocabulary().Tokenize( line.c_str() ); + + corpus.push_back(vector< SentenceAlignment >()); + vector< SentenceAlignment > &vec = corpus.back(); + + vec.push_back(SentenceAlignment()); + SentenceAlignment *sentence = &vec.back(); + + const WORD &countStr = GetVocabulary().GetWord(toks[0]); + sentence->count = atoi(countStr.c_str()); + + for (size_t i = 1; i < toks.size(); ++i) { + WORD_ID wordId = toks[i]; + + if (wordId == delimiter) { + // target and alignments can have multiple sentences. + vec.push_back(SentenceAlignment()); + sentence = &vec.back(); + + // count + ++i; + + const WORD &countStr = GetVocabulary().GetWord(toks[i]); + sentence->count = atoi(countStr.c_str()); + } else { + // just a normal word, add + sentence->target.push_back(wordId); + } + } + + ++lineNum; + + } + +} + + +void FuzzyMatchWrapper::load_alignment(const std::string &fileName, vector< vector< SentenceAlignment > > &corpus ) +{ + ifstream fileStream; + fileStream.open(fileName.c_str()); + if (!fileStream) { + cerr << "file not found: " << fileName << endl; + exit(1); + } + cerr << "loading " << fileName << endl; + + istream *fileStreamP = &fileStream; + + string delimiter = "|||"; + + int lineNum = 0; + string line; + while(getline(*fileStreamP, line)) { + vector< SentenceAlignment > &vec = corpus[lineNum]; + size_t targetInd = 0; + SentenceAlignment *sentence = &vec[targetInd]; + + vector toks = Moses::Tokenize(line); + + for (size_t i = 0; i < toks.size(); ++i) { + string &tok = toks[i]; + + if (tok == delimiter) { + // target and alignments can have multiple sentences. + ++targetInd; + sentence = &vec[targetInd]; + + ++i; + } else { + // just a normal alignment, add + vector alignPoint = Moses::Tokenize(tok, "-"); + assert(alignPoint.size() == 2); + sentence->alignment.push_back(pair(alignPoint[0], alignPoint[1])); + } + } + + ++lineNum; + + } +} + +bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const +{ +#ifdef WITH_THREADS + boost::shared_lock read_lock(m_accessLock); +#endif + map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key ); + if (lookup != m_lsed.end()) { + value = lookup->second; + return true; + } + + return false; +} + +void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value) +{ +#ifdef WITH_THREADS + boost::unique_lock lock(m_accessLock); +#endif + m_lsed[ key ] = value; +} + +/* Letter string edit distance, e.g. sub 'their' to 'there' costs 2 */ + +unsigned int FuzzyMatchWrapper::letter_sed( WORD_ID aIdx, WORD_ID bIdx ) +{ + // check if already computed -> lookup in cache + pair< WORD_ID, WORD_ID > pIdx = make_pair( aIdx, bIdx ); + unsigned int value; + bool ret = GetLSEDCache(pIdx, value); + if (ret) { + return value; + } + + // get surface strings for word indices + const string &a = GetVocabulary().GetWord( aIdx ); + const string &b = GetVocabulary().GetWord( bIdx ); + + // initialize cost matrix + unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); + for( unsigned int i=0; i<=a.size(); i++ ) { + cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); + cost[i][0] = i; + } + for( unsigned int j=0; j<=b.size(); j++ ) { + cost[0][j] = j; + } + + // core string edit distance loop + for( unsigned int i=1; i<=a.size(); i++ ) { + for( unsigned int j=1; j<=b.size(); j++ ) { + + unsigned int ins = cost[i-1][j] + 1; + unsigned int del = cost[i][j-1] + 1; + bool match = (a.substr(i-1,1).compare( b.substr(j-1,1) ) == 0); + unsigned int diag = cost[i-1][j-1] + (match ? 0 : 1); + + unsigned int min = (ins < del) ? ins : del; + min = (diag < min) ? diag : min; + + cost[i][j] = min; + } + } + + // clear out memory + unsigned int final = cost[a.size()][b.size()]; + for( unsigned int i=0; i<=a.size(); i++ ) { + free( cost[i] ); + } + free( cost ); + + // cache and return result + SetLSEDCache(pIdx, final); + return final; +} + +/* string edit distance implementation */ + +unsigned int FuzzyMatchWrapper::sed( const vector< WORD_ID > &a, const vector< WORD_ID > &b, string &best_path, bool use_letter_sed ) +{ + + // initialize cost and path matrices + unsigned int **cost = (unsigned int**) calloc( sizeof( unsigned int* ), a.size()+1 ); + char **path = (char**) calloc( sizeof( char* ), a.size()+1 ); + + for( unsigned int i=0; i<=a.size(); i++ ) { + cost[i] = (unsigned int*) calloc( sizeof(unsigned int), b.size()+1 ); + path[i] = (char*) calloc( sizeof(char), b.size()+1 ); + if (i>0) { + cost[i][0] = cost[i-1][0]; + if (use_letter_sed) { + cost[i][0] += GetVocabulary().GetWord( a[i-1] ).size(); + } else { + cost[i][0]++; + } + } else { + cost[i][0] = 0; + } + path[i][0] = 'I'; + } + + for( unsigned int j=0; j<=b.size(); j++ ) { + if (j>0) { + cost[0][j] = cost[0][j-1]; + if (use_letter_sed) { + cost[0][j] += GetVocabulary().GetWord( b[j-1] ).size(); + } else { + cost[0][j]++; + } + } else { + cost[0][j] = 0; + } + path[0][j] = 'D'; + } + + // core string edit distance algorithm + for( unsigned int i=1; i<=a.size(); i++ ) { + for( unsigned int j=1; j<=b.size(); j++ ) { + unsigned int ins = cost[i-1][j]; + unsigned int del = cost[i][j-1]; + unsigned int match; + if (use_letter_sed) { + ins += GetVocabulary().GetWord( a[i-1] ).size(); + del += GetVocabulary().GetWord( b[j-1] ).size(); + match = letter_sed( a[i-1], b[j-1] ); + } else { + ins++; + del++; + match = ( a[i-1] == b[j-1] ) ? 0 : 1; + } + unsigned int diag = cost[i-1][j-1] + match; + + char action = (ins < del) ? 'I' : 'D'; + unsigned int min = (ins < del) ? ins : del; + if (diag < min) { + action = (match>0) ? 'S' : 'M'; + min = diag; + } + + cost[i][j] = min; + path[i][j] = action; + } + } + + // construct string for best path + unsigned int i = a.size(); + unsigned int j = b.size(); + best_path = ""; + while( i>0 || j>0 ) { + best_path = path[i][j] + best_path; + if (path[i][j] == 'I') { + i--; + } else if (path[i][j] == 'D') { + j--; + } else { + i--; + j--; + } + } + + + // clear out memory + unsigned int final = cost[a.size()][b.size()]; + + for( unsigned int i=0; i<=a.size(); i++ ) { + free( cost[i] ); + free( path[i] ); + } + free( cost ); + free( path ); + + // return result + return final; +} + +/* utlility function: compute length of sentence in characters + (spaces do not count) */ + +unsigned int FuzzyMatchWrapper::compute_length( const vector< WORD_ID > &sentence ) +{ + unsigned int length = 0; + for( unsigned int i=0; i > source, + vector< vector< WORD_ID > > input ) +{ + // go through input set... + for(unsigned int i=0; i= best_cost)) { + continue; + } + + // compute string edit distance + string path; + unsigned int cost = sed( input[i], source[s], path, use_letter_sed ); + + // update if new best + if (cost < best_cost) { + best_cost = cost; + best_path = path; + //best_match = s; + } + } + //cout << best_cost << " ||| " << best_match << " ||| " << best_path << endl; + } +} + +/* definition of short matches + very short n-gram matches (1-grams) will not be looked up in + the suffix array, since there are too many matches + and for longer sentences, at least one 2-gram match must occur */ + +int FuzzyMatchWrapper::short_match_max_length( int input_length ) +{ + if ( ! refined_flag ) + return 0; + if ( input_length >= 5 ) + return 1; + return 0; +} + + +/* if we have non-short matches in a sentence, we need to + take a closer look at it. + this function creates a hash map for all input words and their positions + (to be used by the next function) + (done here, because this has be done only once for an input sentence) */ + +void FuzzyMatchWrapper::init_short_matches(WordIndex &wordIndex, long translationId, const vector< WORD_ID > &input ) +{ + int max_length = short_match_max_length( input.size() ); + if (max_length == 0) + return; + + wordIndex.clear(); + + // store input words and their positions in hash map + for(size_t i=0; i position_vector; + wordIndex[ input[i] ] = position_vector; + } + wordIndex[ input[i] ].push_back( i ); + } +} + +/* add all short matches to list of matches for a sentence */ + +void FuzzyMatchWrapper::add_short_matches(WordIndex &wordIndex, long translationId, vector< Match > &match, const vector< WORD_ID > &tm, int input_length, int best_cost ) +{ + int max_length = short_match_max_length( input_length ); + if (max_length == 0) + return; + + int tm_length = tm.size(); + map< WORD_ID,vector< int > >::iterator input_word_hit; + for(int t_pos=0; t_pos &position_vector = input_word_hit->second; + for(size_t j=0; j0 && i_pos == t_pos ) + min_cost++; + + // after match + max_cost += max( (input_length-i_pos) , (tm_length-t_pos)); + min_cost += abs( (input_length-i_pos) - (tm_length-t_pos)); + if ( i_pos != input_length-1 && (input_length-i_pos) == (tm_length-t_pos)) + min_cost++; + + if (min_cost <= best_cost) { + Match new_match( i_pos,i_pos, t_pos,t_pos, min_cost,max_cost,0 ); + match.push_back( new_match ); + } + } + } + } +} + +/* remove matches that are subsumed by a larger match */ + +vector< Match > FuzzyMatchWrapper::prune_matches( const vector< Match > &match, int best_cost ) +{ + //cerr << "\tpruning"; + vector< Match > pruned; + for(int i=match.size()-1; i>=0; i--) { + //cerr << " (" << match[i].input_start << "," << match[i].input_end + // << " ; " << match[i].tm_start << "," << match[i].tm_end + // << " * " << match[i].min_cost << ")"; + + //if (match[i].min_cost > best_cost) + // continue; + + bool subsumed = false; + for(int j=match.size()-1; j>=0; j--) { + if (i!=j // do not compare match with itself + && ( match[i].input_end - match[i].input_start <= + match[j].input_end - match[j].input_start ) // i shorter than j + && ((match[i].input_start == match[j].input_start && + match[i].tm_start == match[j].tm_start ) || + (match[i].input_end == match[j].input_end && + match[i].tm_end == match[j].tm_end) ) ) { + subsumed = true; + } + } + if (! subsumed && match[i].min_cost <= best_cost) { + //cerr << "*"; + pruned.push_back( match[i] ); + } + } + //cerr << endl; + return pruned; +} + +/* A* parsing method to compute string edit distance */ + +int FuzzyMatchWrapper::parse_matches( vector< Match > &match, int input_length, int tm_length, int &best_cost ) +{ + // cerr << "sentence has " << match.size() << " matches, best cost: " << best_cost << ", lengths input: " << input_length << " tm: " << tm_length << endl; + + if (match.size() == 1) + return match[0].max_cost; + if (match.size() == 0) + return input_length+tm_length; + + int this_best_cost = input_length + tm_length; + for(size_t i=0; i > multi_match; + multi_match.push_back( match ); + + int match_level = 1; + while(multi_match[ match_level-1 ].size()>0) { + // init vector + vector< Match > empty; + multi_match.push_back( empty ); + + for(int first_level = 0; first_level <= (match_level-1)/2; first_level++) { + int second_level = match_level - first_level -1; + //cerr << "\tcombining level " << first_level << " and " << second_level << endl; + + vector< Match > &first_match = multi_match[ first_level ]; + vector< Match > &second_match = multi_match[ second_level ]; + + for(size_t i1 = 0; i1 < first_match.size(); i1++) { + for(size_t i2 = 0; i2 < second_match.size(); i2++) { + + // do not combine the same pair twice + if (first_level == second_level && i2 <= i1) { + continue; + } + + // get sorted matches (first is before second) + Match *first, *second; + if (first_match[i1].input_start < second_match[i2].input_start ) { + first = &first_match[i1]; + second = &second_match[i2]; + } else { + second = &first_match[i1]; + first = &second_match[i2]; + } + + //cerr << "\tcombining " + // << "(" << first->input_start << "," << first->input_end << "), " + // << first->tm_start << " [" << first->internal_cost << "]" + // << " with " + // << "(" << second->input_start << "," << second->input_end << "), " + // << second->tm_start<< " [" << second->internal_cost << "]" + // << endl; + + // do not process overlapping matches + if (first->input_end >= second->input_start) { + continue; + } + + // no overlap / mismatch in tm + if (first->tm_end >= second->tm_start) { + continue; + } + + // compute cost + int min_cost = 0; + int max_cost = 0; + + // initial + min_cost += abs( first->input_start - first->tm_start ); + max_cost += max( first->input_start, first->tm_start ); + + // same number of words, but not sent. start -> cost is at least 1 + if (first->input_start == first->tm_start && first->input_start > 0) { + min_cost++; + } + + // in-between + int skipped_words = second->input_start - first->input_end -1; + int skipped_words_tm = second->tm_start - first->tm_end -1; + int internal_cost = max( skipped_words, skipped_words_tm ); + internal_cost += first->internal_cost + second->internal_cost; + min_cost += internal_cost; + max_cost += internal_cost; + + // final + min_cost += abs( (tm_length-1 - second->tm_end) - + (input_length-1 - second->input_end) ); + max_cost += max( (tm_length-1 - second->tm_end), + (input_length-1 - second->input_end) ); + + // same number of words, but not sent. end -> cost is at least 1 + if ( ( input_length-1 - second->input_end + == tm_length-1 - second->tm_end ) + && input_length-1 != second->input_end ) { + min_cost++; + } + + // cerr << "\tcost: " << min_cost << "-" << max_cost << endl; + + // if worst than best cost, forget it + if (min_cost > best_cost) { + continue; + } + + // add match + Match new_match( first->input_start, + second->input_end, + first->tm_start, + second->tm_end, + min_cost, + max_cost, + internal_cost); + multi_match[ match_level ].push_back( new_match ); + // cerr << "\tstored\n"; + + // possibly updating this_best_cost + if (max_cost < this_best_cost) { + // cerr << "\tupdating this best cost to " << max_cost << "\n"; + this_best_cost = max_cost; + + // possibly updating best_cost + if (max_cost < best_cost) { + // cerr << "\tupdating best cost to " << max_cost << "\n"; + best_cost = max_cost; + } + } + } + } + } + match_level++; + } + return this_best_cost; +} + + +void FuzzyMatchWrapper::create_extract(int sentenceInd, int cost, const vector< WORD_ID > &sourceSentence, const vector &targets, const string &inputStr, const string &path, ofstream &outputFile) +{ + string sourceStr; + for (size_t pos = 0; pos < sourceSentence.size(); ++pos) { + WORD_ID wordId = sourceSentence[pos]; + sourceStr += GetVocabulary().GetWord(wordId) + " "; + } + + for (size_t targetInd = 0; targetInd < targets.size(); ++targetInd) { + const SentenceAlignment &sentenceAlignment = targets[targetInd]; + string targetStr = sentenceAlignment.getTargetString(GetVocabulary()); + string alignStr = sentenceAlignment.getAlignmentString(); + + outputFile + << sentenceInd << endl + << cost << endl + << sourceStr << endl + << inputStr << endl + << targetStr << endl + << alignStr << endl + << path << endl + << sentenceAlignment.count << endl; + + } +} + +} // namespace diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..da50b64b9d97b0415fa983d3e7a91592c5a02d65 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.h @@ -0,0 +1,91 @@ +// +// FuzzyMatchWrapper.h +// moses +// +// Created by Hieu Hoang on 26/07/2012. +// Copyright 2012 __MyCompanyName__. All rights reserved. +// + +#ifndef moses_FuzzyMatchWrapper_h +#define moses_FuzzyMatchWrapper_h + +#ifdef WITH_THREADS +#include +#endif + +#include +#include +#include "SuffixArray.h" +#include "Vocabulary.h" +#include "Match.h" +#include "moses/InputType.h" + +namespace tmmt +{ +class Match; +struct SentenceAlignment; + +class FuzzyMatchWrapper +{ +public: + FuzzyMatchWrapper(const std::string &source, const std::string &target, const std::string &alignment); + + std::string Extract(long translationId, const std::string &dirNameStr); + +protected: + // tm-mt + std::vector< std::vector< tmmt::SentenceAlignment > > targetAndAlignment; + tmmt::SuffixArray *suffixArray; + int basic_flag; + int lsed_flag; + int refined_flag; + int length_filter_flag; + int parse_flag; + int min_match; + int multiple_flag; + int multiple_slack; + int multiple_max; + + typedef std::map< WORD_ID,std::vector< int > > WordIndex; + + // global cache for word pairs + std::map< std::pair< WORD_ID, WORD_ID >, unsigned int > m_lsed; +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif + + void load_corpus( const std::string &fileName, std::vector< std::vector< tmmt::WORD_ID > > &corpus ); + void load_target( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus); + void load_alignment( const std::string &fileName, std::vector< std::vector< tmmt::SentenceAlignment > > &corpus ); + + /** brute force method: compare input to all corpus sentences */ + void basic_fuzzy_match( std::vector< std::vector< tmmt::WORD_ID > > source, + std::vector< std::vector< tmmt::WORD_ID > > input ) ; + + /** utlility function: compute length of sentence in characters + (spaces do not count) */ + unsigned int compute_length( const std::vector< tmmt::WORD_ID > &sentence ); + unsigned int letter_sed( WORD_ID aIdx, WORD_ID bIdx ); + unsigned int sed( const std::vector< WORD_ID > &a, const std::vector< WORD_ID > &b, std::string &best_path, bool use_letter_sed ); + void init_short_matches(WordIndex &wordIndex, long translationId, const std::vector< WORD_ID > &input ); + int short_match_max_length( int input_length ); + void add_short_matches(WordIndex &wordIndex, long translationId, std::vector< Match > &match, const std::vector< WORD_ID > &tm, int input_length, int best_cost ); + std::vector< Match > prune_matches( const std::vector< Match > &match, int best_cost ); + int parse_matches( std::vector< Match > &match, int input_length, int tm_length, int &best_cost ); + + void create_extract(int sentenceInd, int cost, const std::vector< WORD_ID > &sourceSentence, const std::vector &targets, const std::string &inputStr, const std::string &path, std::ofstream &outputFile); + + std::string ExtractTM(WordIndex &wordIndex, long translationId, const std::string &inputPath); + Vocabulary &GetVocabulary() { + return suffixArray->GetVocabulary(); + } + + bool GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const; + void SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value); + +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h new file mode 100644 index 0000000000000000000000000000000000000000..eb702850f44fd7b4705d81ff86d2357945c4b36e --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/Match.h @@ -0,0 +1,34 @@ +// +// Match.h +// fuzzy-match +// +// Created by Hieu Hoang on 25/07/2012. +// Copyright 2012 __MyCompanyName__. All rights reserved. +// + +#ifndef fuzzy_match_Match_h +#define fuzzy_match_Match_h + +namespace tmmt +{ + +/* data structure for n-gram match between input and corpus */ + +class Match +{ +public: + int input_start; + int input_end; + int tm_start; + int tm_end; + int min_cost; + int max_cost; + int internal_cost; + Match( int is, int ie, int ts, int te, int min, int max, int i ) + :input_start(is), input_end(ie), tm_start(ts), tm_end(te), min_cost(min), max_cost(max), internal_cost(i) { + } +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp b/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eabdd875365c7a75639feebc1919d95ec1dde0bd --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.cpp @@ -0,0 +1,25 @@ +// +// SentenceAlignment.cpp +// moses +// +// Created by Hieu Hoang on 26/07/2012. +// Copyright 2012 __MyCompanyName__. All rights reserved. +// + +#include +#include "util/string_stream.hh" +#include "SentenceAlignment.h" + +namespace tmmt +{ +std::string SentenceAlignment::getTargetString(const Vocabulary &vocab) const +{ + util::StringStream strme; + for (size_t i = 0; i < target.size(); ++i) { + const WORD &word = vocab.GetWord(target[i]); + strme << word << " "; + } + return strme.str(); +} + +} diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h new file mode 100644 index 0000000000000000000000000000000000000000..4d6dc430ccfc25481ef6b1bfff4b1e7dbbdea51f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/SentenceAlignment.h @@ -0,0 +1,43 @@ +// +// SentenceAlignment.h +// fuzzy-match +// +// Created by Hieu Hoang on 25/07/2012. +// Copyright 2012 __MyCompanyName__. All rights reserved. +// + +#ifndef fuzzy_match_SentenceAlignment_h +#define fuzzy_match_SentenceAlignment_h + +#include +#include +#include "Vocabulary.h" +#include "util/string_stream.hh" + +namespace tmmt +{ + +struct SentenceAlignment { + int count; + std::vector< WORD_ID > target; + std::vector< std::pair > alignment; + + SentenceAlignment() { + } + + std::string getTargetString(const Vocabulary &vocab) const; + + std::string getAlignmentString() const { + util::StringStream strme; + for (size_t i = 0; i < alignment.size(); ++i) { + const std::pair &alignPair = alignment[i]; + strme << alignPair.first << "-" << alignPair.second << " "; + } + return strme.str(); + } + +}; + +} + +#endif diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp b/mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2930147ab03dc313e0ba8c4ae4df209895799b85 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/SuffixArray.cpp @@ -0,0 +1,240 @@ +#include "SuffixArray.h" +#include +#include +#include + +using namespace std; + +namespace tmmt +{ + +SuffixArray::SuffixArray( string fileName ) +{ + m_vcb.StoreIfNew( "" ); + m_endOfSentence = m_vcb.StoreIfNew( "" ); + + ifstream extractFile; + + // count the number of words first; + extractFile.open(fileName.c_str()); + istream *fileP = &extractFile; + m_size = 0; + size_t sentenceCount = 0; + string line; + while(getline(*fileP, line)) { + + vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() ); + m_size += words.size() + 1; + sentenceCount++; + } + extractFile.close(); + cerr << m_size << " words (incl. sentence boundaries)" << endl; + + // allocate memory + m_array = (WORD_ID*) calloc( sizeof( WORD_ID ), m_size ); + m_index = (INDEX*) calloc( sizeof( INDEX ), m_size ); + m_wordInSentence = (char*) calloc( sizeof( char ), m_size ); + m_sentence = (size_t*) calloc( sizeof( size_t ), m_size ); + m_sentenceLength = (char*) calloc( sizeof( char ), sentenceCount ); + + // fill the array + int wordIndex = 0; + int sentenceId = 0; + extractFile.open(fileName.c_str()); + fileP = &extractFile; + while(getline(*fileP, line)) { + vector< WORD_ID > words = m_vcb.Tokenize( line.c_str() ); + + // add to corpus vector + corpus.push_back(words); + + // create SA + + vector< WORD_ID >::const_iterator i; + for( i=words.begin(); i!=words.end(); i++) { + m_index[ wordIndex ] = wordIndex; + m_sentence[ wordIndex ] = sentenceId; + m_wordInSentence[ wordIndex ] = i-words.begin(); + m_array[ wordIndex++ ] = *i; + } + m_index[ wordIndex ] = wordIndex; + m_array[ wordIndex++ ] = m_endOfSentence; + m_sentenceLength[ sentenceId++ ] = words.size(); + } + extractFile.close(); + cerr << "done reading " << wordIndex << " words, " << sentenceId << " sentences." << endl; + // List(0,9); + + // sort + m_buffer = (INDEX*) calloc( sizeof( INDEX ), m_size ); + Sort( 0, m_size-1 ); + free( m_buffer ); + cerr << "done sorting" << endl; +} + +// good ol' quick sort +void SuffixArray::Sort(INDEX start, INDEX end) +{ + if (start == end) return; + INDEX mid = (start+end+1)/2; + Sort( start, mid-1 ); + Sort( mid, end ); + + // merge + size_t i = start; + size_t j = mid; + size_t k = 0; + size_t length = end-start+1; + while( k end ) { + m_buffer[ k++ ] = m_index[ i++ ]; + } else { + if (CompareIndex( m_index[i], m_index[j] ) < 0) { + m_buffer[ k++ ] = m_index[ i++ ]; + } else { + m_buffer[ k++ ] = m_index[ j++ ]; + } + } + } + + memcpy( ((char*)m_index) + sizeof( INDEX ) * start, + ((char*)m_buffer), sizeof( INDEX ) * (end-start+1) ); +} + +SuffixArray::~SuffixArray() +{ + free(m_index); + free(m_array); +} + +int SuffixArray::CompareIndex( INDEX a, INDEX b ) const +{ + // skip over identical words + INDEX offset = 0; + while( a+offset < m_size && + b+offset < m_size && + m_array[ a+offset ] == m_array[ b+offset ] ) { + offset++; + } + + if( a+offset == m_size ) return -1; + if( b+offset == m_size ) return 1; + return CompareWord( m_array[ a+offset ], m_array[ b+offset ] ); +} + +inline int SuffixArray::CompareWord( WORD_ID a, WORD_ID b ) const +{ + // cerr << "c(" << m_vcb.GetWord(a) << ":" << m_vcb.GetWord(b) << ")=" << m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ) << endl; + return m_vcb.GetWord(a).compare( m_vcb.GetWord(b) ); +} + +int SuffixArray::Count( const vector< WORD > &phrase ) +{ + INDEX dummy; + return LimitedCount( phrase, m_size, dummy, dummy, 0, m_size-1 ); +} + +bool SuffixArray::MinCount( const vector< WORD > &phrase, INDEX min ) +{ + INDEX dummy; + return LimitedCount( phrase, min, dummy, dummy, 0, m_size-1 ) >= min; +} + +bool SuffixArray::Exists( const vector< WORD > &phrase ) +{ + INDEX dummy; + return LimitedCount( phrase, 1, dummy, dummy, 0, m_size-1 ) == 1; +} + +int SuffixArray::FindMatches( const vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) +{ + return LimitedCount( phrase, m_size, firstMatch, lastMatch, search_start, search_end ); +} + +int SuffixArray::LimitedCount( const vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start, INDEX search_end ) +{ + // cerr << "FindFirst\n"; + INDEX start = search_start; + INDEX end = (search_end == -1) ? (m_size-1) : search_end; + INDEX mid = FindFirst( phrase, start, end ); + // cerr << "done\n"; + if (mid == m_size) return 0; // no matches + if (min == 1) return 1; // only existance check + + int matchCount = 1; + + //cerr << "before...\n"; + firstMatch = FindLast( phrase, mid, start, -1 ); + matchCount += mid - firstMatch; + + //cerr << "after...\n"; + lastMatch = FindLast( phrase, mid, end, 1 ); + matchCount += lastMatch - mid; + + return matchCount; +} + +SuffixArray::INDEX SuffixArray::FindLast( const vector< WORD > &phrase, INDEX start, INDEX end, int direction ) +{ + end += direction; + while(true) { + INDEX mid = ( start + end + (direction>0 ? 0 : 1) )/2; + + int match = Match( phrase, mid ); + int matchNext = Match( phrase, mid+direction ); + //cerr << "\t" << start << ";" << mid << ";" << end << " -> " << match << "," << matchNext << endl; + + if (match == 0 && matchNext != 0) return mid; + + if (match == 0) // mid point is a match + start = mid; + else + end = mid; + } +} + +SuffixArray::INDEX SuffixArray::FindFirst( const vector< WORD > &phrase, INDEX &start, INDEX &end ) +{ + while(true) { + INDEX mid = ( start + end + 1 )/2; + //cerr << "FindFirst(" << start << ";" << mid << ";" << end << ")\n"; + int match = Match( phrase, mid ); + + if (match == 0) return mid; + if (start >= end && match != 0 ) return m_size; + + if (match > 0) + start = mid+1; + else + end = mid-1; + } +} + +int SuffixArray::Match( const vector< WORD > &phrase, INDEX index ) +{ + INDEX pos = m_index[ index ]; + for(INDEX i=0; i > corpus; + + WORD_ID *m_array; + INDEX *m_index; + INDEX *m_buffer; + char *m_wordInSentence; + size_t *m_sentence; + char *m_sentenceLength; + WORD_ID m_endOfSentence; + Vocabulary m_vcb; + INDEX m_size; + +public: + SuffixArray( std::string fileName ); + ~SuffixArray(); + + void Sort(INDEX start, INDEX end); + int CompareIndex( INDEX a, INDEX b ) const; + inline int CompareWord( WORD_ID a, WORD_ID b ) const; + int Count( const std::vector< WORD > &phrase ); + bool MinCount( const std::vector< WORD > &phrase, INDEX min ); + bool Exists( const std::vector< WORD > &phrase ); + int FindMatches( const std::vector< WORD > &phrase, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = 0, INDEX search_end = -1 ); + int LimitedCount( const std::vector< WORD > &phrase, INDEX min, INDEX &firstMatch, INDEX &lastMatch, INDEX search_start = -1, INDEX search_end = 0 ); + INDEX FindFirst( const std::vector< WORD > &phrase, INDEX &start, INDEX &end ); + INDEX FindLast( const std::vector< WORD > &phrase, INDEX start, INDEX end, int direction ); + int Match( const std::vector< WORD > &phrase, INDEX index ); + void List( INDEX start, INDEX end ); + inline INDEX GetPosition( INDEX index ) { + return m_index[ index ]; + } + inline size_t GetSentence( INDEX position ) { + return m_sentence[position]; + } + inline char GetWordInSentence( INDEX position ) { + return m_wordInSentence[position]; + } + inline char GetSentenceLength( size_t sentenceId ) { + return m_sentenceLength[sentenceId]; + } + inline INDEX GetSize() { + return m_size; + } + + Vocabulary &GetVocabulary() { + return m_vcb; + } + const std::vector< std::vector< WORD_ID > > &GetCorpus() const { + return corpus; + } +}; + +} + diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp b/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b70eb98cab099790dbce73ff4060192fb5ae2e2f --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.cpp @@ -0,0 +1,71 @@ +// $Id: Vocabulary.cpp 1565 2008-02-22 14:42:01Z bojar $ +#include "Vocabulary.h" +#ifdef WITH_THREADS +#include +#endif + +using namespace std; + +namespace tmmt +{ + +// as in beamdecoder/tables.cpp +vector Vocabulary::Tokenize( const char input[] ) +{ + vector< WORD_ID > token; + bool betweenWords = true; + int start=0; + int i=0; + for(; input[i] != '\0'; i++) { + bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } else if (isSpace && !betweenWords) { + token.push_back( StoreIfNew ( string( input+start, i-start ) ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( StoreIfNew ( string( input+start, i-start ) ) ); + return token; +} + +WORD_ID Vocabulary::StoreIfNew( const WORD& word ) +{ + + { + // read=lock scope +#ifdef WITH_THREADS + boost::shared_lock read_lock(m_accessLock); +#endif + map::iterator i = lookup.find( word ); + + if( i != lookup.end() ) + return i->second; + } + +#ifdef WITH_THREADS + boost::unique_lock lock(m_accessLock); +#endif + WORD_ID id = vocab.size(); + vocab.push_back( word ); + lookup[ word ] = id; + return id; +} + +WORD_ID Vocabulary::GetWordID( const WORD &word ) +{ +#ifdef WITH_THREADS + boost::shared_lock read_lock(m_accessLock); +#endif + map::iterator i = lookup.find( word ); + if( i == lookup.end() ) + return 0; + WORD_ID w= (WORD_ID) i->second; + return w; +} + +} + diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h new file mode 100644 index 0000000000000000000000000000000000000000..f5245ebe3b72521fe6d3d65efb04eb5c53aa946a --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/Vocabulary.h @@ -0,0 +1,46 @@ +// $Id: tables-core.h 1470 2007-10-02 21:43:54Z redpony $ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef WITH_THREADS +#include +#endif + +namespace tmmt +{ +typedef std::string WORD; +typedef unsigned int WORD_ID; + +class Vocabulary +{ +public: + std::map lookup; + std::vector< WORD > vocab; + WORD_ID StoreIfNew( const WORD& ); + WORD_ID GetWordID( const WORD& ); + std::vector Tokenize( const char[] ); + inline WORD &GetWord( WORD_ID id ) const { + WORD &i = (WORD&) vocab[ id ]; + return i; + } + +protected: +#ifdef WITH_THREADS + //reader-writer lock + mutable boost::shared_mutex m_accessLock; +#endif + + +}; + +} + diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp b/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0a31b9b2861ee494b9416cdeeb9c0481896b8e07 --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.cpp @@ -0,0 +1,387 @@ + +#include +#include +#include +#include +#include +#include "moses/Util.h" +#include "Alignments.h" + +using namespace std; +using namespace Moses; + +inline const std::string TrimInternal(const std::string& str, const std::string dropChars = " \t\n\r") +{ + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +class CreateXMLRetValues +{ +public: + string frame, ruleS, ruleT, ruleAlignment, ruleAlignmentInv; +}; + +CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path ); + +void create_xml(const string &inPath) +{ + ifstream inStrme(inPath.c_str()); + ofstream rule((inPath + ".extract").c_str()); + ofstream ruleInv((inPath + ".extract.inv").c_str()); + + // int setenceId; + // float score; + string source, target, align, path; + string *input = NULL; + int count; + + int lineCount = 1; + int ruleCount = 1; + string inLine; + + int step = 0; + while (!inStrme.eof()) { + getline(inStrme, inLine); + //cout << inLine << endl; + switch (step) { + case 0: + /*setenceId = */ + Scan(inLine); + ++step; + break; + case 1: + /*score = */ + Scan(inLine); + ++step; + break; + case 2: + source = inLine; + ++step; + break; + case 3: + if (input == NULL) { + input = new string(inLine); + } else { + assert(inLine == *input); + } + ++step; + break; + case 4: + target = inLine; + ++step; + break; + case 5: + align = inLine; + ++step; + break; + case 6: + path = inLine + "X"; + ++step; + break; + case 7: + count = Scan(inLine); + CreateXMLRetValues ret = createXML(ruleCount, source, *input, target, align, path); + + //print STDOUT $frame."\n"; + rule << ret.ruleS << " [X] ||| " << ret.ruleT << " [X] ||| " << ret.ruleAlignment + << " ||| " << count << endl; + ruleInv << ret.ruleT << " [X] ||| " << ret.ruleS << " [X] ||| " << ret.ruleAlignmentInv + << " ||| " << count << endl; + + //print STDOUT "$sentenceInd ||| $score ||| $count\n"; + ++ruleCount; + step = 0; + break; + } + + ++lineCount; + } + + delete input; + ruleInv.close(); + rule.close(); + inStrme.close(); + +} + + +CreateXMLRetValues createXML(int ruleCount, const string &source, const string &input, const string &target, const string &align, const string &path) +{ + CreateXMLRetValues ret; + vector sourceToks = Tokenize(source, " ") + ,inputToks = Tokenize(input, " ") + ,targetsToks = Tokenize(target, " "); + Alignments alignments(align, sourceToks.size(), targetsToks.size()); + map frameInput; + map alignI2S; + vector< map > nonTerms; + vector targetBitmap(targetsToks.size(), true); + vector inputBitmap; + + // STEP 1: FIND MISMATCHES + int s = 0, i = 0; + bool currently_matching = false; + int start_s = 0, start_i = 0; + + //cerr << input << endl << source << endl << target << endl << path << endl; + for ( int p = 0 ; p < int(path.length()) ; p++ ) { + string action = path.substr(p, 1); + + // beginning of a mismatch + if ( currently_matching && action != "M" && action != "X" ) { + start_i = i; + start_s = s; + currently_matching = 0; + } // if ( currently_matching + // end of a mismatch + else if ( !currently_matching && ( action == "M" || action == "X" ) ) { + + // remove use of affected target words + for ( int ss = start_s ; ss < s ; ss++ ) { + const std::map &targets = alignments.m_alignS2T[ss]; + + std::map::const_iterator iter; + for (iter = targets.begin(); iter != targets.end(); ++iter) { + int tt = iter->first; + targetBitmap[tt] = 0; + } + + // also remove enclosed unaligned words? + } //for ( int ss = start_s ; ss < s ; ss++ ) { + + // are there input words that need to be inserted ? + //cerr << start_i << "<" << i << "?" << endl; + if (start_i < i ) { + + // take note of input words to be inserted + string insertion = ""; + for (int ii = start_i ; ii < i ; ii++ ) { + insertion += inputToks[ii] + " "; + } + + // find position for inserted input words + + // find first removed target word + int start_t = 1000; + for ( int ss = start_s ; ss < s ; ss++ ) { + const std::map &targets = alignments.m_alignS2T[ss]; + + std::map::const_iterator iter; + for (iter = targets.begin(); iter != targets.end(); ++iter) { + int tt = iter->first; + if (tt < start_t) { + start_t = tt; + } + } + } + + // end of sentence? add to end + if ( start_t == 1000 && i > int(inputToks.size()) - 1 ) { + start_t = targetsToks.size() - 1; + } + + // backtrack to previous words if unaligned + if ( start_t == 1000 ) { + start_t = -1; + for ( int ss = s - 1 ; start_t == -1 && ss >= 0 ; ss-- ) { + const std::map &targets = alignments.m_alignS2T[ss]; + + std::map::const_iterator iter; + for (iter = targets.begin(); iter != targets.end(); ++iter) { + int tt = iter->first; + if (tt > start_t) { + start_t = tt; + } + } + } + } // if ( start_t == 1000 ) { + + frameInput[start_t] += insertion; + map nt; + nt["start_t"] = start_t; + nt["start_i"] = start_i; + nonTerms.push_back(nt); + + } // if (start_i < i ) { + + currently_matching = 1; + } // else if ( !currently_matching + + /* + cerr << action << " " << s << " " << i + << "(" << start_s << " " << start_i << ")" + << currently_matching; + */ + + if ( action != "I" ) { + //cerr << " ->"; + + if (s < int(alignments.m_alignS2T.size())) { + const std::map &targets = alignments.m_alignS2T[s]; + //cerr << "s=" << s << endl; + + std::map::const_iterator iter; + for (iter = targets.begin(); iter != targets.end(); ++iter) { + // int tt = iter->first; + //cerr << " " << tt; + } + } + } + //cerr << endl; + + if (action != "I") + s++; + if (action != "D") { + i++; + alignI2S[i] = s; + } + + if (action == "M") { + inputBitmap.push_back(1); + } else if (action == "I" || action == "S") { + inputBitmap.push_back(0); + } + + } // for ( int p = 0 + + //cerr << target << endl; + for (size_t i = 0; i < targetBitmap.size(); ++i) { + //cerr << targetBitmap[i]; + } + //cerr << endl; + + for (map::const_iterator iter = frameInput.begin(); iter != frameInput.end(); ++iter) { + //cerr << iter->first << ":" <second << endl; + } + + // STEP 2: BUILD RULE AND FRAME + + // hierarchical rule + int rule_pos_s = 0; + map ruleAlignS; + + for (int i = 0 ; i < int(inputBitmap.size()) ; ++i ) { + if ( inputBitmap[i] ) { + ret.ruleS += inputToks[i] + " "; + ruleAlignS[ alignI2S[i] ] = rule_pos_s++; + } + + for (size_t j = 0; j < nonTerms.size(); ++j) { + map &nt = nonTerms[j]; + if (i == nt["start_i"]) { + ret.ruleS += "[X][X] "; + nt["rule_pos_s"] = rule_pos_s++; + } + } + } + + int rule_pos_t = 0; + map ruleAlignT; + + for (int t = -1 ; t < (int) targetBitmap.size(); t++ ) { + if (t >= 0 && targetBitmap[t]) { + ret.ruleT += targetsToks[t] + " "; + ruleAlignT[t] = rule_pos_t++; + } + + for (size_t i = 0; i < nonTerms.size(); ++i) { + map &nt = nonTerms[i]; + + if (t == nt["start_t"]) { + ret.ruleT += "[X][X] "; + nt["rule_pos_t"] = rule_pos_t++; + } + } + } + + int numAlign = 0; + ret.ruleAlignment = ""; + + for (map::const_iterator iter = ruleAlignS.begin(); iter != ruleAlignS.end(); ++iter) { + int s = iter->first; + + if (s < int(alignments.m_alignS2T.size())) { + const std::map &targets = alignments.m_alignS2T[s]; + + std::map::const_iterator iter; + for (iter = targets.begin(); iter != targets.end(); ++iter) { + int t =iter->first; + if (ruleAlignT.find(t) == ruleAlignT.end()) + continue; + ret.ruleAlignment += SPrint(ruleAlignS[s]) + "-" + SPrint(ruleAlignT[t]) + " "; + ++numAlign; + } + } + } + + //cerr << "numAlign=" << numAlign << endl; + + for (size_t i = 0; i < nonTerms.size(); ++i) { + map &nt = nonTerms[i]; + ret.ruleAlignment += SPrint(nt["rule_pos_s"]) + "-" + SPrint(nt["rule_pos_t"]) + " "; + ++numAlign; + } + + //cerr << "numAlign=" << numAlign << endl; + + ret.ruleS = TrimInternal(ret.ruleS); + ret.ruleT = TrimInternal(ret.ruleT); + ret.ruleAlignment = TrimInternal(ret.ruleAlignment); + + vector ruleAlignmentToks = Tokenize(ret.ruleAlignment); + for (size_t i = 0; i < ruleAlignmentToks.size(); ++i) { + const string &alignPoint = ruleAlignmentToks[i]; + vector toks = Tokenize(alignPoint, "-"); + assert(toks.size() == 2); + ret.ruleAlignmentInv += toks[1] + "-" +toks[0]; + } + ret.ruleAlignmentInv = TrimInternal(ret.ruleAlignmentInv); + + // frame + // ret.frame; + if (frameInput.find(-1) == frameInput.end()) + ret.frame = frameInput[-1]; + + int currently_included = 0; + int start_t = -1; + targetBitmap.push_back(0); + + for (size_t t = 0 ; t <= targetsToks.size() ; t++ ) { + // beginning of tm target inclusion + if ( !currently_included && targetBitmap[t] ) { + start_t = t; + currently_included = 1; + } + // end of tm target inclusion (not included word or inserted input) + else if (currently_included + && ( targetBitmap[t] || frameInput.find(t) != frameInput.end() ) + ) { + // add xml (unless change is at the beginning of the sentence + if ( start_t >= 0 ) { + string target = ""; + //cerr << "for(tt=$start_t;tt<$t+$TARGET_BITMAP[$t]);\n"; + for (size_t tt = start_t ; tt < t + targetBitmap[t] ; tt++ ) { + target += targetsToks[tt] + " "; + } + // target = Trim(target); TODO + ret.frame += " x "; + } + currently_included = 0; + } + + if (frameInput.find(t) != frameInput.end()) + ret.frame += frameInput[t]; + //cerr << targetBitmap[t] << " " << t << " " << "(" << start_t << ")" + // << currently_included << endl; + + } //for (int t = 0 + + cerr << ret.frame << "\n-------------------------------------\n"; + return ret; + +} + + + diff --git a/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h b/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h new file mode 100644 index 0000000000000000000000000000000000000000..3a49a1fc09b46664c3da127a7c60772824b1644b --- /dev/null +++ b/mosesdecoder/moses/TranslationModel/fuzzy-match/create_xml.h @@ -0,0 +1,5 @@ +#pragma once + +#include + +void create_xml(const std::string &inPath); diff --git a/mosesdecoder/moses/server/Hypothesis_4server.cpp b/mosesdecoder/moses/server/Hypothesis_4server.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ace9c9672bfa69c9c5dcc2cdfd504ab6bbc6724 --- /dev/null +++ b/mosesdecoder/moses/server/Hypothesis_4server.cpp @@ -0,0 +1,37 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#include "moses/Hypothesis.h" +#include "moses/Manager.h" +#include +namespace Moses { + void + Hypothesis:: + OutputLocalWordAlignment(std::vector& dest) const + { + using namespace std; + Range const& src = this->GetCurrSourceWordsRange(); + Range const& trg = this->GetCurrTargetWordsRange(); + + WordAlignmentSort waso = m_manager.options()->output.WA_SortOrder; + vector const* > a + = this->GetCurrTargetPhrase().GetAlignTerm().GetSortedAlignments(waso); + typedef pair item; + BOOST_FOREACH(item const* p, a) { + map M; + M["source-word"] = xmlrpc_c::value_int(src.GetStartPos() + p->first); + M["target-word"] = xmlrpc_c::value_int(trg.GetStartPos() + p->second); + dest.push_back(xmlrpc_c::value_struct(M)); + } + } + + void + Hypothesis:: + OutputWordAlignment(std::vector& out) const + { + std::vector tmp; + for (Hypothesis const* h = this; h; h = h->GetPrevHypo()) + tmp.push_back(h); + for (size_t i = tmp.size(); i-- > 0;) + tmp[i]->OutputLocalWordAlignment(out); + } + +} diff --git a/mosesdecoder/moses/server/Optimizer.h b/mosesdecoder/moses/server/Optimizer.h new file mode 100644 index 0000000000000000000000000000000000000000..da84df0238b30dc9ff9eb320f5cf6e541de9850f --- /dev/null +++ b/mosesdecoder/moses/server/Optimizer.h @@ -0,0 +1,17 @@ +// -*- c++ -*- + +#include +#include +#include + +namespace MosesServer +{ +class + Optimizer : public xmlrpc_c::method +{ +public: + Optimizer(); + void execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP); +}; +} diff --git a/mosesdecoder/moses/server/PackScores.cpp b/mosesdecoder/moses/server/PackScores.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4ec6109d3eeb38fc665e94f427d824da7495de0c --- /dev/null +++ b/mosesdecoder/moses/server/PackScores.cpp @@ -0,0 +1,45 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#include "PackScores.h" +#include "moses/FF/StatefulFeatureFunction.h" +#include "moses/FF/StatelessFeatureFunction.h" +#include +namespace Moses { + +void +PackScores(FeatureFunction const& ff, FVector const& S, + std::map& M) +{ + std::vector v; + size_t N = ff.GetNumScoreComponents(); + + std::vector dense; + dense.reserve(N); + size_t o = ff.GetIndex(); + for (size_t i = 0; i < N; ++i) + if (ff.IsTuneableComponent(i)) + dense.push_back(xmlrpc_c::value_double(S[o+i])); + v.push_back(xmlrpc_c::value_array(dense)); + + std::map sparse; + typedef FVector::FNVmap::const_iterator iter; + for(iter m = S.cbegin(); m != S.cend(); ++m) + sparse[m->first.name()] = xmlrpc_c::value_double(m->second); + v.push_back(xmlrpc_c::value_struct(sparse)); + M[ff.GetScoreProducerDescription()] = xmlrpc_c::value_array(v); +} + +xmlrpc_c::value +PackScores(ScoreComponentCollection const& S) +{ + std::map M; + typedef StatefulFeatureFunction SFFF; + typedef StatelessFeatureFunction SLFF; + BOOST_FOREACH(SFFF const* ff, SFFF::GetStatefulFeatureFunctions()) + if (ff->IsTuneable()) + PackScores(*ff, S.GetScoresVector(), M); + BOOST_FOREACH(SLFF const* ff, SLFF::GetStatelessFeatureFunctions()) + if (ff->IsTuneable()) + PackScores(*ff, S.GetScoresVector(), M); + return xmlrpc_c::value_struct(M); +} +} diff --git a/mosesdecoder/moses/server/PackScores.h b/mosesdecoder/moses/server/PackScores.h new file mode 100644 index 0000000000000000000000000000000000000000..5d875bc3bacc940181f4b98afe30db24f08b2bb0 --- /dev/null +++ b/mosesdecoder/moses/server/PackScores.h @@ -0,0 +1,10 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#pragma once +#include +#include "moses/FF/FeatureFunction.h" +#include "moses/ScoreComponentCollection.h" + +namespace Moses { + xmlrpc_c::value + PackScores(ScoreComponentCollection const& S); +} diff --git a/mosesdecoder/moses/server/Server.h b/mosesdecoder/moses/server/Server.h new file mode 100644 index 0000000000000000000000000000000000000000..802eaef3e4f4c0846dc5d27103c062a5fc08888a --- /dev/null +++ b/mosesdecoder/moses/server/Server.h @@ -0,0 +1,46 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include "moses/TypeDef.h" + +#ifdef WITH_THREADS +#include +#include "moses/ThreadPool.h" +#endif + +#include +#include +#include +#include "Translator.h" +#include "Optimizer.h" +#include "Updater.h" +#include "CloseSession.h" +#include "Session.h" +#include "moses/parameters/ServerOptions.h" +#include + +namespace MosesServer +{ + class Server + { + Moses::ServerOptions m_server_options; + SessionCache m_session_cache; + xmlrpc_c::registry m_registry; + xmlrpc_c::methodPtr const m_updater; + xmlrpc_c::methodPtr const m_optimizer; + xmlrpc_c::methodPtr const m_translator; + xmlrpc_c::methodPtr const m_close_session; + std::string m_pidfile; + public: + Server(Moses::Parameter& params); + ~Server(); + int run(); + void delete_session(uint64_t const session_id); + + Moses::ServerOptions const& + options() const; + + Session const& + get_session(uint64_t session_id); + + }; +} diff --git a/mosesdecoder/moses/server/Session.h b/mosesdecoder/moses/server/Session.h new file mode 100644 index 0000000000000000000000000000000000000000..27d5ca8455fe858ff03f0da87d4b339b7204bf0d --- /dev/null +++ b/mosesdecoder/moses/server/Session.h @@ -0,0 +1,75 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width: 2 -*- +#pragma once +#include "moses/Util.h" +#include "moses/ContextScope.h" +#include "moses/parameters/AllOptions.h" +#include +#include + +#ifdef WITH_THREADS +#include +#include +#endif +namespace MosesServer{ + + struct Session + { + uint64_t const id; + time_t start_time; + time_t last_access; + boost::shared_ptr const scope; // stores local info + SPTR > m_context_weights; + + + Session(uint64_t const session_id) + : id(session_id) + , scope(new Moses::ContextScope) + { + last_access = start_time = time(NULL); + } + + bool is_new() const { return last_access == start_time; } + + void setup(std::map const& params); + }; + + class SessionCache + { + mutable boost::shared_mutex m_lock; + uint64_t m_session_counter; + boost::unordered_map m_cache; + public: + + SessionCache() : m_session_counter(1) {} + + Session const& + operator[](uint32_t id) + { + boost::upgrade_lock lock(m_lock); + if (id > 1) + { + boost::unordered_map::iterator m = m_cache.find(id); + if (m != m_cache.end()) + { + m->second.last_access = time(NULL); + return m->second; + } + } + boost::upgrade_to_unique_lock xlock(lock); + id = ++m_session_counter; + std::pair foo(id, Session(id)); + return m_cache.insert(foo).first->second; + } + + void + erase(uint32_t const id) + { + boost::unique_lock lock(m_lock); + m_cache.erase(id); + } + + + }; + + +} diff --git a/mosesdecoder/moses/server/TranslationRequest.cpp b/mosesdecoder/moses/server/TranslationRequest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2118ad8f79c5908236483816a2b65773f5d2207 --- /dev/null +++ b/mosesdecoder/moses/server/TranslationRequest.cpp @@ -0,0 +1,524 @@ +#include "TranslationRequest.h" +#include "PackScores.h" +#include "moses/ContextScope.h" +#include +#include "moses/Util.h" +#include "moses/Hypothesis.h" + +namespace MosesServer +{ +using namespace std; +using Moses::Hypothesis; +using Moses::StaticData; +using Moses::Range; +using Moses::ChartHypothesis; +using Moses::Phrase; +using Moses::Manager; +using Moses::SearchGraphNode; +using Moses::TrellisPathList; +using Moses::TranslationOptionCollection; +using Moses::TranslationOptionList; +using Moses::TranslationOption; +using Moses::TargetPhrase; +using Moses::FValue; +using Moses::PhraseDictionaryMultiModel; +using Moses::FindPhraseDictionary; +using Moses::Sentence; +using Moses::TokenizeMultiCharSeparator; +using Moses::FeatureFunction; +using Moses::Scan; + +boost::shared_ptr +TranslationRequest:: +create(Translator* translator, xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, boost::mutex& mut) +{ + boost::shared_ptr ret; + ret.reset(new TranslationRequest(paramList, cond, mut)); + ret->m_self = ret; + ret->m_translator = translator; + return ret; +} + +void +SetContextWeights(Moses::ContextScope& s, xmlrpc_c::value const& w) +{ + SPTR > M(new std::map); + typedef std::map tmap; + tmap const tmp = static_cast(xmlrpc_c::value_struct(w)); + for(tmap::const_iterator m = tmp.begin(); m != tmp.end(); ++m) + (*M)[m->first] = xmlrpc_c::value_double(m->second); + s.SetContextWeights(M); +} + +void +TranslationRequest:: +Run() +{ + typedef std::map param_t; + param_t const& params = m_paramList.getStruct(0); + parse_request(params); + // cerr << "SESSION ID" << ret->m_session_id << endl; + + + // settings within the session scope + param_t::const_iterator si = params.find("context-weights"); + if (si != params.end()) SetContextWeights(*m_scope, si->second); + + Moses::StaticData const& SD = Moses::StaticData::Instance(); + + if (is_syntax(m_options->search.algo)) + run_chart_decoder(); + else + run_phrase_decoder(); + + { + boost::lock_guard lock(m_mutex); + m_done = true; + } + m_cond.notify_one(); + +} + +/// add phrase alignment information from a Hypothesis +void +TranslationRequest:: +add_phrase_aln_info(Hypothesis const& h, vector& aInfo) const +{ + if (!m_withAlignInfo) return; + // if (!options()->output.ReportSegmentation) return; + Range const& trg = h.GetCurrTargetWordsRange(); + Range const& src = h.GetCurrSourceWordsRange(); + + std::map pAlnInfo; + pAlnInfo["tgt-start"] = xmlrpc_c::value_int(trg.GetStartPos()); + pAlnInfo["tgt-end"] = xmlrpc_c::value_int(trg.GetEndPos()); + pAlnInfo["src-start"] = xmlrpc_c::value_int(src.GetStartPos()); + pAlnInfo["src-end"] = xmlrpc_c::value_int(src.GetEndPos()); + aInfo.push_back(xmlrpc_c::value_struct(pAlnInfo)); +} + +void +TranslationRequest:: +outputChartHypo(ostream& out, const ChartHypothesis* hypo) +{ + Phrase outPhrase(20); + hypo->GetOutputPhrase(outPhrase); + + // delete 1st & last + assert(outPhrase.GetSize() >= 2); + outPhrase.RemoveWord(0); + outPhrase.RemoveWord(outPhrase.GetSize() - 1); + for (size_t pos = 0 ; pos < outPhrase.GetSize() ; pos++) + out << *outPhrase.GetFactor(pos, 0) << " "; +} + +bool +TranslationRequest:: +compareSearchGraphNode(const Moses::SearchGraphNode& a, + const Moses::SearchGraphNode& b) +{ + return a.hypo->GetId() < b.hypo->GetId(); +} + +void +TranslationRequest:: +insertGraphInfo(Manager& manager, map& retData) +{ + using xmlrpc_c::value_int; + using xmlrpc_c::value_double; + using xmlrpc_c::value_struct; + using xmlrpc_c::value_string; + vector searchGraphXml; + vector searchGraph; + manager.GetSearchGraph(searchGraph); + std::sort(searchGraph.begin(), searchGraph.end()); + BOOST_FOREACH(Moses::SearchGraphNode const& n, searchGraph) { + map x; // search graph xml node + x["forward"] = value_double(n.forward); + x["fscore"] = value_double(n.fscore); + const Hypothesis* hypo = n.hypo; + x["hyp"] = value_int(hypo->GetId()); + x["stack"] = value_int(hypo->GetWordsBitmap().GetNumWordsCovered()); + if (hypo->GetId() != 0) { + const Hypothesis *prevHypo = hypo->GetPrevHypo(); + x["back"] = value_int(prevHypo->GetId()); + x["score"] = value_double(hypo->GetScore()); + x["transition"] = value_double(hypo->GetScore() - prevHypo->GetScore()); + if (n.recombinationHypo) + x["recombined"] = value_int(n.recombinationHypo->GetId()); + x["cover-start"] = value_int(hypo->GetCurrSourceWordsRange().GetStartPos()); + x["cover-end"] = value_int(hypo->GetCurrSourceWordsRange().GetEndPos()); + x["out"] = value_string(hypo->GetCurrTargetPhrase().GetStringRep(options()->output.factor_order)); + } + searchGraphXml.push_back(value_struct(x)); + } + retData["sg"] = xmlrpc_c::value_array(searchGraphXml); +} + +void +TranslationRequest:: +outputNBest(const Manager& manager, map& retData) +{ + TrellisPathList nBestList; + vector nBestXml; + + Moses::NBestOptions const& nbo = m_options->nbest; + manager.CalcNBest(nbo.nbest_size, nBestList, nbo.only_distinct); + manager.OutputNBest(cout, nBestList); + + BOOST_FOREACH(Moses::TrellisPath const* path, nBestList) { + vector const& E = path->GetEdges(); + if (!E.size()) continue; + std::map nBestXmlItem; + pack_hypothesis(manager, E, "hyp", nBestXmlItem); + if (m_withScoreBreakdown) { + // should the score breakdown be reported in a more structured manner? + ostringstream buf; + bool with_labels = nbo.include_feature_labels; + path->GetScoreBreakdown()->OutputAllFeatureScores(buf, with_labels); + nBestXmlItem["fvals"] = xmlrpc_c::value_string(buf.str()); + nBestXmlItem["scores"] = PackScores(*path->GetScoreBreakdown()); + } + + // weighted score + nBestXmlItem["totalScore"] = xmlrpc_c::value_double(path->GetFutureScore()); + nBestXml.push_back(xmlrpc_c::value_struct(nBestXmlItem)); + } + retData["nbest"] = xmlrpc_c::value_array(nBestXml); +} + +void +TranslationRequest:: +insertTranslationOptions(Moses::Manager& manager, + std::map& retData) +{ + std::vector const& ofactor_order = options()->output.factor_order; + + const TranslationOptionCollection* toptsColl = manager.getSntTranslationOptions(); + vector toptsXml; + size_t const stop = toptsColl->GetSource().GetSize(); + TranslationOptionList const* tol; + for (size_t s = 0 ; s < stop ; ++s) { + for (size_t e=s;(tol=toptsColl->GetTranslationOptionList(s,e))!=NULL;++e) { + BOOST_FOREACH(TranslationOption const* topt, *tol) { + std::map toptXml; + TargetPhrase const& tp = topt->GetTargetPhrase(); + std::string tphrase = tp.GetStringRep(ofactor_order); + toptXml["phrase"] = xmlrpc_c::value_string(tphrase); + toptXml["fscore"] = xmlrpc_c::value_double(topt->GetFutureScore()); + toptXml["start"] = xmlrpc_c::value_int(s); + toptXml["end"] = xmlrpc_c::value_int(e); + vector scoresXml; + const std::valarray &scores + = topt->GetScoreBreakdown().getCoreFeatures(); + for (size_t j = 0; j < scores.size(); ++j) + scoresXml.push_back(xmlrpc_c::value_double(scores[j])); + toptXml["scores"] = xmlrpc_c::value_array(scoresXml); + ostringstream buf; + topt->GetScoreBreakdown().OutputAllFeatureScores(buf, true); + toptXml["labelledScores"] = PackScores(topt->GetScoreBreakdown()); + toptsXml.push_back(xmlrpc_c::value_struct(toptXml)); + } + } + } + retData["topt"] = xmlrpc_c::value_array(toptsXml); +} + +TranslationRequest:: +TranslationRequest(xmlrpc_c::paramList const& paramList, + boost::condition_variable& cond, boost::mutex& mut) + : m_cond(cond), m_mutex(mut), m_done(false), m_paramList(paramList) + , m_session_id(0) +{ + +} + +bool +check(std::map const& param, + std::string const key) +{ + std::map::const_iterator m = param.find(key); + if(m == param.end()) return false; + + if (m->second.type() == xmlrpc_c::value::TYPE_BOOLEAN) + return xmlrpc_c::value_boolean(m->second); + + std::string val = string(xmlrpc_c::value_string(m->second)); + if(val == "true" || val == "True" || val == "TRUE" || val == "1") return true; + return false; +} + +void +TranslationRequest:: +parse_request(std::map const& params) +{ + // parse XMLRPC request + m_paramList.verifyEnd(1); // ??? UG + + typedef std::map params_t; + params_t::const_iterator si; + + si = params.find("session-id"); + if (si != params.end()) + { + m_session_id = xmlrpc_c::value_int(si->second); + Session const& S = m_translator->get_session(m_session_id); + m_scope = S.scope; + m_session_id = S.id; + } + else + { + m_session_id = 0; + m_scope.reset(new Moses::ContextScope); + } + + boost::shared_ptr opts(new Moses::AllOptions(*StaticData::Instance().options())); + opts->update(params); + + m_withGraphInfo = check(params, "sg"); + if (m_withGraphInfo || opts->nbest.nbest_size > 0) { + opts->output.SearchGraph = "true"; + opts->nbest.enabled = true; + } + + m_options = opts; + + // source text must be given, or we don't know what to translate + si = params.find("text"); + if (si == params.end()) + throw xmlrpc_c::fault("Missing source text", xmlrpc_c::fault::CODE_PARSE); + m_source_string = xmlrpc_c::value_string(si->second); + XVERBOSE(1,"Input: " << m_source_string << endl); + + m_withTopts = check(params, "topt"); + m_withScoreBreakdown = check(params, "add-score-breakdown"); + si = params.find("lambda"); + if (si != params.end()) + { + // muMo = multiModel + xmlrpc_c::value_array muMoArray = xmlrpc_c::value_array(si->second); + vector muMoValVec(muMoArray.vectorValueValue()); + vector w(muMoValVec.size()); + for (size_t i = 0; i < muMoValVec.size(); ++i) + w[i] = xmlrpc_c::value_double(muMoValVec[i]); + if (w.size() && (si = params.find("model_name")) != params.end()) + { + string const model_name = xmlrpc_c::value_string(si->second); + PhraseDictionaryMultiModel* pdmm + = (PhraseDictionaryMultiModel*) FindPhraseDictionary(model_name); + pdmm->SetTemporaryMultiModelWeightsVector(w); + } + } + + si = params.find("context"); + if (si != params.end()) + { + string context = xmlrpc_c::value_string(si->second); + VERBOSE(1,"CONTEXT " << context); + m_context.reset(new std::vector(1,context)); + } + + si = params.find("context-scope"); + if (si != params.end()) + { + + string context = xmlrpc_c::value_string(si->second); + + string groupSeparator("Moses::ContextScope::GroupSeparator"); + string recordSeparator("Moses::ContextScope::RecordSeparator"); + + // Here, we assume that any XML-RPC value + // associated with the key "context-scope" + // has the following format: + // + // FeatureFunctionName followed by recordSeparator + // followed by the value of interest + // followed by groupSeparator + // + // In the following code, the value of interest will be stored + // in contextScope under the key FeatureFunctionName, + // where FeatureFunctionName is the actual name of the feature function + + boost::shared_ptr contextScope = GetScope(); + + BOOST_FOREACH(string group, TokenizeMultiCharSeparator(context, groupSeparator)) { + + vector record = TokenizeMultiCharSeparator(group, recordSeparator); + + // Use the feature function whose name is record[0] as a key + FeatureFunction& ff = Moses::FeatureFunction::FindFeatureFunction(record[0]); + void const* key = static_cast(&ff); + + // Store (in the context scope) record[1] as the value associated with that key + boost::shared_ptr value = contextScope->get(key,true); + value->replace(value->begin(), value->end(), record[1]); + + } + } + + // Report alignment info if Moses config says to or if XML request says to + m_withAlignInfo = options()->output.ReportSegmentation || check(params, "align"); + + // Report word alignment info if Moses config says to or if XML request says to + m_withWordAlignInfo = options()->output.PrintAlignmentInfo || check(params, "word-align"); + + si = params.find("weights"); + if (si != params.end()) + { + + boost::unordered_map map; + { + const vector &ffs = FeatureFunction::GetFeatureFunctions(); + BOOST_FOREACH(FeatureFunction* const& ff, ffs) { + map[ff->GetScoreProducerDescription()] = ff; + } + } + + string allValues = xmlrpc_c::value_string(si->second); + + BOOST_FOREACH(string values, TokenizeMultiCharSeparator(allValues, "\t")) { + + vector record = TokenizeMultiCharSeparator(values, "="); + + if (record.size() == 2) { + string featureName = record[0]; + string featureWeights = record[1]; + + boost::unordered_map::iterator ffi = map.find(featureName); + + if (ffi != map.end()) { + FeatureFunction* ff = ffi->second; + + size_t prevNumWeights = ff->GetNumScoreComponents(); + + vector ffWeights; + BOOST_FOREACH(string weight, TokenizeMultiCharSeparator(featureWeights, " ")) { + ffWeights.push_back(Scan(weight)); + } + + if (ffWeights.size() == ff->GetNumScoreComponents()) { + + // XXX: This is NOT thread-safe + Moses::StaticData::InstanceNonConst().SetWeights(ff, ffWeights); + VERBOSE(1, "WARNING: THIS IS NOT THREAD-SAFE!\tUpdating weights for " << featureName << " to " << featureWeights << "\n"); + + } else { + TRACE_ERR("ERROR: Unable to update weights for " << featureName << " because " << ff->GetNumScoreComponents() << " weights are required but only " << ffWeights.size() << " were provided\n"); + } + + } else { + TRACE_ERR("ERROR: No FeatureFunction with name " << featureName << ", no weight update\n"); + } + + } else { + TRACE_ERR("WARNING: XML-RPC weights update was improperly formatted:\t" << values << "\n"); + } + + } + + } + + + // // biased sampling for suffix-array-based sampling phrase table? + // if ((si = params.find("bias")) != params.end()) + // { + // std::vector tmp + // = xmlrpc_c::value_array(si->second).cvalue(); + // for (size_t i = 1; i < tmp.size(); i += 2) + // m_bias[xmlrpc_c::value_int(tmp[i-1])] = xmlrpc_c::value_double(tmp[i]); + // } + if (is_syntax(m_options->search.algo)) { + m_source.reset(new Sentence(m_options,0,m_source_string)); + } else { + m_source.reset(new Sentence(m_options,0,m_source_string)); + } + interpret_dlt(); +} // end of Translationtask::parse_request() + + +void +TranslationRequest:: +run_chart_decoder() +{ + Moses::ChartManager manager(this->self()); + manager.Decode(); + + const Moses::ChartHypothesis *hypo = manager.GetBestHypothesis(); + ostringstream out; + if (hypo) outputChartHypo(out,hypo); + + m_target_string = out.str(); + m_retData["text"] = xmlrpc_c::value_string(m_target_string); + + if (m_withGraphInfo) { + std::ostringstream sgstream; + manager.OutputSearchGraphMoses(sgstream); + m_retData["sg"] = xmlrpc_c::value_string(sgstream.str()); + } +} // end of TranslationRequest::run_chart_decoder() + +void +TranslationRequest:: +pack_hypothesis(const Moses::Manager& manager, + vector const& edges, string const& key, + map & dest) const +{ + // target string + ostringstream target; + BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) { + manager.OutputSurface(target, *e); + } + XVERBOSE(1, "BEST TRANSLATION: " << *(manager.GetBestHypothesis()) + << std::endl); + dest[key] = xmlrpc_c::value_string(target.str()); + + if (m_withAlignInfo) { + // if (options()->output.ReportSegmentation) { + // phrase alignment, if requested + + vector p_aln; + BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) + add_phrase_aln_info(*e, p_aln); + dest["align"] = xmlrpc_c::value_array(p_aln); + } + + if (m_withWordAlignInfo) { + //if (options()->output.PrintAlignmentInfo) { + // word alignment, if requested + vector w_aln; + BOOST_REVERSE_FOREACH(Hypothesis const* e, edges) + e->OutputLocalWordAlignment(w_aln); + dest["word-align"] = xmlrpc_c::value_array(w_aln); + } +} + +void +TranslationRequest:: +pack_hypothesis(const Moses::Manager& manager, Hypothesis const* h, string const& key, + map& dest) const +{ + using namespace std; + vector edges; + for (; h; h = h->GetPrevHypo()) + edges.push_back(h); + pack_hypothesis(manager, edges, key, dest); +} + + +void +TranslationRequest:: +run_phrase_decoder() +{ + Manager manager(this->self()); + manager.Decode(); + pack_hypothesis(manager, manager.GetBestHypothesis(), "text", m_retData); + if (m_session_id) + m_retData["session-id"] = xmlrpc_c::value_int(m_session_id); + + if (m_withGraphInfo) insertGraphInfo(manager,m_retData); + if (m_withTopts) insertTranslationOptions(manager,m_retData); + if (m_options->nbest.nbest_size) outputNBest(manager, m_retData); + +} +} diff --git a/mosesdecoder/moses/server/Updater.cpp b/mosesdecoder/moses/server/Updater.cpp new file mode 100644 index 0000000000000000000000000000000000000000..095af383866f1710ae85e18e9e99f8a78cd36dfd --- /dev/null +++ b/mosesdecoder/moses/server/Updater.cpp @@ -0,0 +1,58 @@ +// -*- mode: c++; indent-tabs-mode: nil; tab-width:2 -*- +#include "Updater.h" + +namespace MosesServer +{ +using namespace Moses; +using namespace std; + +Updater:: +Updater() +{ + // signature and help strings are documentation -- the client + // can query this information with a system.methodSignature and + // system.methodHelp RPC. + this->_signature = "S:S"; + this->_help = "Updates stuff"; +} + +void +Updater:: +execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP) +{ +#if PT_UG + const params_t params = paramList.getStruct(0); + breakOutParams(params); + Mmsapt* pdsa = reinterpret_cast(PhraseDictionary::GetColl()[0]); + pdsa->add(m_src, m_trg, m_aln); + XVERBOSE(1,"Done inserting\n"); + *retvalP = xmlrpc_c::value_string("Phrase table updated"); +#endif +}; + +void +Updater:: +breakOutParams(const params_t& params) +{ + params_t::const_iterator si = params.find("source"); + if(si == params.end()) + throw xmlrpc_c::fault("Missing source sentence", + xmlrpc_c::fault::CODE_PARSE); + m_src = xmlrpc_c::value_string(si->second); + XVERBOSE(1,"source = " << m_src << endl); + si = params.find("target"); + if(si == params.end()) + throw xmlrpc_c::fault("Missing target sentence", + xmlrpc_c::fault::CODE_PARSE); + m_trg = xmlrpc_c::value_string(si->second); + XVERBOSE(1,"target = " << m_trg << endl); + if((si = params.find("alignment")) == params.end()) + throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE); + m_aln = xmlrpc_c::value_string(si->second); + XVERBOSE(1,"alignment = " << m_aln << endl); + m_bounded = ((si = params.find("bounded")) != params.end()); + m_add2ORLM = ((si = params.find("updateORLM")) != params.end()); +}; + +} diff --git a/mosesdecoder/moses/server/Updater.h b/mosesdecoder/moses/server/Updater.h new file mode 100644 index 0000000000000000000000000000000000000000..e3eba52ef1edfc746befb78bcad5175f95d4ff44 --- /dev/null +++ b/mosesdecoder/moses/server/Updater.h @@ -0,0 +1,44 @@ +// -*- c++ -*- +#pragma once + +#include "moses/Util.h" +#include "moses/ChartManager.h" +#include "moses/Hypothesis.h" +#include "moses/Manager.h" +#include "moses/StaticData.h" +#include "moses/ThreadPool.h" + +#if PT_UG +#include "moses/TranslationModel/UG/mmsapt.h" +#endif + +#include +#include +#include + + +namespace MosesServer +{ +class + Updater: public xmlrpc_c::method +{ + + typedef std::map params_t; + + + std::string m_src, m_trg, m_aln; + bool m_bounded, m_add2ORLM; + +public: + Updater(); + + void + execute(xmlrpc_c::paramList const& paramList, + xmlrpc_c::value * const retvalP); + + void + breakOutParams(const params_t& params); + +}; + +} diff --git a/mosesdecoder/util/bit_packing_test.cc b/mosesdecoder/util/bit_packing_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4494b69fa3bf42506ded3bdc206f77cbc45a486 --- /dev/null +++ b/mosesdecoder/util/bit_packing_test.cc @@ -0,0 +1,59 @@ +#include "util/bit_packing.hh" + +#define BOOST_TEST_MODULE BitPackingTest +#include + +#include + +namespace util { +namespace { + +const uint64_t test57 = 0x123456789abcdefULL; +const uint32_t test25 = 0x1234567; + +BOOST_AUTO_TEST_CASE(ZeroBit57) { + char mem[16]; + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, 0, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); +} + +BOOST_AUTO_TEST_CASE(EachBit57) { + char mem[16]; + for (uint8_t b = 0; b < 8; ++b) { + memset(mem, 0, sizeof(mem)); + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive57) { + char mem[57+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 57 * 8; b += 57) { + WriteInt57(mem, b, 57, test57); + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } + for (uint64_t b = 0; b < 57 * 8; b += 57) { + BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Consecutive25) { + char mem[25+8]; + memset(mem, 0, sizeof(mem)); + for (uint64_t b = 0; b < 25 * 8; b += 25) { + WriteInt25(mem, b, 25, test25); + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } + for (uint64_t b = 0; b < 25 * 8; b += 25) { + BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); + } +} + +BOOST_AUTO_TEST_CASE(Sanity) { + BitPackingSanity(); +} + +} // namespace +} // namespace util diff --git a/mosesdecoder/util/ersatz_progress.hh b/mosesdecoder/util/ersatz_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..b47aded7d9c6ff8d3e3a248e3c6a0b0c2e075246 --- /dev/null +++ b/mosesdecoder/util/ersatz_progress.hh @@ -0,0 +1,57 @@ +#ifndef UTIL_ERSATZ_PROGRESS_H +#define UTIL_ERSATZ_PROGRESS_H + +#include +#include +#include + +// Ersatz version of boost::progress so core language model doesn't depend on +// boost. Also adds option to print nothing. + +namespace util { + +extern const char kProgressBanner[]; + +class ErsatzProgress { + public: + // No output. + ErsatzProgress(); + + // Null means no output. The null value is useful for passing along the ostream pointer from another caller. + explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); + + ~ErsatzProgress(); + + ErsatzProgress &operator++() { + if (++current_ >= next_) Milestone(); + return *this; + } + + ErsatzProgress &operator+=(uint64_t amount) { + if ((current_ += amount) >= next_) Milestone(); + return *this; + } + + void Set(uint64_t to) { + if ((current_ = to) >= next_) Milestone(); + } + + void Finished() { + Set(complete_); + } + + private: + void Milestone(); + + uint64_t current_, next_, complete_; + unsigned char stones_written_; + std::ostream *out_; + + // noncopyable + ErsatzProgress(const ErsatzProgress &other); + ErsatzProgress &operator=(const ErsatzProgress &other); +}; + +} // namespace util + +#endif // UTIL_ERSATZ_PROGRESS_H diff --git a/mosesdecoder/util/exception.hh b/mosesdecoder/util/exception.hh new file mode 100644 index 0000000000000000000000000000000000000000..b30183e7f913d307cce23be78fb22ac564fe4b0b --- /dev/null +++ b/mosesdecoder/util/exception.hh @@ -0,0 +1,165 @@ +#ifndef UTIL_EXCEPTION_H +#define UTIL_EXCEPTION_H + +#include "util/string_stream.hh" + +#include +#include +#include +#include + +// TODO(hieu): delete this +#include + +namespace util { + +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + +class Exception : public std::exception { + public: + Exception() throw(); + virtual ~Exception() throw(); + + const char *what() const throw() { return what_.str().c_str(); } + + // For use by the UTIL_THROW macros. + void SetLocation( + const char *file, + unsigned int line, + const char *func, + const char *child_name, + const char *condition); + + private: + template friend typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); + + // This helps restrict operator<< defined below. + template struct ExceptionTag { + typedef T Identity; + }; + + StringStream what_; +}; + +/* This implements the normal operator<< for Exception and all its children. + * SFINAE means it only applies to Exception. Think of this as an ersatz + * boost::enable_if. + */ +template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data) { + // TODO(hieu): delete this. + std::stringstream moses_hack; + moses_hack << data; + e.what_ << moses_hack.str(); + return e; +} + +#ifdef __GNUC__ +#define UTIL_FUNC_NAME __PRETTY_FUNCTION__ +#else +#ifdef _WIN32 +#define UTIL_FUNC_NAME __FUNCTION__ +#else +#define UTIL_FUNC_NAME NULL +#endif +#endif + +/* Create an instance of Exception, add the message Modify, and throw it. + * Modify is appended to the what() message and can contain << for ostream + * operations. + * + * do .. while kludge to swallow trailing ; character + * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . + * Arg can be a constructor argument to the exception. + */ +#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \ + Exception UTIL_e Arg; \ + UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \ + UTIL_e << Modify; \ + throw UTIL_e; \ +} while (0) + +#define UTIL_THROW_ARG(Exception, Arg, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify) + +#define UTIL_THROW(Exception, Modify) \ + UTIL_THROW_BACKEND(NULL, Exception, , Modify); + +#define UTIL_THROW2(Modify) \ + UTIL_THROW_BACKEND(NULL, util::Exception, , Modify); + +#if __GNUC__ >= 3 +#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) +#else +#define UTIL_UNLIKELY(x) (x) +#endif + +#if __GNUC__ >= 3 +#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1) +#else +#define UTIL_LIKELY(x) (x) +#endif + +#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \ + if (UTIL_UNLIKELY(Condition)) { \ + UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \ + } \ +} while (0) + +#define UTIL_THROW_IF(Condition, Exception, Modify) \ + UTIL_THROW_IF_ARG(Condition, Exception, , Modify) + +#define UTIL_THROW_IF2(Condition, Modify) \ + UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify) + +// Exception that records errno and adds it to the message. +class ErrnoException : public Exception { + public: + ErrnoException() throw(); + + virtual ~ErrnoException() throw(); + + int Error() const throw() { return errno_; } + + private: + int errno_; +}; + +// file wasn't there, or couldn't be open for some reason +class FileOpenException : public Exception { + public: + FileOpenException() throw() {} + ~FileOpenException() throw() {} +}; + +// Utilities for overflow checking. +class OverflowException : public Exception { + public: + OverflowException() throw(); + ~OverflowException() throw(); +}; + +template inline std::size_t CheckOverflowInternal(uint64_t value) { + UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); + return value; +} + +template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { + return value; +} + +inline std::size_t CheckOverflow(uint64_t value) { + return CheckOverflowInternal(value); +} + +#if defined(_WIN32) || defined(_WIN64) +/* Thrown for Windows specific operations. */ +class WindowsException : public Exception { + public: + WindowsException() throw(); + ~WindowsException() throw(); +}; +#endif + +} // namespace util + +#endif // UTIL_EXCEPTION_H diff --git a/mosesdecoder/util/fake_ostream.hh b/mosesdecoder/util/fake_ostream.hh new file mode 100644 index 0000000000000000000000000000000000000000..2f76053cc583525bf67f77c51afca0da588db4f3 --- /dev/null +++ b/mosesdecoder/util/fake_ostream.hh @@ -0,0 +1,111 @@ +#ifndef UTIL_FAKE_OSTREAM_H +#define UTIL_FAKE_OSTREAM_H + +#include "util/float_to_string.hh" +#include "util/integer_to_string.hh" +#include "util/string_piece.hh" + +#include +#include + +#include + +namespace util { + +/* Like std::ostream but without being incredibly slow. + * Supports most of the built-in types except for long double. + * + * The FakeOStream class is intended to be inherited from. The inherting class + * should provide: + * public: + * Derived &flush(); + * Derived &write(const void *data, std::size_t length); + * + * private: or protected: + * friend class FakeOStream; + * char *Ensure(std::size_t amount); + * void AdvanceTo(char *to); + * + * The Ensure function makes enough space for an in-place write and returns + * where to write. The AdvanceTo function happens after the write, saying how + * much was actually written. + * + * Precondition: + * amount <= kToStringMaxBytes for in-place writes. + */ +template class FakeOStream { + public: + FakeOStream() {} + + // This also covers std::string and char* + Derived &operator<<(StringPiece str) { + return C().write(str.data(), str.size()); + } + + // Handle integers by size and signedness. + private: + template struct EnableIfKludge { + typedef Derived type; + }; + template ::is_signed, bool IsInteger = std::numeric_limits::is_integer> struct Coerce {}; + + template struct Coerce { typedef uint16_t To; }; + template struct Coerce { typedef uint32_t To; }; + template struct Coerce { typedef uint64_t To; }; + + template struct Coerce { typedef int16_t To; }; + template struct Coerce { typedef int32_t To; }; + template struct Coerce { typedef int64_t To; }; + public: + template typename EnableIfKludge::To>::type &operator<<(const From value) { + return CallToString(static_cast::To>(value)); + } + + // Character types that get copied as bytes instead of displayed as integers. + Derived &operator<<(char val) { return put(val); } + Derived &operator<<(signed char val) { return put(static_cast(val)); } + Derived &operator<<(unsigned char val) { return put(static_cast(val)); } + + Derived &operator<<(bool val) { return put(val + '0'); } + // enums will fall back to int but are not caught by the template. + Derived &operator<<(int val) { return CallToString(static_cast::To>(val)); } + + Derived &operator<<(float val) { return CallToString(val); } + Derived &operator<<(double val) { return CallToString(val); } + + // This is here to catch all the other pointer types. + Derived &operator<<(const void *value) { return CallToString(value); } + // This is here because the above line also catches const char*. + Derived &operator<<(const char *value) { return *this << StringPiece(value); } + Derived &operator<<(char *value) { return *this << StringPiece(value); } + + Derived &put(char val) { + char *c = C().Ensure(1); + *c = val; + C().AdvanceTo(++c); + return C(); + } + + char widen(char val) const { return val; } + + private: + // References to derived class for convenience. + Derived &C() { + return *static_cast(this); + } + + const Derived &C() const { + return *static_cast(this); + } + + // This is separate to prevent an infinite loop if the compiler considers + // types the same (i.e. gcc std::size_t and uint64_t or uint32_t). + template Derived &CallToString(const T value) { + C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf::kBytes))); + return C(); + } +}; + +} // namespace + +#endif // UTIL_FAKE_OSTREAM_H diff --git a/mosesdecoder/util/file_piece.hh b/mosesdecoder/util/file_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..d3d83054d6e6ccc06970b82708d45e69433ca80c --- /dev/null +++ b/mosesdecoder/util/file_piece.hh @@ -0,0 +1,175 @@ +#ifndef UTIL_FILE_PIECE_H +#define UTIL_FILE_PIECE_H + +#include "util/ersatz_progress.hh" +#include "util/exception.hh" +#include "util/file.hh" +#include "util/mmap.hh" +#include "util/read_compressed.hh" +#include "util/string_piece.hh" + +#include +#include +#include +#include +#include + +namespace util { + +class ParseNumberException : public Exception { + public: + explicit ParseNumberException(StringPiece value) throw(); + ~ParseNumberException() throw() {} +}; + +extern const bool kSpaces[256]; + +// Memory backing the returned StringPiece may vanish on the next call. +class FilePiece { + public: + // 1 MB default. + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + // Takes ownership of fd. name is used for messages. + explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); + + /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is + * much faster. But sometimes you just have an istream like Boost's HTTP + * server and want to parse it the same way. + * name is just used for messages and FileName(). + */ + explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); + + ~FilePiece(); + + char get() { + if (position_ == position_end_) { + Shift(); + if (at_end_) throw EndOfFileException(); + } + return *(position_++); + } + + // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). + StringPiece ReadDelimited(const bool *delim = kSpaces) { + SkipSpaces(delim); + return Consume(FindDelimiterOrEOF(delim)); + } + + /// Read word until the line or file ends. + bool ReadWordSameLine(StringPiece &to, const bool *delim = kSpaces) { + assert(delim[static_cast('\n')]); + // Skip non-enter spaces. + for (; ; ++position_) { + if (position_ == position_end_) { + try { + Shift(); + } catch (const util::EndOfFileException &e) { return false; } + // And break out at end of file. + if (position_ == position_end_) return false; + } + if (!delim[static_cast(*position_)]) break; + if (*position_ == '\n') return false; + } + // We can't be at the end of file because there's at least one character open. + to = Consume(FindDelimiterOrEOF(delim)); + return true; + } + + /** Read a line of text from the file. + * + * Unlike ReadDelimited, this includes leading spaces and consumes the + * delimiter. It is similar to getline in that way. + * + * If strip_cr is true, any trailing carriate return (as would be found on + * a file written on Windows) will be left out of the returned line. + * + * Throws EndOfFileException if the end of the file is encountered. If the + * file does not end in a newline, this could mean that the last line is + * never read. + */ + StringPiece ReadLine(char delim = '\n', bool strip_cr = true); + + /** Read a line of text from the file, or return false on EOF. + * + * This is like ReadLine, except it returns false where ReadLine throws + * EndOfFileException. Like ReadLine it may not read the last line in the + * file if the file does not end in a newline. + * + * If strip_cr is true, any trailing carriate return (as would be found on + * a file written on Windows) will be left out of the returned line. + */ + bool ReadLineOrEOF(StringPiece &to, char delim = '\n', bool strip_cr = true); + + float ReadFloat(); + double ReadDouble(); + long int ReadLong(); + unsigned long int ReadULong(); + + // Skip spaces defined by isspace. + void SkipSpaces(const bool *delim = kSpaces) { + assert(position_ <= position_end_); + for (; ; ++position_) { + if (position_ == position_end_) { + Shift(); + // And break out at end of file. + if (position_ == position_end_) return; + } + assert(position_ < position_end_); + if (!delim[static_cast(*position_)]) return; + } + } + + uint64_t Offset() const { + return position_ - data_.begin() + mapped_offset_; + } + + const std::string &FileName() const { return file_name_; } + + private: + void InitializeNoRead(const char *name, std::size_t min_buffer); + // Calls InitializeNoRead, so don't call both. + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); + + template T ReadNumber(); + + StringPiece Consume(const char *to) { + assert(to >= position_); + StringPiece ret(position_, to - position_); + position_ = to; + return ret; + } + + const char *FindDelimiterOrEOF(const bool *delim = kSpaces); + + void Shift(); + // Backends to Shift(). + void MMapShift(uint64_t desired_begin); + + void TransitionToRead(); + void ReadShift(); + + const char *position_, *last_space_, *position_end_; + + scoped_fd file_; + const uint64_t total_size_; + const uint64_t page_; + + std::size_t default_map_size_; + uint64_t mapped_offset_; + + // Order matters: file_ should always be destroyed after this. + scoped_memory data_; + + bool at_end_; + bool fallback_to_read_; + + ErsatzProgress progress_; + + std::string file_name_; + + ReadCompressed fell_back_; +}; + +} // namespace util + +#endif // UTIL_FILE_PIECE_H diff --git a/mosesdecoder/util/file_piece_test.cc b/mosesdecoder/util/file_piece_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d03cd312d0d7622fdee3af09e5063fb5d7591948 --- /dev/null +++ b/mosesdecoder/util/file_piece_test.cc @@ -0,0 +1,154 @@ +// Tests might fail if you have creative characters in your path. Sue me. +#include "util/file_piece.hh" + +#include "util/file_stream.hh" +#include "util/file.hh" +#include "util/scoped.hh" + +#define BOOST_TEST_MODULE FilePieceTest +#include +#include +#include +#include +#include +#include + +namespace util { +namespace { + +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + +/* istream */ +BOOST_AUTO_TEST_CASE(IStream) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + std::fstream backing(FileLocation().c_str(), std::ios::in); + FilePiece test(backing); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + BOOST_CHECK_EQUAL(ref_line, test_line); + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +/* mmap implementation */ +BOOST_AUTO_TEST_CASE(MMapReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +#if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) +/* Apple isn't happy with the popen, fileno, dup. And I don't want to + * reimplement popen. This is an issue with the test. + */ +/* read() implementation */ +BOOST_AUTO_TEST_CASE(StreamReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; + + FILE *catter = popen(popen_args.c_str(), "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif + +#ifdef HAVE_ZLIB + +// gzip file +BOOST_AUTO_TEST_CASE(PlainZipReadLine) { + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); + + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); +} + +// gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with +// the test. +#if !defined __APPLE__ && !defined __MINGW32__ +BOOST_AUTO_TEST_CASE(StreamZipReadLine) { + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; + + FILE * catter = popen(command.c_str(), "r"); + BOOST_REQUIRE(catter); + + FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); + std::string ref_line; + while (getline(ref, ref_line)) { + StringPiece test_line(test.ReadLine()); + // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 + if (!test_line.empty() || !ref_line.empty()) { + BOOST_CHECK_EQUAL(ref_line, test_line); + } + } + BOOST_CHECK_THROW(test.get(), EndOfFileException); + BOOST_REQUIRE(!pclose(catter)); +} +#endif // __APPLE__ + +#endif // HAVE_ZLIB + +BOOST_AUTO_TEST_CASE(Numbers) { + scoped_fd file(MakeTemp(FileLocation())); + const float floating = 3.2; + { + util::FileStream writing(file.get()); + writing << "94389483984398493890287 " << floating << " 5"; + } + SeekOrThrow(file.get(), 0); + util::FilePiece f(file.release()); + BOOST_CHECK_THROW(f.ReadULong(), ParseNumberException); + BOOST_CHECK_EQUAL("94389483984398493890287", f.ReadDelimited()); + // Yes, exactly equal. Isn't double-conversion wonderful? + BOOST_CHECK_EQUAL(floating, f.ReadFloat()); + BOOST_CHECK_EQUAL(5, f.ReadULong()); +} + +} // namespace +} // namespace util diff --git a/mosesdecoder/util/generator.hh b/mosesdecoder/util/generator.hh new file mode 100644 index 0000000000000000000000000000000000000000..afa0db611c41dc6b9f33472b1e6ed54661f195d5 --- /dev/null +++ b/mosesdecoder/util/generator.hh @@ -0,0 +1,34 @@ +#pragma once + +// generator/continuation for C++ +// author: Andrew Fedoniouk @ terrainformatica.com +// idea borrowed from: "coroutines in C" Simon Tatham, +// http://www.chiark.greenend.org.uk/~sgtatham/coroutines.html +// BSD license + +template + struct _generator + { + T* _stack; + int _line; + _generator():_stack(0), _line(-1) {} + void _push() { T* n = new T; *n = *static_cast(this); _stack = n; } + bool _pop() { if(!_stack) return false; T* t = _stack; *static_cast(this) = *_stack; t->_stack = 0; delete t; return true; } + ~_generator() { while(_pop()); } + }; + + #define $generator(NAME) struct NAME : public _generator + + #define $emit(T) bool operator()(T& _rv) { \ + if(_line < 0) _line=0; \ + $START: switch(_line) { case 0:; + + #define $stop } _line = 0; if(_pop()) goto $START; return false; } + + #define $restart(WITH) { _push(); _stack->_line = __LINE__; _line=0; WITH; goto $START; case __LINE__:; } + + #define $yield(V) \ + do {\ + _line=__LINE__;\ + _rv = (V); return true; case __LINE__:;\ + } while (0) diff --git a/mosesdecoder/util/getopt.c b/mosesdecoder/util/getopt.c new file mode 100644 index 0000000000000000000000000000000000000000..50eef42cc25f0e0da6b6ffa00f04a92b1a4383c6 --- /dev/null +++ b/mosesdecoder/util/getopt.c @@ -0,0 +1,78 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifndef __GNUC__ + +#include "getopt.hh" +#include +#include + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */ diff --git a/mosesdecoder/util/integer_to_string_test.cc b/mosesdecoder/util/integer_to_string_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..136c88f62111e13580c696e63ea51da5da799b06 --- /dev/null +++ b/mosesdecoder/util/integer_to_string_test.cc @@ -0,0 +1,81 @@ +#define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE +#include "util/integer_to_string.hh" +#include "util/string_piece.hh" + +#define BOOST_TEST_MODULE IntegerToStringTest +#include +#include + +#include + +namespace util { +namespace { + +template void TestValue(const T value) { + char buf[ToStringBuf::kBytes]; + StringPiece result(buf, ToString(value, buf) - buf); + BOOST_REQUIRE_GE(static_cast(ToStringBuf::kBytes), result.size()); + if (value) { + BOOST_CHECK_EQUAL(boost::lexical_cast(value), result); + } else { + // Platforms can do void * as 0x0 or 0. + BOOST_CHECK(result == "0x0" || result == "0"); + } +} + +template void TestCorners() { + TestValue(std::numeric_limits::min()); + TestValue(std::numeric_limits::max()); + TestValue((T)0); + TestValue((T)-1); + TestValue((T)1); +} + +BOOST_AUTO_TEST_CASE(Corners) { + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); + TestCorners(); +} + +template void TestAll() { + for (T i = std::numeric_limits::min(); i < std::numeric_limits::max(); ++i) { + TestValue(i); + } + TestValue(std::numeric_limits::max()); +} + +BOOST_AUTO_TEST_CASE(Short) { + TestAll(); + TestAll(); +} + +template void Test10s() { + for (T i = 1; i < std::numeric_limits::max() / 10; i *= 10) { + TestValue(i); + TestValue(i - 1); + TestValue(i + 1); + } +} + +BOOST_AUTO_TEST_CASE(Tens) { + Test10s(); + Test10s(); + Test10s(); + Test10s(); +} + +BOOST_AUTO_TEST_CASE(Pointers) { + for (uintptr_t i = 1; i < std::numeric_limits::max() / 10; i *= 10) { + TestValue((const void*)i); + } + for (uintptr_t i = 0; i < 256; ++i) { + TestValue((const void*)i); + TestValue((const void*)(i + 0xf00)); + } +} + +}} // namespaces diff --git a/mosesdecoder/util/joint_sort.hh b/mosesdecoder/util/joint_sort.hh new file mode 100644 index 0000000000000000000000000000000000000000..de4b554ff68c1ff305fe0ea6ea4375d1fdf6cbd6 --- /dev/null +++ b/mosesdecoder/util/joint_sort.hh @@ -0,0 +1,146 @@ +#ifndef UTIL_JOINT_SORT_H +#define UTIL_JOINT_SORT_H + +/* A terrifying amount of C++ to coax std::sort into soring one range while + * also permuting another range the same way. + */ + +#include "util/proxy_iterator.hh" + +#include +#include + +namespace util { + +namespace detail { + +template class JointProxy; + +template class JointIter { + public: + JointIter() {} + + JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {} + + bool operator==(const JointIter &other) const { return key_ == other.key_; } + + bool operator<(const JointIter &other) const { return (key_ < other.key_); } + + std::ptrdiff_t operator-(const JointIter &other) const { return key_ - other.key_; } + + JointIter &operator+=(std::ptrdiff_t amount) { + key_ += amount; + value_ += amount; + return *this; + } + + friend void swap(JointIter &first, JointIter &second) { + using std::swap; + swap(first.key_, second.key_); + swap(first.value_, second.value_); + } + + void DeepSwap(JointIter &other) { + using std::swap; + swap(*key_, *other.key_); + swap(*value_, *other.value_); + } + + private: + friend class JointProxy; + KeyIter key_; + ValueIter value_; +}; + +template class JointProxy { + private: + typedef JointIter InnerIterator; + + public: + typedef struct { + typename std::iterator_traits::value_type key; + typename std::iterator_traits::value_type value; + const typename std::iterator_traits::value_type &GetKey() const { return key; } + } value_type; + + JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} + JointProxy(const JointProxy &other) : inner_(other.inner_) {} + + operator value_type() const { + value_type ret; + ret.key = *inner_.key_; + ret.value = *inner_.value_; + return ret; + } + + JointProxy &operator=(const JointProxy &other) { + *inner_.key_ = *other.inner_.key_; + *inner_.value_ = *other.inner_.value_; + return *this; + } + + JointProxy &operator=(const value_type &other) { + *inner_.key_ = other.key; + *inner_.value_ = other.value; + return *this; + } + + typename std::iterator_traits::reference GetKey() const { + return *(inner_.key_); + } + + friend void swap(JointProxy first, JointProxy second) { + first.Inner().DeepSwap(second.Inner()); + } + + private: + friend class ProxyIterator >; + + InnerIterator &Inner() { return inner_; } + const InnerIterator &Inner() const { return inner_; } + InnerIterator inner_; +}; + +template class LessWrapper : public std::binary_function { + public: + explicit LessWrapper(const Less &less) : less_(less) {} + + bool operator()(const Proxy &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const Proxy &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const Proxy &right) const { + return less_(left.GetKey(), right.GetKey()); + } + bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const { + return less_(left.GetKey(), right.GetKey()); + } + + private: + const Less less_; +}; + +} // namespace detail + +template class PairedIterator : public ProxyIterator > { + public: + PairedIterator(const KeyIter &key, const ValueIter &value) : + ProxyIterator >(detail::JointProxy(key, value)) {} +}; + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { + ProxyIterator > full_begin(detail::JointProxy(key_begin, value_begin)); + detail::LessWrapper, Less> less_wrap(less); + std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap); +} + + +template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) { + JointSort(key_begin, key_end, value_begin, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_JOINT_SORT_H diff --git a/mosesdecoder/util/mmap.hh b/mosesdecoder/util/mmap.hh new file mode 100644 index 0000000000000000000000000000000000000000..b474dc75ba619d94bfa0edd5d5bfab47d0dfb271 --- /dev/null +++ b/mosesdecoder/util/mmap.hh @@ -0,0 +1,225 @@ +#ifndef UTIL_MMAP_H +#define UTIL_MMAP_H +// Utilities for mmaped files. + +#include +#include + +#include +#include + +namespace util { + +class scoped_fd; + +std::size_t SizePage(); + +// (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. +class scoped_mmap { + public: + scoped_mmap() : data_((void*)-1), size_(0) {} + scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {} + ~scoped_mmap(); + + void *get() const { return data_; } + + const uint8_t *begin() const { return reinterpret_cast(data_); } + const uint8_t *end() const { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + void reset(void *data, std::size_t size) { + scoped_mmap other(data_, size_); + data_ = data; + size_ = size; + } + + void reset() { + reset((void*)-1, 0); + } + + void *steal() { + void *ret = data_; + data_ = (void*)-1; + size_ = 0; + return ret; + } + + private: + void *data_; + std::size_t size_; + + scoped_mmap(const scoped_mmap &); + scoped_mmap &operator=(const scoped_mmap &); +}; + +/* For when the memory might come from mmap, new char[], or malloc. Uses NULL + * and 0 for blanks even though mmap signals errors with (void*)-1). The reset + * function checks that blank for mmap. + */ +class scoped_memory { + public: + typedef enum { + MMAP_ROUND_UP_ALLOCATED, // The size was rounded up to a multiple of page size. Do the same before munmap. + MMAP_ALLOCATED, // munmap + MALLOC_ALLOCATED, // free + NONE_ALLOCATED // nothing here! + } Alloc; + + scoped_memory(void *data, std::size_t size, Alloc source) + : data_(data), size_(size), source_(source) {} + + scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {} + + // Calls HugeMalloc + scoped_memory(std::size_t to, bool zero_new); + + ~scoped_memory() { reset(); } + + void *get() const { return data_; } + const char *begin() const { return reinterpret_cast(data_); } + const char *end() const { return reinterpret_cast(data_) + size_; } + std::size_t size() const { return size_; } + + Alloc source() const { return source_; } + + void reset() { reset(NULL, 0, NONE_ALLOCATED); } + + void reset(void *data, std::size_t size, Alloc from); + + void *steal() { + void *ret = data_; + data_ = NULL; + size_ = 0; + source_ = NONE_ALLOCATED; + return ret; + } + + private: + void *data_; + std::size_t size_; + + Alloc source_; + + scoped_memory(const scoped_memory &); + scoped_memory &operator=(const scoped_memory &); +}; + +extern const int kFileFlags; + +// Cross-platform, error-checking wrapper for mmap(). +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); + +// msync wrapper +void SyncOrThrow(void *start, size_t length); + +// Cross-platform, error-checking wrapper for munmap(). +void UnmapOrThrow(void *start, size_t length); + +// Allocate memory, promising that all/vast majority of it will be used. Tries +// hard to use huge pages on Linux. +// If you want zeroed memory, pass zeroed = true. +void HugeMalloc(std::size_t size, bool zeroed, scoped_memory &to); + +// Reallocates memory ala realloc but with option to zero the new memory. +// On Linux, the memory can come from anonymous mmap or malloc/calloc. +// On non-Linux, only malloc/calloc is supported. +// +// To summarize, any memory from HugeMalloc or HugeRealloc can be resized with +// this. +void HugeRealloc(std::size_t size, bool new_zeroed, scoped_memory &mem); + +typedef enum { + // mmap with no prepopulate + LAZY, + // On linux, pass MAP_POPULATE to mmap. + POPULATE_OR_LAZY, + // Populate on Linux. malloc and read on non-Linux. + POPULATE_OR_READ, + // malloc and read. + READ, + // malloc and read in parallel (recommended for Lustre) + PARALLEL_READ, +} LoadMethod; + +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); + +// Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); +void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); + +// Forward rolling memory map with no overlap. +class Rolling { + public: + Rolling() {} + + explicit Rolling(void *data) { Init(data); } + + Rolling(const Rolling ©_from, uint64_t increase = 0); + Rolling &operator=(const Rolling ©_from); + + // For an actual rolling mmap. + explicit Rolling(int fd, bool for_write, std::size_t block, std::size_t read_bound, uint64_t offset, uint64_t amount); + + // For a static mapping + void Init(void *data) { + ptr_ = data; + current_end_ = std::numeric_limits::max(); + current_begin_ = 0; + // Mark as a pass-through. + fd_ = -1; + } + + void IncreaseBase(uint64_t by) { + file_begin_ += by; + ptr_ = static_cast(ptr_) + by; + if (!IsPassthrough()) current_end_ = 0; + } + + void DecreaseBase(uint64_t by) { + file_begin_ -= by; + ptr_ = static_cast(ptr_) - by; + if (!IsPassthrough()) current_end_ = 0; + } + + void *ExtractNonRolling(scoped_memory &out, uint64_t index, std::size_t size); + + // Returns base pointer + void *get() const { return ptr_; } + + // Returns base pointer. + void *CheckedBase(uint64_t index) { + if (index >= current_end_ || index < current_begin_) { + Roll(index); + } + return ptr_; + } + + // Returns indexed pointer. + void *CheckedIndex(uint64_t index) { + return static_cast(CheckedBase(index)) + index; + } + + private: + void Roll(uint64_t index); + + // True if this is just a thin wrapper on a pointer. + bool IsPassthrough() const { return fd_ == -1; } + + void *ptr_; + uint64_t current_begin_; + uint64_t current_end_; + + scoped_memory mem_; + + int fd_; + uint64_t file_begin_; + uint64_t file_end_; + + bool for_write_; + std::size_t block_; + std::size_t read_bound_; +}; + +} // namespace util + +#endif // UTIL_MMAP_H diff --git a/mosesdecoder/util/multi_intersection.hh b/mosesdecoder/util/multi_intersection.hh new file mode 100644 index 0000000000000000000000000000000000000000..73954608e6e8118e7dd4679e437d695599f7e9df --- /dev/null +++ b/mosesdecoder/util/multi_intersection.hh @@ -0,0 +1,80 @@ +#ifndef UTIL_MULTI_INTERSECTION_H +#define UTIL_MULTI_INTERSECTION_H + +#include +#include + +#include +#include +#include + +namespace util { + +namespace detail { +template struct RangeLessBySize : public std::binary_function { + bool operator()(const Range &left, const Range &right) const { + return left.size() < right.size(); + } +}; + +/* Takes sets specified by their iterators and a boost::optional containing + * the lowest intersection if any. Each set must be sorted in increasing + * order. sets is changed to truncate the beginning of each sequence to the + * location of the match or an empty set. Precondition: sets is not empty + * since the intersection over null is the universe and this function does not + * know the universe. + */ +template boost::optional::value_type> FirstIntersectionSorted(std::vector > &sets, const Less &less = std::less::value_type>()) { + typedef std::vector > Sets; + typedef typename std::iterator_traits::value_type Value; + + assert(!sets.empty()); + + if (sets.front().empty()) return boost::optional(); + // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster. + Value highest(sets.front().front()); + for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) { + i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin()); + if (i->empty()) return boost::optional(); + if (less(highest, i->front())) { + highest = i->front(); + // start over + i = sets.begin(); + } else { + ++i; + } + } + return boost::optional(highest); +} + +} // namespace detail + +template boost::optional::value_type> FirstIntersection(std::vector > &sets, const Less less) { + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + return detail::FirstIntersectionSorted(sets, less); +} + +template boost::optional::value_type> FirstIntersection(std::vector > &sets) { + return FirstIntersection(sets, std::less::value_type>()); +} + +template void AllIntersection(std::vector > &sets, Output &out, const Less less) { + typedef typename std::iterator_traits::value_type Value; + assert(!sets.empty()); + + std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); + boost::optional ret; + for (boost::optional ret; (ret = detail::FirstIntersectionSorted(sets, less)); sets.front().advance_begin(1)) { + out(*ret); + } +} + +template void AllIntersection(std::vector > &sets, Output &out) { + AllIntersection(sets, out, std::less::value_type>()); +} + +} // namespace util + +#endif // UTIL_MULTI_INTERSECTION_H diff --git a/mosesdecoder/util/murmur_hash.hh b/mosesdecoder/util/murmur_hash.hh new file mode 100644 index 0000000000000000000000000000000000000000..f17157cd926ab763cd388758a907e684f971ed0e --- /dev/null +++ b/mosesdecoder/util/murmur_hash.hh @@ -0,0 +1,18 @@ +#ifndef UTIL_MURMUR_HASH_H +#define UTIL_MURMUR_HASH_H +#include +#include + +namespace util { + +// 64-bit machine version +uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); +// 32-bit machine version (not the same function as above) +uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); +// Use the version for this arch. Because the values differ across +// architectures, really only use it for in-memory structures. +uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); + +} // namespace util + +#endif // UTIL_MURMUR_HASH_H diff --git a/mosesdecoder/util/parallel_read.hh b/mosesdecoder/util/parallel_read.hh new file mode 100644 index 0000000000000000000000000000000000000000..1e96e79035a93a4a669a9d7d7bd14b146e0cb96a --- /dev/null +++ b/mosesdecoder/util/parallel_read.hh @@ -0,0 +1,16 @@ +#ifndef UTIL_PARALLEL_READ__ +#define UTIL_PARALLEL_READ__ + +/* Read pieces of a file in parallel. This has a very specific use case: + * reading files from Lustre is CPU bound so multiple threads actually + * increases throughput. Speed matters when an LM takes a terabyte. + */ + +#include +#include + +namespace util { +void ParallelRead(int fd, void *to, std::size_t amount, uint64_t offset); +} // namespace util + +#endif // UTIL_PARALLEL_READ__ diff --git a/mosesdecoder/util/pcqueue.hh b/mosesdecoder/util/pcqueue.hh new file mode 100644 index 0000000000000000000000000000000000000000..05c868fba5ccf6c5f5111b1d5fbd4922d88a82c7 --- /dev/null +++ b/mosesdecoder/util/pcqueue.hh @@ -0,0 +1,156 @@ +#ifndef UTIL_PCQUEUE_H +#define UTIL_PCQUEUE_H + +#include "util/exception.hh" + +#include +#include +#include +#include + +#include + +#ifdef __APPLE__ +#include +#include +#include +#include +#endif // __APPLE__ + +namespace util { + +/* OS X Maverick and Boost interprocess were doing "Function not implemented." + * So this is my own wrapper around the mach kernel APIs. + */ +#ifdef __APPLE__ + +#define MACH_CALL(call) UTIL_THROW_IF(KERN_SUCCESS != (call), Exception, "Mach call failure") + +class Semaphore { + public: + explicit Semaphore(int value) : task_(mach_task_self()) { + MACH_CALL(semaphore_create(task_, &back_, SYNC_POLICY_FIFO, value)); + } + + ~Semaphore() { + MACH_CALL(semaphore_destroy(task_, back_)); + } + + void wait() { + MACH_CALL(semaphore_wait(back_)); + } + + void post() { + MACH_CALL(semaphore_signal(back_)); + } + + private: + semaphore_t back_; + task_t task_; +}; + +inline void WaitSemaphore(Semaphore &semaphore) { + semaphore.wait(); +} + +#else +typedef boost::interprocess::interprocess_semaphore Semaphore; + +inline void WaitSemaphore (Semaphore &on) { + while (1) { + try { + on.wait(); + break; + } + catch (boost::interprocess::interprocess_exception &e) { + if (e.get_native_error() != EINTR) { + throw; + } + } + } +} + +#endif // __APPLE__ + +/** + * Producer consumer queue safe for multiple producers and multiple consumers. + * T must be default constructable and have operator=. + * The value is copied twice for Consume(T &out) or three times for Consume(), + * so larger objects should be passed via pointer. + * Strong exception guarantee if operator= throws. Undefined if semaphores throw. + */ +template class PCQueue : boost::noncopyable { + public: + explicit PCQueue(size_t size) + : empty_(size), used_(0), + storage_(new T[size]), + end_(storage_.get() + size), + produce_at_(storage_.get()), + consume_at_(storage_.get()) {} + + // Add a value to the queue. + void Produce(const T &val) { + WaitSemaphore(empty_); + { + boost::unique_lock produce_lock(produce_at_mutex_); + try { + *produce_at_ = val; + } + catch (...) { + empty_.post(); + throw; + } + if (++produce_at_ == end_) produce_at_ = storage_.get(); + } + used_.post(); + } + + // Consume a value, assigning it to out. + T& Consume(T &out) { + WaitSemaphore(used_); + { + boost::unique_lock consume_lock(consume_at_mutex_); + try { + out = *consume_at_; + } + catch (...) { + used_.post(); + throw; + } + if (++consume_at_ == end_) consume_at_ = storage_.get(); + } + empty_.post(); + return out; + } + + // Convenience version of Consume that copies the value to return. + // The other version is faster. + T Consume() { + T ret; + Consume(ret); + return ret; + } + + private: + // Number of empty spaces in storage_. + Semaphore empty_; + // Number of occupied spaces in storage_. + Semaphore used_; + + boost::scoped_array storage_; + + T *const end_; + + // Index for next write in storage_. + T *produce_at_; + boost::mutex produce_at_mutex_; + + // Index for next read from storage_. + T *consume_at_; + boost::mutex consume_at_mutex_; + +}; + +} // namespace util + +#endif // UTIL_PCQUEUE_H diff --git a/mosesdecoder/util/probing_hash_table_test.cc b/mosesdecoder/util/probing_hash_table_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ed5414318fa5d3fff90d967e82ba7af07e9ec6f --- /dev/null +++ b/mosesdecoder/util/probing_hash_table_test.cc @@ -0,0 +1,102 @@ +#include "util/probing_hash_table.hh" + +#include "util/murmur_hash.hh" +#include "util/scoped.hh" + +#define BOOST_TEST_MODULE ProbingHashTableTest +#include +#include +#include +#include +#include +#include +#include + +namespace util { +namespace { + +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + void SetKey(unsigned char to) { + key = to; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable > Table; + +BOOST_AUTO_TEST_CASE(simple) { + size_t size = Table::Size(10, 1.2); + boost::scoped_array mem(new char[size]); + memset(mem.get(), 0, size); + + Table table(mem.get(), size); + const Entry *i = NULL; + BOOST_CHECK(!table.Find(2, i)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); + BOOST_REQUIRE(table.Find(3, i)); + BOOST_CHECK_EQUAL(3, i->GetKey()); + BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); + BOOST_CHECK(!table.Find(2, i)); +} + +struct Entry64 { + uint64_t key; + typedef uint64_t Key; + + Entry64() {} + + explicit Entry64(uint64_t key_in) { + key = key_in; + } + + Key GetKey() const { return key; } + void SetKey(uint64_t to) { key = to; } +}; + +struct MurmurHashEntry64 { + std::size_t operator()(uint64_t value) const { + return util::MurmurHash64A(&value, 8); + } +}; + +typedef ProbingHashTable Table64; + +BOOST_AUTO_TEST_CASE(Double) { + for (std::size_t initial = 19; initial < 30; ++initial) { + size_t size = Table64::Size(initial, 1.2); + scoped_malloc mem(MallocOrThrow(size)); + Table64 table(mem.get(), size, std::numeric_limits::max()); + table.Clear(); + for (uint64_t i = 0; i < 19; ++i) { + table.Insert(Entry64(i)); + } + table.CheckConsistency(); + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + for (uint64_t i = 20; i < 40 ; ++i) { + table.Insert(Entry64(i)); + } + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + } +} + +} // namespace +} // namespace util diff --git a/mosesdecoder/util/random.hh b/mosesdecoder/util/random.hh new file mode 100644 index 0000000000000000000000000000000000000000..6c2773520a6d7c0454dd7cbc2695316effa5f215 --- /dev/null +++ b/mosesdecoder/util/random.hh @@ -0,0 +1,229 @@ +#ifndef UTIL_RANDOM_H +#define UTIL_RANDOM_H + +#include +#include + +namespace util +{ +/** Thread-safe, cross-platform random number generator. + * + * This is not for proper security-grade randomness, but should be "good + * enough" for producing arbitrary values of various numeric types. + * + * Before starting, call rand_init() to seed the randomizer. There is no need + * to do this more than once; in fact doing it more often is likely to make the + * randomizer less effective. Once that is done, call the rand(), rand_excl(), + * and rand_incl() functions as needed to generate pseudo-random numbers. + * + * Probability distribution is roughly uniform, but for integral types is + * skewed slightly towards lower numbers depending on how close "top" comes to + * RAND_MAX. + * + * For floating-point types, resolution is limited; there will actually be + * only RAND_MAX different possible values. + */ + +/** Initialize randomizer with a fixed seed. + * + * After this, unless the randomizer gets seeded again, consecutive calls to + * the random functions will return a sequence of pseudo-random numbers + * determined by the seed. Every time the randomizer is seeded with this same + * seed, it will again start returning the same sequence of numbers. + */ +void rand_init(unsigned int); + +/** Initialize randomizer based on current time. + * + * Call this to make the randomizer return hard-to-predict numbers. It won't + * produce high-grade randomness, but enough to make the program act + * differently on different runs. + * + * The seed will be based on the current time in seconds. So calling it twice + * within the same second will just reset the randomizer to where it was before. + * Don't do that. + */ +void rand_init(); + + +/** Return a pseudorandom number between 0 and RAND_MAX inclusive. + * + * Initialize (seed) the randomizer before starting to call this. + */ +template inline T rand(); + + +/** Return a pseudorandom number in the half-open interval [bottom, top). + * + * Generates a value between "bottom" (inclusive) and "top" (exclusive), + * assuming that (top - bottom) <= RAND_MAX. + */ +template inline T rand_excl(T bottom, T top); + + +/** Return a pseudorandom number in the half-open interval [0, top). + * + * Generates a value between 0 (inclusive) and "top" (exclusive), assuming that + * bottom <= RAND_MAX. + */ +template inline T rand_excl(T top); + + +/** Return a pseudorandom number in the open interval [bottom, top]. + * + * Generates a value between "bottom" and "top" inclusive, assuming that + * (top - bottom) < RAND_MAX. + */ +template inline T rand_incl(T bottom, T top); + + +/** Return a pseudorandom number in the open interval [0, top]. + * + * Generates a value between 0 and "top" inclusive, assuming that + * bottom < RAND_MAX. + */ +template inline T rand_incl(T top); + + +/** Return a pseudorandom number which may be larger than RAND_MAX. + * + * The requested type must be integral, and its size must be an even multiple + * of the size of an int. The return value will combine one or more random + * ints into a single value, which could get quite large. + * + * The result is nonnegative. Because the constituent ints are also + * nonnegative, the most significant bit in each of the ints will be zero, + * so for a wider type, there will be "gaps" in the range of possible outputs. + */ +template inline T wide_rand(); + +/** Return a pseudorandom number in [0, top), not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_excl(T top); + +/** Return a pseudorandom number in [bottom, top), not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger value ranges than an int can represent. + */ +template inline T wide_rand_excl(T bottom, T top); + +/** Return a pseudorandom number in [0, top], not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_incl(T top); + +/** Return a pseudorandom number in [bottom, top], not limited to RAND_MAX. + * + * Works like wide_rand(), but if the requested type is wider than an int, it + * accommodates larger top values than an int can represent. + */ +template inline T wide_rand_incl(T bottom, T top); + + +/// Implementation detail. For the random module's internal use only. +namespace internal +{ +/// The central call to the randomizer upon which this whole module is built. +int rand_int(); + +/// Helper template: customize random values to required ranges. +template struct random_scaler; + +/// Specialized random_scaler for integral types. +template struct random_scaler +{ + static T rnd_excl(T value, T range) { return value % range; } + static T rnd_incl(T value, T range) { return value % (range + 1); } +}; + +/// Specialized random_scaler for non-integral types. +template struct random_scaler +{ + static T rnd_excl(T value, T range) + { + // Promote RAND_MAX to T before adding one to avoid overflow. + return range * value / (T(RAND_MAX) + 1); + } + static T rnd_incl(T value, T range) { return range * value / RAND_MAX; } +}; + +/// Helper for filling a wider variable with random ints. +template struct wide_random_collector +{ + static T generate() + { + T one_int = util::rand() << (8 * sizeof(int)); + return one_int | wide_random_collector::generate(); + } +}; +/// Specialized wide_random_collector for generating just a single int. +template struct wide_random_collector +{ + static T generate() { return util::rand(); } +}; + +} // namespace internal + + +template inline T rand() +{ + return T(util::internal::rand_int()); +} + +template inline T rand_excl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_excl(util::rand(), top); +} + +template inline T rand_excl(T bottom, T top) +{ + return bottom + rand_excl(top - bottom); +} + +template inline T rand_incl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_incl(util::rand(), top); +} + +template inline T rand_incl(T bottom, T top) +{ + return bottom + rand_incl(top - bottom); +} + +template inline T wide_rand() +{ + return internal::wide_random_collector::generate(); +} + +template inline T wide_rand_excl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_excl(util::wide_rand(), top); +} + +template inline T wide_rand_excl(T bottom, T top) +{ + return bottom + wide_rand_excl(top - bottom); +} + +template inline T wide_rand_incl(T top) +{ + typedef internal::random_scaler::is_integer> scaler; + return scaler::rnd_incl(util::wide_rand(), top); +} + +template inline T wide_rand_incl(T bottom, T top) +{ + return bottom + wide_rand_incl(top - bottom); +} +} // namespace util + +#endif diff --git a/mosesdecoder/util/sorted_uniform_test.cc b/mosesdecoder/util/sorted_uniform_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..39f05e57ecacd50db36686f5a9a10c4e14a620fc --- /dev/null +++ b/mosesdecoder/util/sorted_uniform_test.cc @@ -0,0 +1,127 @@ +#include "util/sorted_uniform.hh" + +#include +#include +#include +#include +#include + +#define BOOST_TEST_MODULE SortedUniformTest +#include + +#include +#include +#include + +namespace util { +namespace { + +template struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry &other) const { + return key < other.key; + } +}; + +template struct Accessor { + typedef KeyT Key; + template Key operator()(const Entry *entry) const { + return entry->GetKey(); + } +}; + +template void Check(const Entry *begin, const Entry *end, const boost::unordered_map &reference, const Key key) { + typename boost::unordered_map::const_iterator ref = reference.find(key); + typedef const Entry *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind, Pivot64>(Accessor(), begin, end, key, i); + if (ref == reference.end()) { + BOOST_CHECK(!ret); + } else { + BOOST_REQUIRE(ret); + BOOST_CHECK_EQUAL(ref->second, i->GetValue()); + } +} + +BOOST_AUTO_TEST_CASE(empty) { + typedef const Entry T; + const T *i; + bool ret = SortedUniformFind, Pivot64>(Accessor(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); +} + +template void RandomTest(Key upper, size_t entries, size_t queries) { + typedef unsigned char Value; + boost::mt19937 rng; + boost::uniform_int range_key(0, upper); + boost::uniform_int range_value(0, 255); + boost::variate_generator > gen_key(rng, range_key); + boost::variate_generator > gen_value(rng, range_value); + + typedef Entry Ent; + std::vector backing; + boost::unordered_map reference; + Ent ent; + for (size_t i = 0; i < entries; ++i) { + Key key = gen_key(); + unsigned char value = gen_value(); + if (reference.insert(std::make_pair(key, value)).second) { + ent.key = key; + ent.value = value; + backing.push_back(ent); + } + } + std::sort(backing.begin(), backing.end()); + + // Random queries. + for (size_t i = 0; i < queries; ++i) { + const Key key = gen_key(); + Check(&*backing.begin(), &*backing.end(), reference, key); + } + + typename boost::unordered_map::const_iterator it = reference.begin(); + for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { + Check(&*backing.begin(), &*backing.end(), reference, it->second); + } +} + +BOOST_AUTO_TEST_CASE(basic) { + RandomTest(11, 10, 200); +} + +BOOST_AUTO_TEST_CASE(tiny_dense_random) { + RandomTest(11, 50, 200); +} + +BOOST_AUTO_TEST_CASE(small_dense_random) { + RandomTest(100, 100, 200); +} + +BOOST_AUTO_TEST_CASE(small_sparse_random) { + RandomTest(200, 15, 200); +} + +BOOST_AUTO_TEST_CASE(medium_sparse_random) { + RandomTest(32000, 1000, 2000); +} + +BOOST_AUTO_TEST_CASE(sparse_random) { + RandomTest(std::numeric_limits::max(), 100000, 2000); +} + +} // namespace +} // namespace util diff --git a/mosesdecoder/util/stream/block.hh b/mosesdecoder/util/stream/block.hh new file mode 100644 index 0000000000000000000000000000000000000000..42df13f3213522a8e10e0f278c6e90d46f825430 --- /dev/null +++ b/mosesdecoder/util/stream/block.hh @@ -0,0 +1,93 @@ +#ifndef UTIL_STREAM_BLOCK_H +#define UTIL_STREAM_BLOCK_H + +#include +#include + +namespace util { +namespace stream { + +/** + * Encapsulates a block of memory. + */ +class Block { + public: + + /** + * Constructs an empty block. + */ + Block() : mem_(NULL), valid_size_(0) {} + + /** + * Constructs a block that encapsulates a segment of memory. + * + * @param[in] mem The segment of memory to encapsulate + * @param[in] size The size of the memory segment in bytes + */ + Block(void *mem, std::size_t size) : mem_(mem), valid_size_(size) {} + + /** + * Set the number of bytes in this block that should be interpreted as valid. + * + * @param[in] to Number of bytes + */ + void SetValidSize(std::size_t to) { valid_size_ = to; } + + /** + * Gets the number of bytes in this block that should be interpreted as valid. + * This is important because read might fill in less than Allocated at EOF. + */ + std::size_t ValidSize() const { return valid_size_; } + + /** Gets a void pointer to the memory underlying this block. */ + void *Get() { return mem_; } + + /** Gets a const void pointer to the memory underlying this block. */ + const void *Get() const { return mem_; } + + + /** + * Gets a const void pointer to the end of the valid section of memory + * encapsulated by this block. + */ + const void *ValidEnd() const { + return reinterpret_cast(mem_) + valid_size_; + } + + /** + * Returns true if this block encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return mem_ != NULL; } + + /** + * Returns true if this block is empty. + * + * In other words, if Get()==NULL, this method will return true. + */ + bool operator!() const { return mem_ == NULL; } + + private: + friend class Link; + friend class RewindableStream; + + /** + * Points this block's memory at NULL. + * + * This class defines poison as a block whose memory pointer is NULL. + */ + void SetToPoison() { + mem_ = NULL; + } + + void *mem_; + std::size_t valid_size_; +}; + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_BLOCK_H diff --git a/mosesdecoder/util/stream/chain.hh b/mosesdecoder/util/stream/chain.hh new file mode 100644 index 0000000000000000000000000000000000000000..2969822601630dd8c3c05924134f9c1c53c2037c --- /dev/null +++ b/mosesdecoder/util/stream/chain.hh @@ -0,0 +1,347 @@ +#ifndef UTIL_STREAM_CHAIN_H +#define UTIL_STREAM_CHAIN_H + +#include "util/stream/block.hh" +#include "util/stream/config.hh" +#include "util/stream/multi_progress.hh" +#include "util/scoped.hh" + +#include +#include + +#include +#include + +namespace util { +template class PCQueue; +namespace stream { + +class ChainConfigException : public Exception { + public: + ChainConfigException() throw(); + ~ChainConfigException() throw(); +}; + +class Chain; +class RewindableStream; + +/** + * Encapsulates a @ref PCQueue "producer queue" and a @ref PCQueue "consumer queue" within a @ref Chain "chain". + * + * Specifies position in chain for Link constructor. + */ +class ChainPosition { + public: + const Chain &GetChain() const { return *chain_; } + private: + friend class Chain; + friend class Link; + friend class RewindableStream; + ChainPosition(PCQueue &in, PCQueue &out, Chain *chain, MultiProgress &progress) + : in_(&in), out_(&out), chain_(chain), progress_(progress.Add()) {} + + PCQueue *in_, *out_; + + Chain *chain_; + + WorkerProgress progress_; +}; + + +/** + * Encapsulates a worker thread processing data at a given position in the chain. + * + * Each instance of this class owns one boost thread in which the worker is Run(). + */ +class Thread { + public: + + /** + * Constructs a new Thread in which the provided Worker is Run(). + * + * Position is usually ChainPosition but if there are multiple streams involved, this can be ChainPositions. + * + * After a call to this constructor, the provided worker will be running within a boost thread owned by the newly constructed Thread object. + */ + template Thread(const Position &position, const Worker &worker) + : thread_(boost::ref(*this), position, worker) {} + + ~Thread(); + + /** + * Launches the provided worker in this object's boost thread. + * + * This method is called automatically by this class's @ref Thread() "constructor". + */ + template void operator()(const Position &position, Worker &worker) { +// try { + worker.Run(position); +// } catch (const std::exception &e) { +// UnhandledException(e); +// } + } + + private: + void UnhandledException(const std::exception &e); + + boost::thread thread_; +}; + +/** + * This resets blocks to full valid size. Used to close the loop in Chain by recycling blocks. + */ +class Recycler { + public: + /** + * Resets the blocks in the chain such that the blocks' respective valid sizes match the chain's block size. + * + * @see Block::SetValidSize() + * @see Chain::BlockSize() + */ + void Run(const ChainPosition &position); +}; + +extern const Recycler kRecycle; +class WriteAndRecycle; +class PWriteAndRecycle; + +/** + * Represents a sequence of workers, through which @ref Block "blocks" can pass. + */ +class Chain { + private: + template struct CheckForRun { + typedef Chain type; + }; + + public: + + /** + * Constructs a configured Chain. + * + * @param config Specifies how to configure the Chain. + */ + explicit Chain(const ChainConfig &config); + + /** + * Destructs a Chain. + * + * This method waits for the chain's threads to complete, + * and frees the memory held by this chain. + */ + ~Chain(); + + void ActivateProgress() { + assert(!Running()); + progress_.Activate(); + } + + void SetProgressTarget(uint64_t target) { + progress_.SetTarget(target); + } + + /** + * Gets the number of bytes in each record of a Block. + * + * @see ChainConfig::entry_size + */ + std::size_t EntrySize() const { + return config_.entry_size; + } + + /** + * Gets the inital @ref Block::ValidSize "valid size" for @ref Block "blocks" in this chain. + * + * @see Block::ValidSize + */ + std::size_t BlockSize() const { + return block_size_; + } + + /** + * Number of blocks going through the Chain. + */ + std::size_t BlockCount() const { + return config_.block_count; + } + + /** Two ways to add to the chain: Add() or operator>>. */ + ChainPosition Add(); + + /** + * Adds a new worker to this chain, + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const Worker &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + /** + * Adds a new worker to this chain (but avoids copying that worker), + * and runs that worker in a new Thread owned by this chain. + * + * The worker must have a Run method that accepts a position argument. + * + * @see Thread::operator()() + */ + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + assert(!complete_called_); + threads_.push_back(new Thread(Add(), worker)); + return *this; + } + + // Note that Link and Stream also define operator>> outside this class. + + // To complete the loop, call CompleteLoop(), >> kRecycle, or the destructor. + void CompleteLoop() { + threads_.push_back(new Thread(Complete(), kRecycle)); + } + + /** + * Adds a Recycler worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const Recycler &) { + CompleteLoop(); + return *this; + } + + /** + * Adds a WriteAndRecycle worker to this chain, + * and runs that worker in a new Thread owned by this chain. + */ + Chain &operator>>(const WriteAndRecycle &writer); + Chain &operator>>(const PWriteAndRecycle &writer); + + // Chains are reusable. Call Wait to wait for everything to finish and free memory. + void Wait(bool release_memory = true); + + // Waits for the current chain to complete (if any) then starts again. + void Start(); + + bool Running() const { return !queues_.empty(); } + + private: + ChainPosition Complete(); + + ChainConfig config_; + + std::size_t block_size_; + + scoped_malloc memory_; + + boost::ptr_vector > queues_; + + bool complete_called_; + + boost::ptr_vector threads_; + + MultiProgress progress_; +}; + +// Create the link in the worker thread using the position token. +/** + * Represents a C++ style iterator over @ref Block "blocks". + */ +class Link { + public: + + // Either default construct and Init or just construct all at once. + + /** + * Constructs an @ref Init "initialized" link. + * + * @see Init + */ + explicit Link(const ChainPosition &position); + + /** + * Constructs a link that must subsequently be @ref Init "initialized". + * + * @see Init + */ + Link(); + + /** + * Initializes the link with the input @ref PCQueue "consumer queue" and output @ref PCQueue "producer queue" at a given @ref ChainPosition "position" in the @ref Chain "chain". + * + * @see Link() + */ + void Init(const ChainPosition &position); + + /** + * Destructs the link object. + * + * If necessary, this method will pass a poison block + * to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + ~Link(); + + /** + * Gets a reference to the @ref Block "block" at this link. + */ + Block &operator*() { return current_; } + + /** + * Gets a const reference to the @ref Block "block" at this link. + */ + const Block &operator*() const { return current_; } + + /** + * Gets a pointer to the @ref Block "block" at this link. + */ + Block *operator->() { return ¤t_; } + + /** + * Gets a const pointer to the @ref Block "block" at this link. + */ + const Block *operator->() const { return ¤t_; } + + /** + * Gets the link at the next @ref ChainPosition "position" in the @ref Chain "chain". + */ + Link &operator++(); + + /** + * Returns true if the @ref Block "block" at this link encapsulates a valid (non-NULL) block of memory. + * + * This method is a user-defined implicit conversion function to boolean; + * among other things, this method enables bare instances of this class + * to be used as the condition of an if statement. + */ + operator bool() const { return current_; } + + /** + * @ref Block::SetToPoison() "Poisons" the @ref Block "block" at this link, + * and passes this now-poisoned block to this link's output @ref PCQueue "producer queue". + * + * @see Block::SetToPoison() + */ + void Poison(); + + private: + Block current_; + PCQueue *in_, *out_; + + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, Link &link) { + link.Init(chain.Add()); + return chain; +} + +} // namespace stream +} // namespace util + +#endif // UTIL_STREAM_CHAIN_H diff --git a/mosesdecoder/util/stream/count_records.hh b/mosesdecoder/util/stream/count_records.hh new file mode 100644 index 0000000000000000000000000000000000000000..e3f7c94afbc044a91dfac393febbe20ead5a8f9d --- /dev/null +++ b/mosesdecoder/util/stream/count_records.hh @@ -0,0 +1,20 @@ +#include + +namespace util { namespace stream { + +class ChainPosition; + +class CountRecords { + public: + explicit CountRecords(uint64_t *out) + : count_(out) { + *count_ = 0; + } + + void Run(const ChainPosition &position); + + private: + uint64_t *count_; +}; + +}} // namespaces diff --git a/mosesdecoder/util/stream/io.cc b/mosesdecoder/util/stream/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..c272d779c9b5adee651603051c7be04d2938a78c --- /dev/null +++ b/mosesdecoder/util/stream/io.cc @@ -0,0 +1,78 @@ +#include "util/stream/io.hh" + +#include "util/file.hh" +#include "util/stream/chain.hh" + +#include + +namespace util { +namespace stream { + +ReadSizeException::ReadSizeException() throw() {} +ReadSizeException::~ReadSizeException() throw() {} + +void Read::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + const std::size_t entry_size = position.GetChain().EntrySize(); + for (Link link(position); link; ++link) { + std::size_t got = util::ReadOrEOF(file_, link->Get(), block_size); + UTIL_THROW_IF(got % entry_size, ReadSizeException, "File ended with " << got << " bytes, not a multiple of " << entry_size << "."); + if (got == 0) { + link.Poison(); + return; + } else { + link->SetValidSize(got); + } + } +} + +void PRead::Run(const ChainPosition &position) { + scoped_fd owner; + if (own_) owner.reset(file_); + const uint64_t size = SizeOrThrow(file_); + UTIL_THROW_IF(size % static_cast(position.GetChain().EntrySize()), ReadSizeException, "File size " << file_ << " size is " << size << " not a multiple of " << position.GetChain().EntrySize()); + const std::size_t block_size = position.GetChain().BlockSize(); + const uint64_t block_size64 = static_cast(block_size); + Link link(position); + uint64_t offset = 0; + for (; offset + block_size64 < size; offset += block_size64, ++link) { + ErsatzPRead(file_, link->Get(), block_size, offset); + link->SetValidSize(block_size); + } + // size - offset is <= block_size, so it casts to 32-bit fine. + if (size - offset) { + ErsatzPRead(file_, link->Get(), size - offset, offset); + link->SetValidSize(size - offset); + ++link; + } + link.Poison(); +} + +void Write::Run(const ChainPosition &position) { + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + } +} + +void WriteAndRecycle::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + for (Link link(position); link; ++link) { + WriteOrThrow(file_, link->Get(), link->ValidSize()); + link->SetValidSize(block_size); + } +} + +void PWriteAndRecycle::Run(const ChainPosition &position) { + const std::size_t block_size = position.GetChain().BlockSize(); + uint64_t offset = 0; + for (Link link(position); link; ++link) { + ErsatzPWrite(file_, link->Get(), link->ValidSize(), offset); + offset += link->ValidSize(); + link->SetValidSize(block_size); + } + // Trim file to size. + util::ResizeOrThrow(file_, offset); +} + +} // namespace stream +} // namespace util diff --git a/mosesdecoder/util/stream/io.hh b/mosesdecoder/util/stream/io.hh new file mode 100644 index 0000000000000000000000000000000000000000..4605a8a79c1313dffb5186c3adb3e394070e63d1 --- /dev/null +++ b/mosesdecoder/util/stream/io.hh @@ -0,0 +1,87 @@ +#ifndef UTIL_STREAM_IO_H +#define UTIL_STREAM_IO_H + +#include "util/exception.hh" +#include "util/file.hh" + +namespace util { +namespace stream { + +class ChainPosition; + +class ReadSizeException : public util::Exception { + public: + ReadSizeException() throw(); + ~ReadSizeException() throw(); +}; + +class Read { + public: + explicit Read(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// Like read but uses pread so that the file can be accessed from multiple threads. +class PRead { + public: + explicit PRead(int fd, bool take_own = false) : file_(fd), own_(take_own) {} + void Run(const ChainPosition &position); + private: + int file_; + bool own_; +}; + +class Write { + public: + explicit Write(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +// It's a common case that stuff is written and then recycled. So rather than +// spawn another thread to Recycle, this combines the two roles. +class WriteAndRecycle { + public: + explicit WriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + +class PWriteAndRecycle { + public: + explicit PWriteAndRecycle(int fd) : file_(fd) {} + void Run(const ChainPosition &position); + private: + int file_; +}; + + +// Reuse the same file over and over again to buffer output. +class FileBuffer { + public: + explicit FileBuffer(int fd) : file_(fd) {} + + PWriteAndRecycle Sink() const { + util::SeekOrThrow(file_.get(), 0); + return PWriteAndRecycle(file_.get()); + } + + PRead Source(bool discard = false) { + return PRead(discard ? file_.release() : file_.get(), discard); + } + + uint64_t Size() const { + return SizeOrThrow(file_.get()); + } + + private: + scoped_fd file_; +}; + +} // namespace stream +} // namespace util +#endif // UTIL_STREAM_IO_H diff --git a/mosesdecoder/util/stream/line_input.cc b/mosesdecoder/util/stream/line_input.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ad8800f614dd5bf4e332e49c0896a56e87ccd2e --- /dev/null +++ b/mosesdecoder/util/stream/line_input.cc @@ -0,0 +1,52 @@ +#include "util/stream/line_input.hh" + +#include "util/exception.hh" +#include "util/file.hh" +#include "util/read_compressed.hh" +#include "util/stream/chain.hh" + +#include +#include + +namespace util { namespace stream { + +void LineInput::Run(const ChainPosition &position) { + ReadCompressed reader(fd_); + // Holding area for beginning of line to be placed in next block. + std::vector carry; + + for (Link block(position); ; ++block) { + char *to = static_cast(block->Get()); + char *begin = to; + char *end = to + position.GetChain().BlockSize(); + std::copy(carry.begin(), carry.end(), to); + to += carry.size(); + while (to != end) { + std::size_t got = reader.Read(to, end - to); + if (!got) { + // EOF + block->SetValidSize(to - begin); + ++block; + block.Poison(); + return; + } + to += got; + } + + // Find the last newline. + char *newline; + for (newline = to - 1; ; --newline) { + UTIL_THROW_IF(newline < begin, Exception, "Did not find a newline in " << position.GetChain().BlockSize() << " bytes of input of " << NameFromFD(fd_) << ". Is this a text file?"); + if (*newline == '\n') break; + } + + // Copy everything after the last newline to the carry. + carry.clear(); + carry.resize(to - (newline + 1)); + std::copy(newline + 1, to, &*carry.begin()); + + block->SetValidSize(newline + 1 - begin); + } +} + +}} // namespaces diff --git a/mosesdecoder/util/stream/line_input.hh b/mosesdecoder/util/stream/line_input.hh new file mode 100644 index 0000000000000000000000000000000000000000..a870a6648494775d7c1169e17e2b0a375e984803 --- /dev/null +++ b/mosesdecoder/util/stream/line_input.hh @@ -0,0 +1,22 @@ +#ifndef UTIL_STREAM_LINE_INPUT_H +#define UTIL_STREAM_LINE_INPUT_H +namespace util {namespace stream { + +class ChainPosition; + +/* Worker that reads input into blocks, ensuring that blocks contain whole + * lines. Assumes that the maximum size of a line is less than the block size + */ +class LineInput { + public: + // Takes ownership upon thread execution. + explicit LineInput(int fd); + + void Run(const ChainPosition &position); + + private: + int fd_; +}; + +}} // namespaces +#endif // UTIL_STREAM_LINE_INPUT_H diff --git a/mosesdecoder/util/stream/multi_progress.cc b/mosesdecoder/util/stream/multi_progress.cc new file mode 100644 index 0000000000000000000000000000000000000000..59750f516a0e9c60707f06064a8238f41ebd5072 --- /dev/null +++ b/mosesdecoder/util/stream/multi_progress.cc @@ -0,0 +1,86 @@ +#include "util/stream/multi_progress.hh" + +// TODO: merge some functionality with the simple progress bar? +#include "util/ersatz_progress.hh" + +#include +#include + +#include + +#if !defined(_WIN32) && !defined(_WIN64) +#include +#endif + +namespace util { namespace stream { + +namespace { +const char kDisplayCharacters[] = "-+*#0123456789"; + +uint64_t Next(unsigned char stone, uint64_t complete) { + return (static_cast(stone + 1) * complete + MultiProgress::kWidth - 1) / MultiProgress::kWidth; +} + +} // namespace + +MultiProgress::MultiProgress() : active_(false), complete_(std::numeric_limits::max()), character_handout_(0) {} + +MultiProgress::~MultiProgress() { + if (active_ && complete_ != std::numeric_limits::max()) + std::cerr << '\n'; +} + +void MultiProgress::Activate() { + active_ = +#if !defined(_WIN32) && !defined(_WIN64) + // Is stderr a terminal? + (isatty(2) == 1) +#else + true +#endif + ; +} + +void MultiProgress::SetTarget(uint64_t complete) { + if (!active_) return; + complete_ = complete; + if (!complete) complete_ = 1; + memset(display_, 0, sizeof(display_)); + character_handout_ = 0; + std::cerr << kProgressBanner; +} + +WorkerProgress MultiProgress::Add() { + if (!active_) + return WorkerProgress(std::numeric_limits::max(), *this, '\0'); + std::size_t character_index; + { + boost::unique_lock lock(mutex_); + character_index = character_handout_++; + if (character_handout_ == sizeof(kDisplayCharacters) - 1) + character_handout_ = 0; + } + return WorkerProgress(Next(0, complete_), *this, kDisplayCharacters[character_index]); +} + +void MultiProgress::Finished() { + if (!active_ || complete_ == std::numeric_limits::max()) return; + std::cerr << '\n'; + complete_ = std::numeric_limits::max(); +} + +void MultiProgress::Milestone(WorkerProgress &worker) { + if (!active_ || complete_ == std::numeric_limits::max()) return; + unsigned char stone = std::min(static_cast(kWidth), worker.current_ * kWidth / complete_); + for (char *i = &display_[worker.stone_]; i < &display_[stone]; ++i) { + *i = worker.character_; + } + worker.next_ = Next(stone, complete_); + worker.stone_ = stone; + { + boost::unique_lock lock(mutex_); + std::cerr << '\r' << display_ << std::flush; + } +} + +}} // namespaces diff --git a/mosesdecoder/util/stream/multi_progress.hh b/mosesdecoder/util/stream/multi_progress.hh new file mode 100644 index 0000000000000000000000000000000000000000..f9e6423e3e32773b9e19810691fb6562bd2d3a62 --- /dev/null +++ b/mosesdecoder/util/stream/multi_progress.hh @@ -0,0 +1,89 @@ +/* Progress bar suitable for chains of workers */ +#ifndef UTIL_STREAM_MULTI_PROGRESS_H +#define UTIL_STREAM_MULTI_PROGRESS_H + +#include + +#include +#include + +namespace util { namespace stream { + +class WorkerProgress; + +class MultiProgress { + public: + static const unsigned char kWidth = 100; + + MultiProgress(); + + ~MultiProgress(); + + // Turns on showing (requires SetTarget too). + void Activate(); + + void SetTarget(uint64_t complete); + + WorkerProgress Add(); + + void Finished(); + + private: + friend class WorkerProgress; + void Milestone(WorkerProgress &worker); + + bool active_; + + uint64_t complete_; + + boost::mutex mutex_; + + // \0 at the end. + char display_[kWidth + 1]; + + std::size_t character_handout_; + + MultiProgress(const MultiProgress &); + MultiProgress &operator=(const MultiProgress &); +}; + +class WorkerProgress { + public: + // Default contrutor must be initialized with operator= later. + WorkerProgress() : parent_(NULL) {} + + // Not threadsafe for the same worker by default. + WorkerProgress &operator++() { + if (++current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + WorkerProgress &operator+=(uint64_t amount) { + current_ += amount; + if (current_ >= next_) { + parent_->Milestone(*this); + } + return *this; + } + + private: + friend class MultiProgress; + WorkerProgress(uint64_t next, MultiProgress &parent, char character) + : current_(0), next_(next), parent_(&parent), stone_(0), character_(character) {} + + uint64_t current_, next_; + + MultiProgress *parent_; + + // Previous milestone reached. + unsigned char stone_; + + // Character to display in bar. + char character_; +}; + +}} // namespaces + +#endif // UTIL_STREAM_MULTI_PROGRESS_H diff --git a/mosesdecoder/util/stream/multi_stream.hh b/mosesdecoder/util/stream/multi_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..6381fc2ed659705da65355544616c6c859327de3 --- /dev/null +++ b/mosesdecoder/util/stream/multi_stream.hh @@ -0,0 +1,124 @@ +#ifndef UTIL_STREAM_MULTI_STREAM_H +#define UTIL_STREAM_MULTI_STREAM_H + +#include "util/fixed_array.hh" +#include "util/scoped.hh" +#include "util/stream/chain.hh" +#include "util/stream/stream.hh" + +#include +#include + +#include +#include + +namespace util { namespace stream { + +class Chains; + +class ChainPositions : public util::FixedArray { + public: + ChainPositions() {} + + explicit ChainPositions(std::size_t bound) : + util::FixedArray(bound) {} + + void Init(Chains &chains); + + explicit ChainPositions(Chains &chains) { + Init(chains); + } +}; + +class Chains : public util::FixedArray { + private: + template struct CheckForRun { + typedef Chains type; + }; + + public: + // Must call Init. + Chains() {} + + explicit Chains(std::size_t limit) : util::FixedArray(limit) {} + + template typename CheckForRun::type &operator>>(const Worker &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + template typename CheckForRun::type &operator>>(const boost::reference_wrapper &worker) { + threads_.push_back(new util::stream::Thread(ChainPositions(*this), worker)); + return *this; + } + + Chains &operator>>(const util::stream::Recycler &recycler) { + for (util::stream::Chain *i = begin(); i != end(); ++i) + *i >> recycler; + return *this; + } + + void Wait(bool release_memory = true) { + threads_.clear(); + for (util::stream::Chain *i = begin(); i != end(); ++i) { + i->Wait(release_memory); + } + } + + private: + boost::ptr_vector threads_; + + Chains(const Chains &); + void operator=(const Chains &); +}; + +inline void ChainPositions::Init(Chains &chains) { + util::FixedArray::Init(chains.size()); + for (util::stream::Chain *i = chains.begin(); i != chains.end(); ++i) { + // use "placement new" syntax to initalize ChainPosition in an already-allocated memory location + new (end()) util::stream::ChainPosition(i->Add()); Constructed(); + } +} + +inline Chains &operator>>(Chains &chains, ChainPositions &positions) { + positions.Init(chains); + return chains; +} + +template class GenericStreams : public util::FixedArray { + private: + typedef util::FixedArray P; + public: + GenericStreams() {} + + // Limit restricts to positions[0,limit) + void Init(const ChainPositions &positions, std::size_t limit) { + P::Init(limit); + for (const util::stream::ChainPosition *i = positions.begin(); i != positions.begin() + limit; ++i) { + P::push_back(*i); + } + } + void Init(const ChainPositions &positions) { + Init(positions, positions.size()); + } + + GenericStreams(const ChainPositions &positions) { + Init(positions); + } + + void Init(std::size_t amount) { + P::Init(amount); + } +}; + +template inline Chains &operator>>(Chains &chains, GenericStreams &streams) { + ChainPositions positions; + chains >> positions; + streams.Init(positions); + return chains; +} + +typedef GenericStreams Streams; + +}} // namespaces +#endif // UTIL_STREAM_MULTI_STREAM_H diff --git a/mosesdecoder/util/stream/rewindable_stream.hh b/mosesdecoder/util/stream/rewindable_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..560825cde974cfac4e98a870d260a5fd481075fc --- /dev/null +++ b/mosesdecoder/util/stream/rewindable_stream.hh @@ -0,0 +1,132 @@ +#ifndef UTIL_STREAM_REWINDABLE_STREAM_H +#define UTIL_STREAM_REWINDABLE_STREAM_H + +#include "util/stream/chain.hh" + +#include + +#include + +namespace util { +namespace stream { + +/** + * A RewindableStream is like a Stream (but one that is only used for + * creating input at the start of a chain) except that it can be rewound to + * be able to re-write a part of the stream before it is sent. Rewinding + * has a limit of 2 * block_size_ - 1 in distance (it does *not* buffer an + * entire stream into memory, only a maximum of 2 * block_size_). + */ +class RewindableStream : boost::noncopyable { + public: + /** + * Creates an uninitialized RewindableStream. You **must** call Init() + * on it later! + */ + RewindableStream(); + + ~RewindableStream() { + Poison(); + } + + /** + * Initializes an existing RewindableStream at a specific position in + * a Chain. + * + * @param position The position in the chain to get input from and + * produce output on + */ + void Init(const ChainPosition &position); + + /** + * Constructs a RewindableStream at a specific position in a Chain all + * in one step. + * + * Equivalent to RewindableStream a(); a.Init(....); + */ + explicit RewindableStream(const ChainPosition &position) + : in_(NULL) { + Init(position); + } + + /** + * Gets the record at the current stream position. Const version. + */ + const void *Get() const { + assert(!poisoned_); + assert(current_); + return current_; + } + + /** + * Gets the record at the current stream position. + */ + void *Get() { + assert(!poisoned_); + assert(current_); + return current_; + } + + operator bool() const { return !poisoned_; } + + bool operator!() const { return poisoned_; } + + /** + * Marks the current position in the stream to be rewound to later. + * Note that you can only rewind back as far as 2 * block_size_ - 1! + */ + void Mark(); + + /** + * Rewinds the stream back to the marked position. This will throw an + * exception if the marked position is too far away. + */ + void Rewind(); + + /** + * Moves the stream forward to the next record. This internally may + * buffer a block for the purposes of rewinding. + */ + RewindableStream& operator++(); + + /** + * Poisons the stream. This sends any buffered blocks down the chain + * and sends a poison block as well (sending at most 2 non-poison and 1 + * poison block). + */ + void Poison(); + + private: + void AppendBlock(); + + void Flush(std::deque::iterator to); + + std::deque blocks_; + // current_ is in blocks_[blocks_it_] unless poisoned_. + std::size_t blocks_it_; + + std::size_t entry_size_; + std::size_t block_size_; + std::size_t block_count_; + + uint8_t *marked_, *current_; + const uint8_t *block_end_; + + PCQueue *in_, *out_; + + // Have we hit poison at the end of the stream, even if rewinding? + bool hit_poison_; + // Is the curren position poison? + bool poisoned_; + + WorkerProgress progress_; +}; + +inline Chain &operator>>(Chain &chain, RewindableStream &stream) { + stream.Init(chain.Add()); + return chain; +} + +} +} +#endif diff --git a/mosesdecoder/util/stream/rewindable_stream_test.cc b/mosesdecoder/util/stream/rewindable_stream_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f8924c3c74db03e4107b1d8132cf8336cf8f50c2 --- /dev/null +++ b/mosesdecoder/util/stream/rewindable_stream_test.cc @@ -0,0 +1,41 @@ +#include "util/stream/io.hh" + +#include "util/stream/rewindable_stream.hh" +#include "util/file.hh" + +#define BOOST_TEST_MODULE RewindableStreamTest +#include + +namespace util { +namespace stream { +namespace { + +BOOST_AUTO_TEST_CASE(RewindableStreamTest) { + scoped_fd in(MakeTemp("io_test_temp")); + for (uint64_t i = 0; i < 100000; ++i) { + WriteOrThrow(in.get(), &i, sizeof(uint64_t)); + } + SeekOrThrow(in.get(), 0); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 100; + config.block_count = 6; + + Chain chain(config); + RewindableStream s; + chain >> Read(in.get()) >> s >> kRecycle; + uint64_t i = 0; + for (; s; ++s, ++i) { + BOOST_CHECK_EQUAL(i, *static_cast(s.Get())); + if (100000UL - i == 2) + s.Mark(); + } + BOOST_CHECK_EQUAL(100000ULL, i); + s.Rewind(); + BOOST_CHECK_EQUAL(100000ULL - 2, *static_cast(s.Get())); +} + +} +} +} diff --git a/mosesdecoder/util/stream/sort_test.cc b/mosesdecoder/util/stream/sort_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc97ffdbfea20507f9d7f1d3a5a833e6cdc22e07 --- /dev/null +++ b/mosesdecoder/util/stream/sort_test.cc @@ -0,0 +1,62 @@ +#include "util/stream/sort.hh" + +#define BOOST_TEST_MODULE SortTest +#include + +#include + +#include + +namespace util { namespace stream { namespace { + +struct CompareUInt64 : public std::binary_function { + bool operator()(const void *first, const void *second) const { + return *static_cast(first) < *reinterpret_cast(second); + } +}; + +const uint64_t kSize = 100000; + +struct Putter { + Putter(std::vector &shuffled) : shuffled_(shuffled) {} + + void Run(const ChainPosition &position) { + Stream put_shuffled(position); + for (uint64_t i = 0; i < shuffled_.size(); ++i, ++put_shuffled) { + *static_cast(put_shuffled.Get()) = shuffled_[i]; + } + put_shuffled.Poison(); + } + std::vector &shuffled_; +}; + +BOOST_AUTO_TEST_CASE(FromShuffled) { + std::vector shuffled; + shuffled.reserve(kSize); + for (uint64_t i = 0; i < kSize; ++i) { + shuffled.push_back(i); + } + std::random_shuffle(shuffled.begin(), shuffled.end()); + + ChainConfig config; + config.entry_size = 8; + config.total_memory = 800; + config.block_count = 3; + + SortConfig merge_config; + merge_config.temp_prefix = "sort_test_temp"; + merge_config.buffer_size = 800; + merge_config.total_memory = 3300; + + Chain chain(config); + chain >> Putter(shuffled); + BlockingSort(chain, merge_config, CompareUInt64(), NeverCombine()); + Stream sorted; + chain >> sorted >> kRecycle; + for (uint64_t i = 0; i < kSize; ++i, ++sorted) { + BOOST_CHECK_EQUAL(i, *static_cast(sorted.Get())); + } + BOOST_CHECK(!sorted); +} + +}}} // namespaces diff --git a/mosesdecoder/util/stream/timer.hh b/mosesdecoder/util/stream/timer.hh new file mode 100644 index 0000000000000000000000000000000000000000..9e9573d15bed888b194459ffffb52b069429ab2f --- /dev/null +++ b/mosesdecoder/util/stream/timer.hh @@ -0,0 +1,16 @@ +#ifndef UTIL_STREAM_TIMER_H +#define UTIL_STREAM_TIMER_H + +// Sorry Jon, this was adding library dependencies in Moses and people complained. + +/*#include + +#if BOOST_VERSION >= 104800 +#include +#define UTIL_TIMER(str) boost::timer::auto_cpu_timer timer(std::cerr, 1, (str)) +#else +//#warning Using Boost older than 1.48. Timing information will not be available.*/ +#define UTIL_TIMER(str) +//#endif + +#endif // UTIL_STREAM_TIMER_H diff --git a/mosesdecoder/util/string_piece.hh b/mosesdecoder/util/string_piece.hh new file mode 100644 index 0000000000000000000000000000000000000000..4288086922c0f8ad1d4ef73332902e46875e5520 --- /dev/null +++ b/mosesdecoder/util/string_piece.hh @@ -0,0 +1,278 @@ +/* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If + * you don't use ICU, then this will use the Google implementation from Chrome. + * This has been modified from the original version to let you choose. + */ + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// Copied from strings/stringpiece.h with modifications +// +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// + +#ifndef UTIL_STRING_PIECE_H +#define UTIL_STRING_PIECE_H + +#include "util/have.hh" + +#include +#include +#include + +#ifdef HAVE_ICU +#include +#include + +// Old versions of ICU don't define operator== and operator!=. +#if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) +#warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} +#endif // old version of ICU + +U_NAMESPACE_BEGIN + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + int longersize = longer.size(), prefixsize = prefix.size(); + return longersize >= prefixsize && std::memcmp(longer.data(), prefix.data(), prefixsize) == 0; +} + +#else + +#include +#include +#include +#include + +#ifdef WIN32 +#undef max +#undef min +#endif + +class StringPiece { + public: + typedef size_t size_type; + + private: + const char* ptr_; + size_type length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(str.size()) { } + StringPiece(const char* offset, size_type len) + : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + size_type size() const { return length_; } + size_type length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, size_type len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + length_ = str ? strlen(str) : 0; + } + void set(const void* data, size_type len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](size_type i) const { return ptr_[i]; } + + void remove_prefix(size_type n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(size_type n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + // std::string doesn't like to take a NULL pointer even with a 0 size. + return std::string(!empty() ? data() : "", size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + + size_type max_size() const { return length_; } + size_type capacity() const { return length_; } + + size_type copy(char* buf, size_type n, size_type pos = 0) const; + + size_type find(const StringPiece& s, size_type pos = 0) const; + size_type find(char c, size_type pos = 0) const; + size_type rfind(const StringPiece& s, size_type pos = npos) const; + size_type rfind(char c, size_type pos = npos) const; + + size_type find_first_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_of(char c, size_type pos = 0) const { + return find(c, pos); + } + size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; + size_type find_first_not_of(char c, size_type pos = 0) const; + size_type find_last_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_of(char c, size_type pos = npos) const { + return rfind(c, pos); + } + size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; + size_type find_last_not_of(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; + + static int wordmemcmp(const char* p, const char* p2, size_type N) { + return std::memcmp(p, p2, N); + } +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + if (x.size() != y.size()) + return false; + + return std::memcmp(x.data(), y.data(), x.size()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool starts_with(const StringPiece& longer, const StringPiece& prefix) { + return longer.starts_with(prefix); +} + +#endif // HAVE_ICU undefined + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = std::memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, const StringPiece& y) { + return !(x < y); +} + +inline StringPiece Trim(const StringPiece& str, const std::string dropChars = " \t\n\r") +{ + StringPiece::size_type startPos = str.find_first_not_of(dropChars); + StringPiece::size_type endPos = str.find_last_not_of(dropChars); + StringPiece ret = str.substr(startPos, endPos - startPos + 1); + return ret; +} + +// allow StringPiece to be logged (needed for unit testing). +inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { + return o.write(piece.data(), static_cast(piece.size())); +} + +#ifdef HAVE_ICU +U_NAMESPACE_END +using U_NAMESPACE_QUALIFIER StringPiece; +#endif + +#endif // UTIL_STRING_PIECE_H diff --git a/mosesdecoder/util/string_stream.hh b/mosesdecoder/util/string_stream.hh new file mode 100644 index 0000000000000000000000000000000000000000..28fdd4219900c500b6e7d7a89d9e01fbdc50f903 --- /dev/null +++ b/mosesdecoder/util/string_stream.hh @@ -0,0 +1,48 @@ +#ifndef UTIL_STRING_STREAM_H +#define UTIL_STRING_STREAM_H + +#include "util/fake_ostream.hh" + +#include +#include + +namespace util { + +class StringStream : public FakeOStream { + public: + StringStream() {} + + StringStream &flush() { return *this; } + + StringStream &write(const void *data, std::size_t length) { + out_.append(static_cast(data), length); + return *this; + } + + const std::string &str() const { return out_; } + + void str(const std::string &val) { out_ = val; } + + void swap(std::string &str) { std::swap(out_, str); } + + protected: + friend class FakeOStream; + char *Ensure(std::size_t amount) { + std::size_t current = out_.size(); + out_.resize(out_.size() + amount); + return &out_[current]; + } + + void AdvanceTo(char *to) { + assert(to <= &*out_.end()); + assert(to >= &*out_.begin()); + out_.resize(to - &*out_.begin()); + } + + private: + std::string out_; +}; + +} // namespace + +#endif // UTIL_STRING_STREAM_H